mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r953667 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/impl/similarity/file/ test/java/org/apache/mahout/cf/taste/impl/ test/java/org/apache/mahout/cf/taste/impl/model/file/ test/java/org/apache/mahout/cf/taste/impl/simil...
Date Fri, 11 Jun 2010 11:34:11 GMT
Author: srowen
Date: Fri Jun 11 11:34:10 2010
New Revision: 953667

URL: http://svn.apache.org/viewvc?rev=953667&view=rev
Log:
MAHOUT-412

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarityTest.java
Modified:
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java

Added: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java?rev=953667&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
(added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
Fri Jun 11 11:34:10 2010
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Pattern;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity.ItemItemSimilarity;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * An {@link ItemSimilarity} backed by a comma-delimited file. This class typically expects
a file where each line
+ * contains an item ID, followed by another item ID, followed by a similarity value, separated
by commas. You may also
+ * use tabs.
+ * </p>
+ *
+ * <p>
+ * The similarity value is assumed to be parseable as a <code>double</code> having
a value between -1 and 1. The
+ * item IDs are parsed as <code>long</code>s. Similarities are symmetric so for
a pair of items you do not have to
+ * include 2 lines in the file.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called,
unless the file
+ * has been reloaded very recently already.
+ * </p>
+ *
+ * <p>
+ * This class is not intended for use with very large amounts of data. For that, a JDBC-backed
{@link ItemSimilarity}
+ * and a database are more appropriate.
+ * </p>
+ */
+public class FileItemSimilarity implements ItemSimilarity {
+
+  public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+
+  private ItemSimilarity delegate;
+  private final ReentrantLock reloadLock;
+  private final File dataFile;
+  private long lastModified;
+  private boolean loaded;
+  private final long minReloadIntervalMS;
+
+  private static final Logger log = LoggerFactory.getLogger(FileItemSimilarity.class);
+
+  /**
+   * @param dataFile
+   *          file containing the similarity data
+   * @throws IOException
+   */
+  public FileItemSimilarity(File dataFile) throws IOException {
+    this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+  }
+
+  /**
+   * @param minReloadIntervalMS
+   *          the minimum interval in milliseconds after which a full reload of the original
datafile is done
+   *          when refresh() is called
+   * @see #FileItemSimilarity(File)
+   */
+  public FileItemSimilarity(File dataFile, long minReloadIntervalMS) throws IOException {
+    if (dataFile == null) {
+      throw new IllegalArgumentException("dataFile is null");
+    }
+    if (!dataFile.exists() || dataFile.isDirectory()) {
+      throw new FileNotFoundException(dataFile.toString());
+    }
+
+    log.info("Creating FileItemSimilarity for file {}", dataFile);
+
+    this.dataFile = dataFile.getAbsoluteFile();
+    this.lastModified = dataFile.lastModified();
+    this.loaded = false;
+    this.minReloadIntervalMS = minReloadIntervalMS;
+    this.reloadLock = new ReentrantLock();
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    checkLoaded();
+    return delegate.itemSimilarities(itemID1, itemID2s);
+  }
+
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    checkLoaded();
+    return delegate.itemSimilarity(itemID1, itemID2);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    if (delegate == null || dataFile.lastModified() > lastModified + minReloadIntervalMS)
{
+      log.debug("File has changed; reloading...");
+      reload();
+    }
+  }
+
+  private void checkLoaded() {
+    if (!loaded) {
+      reload();
+    }
+  }
+
+  protected void reload() {
+    if (!reloadLock.isLocked()) {
+      reloadLock.lock();
+      try {
+        long newLastModified = dataFile.lastModified();
+        delegate = new GenericItemSimilarity(new FileItemItemSimilarityIterable(dataFile));
+        lastModified = newLastModified;
+        loaded = true;
+      } finally {
+        reloadLock.unlock();
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "FileItemSimilarity[dataFile:" + dataFile + ']';
+  }
+
+  /**
+   * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity}
+   */
+  static class FileItemItemSimilarityIterable implements Iterable<ItemItemSimilarity>
{
+
+    private final File similaritiesFile;
+
+    FileItemItemSimilarityIterable(File similaritiesFile) {
+      this.similaritiesFile = similaritiesFile;
+    }
+
+    @Override
+    public Iterator<ItemItemSimilarity> iterator() {
+      return new FileItemItemSimilarityIterator(similaritiesFile);
+    }
+
+    /**
+     * a simple iterator using a {@link FileLineIterator} internally, parsing each
+     * line into an {@link ItemItemSimilarity}
+     */
+    static class FileItemItemSimilarityIterator implements Iterator<ItemItemSimilarity>
{
+
+      private static final Pattern SEPARATOR = Pattern.compile("[,\t]");
+
+      private final FileLineIterator lineIterator;
+
+      FileItemItemSimilarityIterator(File similaritiesFile) {
+        try {
+          lineIterator = new FileLineIterator(similaritiesFile);
+        } catch (IOException e) {
+          throw new IllegalArgumentException("Cannot read similarities file", e);
+        }
+      }
+
+      @Override
+      public boolean hasNext() {
+        return lineIterator.hasNext();
+      }
+
+      @Override
+      public ItemItemSimilarity next() {
+        String line = lineIterator.next();
+        String[] tokens = SEPARATOR.split(line);
+        return new ItemItemSimilarity(Long.parseLong(tokens[0]), Long.parseLong(tokens[1]),
+            Double.parseDouble(tokens[2]));
+      }
+
+      @Override
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+    }
+
+  }
+}

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java?rev=953667&r1=953666&r2=953667&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/TasteTestCase.java Fri
Jun 11 11:34:10 2010
@@ -26,6 +26,12 @@ import org.apache.mahout.cf.taste.model.
 import org.apache.mahout.cf.taste.model.Preference;
 import org.apache.mahout.cf.taste.model.PreferenceArray;
 
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -70,4 +76,14 @@ public abstract class TasteTestCase exte
     return false;
   }
 
+  protected static void writeLines(File file, String... lines) throws FileNotFoundException
{
+    PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file),
Charset.forName("UTF-8")));
+    try {
+      for (String line : lines) {
+        writer.println(line);
+      }
+    } finally {
+      writer.close();
+    }
+  }
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java?rev=953667&r1=953666&r2=953667&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
Fri Jun 11 11:34:10 2010
@@ -31,14 +31,7 @@ import org.apache.mahout.cf.taste.neighb
 import org.apache.mahout.cf.taste.recommender.Recommender;
 import org.apache.mahout.cf.taste.similarity.UserSimilarity;
 
-import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.nio.charset.Charset;
 import java.util.NoSuchElementException;
 
 /** <p>Tests {@link FileDataModel}.</p> */
@@ -68,15 +61,7 @@ public final class FileDataModelTest ext
   public void setUp() throws Exception {
     super.setUp();
     testFile = getTestTempFile("test.txt");
-    PrintWriter writer =
-        new PrintWriter(new OutputStreamWriter(new FileOutputStream(testFile), Charset.forName("UTF-8")));
-    try {
-      for (String data : DATA) {
-        writer.println(data);
-      }
-    } finally {
-      writer.close();
-    }
+    writeLines(testFile, DATA);
     model = new FileDataModel(testFile);
   }
 
@@ -171,7 +156,7 @@ public final class FileDataModelTest ext
 
   public void testExplicitRefreshAfterCompleteFileUpdate() throws Exception {
     File file = getTestTempFile("refresh");
-    write(file, "123,456,3.0");
+    writeLines(file, "123,456,3.0");
 
     /* create a FileDataModel that always reloads when the underlying file has changed */
     FileDataModel dataModel = new FileDataModel(file, false, 0L);
@@ -180,7 +165,7 @@ public final class FileDataModelTest ext
     /* change the underlying file,
      * we have to wait at least a second to see the change in the file's lastModified timestamp
*/
     Thread.sleep(2000L);
-    write(file, "123,456,5.0");
+    writeLines(file, "123,456,5.0");
     dataModel.refresh(null);
 
     assertEquals(5.0f, dataModel.getPreferenceValue(123L, 456L));
@@ -190,12 +175,4 @@ public final class FileDataModelTest ext
     assertTrue(model.toString().length() > 0);
   }
 
-  private static void write(File file, String content) throws IOException {
-    BufferedWriter writer = new BufferedWriter(new FileWriter(file));
-    try {
-      writer.write(content);
-    } finally {
-      writer.close();
-    }
-  }
 }

Added: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarityTest.java?rev=953667&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarityTest.java
(added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarityTest.java
Fri Jun 11 11:34:10 2010
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity.ItemItemSimilarity;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+/** <p>Tests {@link FileItemSimilarity}.</p> */
+public class FileItemSimilarityTest extends TasteTestCase {
+
+  private static final String[] data = {
+      "1,5,0.125",
+      "1,7,0.5" };
+
+  private static final String[] changedData = {
+      "1,5,0.125",
+      "1,7,0.9",
+      "7,8,0.112" };
+
+  private File testFile;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    testFile = getTestTempFile("test.txt");
+    writeLines(testFile, data);
+  }
+
+  public void testLoadFromFile() throws Exception {
+    ItemSimilarity similarity = new FileItemSimilarity(testFile);
+
+    assertEquals(0.125d, similarity.itemSimilarity(1L, 5L));
+    assertEquals(0.125d, similarity.itemSimilarity(5L, 1L));
+    assertEquals(0.5d, similarity.itemSimilarity(1L, 7L));
+    assertEquals(0.5d, similarity.itemSimilarity(7L, 1L));
+
+    assertTrue(Double.isNaN(similarity.itemSimilarity(7L, 8L)));
+
+    double[] valuesForOne = similarity.itemSimilarities(1L, new long[] { 5L, 7L });
+    assertNotNull(valuesForOne);
+    assertEquals(2, valuesForOne.length);
+    assertEquals(0.125d, valuesForOne[0]);
+    assertEquals(0.5d, valuesForOne[1]);
+  }
+
+  public void testNoRefreshAfterFileUpdate() throws Exception {
+    ItemSimilarity similarity = new FileItemSimilarity(testFile, 0L);
+
+    /* call a method to make sure the original file is loaded*/
+    similarity.itemSimilarity(1L, 5L);
+
+    /* change the underlying file,
+     * we have to wait at least a second to see the change in the file's lastModified timestamp
*/
+    Thread.sleep(2000L);
+    writeLines(testFile, changedData);
+
+    /* we shouldn't see any changes in the data as we have not yet refreshed */
+    assertEquals(0.5d, similarity.itemSimilarity(1L, 7L));
+    assertEquals(0.5d, similarity.itemSimilarity(7L, 1L));
+    assertTrue(Double.isNaN(similarity.itemSimilarity(7L, 8L)));
+  }
+
+  public void testRefreshAfterFileUpdate() throws Exception {
+    ItemSimilarity similarity = new FileItemSimilarity(testFile, 0L);
+
+    /* call a method to make sure the original file is loaded */
+    similarity.itemSimilarity(1L, 5L);
+
+    /* change the underlying file,
+     * we have to wait at least a second to see the change in the file's lastModified timestamp
*/
+    Thread.sleep(2000L);
+    writeLines(testFile, changedData);
+
+    similarity.refresh(null);
+
+    /* we should now see the changes in the data */
+    assertEquals(0.9d, similarity.itemSimilarity(1L, 7L));
+    assertEquals(0.9d, similarity.itemSimilarity(7L, 1L));
+    assertEquals(0.125d, similarity.itemSimilarity(1L, 5L));
+    assertEquals(0.125d, similarity.itemSimilarity(5L, 1L));
+
+    assertFalse(Double.isNaN(similarity.itemSimilarity(7L, 8L)));
+    assertEquals(0.112d, similarity.itemSimilarity(7L, 8L));
+    assertEquals(0.112d, similarity.itemSimilarity(8L, 7L));
+  }
+
+  public void testFileNotFoundExceptionForNonExistingFile() throws Exception {
+    try {
+      new FileItemSimilarity(new File("xKsdfksdfsdf"));
+      fail();
+    } catch (FileNotFoundException e) {}
+  }
+
+  public void testFileItemItemSimilarityIterable() throws Exception {
+    Iterable<ItemItemSimilarity> similarityIterable = new FileItemSimilarity.FileItemItemSimilarityIterable(testFile);
+    GenericItemSimilarity similarity = new GenericItemSimilarity(similarityIterable);
+
+    assertEquals(0.125d, similarity.itemSimilarity(1L, 5L));
+    assertEquals(0.125d, similarity.itemSimilarity(5L, 1L));
+    assertEquals(0.5d, similarity.itemSimilarity(1L, 7L));
+    assertEquals(0.5d, similarity.itemSimilarity(7L, 1L));
+
+    assertTrue(Double.isNaN(similarity.itemSimilarity(7L, 8L)));
+
+    double[] valuesForOne = similarity.itemSimilarities(1L, new long[] { 5L, 7L });
+    assertNotNull(valuesForOne);
+    assertEquals(2, valuesForOne.length);
+    assertEquals(0.125d, valuesForOne[0]);
+    assertEquals(0.5d, valuesForOne[1]);
+  }
+
+  public void testToString() throws Exception {
+    ItemSimilarity similarity = new FileItemSimilarity(testFile);
+    assertTrue(similarity.toString().length() > 0);
+  }
+
+}



Mime
View raw message