mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r805857 - in /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example: bookcrossing/BookCrossingDataModel.java jester/JesterDataModel.java netflix/NetflixDataModel.java netflix/TransposeToByUser.java
Date Wed, 19 Aug 2009 16:07:24 GMT
Author: srowen
Date: Wed Aug 19 16:07:23 2009
New Revision: 805857

URL: http://svn.apache.org/viewvc?rev=805857&view=rev
Log:
More cleanup especially of dead code in BookCrossingDataModel

Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java?rev=805857&r1=805856&r2=805857&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
Wed Aug 19 16:07:23 2009
@@ -18,7 +18,6 @@
 package org.apache.mahout.cf.taste.example.bookcrossing;
 
 import org.apache.mahout.cf.taste.example.grouplens.GroupLensDataModel;
-import org.apache.mahout.cf.taste.impl.common.FastMap;
 import org.apache.mahout.cf.taste.impl.common.FileLineIterable;
 import org.apache.mahout.cf.taste.impl.common.IOUtils;
 import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
@@ -29,93 +28,25 @@
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.nio.charset.Charset;
-import java.util.Map;
 
-public final class BookCrossingDataModel extends FileDataModel {
 
-  private Map<String, String[]> userDataMap;
-  private final File usersFile;
+/**
+ * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a>
for
+ * data needed by this class. The BX-Book-Ratings.csv file is needed.
+ */
+public final class BookCrossingDataModel extends FileDataModel {
 
   public BookCrossingDataModel() throws IOException {
-    this(GroupLensDataModel.readResourceToTempFile("/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
-         GroupLensDataModel.readResourceToTempFile("/org/apache/mahout/cf/taste/example/bookcrossing/BX-Users.csv"));
+    this(GroupLensDataModel.readResourceToTempFile(
+            "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"));
   }
 
   /**
    * @param ratingsFile BookCrossing ratings file in its native format
-   * @param usersFile BookCrossing books file in its native format
    * @throws IOException if an error occurs while reading or writing files
    */
-  public BookCrossingDataModel(File ratingsFile, File usersFile) throws IOException {
+  public BookCrossingDataModel(File ratingsFile) throws IOException {
     super(convertBCFile(ratingsFile));
-    this.usersFile = usersFile;
-  }
-
-  @Override
-  protected void reload() {
-    userDataMap = new FastMap<String, String[]>(5001);
-
-    for (String line : new FileLineIterable(usersFile, true)) {
-      String[] tokens = tokenizeLine(line, 3);
-      if (tokens != null) {
-        String id = tokens[0];
-        userDataMap.put(id, new String[] { tokens[1], tokens[2] });
-      }
-    }
-    super.reload();
-    userDataMap = null;
-  }
-
-  private static String[] tokenizeLine(String line, int numTokens) {
-    String[] result = new String[numTokens];
-    int pos = 0;
-    int token = 0;
-    int start = 0;
-    int end = 0;
-    boolean inQuote = false;
-    int length = line.length();
-    while (pos < length && token < numTokens) {
-      char c = line.charAt(pos);
-      if (c == '"') {
-        if (inQuote) {
-          if (line.charAt(pos - 1) != '\\') {
-            end = pos;
-            inQuote = false;
-          }
-        } else {
-          start = pos + 1;
-          inQuote = true;
-        }
-      } else if (c == ';' && !inQuote) {
-        if (start == end) {
-          // last token was unquoted
-          end = pos + 1;
-        }
-        result[token] = line.substring(start, end);
-        start = pos + 1;
-        end = pos + 1;
-        token++;
-      }
-      pos++;
-    }
-    if (token == numTokens - 1) {
-      // one more at end
-      if (start == end) {
-        // last token was unquoted
-        end = pos;
-      }
-      result[token] = line.substring(start, end);
-      token++;
-    }
-    if (token != numTokens) {
-      return null;
-    }
-    for (int i = 0; i < result.length; i++) {
-      if ("NULL".equalsIgnoreCase(result[i])) {
-        result[i] = null;
-      }
-    }
-    return result;
   }
 
   private static File convertBCFile(File originalFile) throws IOException {
@@ -125,11 +56,8 @@
       try {
         writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(resultFile),
Charset.forName("UTF-8")));
         for (String line : new FileLineIterable(originalFile, true)) {
-          if (line.indexOf(',') >= 0) {
-            // crude hack to work around corruptions in data file -- some bad lines with
commas in them
-            continue;
-          }
-          String convertedLine = line.replace(';', ',').replace("\"", "");
+          // Delete commas, make semicolon delimiter into comma delimter, then remove quotes
+          String convertedLine = line.replace(",", "").replace(';', ',').replace("\"", "");
           writer.println(convertedLine);
         }
         writer.flush();

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java?rev=805857&r1=805856&r2=805857&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java
Wed Aug 19 16:07:23 2009
@@ -21,16 +21,12 @@
 import org.apache.mahout.cf.taste.impl.model.GenericPreference;
 import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
 import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FileLineIterator;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
 import org.apache.mahout.cf.taste.model.Preference;
 
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.List;
-import java.util.Map;
 
 public final class JesterDataModel extends FileDataModel {
 
@@ -58,7 +54,7 @@
   protected void processLine(String line, FastByIDMap<Collection<Preference>>
data, char delimiter) {
     String[] jokePrefs = line.split(",");
     int count = Integer.parseInt(jokePrefs[0]);
-    List<Preference> prefs = new ArrayList<Preference>(count);
+    Collection<Preference> prefs = new ArrayList<Preference>(count);
     for (int itemID = 1; itemID < jokePrefs.length; itemID++) { // yes skip first one,
just a count
       String jokePref = jokePrefs[itemID];
       if (!"99".equals(jokePref)) {

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java?rev=805857&r1=805856&r2=805857&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
Wed Aug 19 16:07:23 2009
@@ -76,7 +76,7 @@
 		delegate = new GenericDataModel(users);
 	}
 
-	private FastByIDMap<PreferenceArray> readUsers(File dataDirectory) throws IOException
{
+	private FastByIDMap<PreferenceArray> readUsers(File dataDirectory) {
 		FastByIDMap<Collection<Preference>> userIDPrefMap = new FastByIDMap<Collection<Preference>>();
 
 		int counter = 0;

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java?rev=805857&r1=805856&r2=805857&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
Wed Aug 19 16:07:23 2009
@@ -40,7 +40,7 @@
 
   private TransposeToByUser() {}
 
-  public static void main(String[] args) throws Exception {
+  public static void main(String[] args) throws IOException {
 
     File dataDirectory = new File(args[0]);
     File byItemDirectory = new File(dataDirectory, "training_set");
@@ -100,7 +100,7 @@
     }
   }
 
-  private static void appendStringsToFile(List<String> strings, File file) throws IOException
{
+  private static void appendStringsToFile(Iterable<String> strings, File file) throws
IOException {
     PrintWriter outputStreamWriter =
         new PrintWriter(new OutputStreamWriter(new FileOutputStream(file, true), Charset.forName("UTF-8")));
     try {



Mime
View raw message