mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r734822 - in /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix: NetflixDataModel.java NetflixFileDataModel.java NetflixMovie.java TransposeToByUser.java
Date Thu, 15 Jan 2009 21:20:49 GMT
Author: srowen
Date: Thu Jan 15 13:20:49 2009
New Revision: 734822

URL: http://svn.apache.org/viewvc?rev=734822&view=rev
Log: (empty)

Added:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixFileDataModel.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixMovie.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java?rev=734822&r1=734821&r2=734822&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixDataModel.java
Thu Jan 15 13:20:49 2009
@@ -71,7 +71,7 @@
 		log.info("Creating NetflixDataModel for directory: {}", dataDirectory);
 
 		log.info("Reading movie data...");
-		List<NetflixMovie> movies = readMovies(dataDirectory);
+		List<NetflixMovie> movies = NetflixMovie.readMovies(dataDirectory);
 
 		log.info("Reading preference data...");
 		List<User> users = readUsers(dataDirectory, movies);
@@ -88,10 +88,7 @@
 		for (File movieFile : new File(dataDirectory, "training_set").listFiles(filenameFilter))
{
       Iterator<String> lineIterator = new FileLineIterable(movieFile, false).iterator();
 			String line = lineIterator.next();
-			if (line == null) {
-				throw new IOException("Can't read first line of file " + movieFile);
-			}
-			int movieID = Integer.parseInt(line.substring(0, line.length() - 1));
+			int movieID = Integer.parseInt(line.substring(0, line.length() - 1)); // strip colon
 			NetflixMovie movie = movies.get(movieID - 1);
 			if (movie == null) {
 				throw new IllegalArgumentException("No such movie: " + movieID);

Added: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixFileDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixFileDataModel.java?rev=734822&view=auto
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixFileDataModel.java
(added)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixFileDataModel.java
Thu Jan 15 13:20:49 2009
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.netflix;
+
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.User;
+import org.apache.mahout.cf.taste.model.Item;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.FileLineIterable;
+import org.apache.mahout.cf.taste.impl.common.ArrayIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.ArrayList;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public final class NetflixFileDataModel implements DataModel {
+
+  private final File dataDirectory;
+  private final List<NetflixMovie> movies;
+
+  public NetflixFileDataModel(File dataDirectory) throws IOException {
+		if (dataDirectory == null) {
+			throw new IllegalArgumentException("dataDirectory is null");
+		}
+		if (!dataDirectory.exists() || !dataDirectory.isDirectory()) {
+			throw new FileNotFoundException(dataDirectory.toString());
+		}
+
+    this.dataDirectory = dataDirectory;
+    movies = NetflixMovie.readMovies(dataDirectory);
+  }
+
+  @Override
+  public Iterable<? extends User> getUsers() {
+    throw new UnsupportedOperationException(); // TODO
+  }
+
+  @Override
+  public User getUser(Object id) {
+    throw new UnsupportedOperationException(); // TODO
+  }
+
+  @Override
+  public Iterable<? extends Item> getItems() {
+    return movies;
+  }
+
+  @Override
+  public Item getItem(Object id) {
+    return movies.get((Integer) id - 1);
+  }
+
+  @Override
+  public Iterable<? extends Preference> getPreferencesForItem(Object itemID) {
+    return new ArrayIterator<Preference>(getPreferencesForItemAsArray(itemID));
+  }
+
+  @Override
+  public Preference[] getPreferencesForItemAsArray(Object itemID) {
+    StringBuilder itemIDPadded = new StringBuilder(5);
+    itemIDPadded.append(itemID);
+    while (itemIDPadded.length() < 5) {
+      itemIDPadded.insert(0, '0');
+    }
+    List<Preference> prefs = new ArrayList<Preference>();
+    Item movie = getItem(itemID);
+    File movieFile = new File(new File(dataDirectory, "training_set"), "mv_00" + itemIDPadded
+ ".txt");
+    for (String line : new FileLineIterable(movieFile, true)) {
+      int firstComma = line.indexOf((int) ',');
+      Integer userID = Integer.valueOf(line.substring(0, firstComma));
+      int secondComma = line.indexOf((int) ',', firstComma + 1);
+      double rating = Double.parseDouble(line.substring(firstComma + 1, secondComma));
+      prefs.add(new GenericPreference(getUser(userID), movie, rating));
+    }
+    return prefs.toArray(new Preference[prefs.size()]);
+  }
+
+  @Override
+  public int getNumItems() {
+    return movies.size();
+  }
+
+  @Override
+  public int getNumUsers() {
+    throw new UnsupportedOperationException(); // TODO
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(Object... itemIDs) {
+    throw new UnsupportedOperationException(); // TODO
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void setPreference(Object userID, Object itemID, double value) {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void removePreference(Object userID, Object itemID) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    // do nothing
+  }
+
+}

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixMovie.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixMovie.java?rev=734822&r1=734821&r2=734822&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixMovie.java
(original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/NetflixMovie.java
Thu Jan 15 13:20:49 2009
@@ -18,6 +18,11 @@
 package org.apache.mahout.cf.taste.example.netflix;
 
 import org.apache.mahout.cf.taste.model.Item;
+import org.apache.mahout.cf.taste.impl.common.FileLineIterable;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.File;
 
 final class NetflixMovie implements Item {
 
@@ -66,4 +71,19 @@
 		return id + ":" + title;
 	}
 
+  static List<NetflixMovie> readMovies(File dataDirectory) {
+		List<NetflixMovie> movies = new ArrayList<NetflixMovie>(17770);
+    for (String line : new FileLineIterable(new File(dataDirectory, "movie_titles.txt"),
false)) {
+			int firstComma = line.indexOf((int) ',');
+			int id = Integer.parseInt(line.substring(0, firstComma));
+			int secondComma = line.indexOf((int) ',', firstComma + 1);
+			String title = line.substring(secondComma + 1);
+			movies.add(new NetflixMovie(id, title));
+      if (id != movies.size()) {
+        throw new IllegalStateException("A movie is missing from movie_titles.txt");
+      }
+		}
+		return movies;
+	}
+
 }

Added: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java?rev=734822&view=auto
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
(added)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/netflix/TransposeToByUser.java
Thu Jan 15 13:20:49 2009
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.netflix;
+
+import org.apache.mahout.cf.taste.impl.common.FastMap;
+import org.apache.mahout.cf.taste.impl.common.FileLineIterable;
+import org.apache.mahout.cf.taste.impl.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.Writer;
+import java.io.OutputStreamWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Map;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ArrayList;
+
+public final class TransposeToByUser {
+
+  private static final Logger log = LoggerFactory.getLogger(TransposeToByUser.class);
+
+  private TransposeToByUser() {}
+
+  public static void main(String[] args) throws Exception {
+
+    File dataDirectory = new File(args[0]);
+    File byItemDirectory = new File(dataDirectory, "training_set");
+    File byUserDirectory = new File(dataDirectory, "training_set_by_user");
+
+    if (!dataDirectory.exists() || !dataDirectory.isDirectory()) {
+      throw new IllegalArgumentException(dataDirectory + " is not a directory");
+    }
+    if (!byItemDirectory.exists() || !byItemDirectory.isDirectory()) {
+      throw new IllegalArgumentException(byItemDirectory + " is not a directory");
+    }
+    if (byUserDirectory.exists()) {
+      throw new IllegalArgumentException(byUserDirectory + " already exists");
+    }
+
+    byUserDirectory.mkdirs();
+
+    Map<String, List<String>> byUserEntryCache = new FastMap<String, List<String>>(100000);
+
+    for (File byItemFile : byItemDirectory.listFiles()) {
+      log.info("Processing {}", byItemFile);
+      Iterator<String> lineIterator = new FileLineIterable(byItemFile, false).iterator();
+			String line = lineIterator.next();
+			String movieIDString = line.substring(0, line.length() - 1);
+			while (lineIterator.hasNext()) {
+        line = lineIterator.next();
+        int firstComma = line.indexOf((int) ',');
+        String userIDString= line.substring(0, firstComma);
+        int secondComma = line.indexOf((int) ',', firstComma + 1);
+        String ratingString = line.substring(firstComma, secondComma); // keep comma
+        List<String> cachedLines = byUserEntryCache.get(userIDString);
+        if (cachedLines == null) {
+          cachedLines = new ArrayList<String>();
+          byUserEntryCache.put(userIDString, cachedLines);
+        }
+        cachedLines.add(movieIDString + ratingString);
+        maybeFlushCache(byUserDirectory, byUserEntryCache);
+      }
+
+    }
+
+  }
+
+  private static void maybeFlushCache(File byUserDirectory, Map<String, List<String>>
byUserEntryCache) throws IOException {
+    if (byUserEntryCache.size() >= 100000) {
+      log.info("Flushing cache");
+      for (Map.Entry<String, List<String>> entry : byUserEntryCache.entrySet())
{
+        String userID = entry.getKey();
+        List<String> lines = entry.getValue();
+        int userIDValue = Integer.parseInt(userID);
+        File intermediateDir = new File(byUserDirectory, String.valueOf(userIDValue % 10000));
+        intermediateDir.mkdirs();
+        File userIDFile = new File(intermediateDir, (userIDValue / 10000) + ".txt");
+        appendStringsToFile(lines, userIDFile);
+      }
+      byUserEntryCache.clear();
+    }
+  }
+
+  private static void appendStringsToFile(List<String> strings, File file) throws IOException
{
+    Writer outputStreamWriter = new OutputStreamWriter(new FileOutputStream(file, true));
+    try {
+      for (String s : strings) {
+        outputStreamWriter.write(s);
+        outputStreamWriter.write('\n');
+      }
+    } finally {
+      IOUtils.quietClose(outputStreamWriter);
+    }
+  }
+
+}



Mime
View raw message