mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r1084367 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/hadoop/item/ main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ test/java/org/apache/mahout/cf/taste/hadoop/item/ test/java/org/apache/mahout/cf/taste/hado...
Date Tue, 22 Mar 2011 21:48:22 GMT
Author: ssc
Date: Tue Mar 22 21:48:22 2011
New Revision: 1084367

URL: http://svn.apache.org/viewvc?rev=1084367&view=rev
Log:
MAHOUT-628 Add an option to prune away users with less than a given number of preferences
to ItemSimilarityJob and RecommenderJob

Added:
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
      - copied, changed from r1084304, mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Removed:
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
Tue Mar 22 21:48:22 2011
@@ -100,6 +100,7 @@ public final class RecommenderJob extend
   
   private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
   private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
 
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{
@@ -116,6 +117,8 @@ public final class RecommenderJob extend
     addOption("maxPrefsPerUser", "mp",
         "Maximum number of preferences considered per user in final recommendation phase",
         String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the
similarity computation "
+        + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
     addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per
item ",
         String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
     addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per
item to this "
@@ -139,6 +142,7 @@ public final class RecommenderJob extend
     String filterFile = parsedArgs.get("--filterFile");
     boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
     int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
+    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
     int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
     int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
     String similarityClassname = parsedArgs.get("--similarityClassname");
@@ -172,13 +176,14 @@ public final class RecommenderJob extend
         ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
         SequenceFileOutputFormat.class);
       toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
+      toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
minPrefsPerUser);
       toUserVector.waitForCompletion(true);
     }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Job countUsers = prepareJob(inputPath,
+      Job countUsers = prepareJob(userVectorPath,
                                   countUsersPath,
-                                  TextInputFormat.class,
+                                  SequenceFileInputFormat.class,
                                   CountUsersMapper.class,
                                   CountUsersKeyWritable.class,
                                   VarLongWritable.class,

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
Tue Mar 22 21:48:22 2011
@@ -46,7 +46,17 @@ import org.apache.mahout.math.VectorWrit
  */
 public final class ToUserVectorReducer extends
     Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
-  
+
+  public static final String MIN_PREFERENCES_PER_USER = ToUserVectorReducer.class.getName()
+
+      ".minPreferencesPerUser";
+  private int minPreferences;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1);
+  }
+
   @Override
   protected void reduce(VarLongWritable userID,
                         Iterable<VarLongWritable> itemPrefs,
@@ -58,9 +68,11 @@ public final class ToUserVectorReducer e
       userVector.set(index, value);
     }
 
-    VectorWritable vw = new VectorWritable(userVector);
-    vw.setWritesLaxPrecision(true);
-    context.write(userID, vw);
+    if (userVector.getNumNondefaultElements() >= minPreferences) {
+      VectorWritable vw = new VectorWritable(userVector);
+      vw.setWritesLaxPrecision(true);
+      context.write(userID, vw);
+    }
   }
   
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
Tue Mar 22 21:48:22 2011
@@ -24,21 +24,19 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
 import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * Maps out the userIDs in a way that we can use a secondary sort on them
  */
 public class CountUsersMapper extends
-    Mapper<LongWritable,Text,CountUsersKeyWritable, VarLongWritable> {
+    Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable> {
 
   @Override
-  protected void map(LongWritable key,
-                     Text value,
+  protected void map(VarLongWritable key,
+                     VectorWritable value,
                      Context context) throws IOException, InterruptedException {
-
-    String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
-    long userID = Long.parseLong(tokens[0]);
-
+    long userID = key.get();
     context.write(new CountUsersKeyWritable(userID), new VarLongWritable(userID));
   }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Tue Mar 22 21:48:22 2011
@@ -87,6 +87,7 @@ public final class ItemSimilarityJob ext
 
   private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
   private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
 
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new ItemSimilarityJob(), args);
@@ -100,9 +101,13 @@ public final class ItemSimilarityJob ext
     addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate,
alternatively use "
         + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
     addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per
item to this number "
-        + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
+        + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
+        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
     addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per
item to this number "
-        + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+        + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
+        String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+        + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
     addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
 
     Map<String,String> parsedArgs = parseArguments(args);
@@ -113,6 +118,7 @@ public final class ItemSimilarityJob ext
     String similarityClassName = parsedArgs.get("--similarityClassname");
     int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
     int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
+    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
     boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
 
     Path inputPath = getInputPath();
@@ -137,21 +143,6 @@ public final class ItemSimilarityJob ext
       itemIDIndex.waitForCompletion(true);
     }
 
-    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Job countUsers = prepareJob(inputPath,
-                                  countUsersPath,
-                                  TextInputFormat.class,
-                                  CountUsersMapper.class,
-                                  CountUsersKeyWritable.class,
-                                  VarLongWritable.class,
-                                  CountUsersReducer.class,
-                                  VarIntWritable.class,
-                                  NullWritable.class,
-                                  TextOutputFormat.class);
-      countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
-      countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
-      countUsers.waitForCompletion(true);
-    }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
       Job toUserVector = prepareJob(inputPath,
@@ -165,10 +156,27 @@ public final class ItemSimilarityJob ext
                                   VectorWritable.class,
                                   SequenceFileOutputFormat.class);
       toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
+      toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
minPrefsPerUser);
       toUserVector.waitForCompletion(true);
     }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job countUsers = prepareJob(userVectorPath,
+                                  countUsersPath,
+                                  SequenceFileInputFormat.class,
+                                  CountUsersMapper.class,
+                                  CountUsersKeyWritable.class,
+                                  VarLongWritable.class,
+                                  CountUsersReducer.class,
+                                  VarIntWritable.class,
+                                  NullWritable.class,
+                                  TextOutputFormat.class);
+      countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
+      countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
+      countUsers.waitForCompletion(true);
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
       Job maybePruneAndTransponse = prepareJob(userVectorPath,
                                   itemUserMatrixPath,
                                   SequenceFileInputFormat.class,

Added: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java?rev=1084367&view=auto
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
(added)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
Tue Mar 22 21:48:22 2011
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.MathHelper;
+import org.easymock.classextension.EasyMock;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+/**
+ * tests {@link org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer}
+ */
+public class ToUserVectorReducerTest extends TasteTestCase {
+
+  @Test
+  public void testToUsersReducerMinPreferencesUserIgnored() throws Exception {
+    Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context
context =
+        EasyMock.createMock(Reducer.Context.class);
+
+    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    setField(reducer, "minPreferences", 2);
+
+    EasyMock.replay(context);
+
+    reducer.reduce(new VarLongWritable(123), Arrays.asList(new VarLongWritable(456)), context);
+
+    EasyMock.verify(context);
+  }
+
+  @Test
+  public void testToUsersReducerMinPreferencesUserPasses() throws Exception {
+    Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context
context =
+        EasyMock.createMock(Reducer.Context.class);
+
+    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    setField(reducer, "minPreferences", 2);
+
+    context.write(EasyMock.eq(new VarLongWritable(123)), MathHelper.vectorMatches(
+        MathHelper.elem(TasteHadoopUtils.idToIndex(456L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(789L),
1.0)));
+
+    EasyMock.replay(context);
+
+    reducer.reduce(new VarLongWritable(123), Arrays.asList(new VarLongWritable(456), new
VarLongWritable(789)), context);
+
+    EasyMock.verify(context);
+  }
+
+}

Copied: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
(from r1084304, mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java)
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java&r1=1084304&r2=1084367&rev=1084367&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
Tue Mar 22 21:48:22 2011
@@ -54,22 +54,22 @@ import org.junit.Test;
  * Unit tests for the mappers and reducers in org.apache.mahout.cf.taste.hadoop.similarity.item
  * some integration tests with tiny data sets at the end
  */
-public final class ItemSimilarityTest extends TasteTestCase {
+public final class ItemSimilarityJobTest extends TasteTestCase {
 
   /**
    * Tests {@link CountUsersMapper}
    */
   @Test
   public void testCountUsersMapper() throws Exception {
-    Mapper<LongWritable,Text,CountUsersKeyWritable,VarLongWritable>.Context context
=
+    Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable>.Context
context =
         EasyMock.createMock(Mapper.Context.class);
     context.write(keyForUserID(12L), EasyMock.eq(new VarLongWritable(12L)));
     context.write(keyForUserID(35L), EasyMock.eq(new VarLongWritable(35L)));
     EasyMock.replay(context);
 
     CountUsersMapper mapper = new CountUsersMapper();
-    mapper.map(null, new Text("12,100,1.3"), context);
-    mapper.map(null, new Text("35,100,3.0"), context);
+    mapper.map(new VarLongWritable(12), new VectorWritable(), context);
+    mapper.map(new VarLongWritable(35), new VectorWritable(), context);
 
     EasyMock.verify(context);
   }



Mime
View raw message