mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1170492 - in /mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer: DictionaryVectorizer.java term/TermCountCombiner.java term/TermCountReducer.java
Date Wed, 14 Sep 2011 09:21:30 GMT
Author: srowen
Date: Wed Sep 14 09:21:30 2011
New Revision: 1170492

URL: http://svn.apache.org/viewvc?rev=1170492&view=rev
Log:
MAHOUT-808 Avoid filtering too much by applying minSupport in combiner

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
      - copied, changed from r1170071, mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
Wed Sep 14 09:21:30 2011
@@ -49,6 +49,7 @@ import org.apache.mahout.math.VectorWrit
 import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
 import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
+import org.apache.mahout.vectorizer.term.TermCountCombiner;
 import org.apache.mahout.vectorizer.term.TermCountMapper;
 import org.apache.mahout.vectorizer.term.TermCountReducer;
 
@@ -332,7 +333,7 @@ public final class DictionaryVectorizer 
     job.setMapperClass(TermCountMapper.class);
     
     job.setInputFormatClass(SequenceFileInputFormat.class);
-    job.setCombinerClass(TermCountReducer.class);
+    job.setCombinerClass(TermCountCombiner.class);
     job.setReducerClass(TermCountReducer.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     

Copied: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
(from r1170071, mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java)
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java&r1=1170071&r2=1170492&rev=1170492&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
Wed Sep 14 09:21:30 2011
@@ -17,19 +17,16 @@
 
 package org.apache.mahout.vectorizer.term;
 
-import java.io.IOException;
-
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.vectorizer.DictionaryVectorizer;
+
+import java.io.IOException;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the words and the weights and
sums them up.
+ * @see TermCountReducer
  */
-public class TermCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>
{
-
-  private int minSupport;
+public class TermCountCombiner extends Reducer<Text, LongWritable, Text, LongWritable>
{
 
   @Override
   protected void reduce(Text key, Iterable<LongWritable> values, Context context)
@@ -38,16 +35,7 @@ public class TermCountReducer extends Re
     for (LongWritable value : values) {
       sum += value.get();
     }
-    if (sum >= minSupport) {
-      context.write(key, new LongWritable(sum));
-    }
-  }
-
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    super.setup(context);
-    minSupport = context.getConfiguration().getInt(DictionaryVectorizer.MIN_SUPPORT,
-                                                   DictionaryVectorizer.DEFAULT_MIN_SUPPORT);
+    context.write(key, new LongWritable(sum));
   }
 
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Wed Sep 14 09:21:30 2011
@@ -25,7 +25,9 @@ import org.apache.hadoop.mapreduce.Reduc
 import org.apache.mahout.vectorizer.DictionaryVectorizer;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the words and the weights and
sums them up.
+ * This accumulates all the words and the weights and sums them up.
+ *
+ * @see TermCountCombiner
  */
 public class TermCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>
{
 



Mime
View raw message