mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r1211828 - /mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
Date Thu, 08 Dec 2011 11:11:40 GMT
Author: gsingers
Date: Thu Dec  8 11:11:39 2011
New Revision: 1211828

URL: http://svn.apache.org/viewvc?rev=1211828&view=rev
Log:
MAHOUT-688: fix dropping of maxDFPercent

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1211828&r1=1211827&r2=1211828&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
Thu Dec  8 11:11:39 2011
@@ -84,13 +84,18 @@ public final class SparseVectorsFromSequ
     Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
       abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
       "The minimum document frequency.  Default is 1").withShortName("md").create();
-    
+
+    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
+      abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The max percentage of docs for the DF.  Can be used to remove really high frequency
terms."
+          + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is
also set, it will override this value.").withShortName("x").create();
+
     Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false).withArgument(
       abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()).withDescription(
       "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard
deviation (sigma) of the document frequencies of these vectors." +
               "  Can be used to remove really high frequency terms."
           + " Expressed as a double value. Good value to be specified is 3.0. In case the
value is less then 0 no vectors " +
-              "will be filtered out. Default is -1.0.").withShortName("xs").create();
+              "will be filtered out. Default is -1.0.  Overrides maxDFPercent").withShortName("xs").create();
     
     Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
       abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -134,7 +139,7 @@ public final class SparseVectorsFromSequ
     
     Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
         .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
-        .withOption(maxDFSigmaOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
+        .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
         .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
         .withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
         .withOption(logNormalizeOpt)
@@ -218,6 +223,10 @@ public final class SparseVectorsFromSequ
       if (cmdLine.hasOption(minDFOpt)) {
         minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
       }
+      int maxDFPercent = 99;
+      if (cmdLine.hasOption(maxDFPercentOpt)) {
+        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
+      }
       double maxDFSigma = -1.0;
       if (cmdLine.hasOption(maxDFSigmaOpt)) {
     	  maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
@@ -269,7 +278,7 @@ public final class SparseVectorsFromSequ
                  outputDir, conf, chunkSize);
        }
 
-       long maxDF = -1;
+       long maxDF = maxDFPercent;//if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
          Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
          Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);



Mime
View raw message