mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jeast...@apache.org
Subject svn commit: r947427 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/ examples/src/main/java/org/ap...
Date Sun, 23 May 2010 15:22:30 GMT
Author: jeastman
Date: Sun May 23 15:22:28 2010
New Revision: 947427

URL: http://svn.apache.org/viewvc?rev=947427&view=rev
Log:
MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
MAHOUT-398: added minimal vector renaming to improve clarity
MAHOUT-397: fixes to allow setting -nr in vector output stages

Tests all ran before I installed Java update. Will test on EC2 again today.

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
    mahout/trunk/examples/bin/build-reuters.sh
    mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
Sun May 23 15:22:28 2010
@@ -83,7 +83,7 @@ public class DirichletDriver {
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
     Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
-    Option kOpt = DefaultOptionCreator.kOption().create();
+    Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
     Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
     Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
     Option alphaOpt = DefaultOptionCreator.alphaOption().create();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Sun May
23 15:22:28 2010
@@ -145,10 +145,10 @@ public final class LDADriver {
     double oldLL = Double.NEGATIVE_INFINITY;
     boolean converged = false;
 
-    for (int iteration = 0; ((maxIterations < 1) || (iteration < maxIterations)) &&
!converged; iteration++) {
+    for (int iteration = 1; ((maxIterations < 1) || (iteration <= maxIterations)) &&
!converged; iteration++) {
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
-      Path stateOut = new Path(output, "state-" + (iteration + 1));
+      Path stateOut = new Path(output, "state-" + iteration);
       double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, topicSmoothing,
numReducers);
       double relChange = (oldLL - ll) / oldLL;
 
@@ -157,7 +157,7 @@ public final class LDADriver {
       log.info("(Old LL: {})", oldLL);
       log.info("(Rel Change: {})", relChange);
 
-      converged = (iteration > 2) && (relChange < OVERALL_CONVERGENCE);
+      converged = (iteration > 3) && (relChange < OVERALL_CONVERGENCE);
       stateIn = stateOut;
       oldLL = ll;
     }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
Sun May 23 15:22:28 2010
@@ -112,7 +112,7 @@ public final class DefaultOptionCreator 
    * Returns a default command line option for specification of numbers of clusters to create.
Used by Dirichlet, FuzzyKmeans, Kmeans
    */
   public static DefaultOptionBuilder kOption() {
-    return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+    return new DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
         new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
         "The number of clusters to create").withShortName("k");
   }

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
@@ -38,14 +38,15 @@ fi
 
 cd ../..
 ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/
./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir
-c UTF-8
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir
-c UTF-8 -chunk 5
 
-# to use k-Means clustering, uncomment the next two lines
-#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters
-o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
+# to use k-Means clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/
-c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
 #./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0
-dt sequencefile -b 100 -n 20
 
-# to use LDA clustering, uncomment the next two lines
-#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o ./examples/bin/work/reuters-lda
-k 20 -v 50000 -ow
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0
-dt sequencefile
+# to use LDA clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
-wt tf -seq -nr 3
+#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors -o ./examples/bin/work/reuters-lda
-k 20 -v 50000 -ow -x 20
+#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0
-dt sequencefile
 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
Sun May 23 15:22:28 2010
@@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
     private final FileSystem fs;
     
     public ChunkedWriter(int chunkSizeInMB, String outputDir) throws IOException {
-      if (chunkSizeInMB < 64) {
-        chunkSizeInMB = 64;
-      } else if (chunkSizeInMB > 1984) {
+      if (chunkSizeInMB > 1984) {
         chunkSizeInMB = 1984;
       }
       maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
(original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Sun May 23 15:22:28 2010
@@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
       abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
+              + " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
     Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
         .withDescription(
-          "(Optional) Whether output vectors should be SequentialAccessVectors If set true
else false")
+          "(Optional) Whether output vectors should be SequentialAccessVectors. If set true
else false")
         .withShortName("seq").create();
     
     Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
+      "If set, overwrite the output directory").withShortName("ow").create();
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
     
@@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
       if (cmdLine.hasOption(numReduceTasksOpt)) {
         reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       }
-      log.info("Pass1 reduce tasks: {}", reduceTasks);
+      log.info("Number of reduce tasks: {}", reduceTasks);
       
       Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
       if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
         TFIDFConverter.processTfIdf(
           new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
           new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent,
norm,
-          sequentialAccessOutput);
+          sequentialAccessOutput, reduceTasks);
       }
     } catch (OptionException e) {
       log.error("Exception", e);

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
(original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
Sun May 23 15:22:28 2010
@@ -69,13 +69,16 @@ public final class PartialVectorMerger {
    *          output directory were the partial vectors have to be created
    * @param normPower
    *          The normalization value. Must be greater than or equal to 0 or equal to {@link
#NO_NORMALIZING}
+   * @param numReducers 
+   *          The number of reducers to spawn
    * @throws IOException
    */
   public static void mergePartialVectors(List<Path> partialVectorPaths,
                                          Path output,
                                          float normPower,
                                          int dimension,
-                                         boolean sequentialAccess) throws IOException {
+                                         boolean sequentialAccess, 
+                                         int numReducers) throws IOException {
     if (normPower != NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
@@ -101,6 +104,7 @@ public final class PartialVectorMerger {
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setReducerClass(PartialVectorMergeReducer.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);
     
     HadoopUtil.overwriteOutput(output);
 

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
(original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
Sun May 23 15:22:28 2010
@@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
  */
 public final class DictionaryVectorizer {
   
-  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
   
   public static final String MIN_SUPPORT = "min.support";
   
@@ -153,7 +153,7 @@ public final class DictionaryVectorizer 
       Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
       makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
-        maxTermDimension[0], sequentialAccess);
+        maxTermDimension[0], sequentialAccess, numReducers);
     }
     
     Configuration conf = new Configuration();
@@ -162,7 +162,7 @@ public final class DictionaryVectorizer 
     Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
     if (dictionaryChunks.size() > 1) {
       PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
-        sequentialAccess);
+        sequentialAccess, numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -245,6 +245,8 @@ public final class DictionaryVectorizer 
    *          location of the chunk of features and the id's
    * @param output
    *          output directory were the partial vectors have to be created
+   * @param numReducers 
+   *          the desired number of reducer tasks
    * @throws IOException
    */
   private static void makePartialVectors(Path input,
@@ -252,7 +254,8 @@ public final class DictionaryVectorizer 
                                          Path dictionaryFilePath,
                                          Path output,
                                          int dimension,
-                                         boolean sequentialAccess) throws IOException {
+                                         boolean sequentialAccess, 
+                                         int numReducers) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -279,6 +282,7 @@ public final class DictionaryVectorizer 
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setReducerClass(TFPartialVectorReducer.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);
 
     HadoopUtil.overwriteOutput(output);
     

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
(original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
Sun May 23 15:22:28 2010
@@ -66,7 +66,7 @@ public final class TFIDFConverter {
   
   public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
   
-  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
   
   private static final String FREQUENCY_FILE = "frequency.file-";
   
@@ -99,17 +99,21 @@ public final class TFIDFConverter {
    * @param output
    *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s
of the document
    *          are generated
-   * @param minDf
-   *          The minimum document frequency. Default 1
-   * @param maxDFPercent
-   *          The max percentage of vectors for the DF. Can be used to remove really high
frequency features.
-   *          Expressed as an integer between 0 and 100. Default 99
    * @param chunkSizeInMegabytes
    *          the size in MB of the feature => id chunk to be kept in memory at each node
during Map/Reduce
    *          stage. Its recommended you calculated this based on the number of cores and
the free memory
    *          available to you per node. Say, you have 2 cores and around 1GB extra memory
to spare we
    *          recommend you use a split size of around 400-500MB so that two simultaneous
reducers can create
    *          partial vectors without thrashing the system due to increased swapping
+   * @param minDf
+   *          The minimum document frequency. Default 1
+   * @param maxDFPercent
+   *          The max percentage of vectors for the DF. Can be used to remove really high
frequency features.
+   *          Expressed as an integer between 0 and 100. Default 99
+   * @param numReducers 
+   *          The number of reducers to spawn. This also affects the possible parallelism
since each reducer
+   *          will typically produce a single output file containing tf-idf vectors for a
subset of the
+   *          documents in the corpus.
    * @throws IOException
    */
   public static void processTfIdf(Path input,
@@ -118,7 +122,8 @@ public final class TFIDFConverter {
                                   int minDf,
                                   int maxDFPercent,
                                   float normPower,
-                                  boolean sequentialAccessOutput) throws IOException {
+                                  boolean sequentialAccessOutput, 
+                                  int numReducers) throws IOException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
     } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -158,7 +163,7 @@ public final class TFIDFConverter {
     Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
     if (dictionaryChunks.size() > 1) {
       PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
-        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
+        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
(original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Sun May 23 15:22:28 2010
@@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
     DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
       getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
     TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
-                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false);
+                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false,
1);
     
   }
 }



Mime
View raw message