mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jeast...@apache.org
Subject svn commit: r945447 - in /mahout/trunk: conf/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache/...
Date Tue, 18 May 2010 01:08:21 GMT
Author: jeastman
Date: Tue May 18 01:08:20 2010
New Revision: 945447

URL: http://svn.apache.org/viewvc?rev=945447&view=rev
Log:
Clustering command line cleanup:
- refactored all clustering Driver main methods to move command line options to
  DefaultOptionCreator to improve command uniformity, defaults and to clean up
  messy drivers. Impacted Canopy, Dirichlet, FuzzyKMeans, KMeans and MeanShift
  drivers.
- added command option comments to conf/*.props consistent with above.
- added new files for canopy.props and lda.props
- removed Dirichlet's prototypeSize argument in favor of reading the first data point
  to determine cardinality

All tests run

Added:
    mahout/trunk/conf/canopy.props
    mahout/trunk/conf/lda.props
Modified:
    mahout/trunk/conf/dirichlet.props
    mahout/trunk/conf/fkmeans.props
    mahout/trunk/conf/kmeans.props
    mahout/trunk/conf/meanshift.props
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Added: mahout/trunk/conf/canopy.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/canopy.props?rev=945447&view=auto
==============================================================================
--- mahout/trunk/conf/canopy.props (added)
+++ mahout/trunk/conf/canopy.props Tue May 18 01:08:20 2010
@@ -0,0 +1,14 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#t1|t1 = <T1 threshold value>
+#t2|t2 = <T2 threshold value>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#cl|clustering = <cluster points if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+
+
+
+

Modified: mahout/trunk/conf/dirichlet.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/dirichlet.props?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/conf/dirichlet.props (original)
+++ mahout/trunk/conf/dirichlet.props Tue May 18 01:08:20 2010
@@ -0,0 +1,19 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#x|maxIter = <number of iterations>
+#k|k = <number of models>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#m|alpha = <Dirichlet alpha_0 value. Default: 1.0>
+#md|modelDistClass = <model distribution class name. Default: NormalModelDistribution>
+#mp|modelPrototypeClass = <vector class name for models. Default: RandomAccessSparseVector>
+#r|maxRed = <number of reducers. Default: 1>
+#cl|clustering = <cluster points if present>
+#e|emitMostLikely = <emit most likely cluster if clustering. Default: true>
+#t|threshold = <threshold if clustering and not emitMostLikely. Default: 0.0>
+
+
+
+

Modified: mahout/trunk/conf/fkmeans.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/fkmeans.props?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/conf/fkmeans.props (original)
+++ mahout/trunk/conf/fkmeans.props Tue May 18 01:08:20 2010
@@ -0,0 +1,16 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#c|clusters = /path/to/initial/clusters
+#o|output = /path/to/output
+#m|m = <the coefficient normalization factor > 1.0>
+#x|max = <the maximum number of iterations to attempt>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+#cd|convergenceDelta = <the convergence threshold. Default: 0.5>
+#u|numMap <the number of mapper tasks to launch. Default: 10>
+#r|numReduce = <the number of reduce tasks to launch. Default: 1>
+#cl|clustering = <cluster points if present>
+#e|emitMostLikely = <emit most likely cluster if clustering. Default: true>
+#t|threshold = <threshold if clustering and not emitMostLikely. Default: 0.0>

Modified: mahout/trunk/conf/kmeans.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/kmeans.props?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/conf/kmeans.props (original)
+++ mahout/trunk/conf/kmeans.props Tue May 18 01:08:20 2010
@@ -1,5 +1,12 @@
+# The following parameters must be specified
 #i|input = /path/to/input
+#c|clusters = /path/to/initial/clusters
 #o|output = /path/to/output
-#c|clusters = /path/to/put/clusters
-#x|max = <numIterations>
-#k|k = <numClusters>
+#x|max = <the maximum number of iterations to attempt>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#cl|clustering = <cluster points if present>
+#dm|distance = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>
+#cd|convergenceDelta = <the convergence threshold. Default: 0.5>
+#r|numReduce = <the number of reduce tasks to launch. Default: 1>
\ No newline at end of file

Added: mahout/trunk/conf/lda.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/lda.props?rev=945447&view=auto
==============================================================================
--- mahout/trunk/conf/lda.props (added)
+++ mahout/trunk/conf/lda.props Tue May 18 01:08:20 2010
@@ -0,0 +1,11 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#k|numTopics = <number of topics>
+#v|numWords = <number of words in corpus>
+
+# The following parameters all have default values if not specified
+#a|topicSmoothing = <topic smoothing. Default: 50/numTopics>
+#maxIter|maxIter = <maximum number of iterations. Default: -1 (until converged)>
+#numReducers|numReducers = <the number of reducers. Default: 10>
+

Modified: mahout/trunk/conf/meanshift.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/meanshift.props?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/conf/meanshift.props (original)
+++ mahout/trunk/conf/meanshift.props Tue May 18 01:08:20 2010
@@ -0,0 +1,13 @@
+# The following parameters must be specified
+#i|input = /path/to/input
+#o|output = /path/to/output
+#t1|t1 = <T1 threshold value>
+#t2|t2 = <T2 threshold value >
+#cd|convergenceDelta = <the convergence threshold to halt iteration>
+#x|maxIter = <the maximum number of iterations to attempt>
+
+# The following parameters all have default values if not specified
+#ow|overwrite = <clear output directory if present>
+#cl|clustering = <cluster points if present>
+#ic|inputIsCanopies = <if present, input directory contains MeanShiftCanopies (vs.VectorWritable)>
+#dm|distanceMeasure = <distance measure class name. Default: SquaredEuclideanDistanceMeasure>

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Tue May 18 01:08:20 2010
@@ -42,6 +42,7 @@ import org.apache.mahout.clustering.Clus
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
@@ -57,37 +58,18 @@ public final class CanopyDriver {
   }
 
   public static void main(String[] args) throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-        abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
-
-    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The Path to put the output in")
-        .withShortName("o").create();
-
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-        "If set, overwrite the output directory").withShortName("w").create();
-
-    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
-        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
-
-    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
-        abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v").create();
-    Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
-        abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("t1").withShortName("t1").create();
-    Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
-        abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("t2").withShortName("t2").create();
+    Option helpOpt = DefaultOptionCreator.helpOption();
+    Option inputOpt = DefaultOptionCreator.inputOption().create();
+    Option outputOpt = DefaultOptionCreator.outputOption().create();
+    Option measureClassOpt = DefaultOptionCreator.distanceMeasureOption().create();
+    Option t1Opt = DefaultOptionCreator.t1Option().create();
+    Option t2Opt = DefaultOptionCreator.t2Option().create();
 
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
 
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput).withOption(
-        measureClassOpt).withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt).withOption(helpOpt).create();
+    Group group = new GroupBuilder().withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput).withOption(
+        measureClassOpt).withOption(t1Opt).withOption(t2Opt).withOption(clusteringOpt).withOption(helpOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -104,18 +86,11 @@ public final class CanopyDriver {
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
-      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
-      if (cmdLine.hasOption(measureClassOpt)) {
-        measureClass = cmdLine.getValue(measureClassOpt).toString();
-      }
-
-      // Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
-      // RandomAccessSparseVector.class
-      // : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+      String measureClass = cmdLine.getValue(measureClassOpt).toString();
       double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
       double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
 
-      runJob(input, output, measureClass, t1, t2, false);
+      runJob(input, output, measureClass, t1, t2, cmdLine.hasOption(clusteringOpt));
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Tue May 18 01:08:20 2010
@@ -30,15 +30,18 @@ import org.apache.commons.cli2.builder.D
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputLogFilter;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.mahout.clustering.Cluster;
@@ -76,54 +79,23 @@ public class DirichletDriver {
   }
 
   public static void main(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
+    Option helpOpt = DefaultOptionCreator.helpOption();
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
-    Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
-    Option topicsOpt = DefaultOptionCreator.kOption().create();
-    Option helpOpt = DefaultOptionCreator.helpOption();
-
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-        "If set, overwrite the output directory").withShortName("w").create();
-
-    Option mOpt = obuilder.withLongName("alpha").withRequired(true).withShortName("m").withArgument(
-        abuilder.withName("alpha").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The alpha0 value for the DirichletDistribution.").create();
-
-    Option modelOpt = obuilder.withLongName("modelClass").withRequired(true).withShortName("d").withArgument(
-        abuilder.withName("modelClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The ModelDistribution class name. " + "Defaults to org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution")
-        .create();
-
-    Option prototypeOpt = obuilder.withLongName("modelPrototypeClass").withRequired(false).withShortName("p").withArgument(
-        abuilder.withName("prototypeClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The ModelDistribution prototype Vector class name. " + "Defaults to org.apache.mahout.math.RandomAccessSparseVector")
-        .create();
-
-    Option sizeOpt = obuilder.withLongName("prototypeSize").withRequired(true).withShortName("s").withArgument(
-        abuilder.withName("prototypeSize").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The ModelDistribution prototype Vector size. ").create();
-
-    Option numRedOpt = obuilder.withLongName("maxRed").withRequired(true).withShortName("r").withArgument(
-        abuilder.withName("maxRed").withMinimum(1).withMaximum(1).create()).withDescription("The number of reduce tasks.").create();
-
-    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
-        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
-
-    Option emitMostLikelyOpt = obuilder.withLongName("emitMostLikely").withRequired(false).withShortName("e").withArgument(
-        abuilder.withName("emitMostLikely").withMinimum(1).withMaximum(1).create()).withDescription(
-        "True if clustering emits most likely point only, false for threshold clustering").create();
-
-    Option thresholdOpt = obuilder.withLongName("threshold").withRequired(false).withShortName("t").withArgument(
-        abuilder.withName("threshold").withMinimum(1).withMaximum(1).create()).withDescription("The pdf threshold").create();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput).withOption(
-        modelOpt).withOption(prototypeOpt).withOption(sizeOpt).withOption(maxIterOpt).withOption(mOpt).withOption(topicsOpt)
-        .withOption(helpOpt).withOption(numRedOpt).withOption(clusteringOpt).withOption(emitMostLikelyOpt).withOption(thresholdOpt)
-        .create();
+    Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
+    Option kOpt = DefaultOptionCreator.kOption().create();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
+    Option alphaOpt = DefaultOptionCreator.alphaOption().create();
+    Option modelDistOpt = DefaultOptionCreator.modelDistributionOption().create();
+    Option prototypeOpt = DefaultOptionCreator.modelPrototypeOption().create();
+    Option numRedOpt = DefaultOptionCreator.numReducersOption().create();
+    Option emitMostLikelyOpt = DefaultOptionCreator.emitMostLikelyOption().create();
+    Option thresholdOpt = DefaultOptionCreator.thresholdOption().create();
+
+    Group group = new GroupBuilder().withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput).withOption(
+        modelDistOpt).withOption(prototypeOpt).withOption(maxIterOpt).withOption(alphaOpt).withOption(kOpt).withOption(helpOpt)
+        .withOption(numRedOpt).withOption(clusteringOpt).withOption(emitMostLikelyOpt).withOption(thresholdOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -139,33 +111,17 @@ public class DirichletDriver {
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
-      String modelFactory = "org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution";
-      if (cmdLine.hasOption(modelOpt)) {
-        modelFactory = cmdLine.getValue(modelOpt).toString();
-      }
-      String modelPrototype = "org.apache.mahout.math.RandomAccessSparseVector";
-      if (cmdLine.hasOption(prototypeOpt)) {
-        modelPrototype = cmdLine.getValue(prototypeOpt).toString();
-      }
-      int prototypeSize = Integer.parseInt(cmdLine.getValue(sizeOpt).toString());
+      String modelFactory = cmdLine.getValue(modelDistOpt).toString();
+      String modelPrototype = cmdLine.getValue(prototypeOpt).toString();
+      int numModels = Integer.parseInt(cmdLine.getValue(kOpt).toString());
       int numReducers = Integer.parseInt(cmdLine.getValue(numRedOpt).toString());
-      int numModels = Integer.parseInt(cmdLine.getValue(topicsOpt).toString());
       int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
-      boolean runClustering = true;
-      if (cmdLine.hasOption(clusteringOpt)) {
-        runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
-      }
-      boolean emitMostLikely = true;
-      if (cmdLine.hasOption(emitMostLikelyOpt)) {
-        emitMostLikely = Boolean.parseBoolean(cmdLine.getValue(emitMostLikelyOpt).toString());
-      }
-      double threshold = 0;
-      if (cmdLine.hasOption(thresholdOpt)) {
-        threshold = Double.parseDouble(cmdLine.getValue(thresholdOpt).toString());
-      }
-      double alpha_0 = Double.parseDouble(cmdLine.getValue(mOpt).toString());
-      runJob(input, output, modelFactory, modelPrototype, prototypeSize, numModels, maxIterations, alpha_0, numReducers,
-          runClustering, emitMostLikely, threshold);
+      boolean emitMostLikely = Boolean.parseBoolean(cmdLine.getValue(emitMostLikelyOpt).toString());
+      double threshold = Double.parseDouble(cmdLine.getValue(thresholdOpt).toString());
+      double alpha_0 = Double.parseDouble(cmdLine.getValue(alphaOpt).toString());
+
+      runJob(input, output, modelFactory, modelPrototype, numModels, maxIterations, alpha_0, numReducers, cmdLine
+          .hasOption(clusteringOpt), emitMostLikely, threshold);
     } catch (OptionException e) {
       log.error("Exception parsing command line: ", e);
       CommandLineUtil.printHelp(group);
@@ -181,37 +137,8 @@ public class DirichletDriver {
    *          the directory pathname for output points
    * @param modelFactory
    *          the String ModelDistribution class name to use
-   * @param numClusters
-   *          the number of models
-   * @param maxIterations
-   *          the maximum number of iterations
-   * @param alpha_0
-   *          the alpha_0 value for the DirichletDistribution
-   * @param numReducers
-   *          the number of Reducers desired
-   * @deprecated since it presumes 2-d, dense vector model prototypes
-   */
-  @Deprecated
-  public static void runJob(Path input, Path output, String modelFactory, int numClusters, int maxIterations, double alpha_0,
-      int numReducers) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
-      SecurityException, NoSuchMethodException, InvocationTargetException {
-    runJob(input, output, modelFactory, "org.apache.mahout.math.DenseVector", 2, numClusters, maxIterations, alpha_0, numReducers,
-        false, true, 0);
-  }
-
-  /**
-   * Run the job using supplied arguments
-   * 
-   * @param input
-   *          the directory pathname for input points
-   * @param output
-   *          the directory pathname for output points
-   * @param modelFactory
-   *          the String ModelDistribution class name to use
    * @param modelPrototype
    *          the String class name of the model prototype
-   * @param prototypeSize
-   *          the int size of the prototype to use
    * @param numClusters
    *          the number of models
    * @param maxIterations
@@ -227,19 +154,22 @@ public class DirichletDriver {
    * @param threshold 
    *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    */
-  public static void runJob(Path input, Path output, String modelFactory, String modelPrototype, int prototypeSize,
-      int numClusters, int maxIterations, double alpha_0, int numReducers, boolean runClustering, boolean emitMostLikely,
-      double threshold) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
-      SecurityException, NoSuchMethodException, InvocationTargetException {
+  public static void runJob(Path input, Path output, String modelFactory, String modelPrototype, int numClusters,
+      int maxIterations, double alpha_0, int numReducers, boolean runClustering, boolean emitMostLikely, double threshold)
+      throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, SecurityException,
+      NoSuchMethodException, InvocationTargetException {
 
     Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
-    writeInitialState(output, clustersIn, modelFactory, modelPrototype, prototypeSize, numClusters, alpha_0);
+
+    int protoSize = readPrototypeSize(input);
+
+    writeInitialState(output, clustersIn, modelFactory, modelPrototype, protoSize, numClusters, alpha_0);
 
     for (int iteration = 1; iteration <= maxIterations; iteration++) {
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
       Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
-      runIteration(input, clustersIn, clustersOut, modelFactory, modelPrototype, prototypeSize, numClusters, alpha_0, numReducers);
+      runIteration(input, clustersIn, clustersOut, modelFactory, modelPrototype, protoSize, numClusters, alpha_0, numReducers);
       // now point the input to the old output directory
       clustersIn = clustersOut;
     }
@@ -249,6 +179,24 @@ public class DirichletDriver {
     }
   }
 
+  private static int readPrototypeSize(Path input) throws IOException, InstantiationException, IllegalAccessException {
+    JobConf job = new JobConf(DirichletDriver.class);
+    FileSystem fs = FileSystem.get(input.toUri(), job);
+    FileStatus[] status = fs.listStatus(input, new OutputLogFilter());
+    int protoSize = 0;
+    for (FileStatus s : status) {
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), job);
+      WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
+      VectorWritable value = new VectorWritable();
+      if (reader.next(key, value)) {
+        protoSize = value.get().size();
+      }
+      reader.close();
+      break;
+    }
+    return protoSize;
+  }
+
   private static void writeInitialState(Path output, Path stateIn, String modelFactory, String modelPrototype, int prototypeSize,
       int numModels, double alpha_0) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
       SecurityException, NoSuchMethodException, InvocationTargetException {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Tue May 18 01:08:20 2010
@@ -49,6 +49,7 @@ import org.apache.mahout.clustering.Weig
 import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -56,80 +57,36 @@ import org.slf4j.LoggerFactory;
 public final class FuzzyKMeansDriver {
 
   private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansDriver.class);
-  
+
   private FuzzyKMeansDriver() {
   }
 
   public static void main(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-        abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
-
-    Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
-        abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
+    Option inputOpt = DefaultOptionCreator.inputOption().create();
+    Option outputOpt = DefaultOptionCreator.outputOption().create();
+    Option measureClassOpt = DefaultOptionCreator.distanceMeasureOption().create();
+    Option clustersOpt = DefaultOptionCreator.clustersInOption().withDescription(
         "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
             + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first")
-        .withShortName("c").create();
-
-    Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
-        abuilder.withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
-            + " as the Centroid and written to the clusters output path.").withShortName("k").create();
-
-    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The Path to put the output in")
-        .withShortName("o").create();
-
-    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
-        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("dm").create();
-
-    Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
-        abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The threshold below which the clusters are considered to be converged.  Default is 0.5").withShortName("d").create();
-
-    Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-        abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of iterations to perform.  Default is 20").withShortName("x").create();
-
-    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
-        abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-        "If set, overwrite the output directory").withShortName("w").create();
-
-    Option mOpt = obuilder.withLongName("m").withRequired(true).withArgument(
-        abuilder.withName("m").withMinimum(1).withMaximum(1).create()).withDescription(
-        "coefficient normalization factor, must be greater than 1").withShortName("m").create();
-
-    Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
-        abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription("The number of reduce tasks")
-        .withShortName("r").create();
-
-    Option numMapTasksOpt = obuilder.withLongName("numMap").withRequired(false).withArgument(
-        abuilder.withName("numMap").withMinimum(1).withMaximum(1).create()).withDescription("The number of map tasks")
-        .withShortName("u").create();
-
-    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
-        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
-
-    Option emitMostLikelyOpt = obuilder.withLongName("emitMostLikely").withRequired(false).withShortName("e").withArgument(
-        abuilder.withName("emitMostLikely").withMinimum(1).withMaximum(1).create()).withDescription(
-        "True if clustering emits most likely point only, false for threshold clustering").create();
-
-    Option thresholdOpt = obuilder.withLongName("threshold").withRequired(false).withShortName("t").withArgument(
-        abuilder.withName("threshold").withMinimum(1).withMaximum(1).create()).withDescription("The pdf threshold").create();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(
-        measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(kOpt).withOption(mOpt).withOption(
-        vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).withOption(emitMostLikelyOpt).withOption(thresholdOpt)
         .create();
+    Option kOpt = DefaultOptionCreator.kOption().withDescription(
+        "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
+            + " as the Centroid and written to the clusters input path.").create();
+    Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
+    Option maxIterationsOpt = DefaultOptionCreator.maxIterationsOption().create();
+    Option helpOpt = DefaultOptionCreator.helpOption();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option mOpt = DefaultOptionCreator.mOption().create();
+    Option numReduceTasksOpt = DefaultOptionCreator.numReducersOption().create();
+    Option numMapTasksOpt = DefaultOptionCreator.numMappersOption().create();
+    Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
+    Option emitMostLikelyOpt = DefaultOptionCreator.emitMostLikelyOption().create();
+    Option thresholdOpt = DefaultOptionCreator.thresholdOption().create();
+
+    Group group = new GroupBuilder().withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt)
+        .withOption(measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(kOpt).withOption(mOpt)
+        .withOption(overwriteOutput).withOption(helpOpt).withOption(numMapTasksOpt).withOption(numReduceTasksOpt).withOption(
+            emitMostLikelyOpt).withOption(thresholdOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -146,47 +103,20 @@ public final class FuzzyKMeansDriver {
       if (cmdLine.hasOption(measureClassOpt)) {
         measureClass = cmdLine.getValue(measureClassOpt).toString();
       }
-      double convergenceDelta = 0.5;
-      if (cmdLine.hasOption(convergenceDeltaOpt)) {
-        convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
-      }
+      double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
       float m = Float.parseFloat(cmdLine.getValue(mOpt).toString());
 
-      // Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
-      // RandomAccessSparseVector.class
-      // : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
-
-      int numReduceTasks = 10;
-      if (cmdLine.hasOption(numReduceTasksOpt)) {
-        numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
-      }
-
-      int numMapTasks = 50;
-      if (cmdLine.hasOption(numMapTasksOpt)) {
-        numMapTasks = Integer.parseInt(cmdLine.getValue(numMapTasksOpt).toString());
-      }
-
-      int maxIterations = 20;
-      if (cmdLine.hasOption(maxIterationsOpt)) {
-        maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
-      }
-
+      int numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
+      int numMapTasks = Integer.parseInt(cmdLine.getValue(numMapTasksOpt).toString());
+      int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
-
+      boolean emitMostLikely = Boolean.parseBoolean(cmdLine.getValue(emitMostLikelyOpt).toString());
+      double threshold = Double.parseDouble(cmdLine.getValue(thresholdOpt).toString());
       if (cmdLine.hasOption(kOpt)) {
         clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString()));
       }
-
-      boolean emitMostLikely = true;
-      if (cmdLine.hasOption(emitMostLikelyOpt)) {
-        emitMostLikely = Boolean.parseBoolean(cmdLine.getValue(emitMostLikelyOpt).toString());
-      }
-      double threshold = 0;
-      if (cmdLine.hasOption(thresholdOpt)) {
-        threshold = Double.parseDouble(cmdLine.getValue(thresholdOpt).toString());
-      }
       runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numMapTasks, numReduceTasks, m, cmdLine
           .hasOption(clusteringOpt), emitMostLikely, threshold);
 
@@ -249,8 +179,8 @@ public final class FuzzyKMeansDriver {
 
     // now actually cluster the points
     log.info("Clustering ");
-    runClustering(input, clustersIn, new Path(output, Cluster.CLUSTERED_POINTS_DIR), measureClass, convergenceDelta, numMapTasks, m,
-        emitMostLikely, threshold);
+    runClustering(input, clustersIn, new Path(output, Cluster.CLUSTERED_POINTS_DIR), measureClass, convergenceDelta, numMapTasks,
+        m, emitMostLikely, threshold);
   }
 
   /**
@@ -275,8 +205,8 @@ public final class FuzzyKMeansDriver {
    *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
    * @return true if the iteration successfully runs
    */
-  private static boolean runIteration(Path input, Path clustersIn, Path clustersOut, String measureClass,
-      double convergenceDelta, int numMapTasks, int numReduceTasks, int iterationNumber, float m) {
+  private static boolean runIteration(Path input, Path clustersIn, Path clustersOut, String measureClass, double convergenceDelta,
+      int numMapTasks, int numReduceTasks, int iterationNumber, float m) {
 
     JobConf conf = new JobConf(FuzzyKMeansDriver.class);
     conf.setJobName("Fuzzy K Means{" + iterationNumber + '}');

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Tue May 18 01:08:20 2010
@@ -42,6 +42,7 @@ import org.apache.hadoop.mapred.Sequence
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
@@ -54,67 +55,27 @@ public final class KMeansDriver {
   private KMeansDriver() {
   }
 
-  /**
-   * @param args
-   *          Expects 7 args and they all correspond to the order of the params in {@link #runJob}
-   */
   public static void main(String[] args) throws Exception {
-
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-        abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
-
-    Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
-        abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The input centroids, as Vectors. " + "Must be a SequenceFile of Writable, Cluster/Canopy.  "
-            + "If k is also specified, then a random set of vectors will be selected " + "and written out to this path first")
-        .withShortName("c").create();
-
-    Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
-        abuilder.withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen "
-            + "as the Centroid and written to the clusters output path.").withShortName("k").create();
-
-    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The Path to put the output in")
-        .withShortName("o").create();
-
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-        "If set, overwrite the output directory").withShortName("w").create();
-
-    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
-        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
-
-    Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
-        abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The threshold below which the clusters are considered to be converged.  Default is 0.5").withShortName("d").create();
-
-    Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-        abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The maximum number of iterations to perform.  Default is 20").withShortName("x").create();
-
-    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
-        abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v").create();
-
-    Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
-        abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription("The number of reduce tasks")
-        .withShortName("r").create();
-
-    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
-        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(
-        measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt)
-        .withOption(kOpt).withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).withOption(clusteringOpt)
+    Option inputOpt = DefaultOptionCreator.inputOption().create();
+    Option clustersOpt = DefaultOptionCreator.clustersInOption().withDescription(
+        "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
+            + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first")
         .create();
+    Option kOpt = DefaultOptionCreator.kOption().withDescription(
+        "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
+            + " as the Centroid and written to the clusters input path.").create();
+    Option outputOpt = DefaultOptionCreator.outputOption().create();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option measureClassOpt = DefaultOptionCreator.distanceMeasureOption().create();
+    Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
+    Option maxIterationsOpt = DefaultOptionCreator.maxIterationsOption().create();
+    Option numReduceTasksOpt = DefaultOptionCreator.numReducersOption().create();
+    Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
+    Option helpOpt = DefaultOptionCreator.helpOption();
+
+    Group group = new GroupBuilder().withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt)
+        .withOption(measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt)
+        .withOption(kOpt).withOption(overwriteOutput).withOption(helpOpt).withOption(clusteringOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -127,38 +88,18 @@ public final class KMeansDriver {
       Path input = new Path(cmdLine.getValue(inputOpt).toString());
       Path clusters = new Path(cmdLine.getValue(clustersOpt).toString());
       Path output = new Path(cmdLine.getValue(outputOpt).toString());
-      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
-      if (cmdLine.hasOption(measureClassOpt)) {
-        measureClass = cmdLine.getValue(measureClassOpt).toString();
-      }
-      double convergenceDelta = 0.5;
-      if (cmdLine.hasOption(convergenceDeltaOpt)) {
-        convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
-      }
-
-      // Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
-      // RandomAccessSparseVector.class
-      // : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
-
-      int maxIterations = 20;
-      if (cmdLine.hasOption(maxIterationsOpt)) {
-        maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
-      }
-      int numReduceTasks = 2;
-      if (cmdLine.hasOption(numReduceTasksOpt)) {
-        numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
-      }
+      String measureClass = cmdLine.getValue(measureClassOpt).toString();
+      double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
+      int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
+      int numReduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
       if (cmdLine.hasOption(kOpt)) {
         clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString()));
       }
-      boolean runClustering = true;
-      if (cmdLine.hasOption(clusteringOpt)) {
-        runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
-      }
-      runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks, runClustering);
+      runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks, cmdLine
+          .hasOption(clusteringOpt));
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
@@ -185,14 +126,8 @@ public final class KMeansDriver {
    * @param runClustering 
    *          true if points are to be clustered after iterations are completed
    */
-  public static void runJob(Path input,
-                            Path clustersIn,
-                            Path output,
-                            String measureClass,
-                            double convergenceDelta,
-                            int maxIterations,
-                            int numReduceTasks,
-                            boolean runClustering) throws IOException {
+  public static void runJob(Path input, Path clustersIn, Path output, String measureClass, double convergenceDelta,
+      int maxIterations, int numReduceTasks, boolean runClustering) throws IOException {
     // iterate until the clusters converge
     String delta = Double.toString(convergenceDelta);
     if (log.isInfoEnabled()) {
@@ -235,12 +170,8 @@ public final class KMeansDriver {
    *          the number of reducer tasks
    * @return true if the iteration successfully runs
    */
-  private static boolean runIteration(Path input,
-                                      Path clustersIn,
-                                      Path clustersOut,
-                                      String measureClass,
-                                      String convergenceDelta,
-                                      int numReduceTasks) throws IOException {
+  private static boolean runIteration(Path input, Path clustersIn, Path clustersOut, String measureClass, String convergenceDelta,
+      int numReduceTasks) throws IOException {
     JobConf conf = new JobConf(KMeansDriver.class);
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(KMeansInfo.class);
@@ -284,11 +215,8 @@ public final class KMeansDriver {
    * @param convergenceDelta
    *          the convergence delta value
    */
-  private static void runClustering(Path input,
-                                    Path clustersIn,
-                                    Path output,
-                                    String measureClass,
-                                    String convergenceDelta) throws IOException {
+  private static void runClustering(Path input, Path clustersIn, Path output, String measureClass, String convergenceDelta)
+      throws IOException {
     if (log.isInfoEnabled()) {
       log.info("Running Clustering");
       log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] { input, clustersIn, output, measureClass });

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Tue May 18 01:08:20 2010
@@ -45,6 +45,7 @@ import org.apache.mahout.clustering.fuzz
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -60,40 +61,21 @@ public final class MeanShiftCanopyDriver
   }
 
   public static void main(String[] args) throws IOException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
     Option helpOpt = DefaultOptionCreator.helpOption();
-    Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-        "If set, overwrite the output directory").withShortName("w").create();
-
-    Option inputIsCanopiesOpt = obuilder.withLongName("inputIsCanopies").withRequired(true).withShortName("i").withArgument(
-        abuilder.withName("inputIsCanopies").withMinimum(1).withMaximum(1).create()).withDescription(
-        "True if the input directory already contains MeanShiftCanopies").create();
-
-    Option modelOpt = obuilder.withLongName("distanceClass").withRequired(true).withShortName("d").withArgument(
-        abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
-        "The distance measure class name.").create();
-
-    Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(true).withShortName("t1").withArgument(
-        abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
-        .create();
-
-    Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(true).withShortName("t2").withArgument(
-        abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
-        .create();
-
-    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
-        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
-
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput).withOption(
-        modelOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(threshold2Opt)
-        .withOption(clusteringOpt).withOption(maxIterOpt).withOption(inputIsCanopiesOpt).create();
+    Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
+    Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
+    Option inputIsCanopiesOpt = DefaultOptionCreator.inputIsCanopiesOption().create();
+    Option measureClassOpt = DefaultOptionCreator.distanceMeasureOption().create();
+    Option threshold1Opt = DefaultOptionCreator.t1Option().create();
+    Option threshold2Opt = DefaultOptionCreator.t2Option().create();
+    Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
+
+    Group group = new GroupBuilder().withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(overwriteOutput)
+        .withOption(measureClassOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(
+            threshold2Opt).withOption(clusteringOpt).withOption(maxIterOpt).withOption(inputIsCanopiesOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -103,14 +85,10 @@ public final class MeanShiftCanopyDriver
         CommandLineUtil.printHelp(group);
         return;
       }
-      boolean runClustering = true;
-      if (cmdLine.hasOption(clusteringOpt)) {
-        runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
-      }
 
       Path input = new Path(cmdLine.getValue(inputOpt).toString());
       Path output = new Path(cmdLine.getValue(outputOpt).toString());
-      String measureClassName = cmdLine.getValue(modelOpt).toString();
+      String measureClass = cmdLine.getValue(measureClassOpt).toString();
       if (cmdLine.hasOption(overwriteOutput)) {
         HadoopUtil.overwriteOutput(output);
       }
@@ -118,9 +96,8 @@ public final class MeanShiftCanopyDriver
       double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt).toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
       int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
-      boolean inputIsCanopies = Boolean.parseBoolean(cmdLine.getValue(inputIsCanopiesOpt).toString());
-      createCanopyFromVectors(input, new Path(output, "intial-canopies"));
-      runJob(input, output, measureClassName, t1, t2, convergenceDelta, maxIterations, inputIsCanopies, runClustering);
+      runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations, cmdLine.hasOption(inputIsCanopiesOpt), cmdLine
+          .hasOption(clusteringOpt));
     } catch (OptionException e) {
       log.error("Exception parsing command line: ", e);
       CommandLineUtil.printHelp(group);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java Tue May 18 01:08:20 2010
@@ -20,76 +20,190 @@ package org.apache.mahout.common.command
 import org.apache.commons.cli2.Option;
 import org.apache.commons.cli2.builder.ArgumentBuilder;
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.math.RandomAccessSparseVector;
 
 public final class DefaultOptionCreator {
-  
-  private DefaultOptionCreator() { }
-  
+
+  private DefaultOptionCreator() {
+  }
+
   /**
-   * Returns a default command line option for convergence delta specification.
+   * Returns a default command line option for help.
+   * */
+  public static Option helpOption() {
+    return new DefaultOptionBuilder().withLongName("help").withDescription("Print out help").withShortName("h").create();
+  }
+
+  /**
+   * Returns a default command line option for input directory specification.
    */
-  public static DefaultOptionBuilder convergenceOption() {
-    return new DefaultOptionBuilder().withLongName("convergenceDelta").withRequired(true).withShortName("v")
-        .withArgument(
-          new ArgumentBuilder().withName("convergenceDelta").withMinimum(1).withMaximum(1).create())
-        .withDescription("The convergence delta value.");
+  public static DefaultOptionBuilder inputOption() {
+    return new DefaultOptionBuilder().withLongName("input").withRequired(true).withShortName("i").withArgument(
+        new ArgumentBuilder().withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+        "Path to job input directory. Must be a SequenceFile of VectorWritable");
+  }
+
+  public static DefaultOptionBuilder clustersInOption() {
+    return new DefaultOptionBuilder().withLongName("clusters").withRequired(true).withArgument(
+        new ArgumentBuilder().withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The path to the initial clusters directory. Must be a SequenceFile of some type of Cluster").withShortName("c");
   }
-  
+
   /**
    * Returns a default command line option for output directory specification.
    */
   public static DefaultOptionBuilder outputOption() {
-    return new DefaultOptionBuilder().withLongName("output").withRequired(true).withShortName("o")
-        .withArgument(new ArgumentBuilder().withName("output").withMinimum(1).withMaximum(1).create())
-        .withDescription("The directory pathname for output.");
+    return new DefaultOptionBuilder().withLongName("output").withRequired(true).withShortName("o").withArgument(
+        new ArgumentBuilder().withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The directory pathname for output.");
   }
-  
+
   /**
-   * Returns a default command line option for input directory specification.
+   * Returns a default command line option for output directory overwriting
    */
-  public static DefaultOptionBuilder inputOption() {
-    return new DefaultOptionBuilder().withLongName("input").withRequired(true).withShortName("i")
-        .withArgument(new ArgumentBuilder().withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription("Path to job input directory");
+  public static DefaultOptionBuilder overwriteOption() {
+    return new DefaultOptionBuilder().withLongName("overwrite").withRequired(false).withDescription(
+        "If present, overwrite the output directory before running job").withShortName("ow");
   }
-  
+
+  /**
+   * Returns a default command line option for clustering specification
+   */
+  public static DefaultOptionBuilder clusteringOption() {
+    return new DefaultOptionBuilder().withLongName("clustering").withRequired(false).withDescription(
+        "If present, run clustering after the iterations have taken place").withShortName("cl");
+  }
+
+  /**
+   * Returns a default command line option for specification of distance measure class to use.
+   */
+  public static DefaultOptionBuilder distanceMeasureOption() {
+    return new DefaultOptionBuilder().withLongName("distanceMeasure").withRequired(false).withShortName("dm").withArgument(
+        new ArgumentBuilder().withName("distanceMeasure").withDefault(SquaredEuclideanDistanceMeasure.class.getName()).withMinimum(
+            1).withMaximum(1).create()).withDescription("The classname of the DistanceMeasure. Default is SquaredEuclidean");
+  }
+
+  public static DefaultOptionBuilder t1Option() {
+    return new DefaultOptionBuilder().withLongName("t1").withRequired(true).withArgument(
+        new ArgumentBuilder().withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("T1 threshold value")
+        .withShortName("t1");
+  }
+
+  public static DefaultOptionBuilder t2Option() {
+    return new DefaultOptionBuilder().withLongName("t2").withRequired(true).withArgument(
+        new ArgumentBuilder().withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("T2 threshold value")
+        .withShortName("t2");
+  }
+
+  /**
+   * Returns a default command line option for specification of max number of iterations.
+   */
+  public static DefaultOptionBuilder maxIterationsOption() {
+    return new DefaultOptionBuilder().withLongName("maxIter").withRequired(true).withShortName("x").withArgument(
+        new ArgumentBuilder().withName("maxIter").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The maximum number of iterations.");
+  }
+
   /**
    * Returns a default command line option for specification of numbers of clusters to create.
    */
   public static DefaultOptionBuilder kOption() {
-    return new DefaultOptionBuilder()
-        .withLongName("k")
-        .withRequired(true)
-        .withArgument(new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The k in k-Means. k random Vectors will be chosen as the Centroid and written to the clusters output path.")
-        .withShortName("k");
+    return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+        new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The number of clusters to create").withShortName("k");
+  }
+
+  /**
+   * Returns a default command line option for convergence delta specification.
+   */
+  public static DefaultOptionBuilder convergenceOption() {
+    return new DefaultOptionBuilder().withLongName("convergenceDelta").withRequired(false).withShortName("cd").withArgument(
+        new ArgumentBuilder().withName("convergenceDelta").withDefault("0.5").withMinimum(1).withMaximum(1).create())
+        .withDescription("The convergence delta value. Default is 0.5");
   }
-  
+
   /**
-   * Returns a default command line option for specification of max number of iterations.
+   * Returns a default command line option for alpha specification
    */
-  public static DefaultOptionBuilder maxIterOption() {
-    return new DefaultOptionBuilder().withLongName("maxIter").withRequired(true).withShortName("x")
-        .withArgument(new ArgumentBuilder().withName("maxIter").withMinimum(1).withMaximum(1).create())
-        .withDescription("The maximum number of iterations.");
+  public static DefaultOptionBuilder alphaOption() {
+    return new DefaultOptionBuilder().withLongName("alpha").withRequired(false).withShortName("m").withArgument(
+        new ArgumentBuilder().withName("alpha").withDefault("1.0").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The alpha0 value for the DirichletDistribution. Defaults to 1.0");
   }
-  
+
   /**
-   * Returns a default command line option for specification of distance measure class to use.
+   * Returns a default command line option for model distribution class specification
+   */
+  public static DefaultOptionBuilder modelDistributionOption() {
+    return new DefaultOptionBuilder().withLongName("modelDistClass").withRequired(false).withShortName("md").withArgument(
+        new ArgumentBuilder().withName("modelDistClass").withDefault(NormalModelDistribution.class.getName()).withMinimum(1)
+            .withMaximum(1).create()).withDescription("The ModelDistribution class name. " + "Defaults to NormalModelDistribution");
+  }
+
+  /**
+   * Returns a default command line option for model prototype class specification
    */
-  public static DefaultOptionBuilder distanceOption() {
-    return new DefaultOptionBuilder().withLongName("measure").withRequired(true).withShortName("d")
-        .withArgument(new ArgumentBuilder().withName("measure").withMinimum(1).withMaximum(1).create())
-        .withDescription("The classname of the DistanceMeasure.");
+  public static DefaultOptionBuilder modelPrototypeOption() {
+    return new DefaultOptionBuilder().withLongName("modelPrototypeClass").withRequired(false).withShortName("mp").withArgument(
+        new ArgumentBuilder().withName("prototypeClass").withDefault(RandomAccessSparseVector.class.getName()).withMinimum(1)
+            .withMaximum(1).create()).withDescription(
+        "The ModelDistribution prototype Vector class name. " + "Defaults to RandomAccessSparseVector");
   }
-  
+
   /**
-   * Returns a default command line option for help.
-   * */
-  public static Option helpOption() {
-    return new DefaultOptionBuilder().withLongName("help").withDescription("Print out help").withShortName(
-      "h").create();
+   * Returns a default command line option for specifying the number of Mappers
+   */
+  public static DefaultOptionBuilder numMappersOption() {
+    return new DefaultOptionBuilder().withLongName("numMap").withRequired(false).withArgument(
+        new ArgumentBuilder().withName("numMap").withDefault("10").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The number of map tasks. Defaults to 10").withShortName("u");
   }
-  
+
+  /**
+   * Returns a default command line option for specifying the max number of reducers
+   */
+  public static DefaultOptionBuilder numReducersOption() {
+    return new DefaultOptionBuilder().withLongName("maxRed").withRequired(false).withShortName("r").withArgument(
+        new ArgumentBuilder().withName("maxRed").withDefault("1").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The number of reduce tasks. Defaults to 1");
+  }
+
+  /**
+   * Returns a default command line option for specifying the emitMostLikely 
+   */
+  public static DefaultOptionBuilder emitMostLikelyOption() {
+    return new DefaultOptionBuilder().withLongName("emitMostLikely").withRequired(false).withShortName("e").withArgument(
+        new ArgumentBuilder().withName("emitMostLikely").withDefault("false").withMinimum(1).withMaximum(1).create())
+        .withDescription("True if clustering should emit the most likely point only, false for threshold clustering");
+  }
+
+  /**
+   * Returns a default command line option for specifying the clustering threshold value
+   */
+  public static DefaultOptionBuilder thresholdOption() {
+    return new DefaultOptionBuilder().withLongName("threshold").withRequired(false).withShortName("t").withArgument(
+        new ArgumentBuilder().withName("threshold").withDefault("0").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The pdf threshold used for cluster determination. Default is 0");
+  }
+
+  /**
+   * Returns a default command line option for specifying the FuzzyKMeans coefficient normalization factor, 'm'
+   */
+  public static DefaultOptionBuilder mOption() {
+    return new DefaultOptionBuilder().withLongName("m").withRequired(true).withArgument(
+        new ArgumentBuilder().withName("m").withMinimum(1).withMaximum(1).create()).withDescription(
+        "coefficient normalization factor, must be greater than 1").withShortName("m");
+  }
+
+  /**
+   * Returns a default command line option for specifying that the MeanShift input directory already contains Canopies vs. Vectors
+   */
+  public static DefaultOptionBuilder inputIsCanopiesOption() {
+    return new DefaultOptionBuilder().withLongName("inputIsCanopies").withRequired(false).withShortName("ic").withArgument(
+        new ArgumentBuilder().withName("inputIsCanopies").withMinimum(1).withMaximum(1).create()).withDescription(
+        "If present, the input directory already contains MeanShiftCanopies");
+  }
+
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Tue May 18 01:08:20 2010
@@ -215,8 +215,8 @@ public class TestMapReduce extends Mahou
     generateSamples(100, 2, 2, 1);
     ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
     // Now run the driver
-    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"),
-      "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", 20, 5, 1.0, 1);
+    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"), "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", "org.apache.mahout.math.DenseVector", 20, 5, 1.0, 1,
+    false, true, 0);
     // and inspect results
     List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
     JobConf conf = new JobConf(KMeansDriver.class);
@@ -253,8 +253,8 @@ public class TestMapReduce extends Mahou
   public void testDriverMnRIterations() throws Exception {
     generate4Datasets();
     // Now run the driver
-    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"),
-      "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", 20, 3, 1.0, 1);
+    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"), "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", "org.apache.mahout.math.DenseVector", 20, 3, 1.0, 1,
+    false, true, 0);
     // and inspect results
     List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
     JobConf conf = new JobConf(KMeansDriver.class);
@@ -289,8 +289,8 @@ public class TestMapReduce extends Mahou
   public void testDriverMnRnIterations() throws Exception {
     generate4Datasets();
     // Now run the driver
-    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"),
-      "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", 20, 3, 1.0, 2);
+    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"), "org.apache.mahout.clustering.dirichlet.models.SampledNormalDistribution", "org.apache.mahout.math.DenseVector", 20, 3, 1.0, 2,
+    false, true, 0);
     // and inspect results
     List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
     JobConf conf = new JobConf(KMeansDriver.class);
@@ -325,8 +325,8 @@ public class TestMapReduce extends Mahou
     generateSamples(500, 2, 2, 1);
     ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data4.txt"), fs, conf);
     // Now run the driver
-    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"),
-      "org.apache.mahout.clustering.dirichlet.models.AsymmetricSampledNormalDistribution", 20, 3, 1.0, 2);
+    DirichletDriver.runJob(getTestTempDirPath("input"), getTestTempDirPath("output"), "org.apache.mahout.clustering.dirichlet.models.AsymmetricSampledNormalDistribution", "org.apache.mahout.math.DenseVector", 20, 3, 1.0, 2,
+    false, true, 0);
     // and inspect results
     List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
     JobConf conf = new JobConf(KMeansDriver.class);

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Tue May 18 01:08:20 2010
@@ -61,7 +61,7 @@ public class Job {
     
     Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
     Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
-    Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
+    Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().withRequired(false).create();
     Option topicsOpt = DefaultOptionCreator.kOption().withRequired(false).create();
     
     Option redOpt = obuilder.withLongName("reducerNum").withRequired(false).withArgument(
@@ -150,7 +150,7 @@ public class Job {
     Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput, vectorClassName);
     DirichletDriver.runJob(directoryContainingConvertedInput, output, modelFactory,
-      vectorClassName, 60, numModels, maxIterations, alpha_0, numReducers, true, true, 0);
+      vectorClassName, numModels, maxIterations, alpha_0, numReducers, true, true, 0);
   }
   
   /**

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Tue May 18 01:08:20 2010
@@ -57,7 +57,7 @@ public final class Job {
     Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
     Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
-    Option maxIterationsOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
+    Option maxIterationsOpt = DefaultOptionCreator.maxIterationsOption().withRequired(false).create();
 
     Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
         abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -95,9 +95,6 @@ public final class Job {
       double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
       int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt, 10).toString());
-      // String className = cmdLine.getValue(vectorClassOpt,
-      // "org.apache.mahout.math.RandomAccessSparseVector").toString();
-      // Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
 
       runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
     } catch (OptionException e) {

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Tue May 18 01:08:20 2010
@@ -54,7 +54,7 @@ public final class Job {
     Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
     Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
-    Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
+    Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().withRequired(false).create();
     Option helpOpt = DefaultOptionCreator.helpOption();
 
     Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d").withArgument(

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Tue May 18 01:08:20 2010
@@ -73,7 +73,7 @@ public class CDbwDriver {
 
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
-    Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
+    Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
     Option helpOpt = DefaultOptionCreator.helpOption();
 
     Option modelOpt = obuilder.withLongName("modelClass").withRequired(true).withShortName("d").withArgument(

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Tue May 18 01:08:20 2010
@@ -193,7 +193,7 @@ public class TestClusterDumper extends M
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     DirichletDriver.runJob(getTestTempDirPath("testdata"), output,
                            L1ModelDistribution.class.getName(), prototype.getDelegate().getClass().getName(),
-                           prototype.size(), 15, 10, 1.0, 1, true, true, 0);
+                           15, 10, 1.0, 1, true, true, 0);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-10"),
                                                     new Path(output, "clusteredPoints"));

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=945447&r1=945446&r2=945447&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Tue May 18 01:08:20 2010
@@ -189,7 +189,7 @@ public class TestCDbwEvaluator extends M
     Vector prototype = new DenseVector(2);
     DirichletDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
                            L1ModelDistribution.class.getName(), prototype.getClass().getName(),
-                           prototype.size(), 15, 5, 1.0, 1, true, true, 0);
+                           15, 5, 1.0, 1, true, true, 0);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
     CDbwDriver.runJob(new Path(output, "clusters-5"), new Path(output, "clusteredPoints"), output,



Mime
View raw message