mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jeast...@apache.org
Subject svn commit: r939867 - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/kmeans/ core/src/main/java/org/apache/mahout/clustering/meanshift/ core/src/test/java/org/apache/m...
Date Fri, 30 Apr 2010 22:52:12 GMT
Author: jeastman
Date: Fri Apr 30 22:52:11 2010
New Revision: 939867

URL: http://svn.apache.org/viewvc?rev=939867&view=rev
Log:
MAHOUT-236:
- removed output directory deletion from DirichletDriver.writeInitialState
- added runClustering option to KmeansDriver and MeanShiftCanopyDriver
- refactored methods from MeanShiftCanopyJob into MeanShiftCanopyDriver and removed job
- adjusted TestKmeansClustering and TestMeanShift
- adjusted synthetic control examples to employ ClusterDumper of outputs
- adjusted TestClusterDumper and TestDCbwEvaluator for KMeans and MeanShift job api changes
- decreased number of iterations in unit tests to improve performance
- all tests run

Removed:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Fri Apr 30 22:52:11 2010
@@ -247,10 +247,8 @@ public class DirichletDriver {
       IllegalAccessException, IOException, SecurityException, NoSuchMethodException, InvocationTargetException {
 
     DirichletState<VectorWritable> state = createState(modelFactory, modelPrototype, prototypeSize, numModels, alpha_0);
-    JobConf job = new JobConf(KMeansDriver.class);
-    Path outPath = new Path(output);
-    FileSystem fs = FileSystem.get(outPath.toUri(), job);
-    fs.delete(outPath, true);
+    JobConf job = new JobConf(DirichletDriver.class);
+    FileSystem fs = FileSystem.get(new Path(output).toUri(), job);
     for (int i = 0; i < numModels; i++) {
       Path path = new Path(stateIn + "/part-" + i);
       SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, Text.class, DirichletCluster.class);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Fri Apr 30 22:52:11 2010
@@ -49,86 +49,78 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class KMeansDriver {
-    
+
   private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
-  
-  private KMeansDriver() {}
-  
+
+  private KMeansDriver() {
+  }
+
   /**
    * @param args
    *          Expects 7 args and they all correspond to the order of the params in {@link #runJob}
    */
   public static void main(String[] args) throws Exception {
-    
+
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
-    
-    Option clustersOpt = obuilder
-        .withLongName("clusters")
-        .withRequired(true)
-        .withArgument(abuilder.withName("clusters").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The input centroids, as Vectors. "
-              + "Must be a SequenceFile of Writable, Cluster/Canopy.  "
-              + "If k is also specified, then a random set of vectors will be selected "
-              + "and written out to this path first")
+        abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+    Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
+        abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The input centroids, as Vectors. " + "Must be a SequenceFile of Writable, Cluster/Canopy.  "
+            + "If k is also specified, then a random set of vectors will be selected " + "and written out to this path first")
         .withShortName("c").create();
-    
-    Option kOpt = obuilder
-        .withLongName("k")
-        .withRequired(false)
-        .withArgument(abuilder.withName("k").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen "
-              + "as the Centroid and written to the clusters output path.")
-        .withShortName("k").create();
-
-   Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Path to put the output in").withShortName("o").create();
-    
+
+    Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
+        abuilder.withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen "
+            + "as the Centroid and written to the clusters output path.").withShortName("k").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The Path to put the output in")
+        .withShortName("o").create();
+
     Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
-    
+        "If set, overwrite the output directory").withShortName("w").create();
+
     Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
-      abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
-    
+        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
+
     Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
-      abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The threshold below which the clusters are considered to be converged.  Default is 0.5")
-        .withShortName("d").create();
-    
+        abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The threshold below which the clusters are considered to be converged.  Default is 0.5").withShortName("d").create();
+
     Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-      abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The maximum number of iterations to perform.  Default is 20").withShortName("x").create();
-    
+        abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The maximum number of iterations to perform.  Default is 20").withShortName("x").create();
+
     Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
-      abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v")
-        .create();
-    
+        abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v").create();
+
     Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
-      abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The number of reduce tasks").withShortName("r").create();
-    
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription("The number of reduce tasks")
+        .withShortName("r").create();
+
+    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
+        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
+
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(
+        measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt)
+        .withOption(kOpt).withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).withOption(clusteringOpt)
         .create();
-    
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(
-      outputOpt).withOption(measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
-        .withOption(numReduceTasksOpt).withOption(kOpt).withOption(vectorClassOpt)
-        .withOption(overwriteOutput).withOption(helpOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
@@ -144,11 +136,11 @@ public final class KMeansDriver {
       if (cmdLine.hasOption(convergenceDeltaOpt)) {
         convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
       }
-      
+
       // Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
       // RandomAccessSparseVector.class
       // : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
-      
+
       int maxIterations = 20;
       if (cmdLine.hasOption(maxIterationsOpt)) {
         maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
@@ -161,16 +153,19 @@ public final class KMeansDriver {
         HadoopUtil.overwriteOutput(output);
       }
       if (cmdLine.hasOption(kOpt)) {
-        clusters = RandomSeedGenerator.buildRandom(input, clusters,
-          Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
+        clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
       }
-      runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks);
+      boolean runClustering = true;
+      if (cmdLine.hasOption(clusteringOpt)) {
+        runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
+      }
+      runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks, runClustering);
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   /**
    * Run the job using supplied arguments
    * 
@@ -188,21 +183,17 @@ public final class KMeansDriver {
    *          the maximum number of iterations
    * @param numReduceTasks
    *          the number of reducers
+   * @param runClustering 
+   *          true if points are to be clustered after iterations are completed
    */
-  public static void runJob(String input,
-                            String clustersIn,
-                            String output,
-                            String measureClass,
-                            double convergenceDelta,
-                            int maxIterations,
-                            int numReduceTasks) {
+  public static void runJob(String input, String clustersIn, String output, String measureClass, double convergenceDelta,
+      int maxIterations, int numReduceTasks, boolean runClustering) {
     // iterate until the clusters converge
     String delta = Double.toString(convergenceDelta);
     if (log.isInfoEnabled()) {
-      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output,
-                                                                               measureClass});
-      log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}",
-        new Object[] {convergenceDelta, maxIterations, numReduceTasks, VectorWritable.class.getName()});
+      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] { input, clustersIn, output, measureClass });
+      log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}", new Object[] { convergenceDelta,
+          maxIterations, numReduceTasks, VectorWritable.class.getName() });
     }
     boolean converged = false;
     int iteration = 1;
@@ -215,11 +206,13 @@ public final class KMeansDriver {
       clustersIn = clustersOut;
       iteration++;
     }
-    // now actually cluster the points
-    log.info("Clustering ");
-    runClustering(input, clustersIn, output + Cluster.CLUSTERED_POINTS_DIR, measureClass, delta);
+    if (runClustering) {
+      // now actually cluster the points
+      log.info("Clustering ");
+      runClustering(input, clustersIn, output + Cluster.CLUSTERED_POINTS_DIR, measureClass, delta);
+    }
   }
-  
+
   /**
    * Run the job using supplied arguments
    * 
@@ -239,19 +232,14 @@ public final class KMeansDriver {
    *          The iteration number
    * @return true if the iteration successfully runs
    */
-  private static boolean runIteration(String input,
-                                      String clustersIn,
-                                      String clustersOut,
-                                      String measureClass,
-                                      String convergenceDelta,
-                                      int numReduceTasks,
-                                      int iteration) {
+  private static boolean runIteration(String input, String clustersIn, String clustersOut, String measureClass,
+      String convergenceDelta, int numReduceTasks, int iteration) {
     JobConf conf = new JobConf(KMeansDriver.class);
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(KMeansInfo.class);
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(Cluster.class);
-    
+
     FileInputFormat.setInputPaths(conf, new Path(input));
     Path outPath = new Path(clustersOut);
     FileOutputFormat.setOutputPath(conf, outPath);
@@ -264,7 +252,7 @@ public final class KMeansDriver {
     conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
     conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
     conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-    
+
     try {
       JobClient.runJob(conf);
       FileSystem fs = FileSystem.get(outPath.toUri(), conf);
@@ -274,7 +262,7 @@ public final class KMeansDriver {
       return true;
     }
   }
-  
+
   /**
    * Run the job using supplied arguments
    * 
@@ -289,41 +277,36 @@ public final class KMeansDriver {
    * @param convergenceDelta
    *          the convergence delta value
    */
-  private static void runClustering(String input,
-                                    String clustersIn,
-                                    String output,
-                                    String measureClass,
-                                    String convergenceDelta) {
+  private static void runClustering(String input, String clustersIn, String output, String measureClass, String convergenceDelta) {
     if (log.isInfoEnabled()) {
       log.info("Running Clustering");
-      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output,
-                                                                               measureClass});
+      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] { input, clustersIn, output, measureClass });
       log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
     }
     JobConf conf = new JobConf(KMeansDriver.class);
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
-    
+
     conf.setOutputKeyClass(IntWritable.class);
     conf.setOutputValueClass(WeightedVectorWritable.class);
-    
+
     FileInputFormat.setInputPaths(conf, new Path(input));
     Path outPath = new Path(output);
     FileOutputFormat.setOutputPath(conf, outPath);
-    
+
     conf.setMapperClass(KMeansClusterMapper.class);
     conf.setNumReduceTasks(0);
     conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
     conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
     conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-    
+
     try {
       JobClient.runJob(conf);
     } catch (IOException e) {
       log.warn(e.toString(), e);
     }
   }
-  
+
   /**
    * Return if all of the Clusters in the parts in the filePath have converged or not
    * 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Fri Apr 30 22:52:11 2010
@@ -28,6 +28,8 @@ import org.apache.commons.cli2.builder.D
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
@@ -37,6 +39,7 @@ import org.apache.hadoop.mapred.JobClien
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.common.CommandLineUtil;
@@ -50,10 +53,12 @@ public final class MeanShiftCanopyDriver
 
   public static final String STATE_IN_KEY = "org.apache.mahout.clustering.meanshift.stateInKey";
 
+  protected static final String CONTROL_CONVERGED = "/control/converged";
+
   private MeanShiftCanopyDriver() {
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
@@ -62,6 +67,10 @@ public final class MeanShiftCanopyDriver
     Option outputOpt = DefaultOptionCreator.outputOption().create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
     Option helpOpt = DefaultOptionCreator.helpOption();
+    Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
+    Option inputIsCanopiesOpt = obuilder.withLongName("inputIsCanopies").withRequired(true).withShortName("i").withArgument(
+        abuilder.withName("inputIsCanopies").withMinimum(1).withMaximum(1).create()).withDescription(
+        "True if the input directory already contains MeanShiftCanopies").create();
 
     Option modelOpt = obuilder.withLongName("distanceClass").withRequired(true).withShortName("d").withArgument(
         abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -75,8 +84,12 @@ public final class MeanShiftCanopyDriver
         abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
         .create();
 
+    Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
+        "If true, run clustering after the iterations have taken place").withShortName("cl").create();
+
     Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(helpOpt)
-        .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(threshold2Opt).create();
+        .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(threshold2Opt).withOption(clusteringOpt).withOption(
+            maxIterOpt).withOption(inputIsCanopiesOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -86,6 +99,10 @@ public final class MeanShiftCanopyDriver
         CommandLineUtil.printHelp(group);
         return;
       }
+      boolean runClustering = true;
+      if (cmdLine.hasOption(clusteringOpt)) {
+        runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
+      }
 
       String input = cmdLine.getValue(inputOpt).toString();
       String output = cmdLine.getValue(outputOpt).toString();
@@ -93,9 +110,10 @@ public final class MeanShiftCanopyDriver
       double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt).toString());
       double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt).toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
+      int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
+      boolean inputIsCanopies = Boolean.parseBoolean(cmdLine.getValue(inputIsCanopiesOpt).toString());
       createCanopyFromVectors(input, output + "/intial-canopies");
-      runJob(output + "/intial-canopies", output, output + MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, measureClassName, t1, t2,
-          convergenceDelta);
+      runJob(input, output, measureClassName, t1, t2, convergenceDelta, maxIterations, inputIsCanopies, runClustering);
     } catch (OptionException e) {
       log.error("Exception parsing command line: ", e);
       CommandLineUtil.printHelp(group);
@@ -103,7 +121,7 @@ public final class MeanShiftCanopyDriver
   }
 
   /**
-   * Run the job
+   * Run an iteration
    * 
    * @param input
    *          the input pathname String
@@ -120,7 +138,7 @@ public final class MeanShiftCanopyDriver
    * @param convergenceDelta
    *          the double convergence criteria
    */
-  public static void runJob(String input, String output, String control, String measureClassName, double t1, double t2,
+  static void runIteration(String input, String output, String control, String measureClassName, double t1, double t2,
       double convergenceDelta) {
 
     Configurable client = new JobClient();
@@ -160,7 +178,7 @@ public final class MeanShiftCanopyDriver
    * @param output
    *          the output pathname String
    */
-  public static void createCanopyFromVectors(String input, String output) {
+  static void createCanopyFromVectors(String input, String output) {
 
     Configurable client = new JobClient();
     JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
@@ -195,25 +213,23 @@ public final class MeanShiftCanopyDriver
    * @param output
    *          the directory pathname for output clustered points
    */
-  public static void runClustering(String input,
-                                    String clustersIn,
-                                    String output) {
-    
+  static void runClustering(String input, String clustersIn, String output) {
+
     JobConf conf = new JobConf(FuzzyKMeansDriver.class);
     conf.setJobName("Mean Shift Clustering");
-    
+
     conf.setOutputKeyClass(IntWritable.class);
     conf.setOutputValueClass(WeightedVectorWritable.class);
-    
+
     FileInputFormat.setInputPaths(conf, new Path(input));
     Path outPath = new Path(output);
     FileOutputFormat.setOutputPath(conf, outPath);
-    
+
     conf.setMapperClass(MeanShiftCanopyClusterMapper.class);
-    
+
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
-    
+
     // uncomment it to run locally
     // conf.set("mapred.job.tracker", "local");
     conf.setNumReduceTasks(0);
@@ -224,4 +240,60 @@ public final class MeanShiftCanopyDriver
       log.warn(e.toString(), e);
     }
   }
+
+  /**
+   * Run the job where the input format can be either Vectors or Canopies
+   * 
+   * @param input
+   *          the input pathname String
+   * @param output
+   *          the output pathname String
+   * @param measureClassName
+   *          the DistanceMeasure class name
+   * @param t1
+   *          the T1 distance threshold
+   * @param t2
+   *          the T2 distance threshold
+   * @param convergenceDelta
+   *          the double convergence criteria
+   * @param maxIterations
+   *          an int number of iterations
+   * @param inputIsCanopies 
+              true if the input path already contains MeanShiftCanopies and does not need to be converted from Vectors
+   * @param runClustering 
+   *          true if the input points are to be clustered once the iterations complete
+   */
+  public static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+      int maxIterations, boolean inputIsCanopies, boolean runClustering) throws IOException {
+    // delete the output directory
+    Configuration conf = new JobConf(MeanShiftCanopyDriver.class);
+
+    String clustersIn = output + Cluster.INITIAL_CLUSTERS_DIR;
+    if (inputIsCanopies) {
+      clustersIn = input;
+    } else {
+      createCanopyFromVectors(input, clustersIn);
+    }
+
+    // iterate until the clusters converge
+    boolean converged = false;
+    int iteration = 1;
+    while (!converged && (iteration <= maxIterations)) {
+      log.info("Iteration {}", iteration);
+      // point the output to a new directory per iteration
+      String clustersOut = output + Cluster.CLUSTERS_DIR + iteration;
+      String controlOut = output + CONTROL_CONVERGED;
+      runIteration(clustersIn, clustersOut, controlOut, measureClassName, t1, t2, convergenceDelta);
+      converged = FileSystem.get(conf).exists(new Path(controlOut));
+      // now point the input to the old output directory
+      clustersIn = clustersOut;
+      iteration++;
+    }
+
+    if (runClustering) {
+      // now cluster the points
+      MeanShiftCanopyDriver.runClustering((inputIsCanopies ? input : output + Cluster.INITIAL_CLUSTERS_DIR), clustersIn, output
+          + Cluster.CLUSTERED_POINTS_DIR);
+    }
+  }
 }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Fri Apr 30 22:52:11 2010
@@ -364,7 +364,7 @@ public class TestKmeansClustering extend
       // now run the Job
       HadoopUtil.overwriteOutput("output");
       KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10,
-          k + 1);
+          k + 1, true);
       // now compare the expected clusters with actual
       File outDir = new File("output/clusteredPoints");
       assertTrue("output dir exists?", outDir.exists());
@@ -412,7 +412,7 @@ public class TestKmeansClustering extend
     CanopyDriver.runJob("testdata/points", "output", ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, false);
 
     // now run the KMeans job
-    KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+    KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
 
     // now compare the expected clusters with actual
     File outDir = new File("output/clusteredPoints");

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Fri Apr 30 22:52:11 2010
@@ -304,7 +304,7 @@ public class TestMeanShift extends Mahou
     ClusteringTestUtils.writePointsToFile(points, "testdata/file1", fs, conf);
     ClusteringTestUtils.writePointsToFile(points, "testdata/file2", fs, conf);
     // now run the Job
-    MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10);
+    MeanShiftCanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
     JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
     Path outPart = new Path("output/clusters-3/part-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, outPart, conf);

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Fri Apr 30 22:52:11 2010
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -123,9 +124,11 @@ public final class Job {
    *          the canopy T1 threshold
    * @param t2
    *          the canopy T2 threshold
+   * @throws IllegalAccessException 
+   * @throws InstantiationException 
    */
   private static void runJob(String input, String output, String measureClassName,
-                             double t1, double t2) throws IOException {
+                             double t1, double t2) throws IOException, InstantiationException, IllegalAccessException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
     
@@ -139,6 +142,10 @@ public final class Job {
     InputDriver.runJob(input, directoryContainingConvertedInput,
       "org.apache.mahout.math.RandomAccessSparseVector");
     CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2, true);
+    
+    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-0", "output/clusteredPoints");
+    clusterDumper.printClusters(null);
+
   }
   
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Fri Apr 30 22:52:11 2010
@@ -43,6 +43,7 @@ import org.apache.mahout.clustering.synt
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -161,9 +162,12 @@ public class Job {
     fs.mkdirs(outPath);
     String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
     InputDriver.runJob(input, directoryContainingConvertedInput, vectorClassName);
-    DirichletDriver.runJob(directoryContainingConvertedInput, output + "/state", modelFactory,
+    DirichletDriver.runJob(directoryContainingConvertedInput, output, modelFactory,
       vectorClassName, 60, numModels, maxIterations, alpha_0, numReducers, true, true, 0);
-    printResults(output + "/state", modelFactory, vectorClassName, 60, maxIterations, numModels, alpha_0);
+    
+    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-5", "output/clusteredPoints");
+    clusterDumper.printClusters(null);
+
   }
   
   /**

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Fri Apr 30 22:52:11 2010
@@ -38,58 +38,59 @@ import org.apache.mahout.clustering.synt
 import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class Job {
-  
+
   private static final Logger log = LoggerFactory.getLogger(Job.class);
-  
-  private Job() { }
-  
+
+  private Job() {
+  }
+
   public static void main(String[] args) throws Exception {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
     Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
     Option maxIterationsOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
-    
+
     Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
-      abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
-    
+        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
+
     Option t1Opt = obuilder.withLongName("t1").withRequired(false).withArgument(
-      abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("The t1 value to use.")
-        .withShortName("m").create();
+        abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("The t1 value to use.").withShortName("m")
+        .create();
     Option t2Opt = obuilder.withLongName("t2").withRequired(false).withArgument(
-      abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("The t2 value to use.")
-        .withShortName("m").create();
-    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
-      abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v")
+        abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("The t2 value to use.").withShortName("m")
         .create();
-    
+    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+        abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The Vector implementation class name.  Default is RandomAccessSparseVector.class").withShortName("v").create();
+
     Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
-      measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
-        .withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt).withOption(helpOpt).create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(measureClassOpt).withOption(
+        convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt)
+        .withOption(helpOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
       String input = cmdLine.getValue(inputOpt, "testdata").toString();
       String output = cmdLine.getValue(outputOpt, "output").toString();
-      String measureClass = cmdLine.getValue(measureClassOpt,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+      String measureClass = cmdLine.getValue(measureClassOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure")
+          .toString();
       double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
       double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
@@ -97,14 +98,14 @@ public final class Job {
       // String className = cmdLine.getValue(vectorClassOpt,
       // "org.apache.mahout.math.RandomAccessSparseVector").toString();
       // Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
-      
+
       runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   /**
    * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
    * parameters. All output data will be written to the output directory, which will be initially deleted if
@@ -127,17 +128,14 @@ public final class Job {
    *          the double convergence criteria for iterations
    * @param maxIterations
    *          the int maximum number of iterations
+   * @throws IllegalAccessException 
+   * @throws InstantiationException 
    */
-  private static void runJob(String input,
-                             String output,
-                             String measureClass,
-                             double t1,
-                             double t2,
-                             double convergenceDelta,
-                             int maxIterations) throws IOException {
+  private static void runJob(String input, String output, String measureClass, double t1, double t2, double convergenceDelta,
+      int maxIterations) throws IOException, InstantiationException, IllegalAccessException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
-    
+
     Path outPath = new Path(output);
     client.setConf(conf);
     FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -146,14 +144,14 @@ public final class Job {
     }
     String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
     log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput,
-      "org.apache.mahout.math.RandomAccessSparseVector");
+    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.runJob(directoryContainingConvertedInput,
-      output + Cluster.INITIAL_CLUSTERS_DIR, measureClass, t1, t2, false);
+    CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClass, t1, t2, false);
     log.info("Running KMeans");
-    KMeansDriver.runJob(directoryContainingConvertedInput,
-      output + Cluster.INITIAL_CLUSTERS_DIR, output, measureClass, convergenceDelta,
-      maxIterations, 1);
+    KMeansDriver.runJob(directoryContainingConvertedInput, output + Cluster.INITIAL_CLUSTERS_DIR, output, measureClass,
+        convergenceDelta, maxIterations, 1, true);
+
+    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+    clusterDumper.printClusters(null);
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Fri Apr 30 22:52:11 2010
@@ -32,48 +32,49 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class Job {
 
   private static final Logger log = LoggerFactory.getLogger(Job.class);
-  
+
   private static final String CLUSTERED_POINTS_OUTPUT_DIRECTORY = "/clusteredPoints";
-  
-  private Job() {}
-  
+
+  private Job() {
+  }
+
   public static void main(String[] args) throws Exception {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
     Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
     Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
     Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
     Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d")
-        .withArgument(abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create())
-        .withDescription("The distance measure class name.").create();
-    
-    Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1")
-        .withArgument(abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create())
-        .withDescription("The T1 distance threshold.").create();
-    
-    Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2")
-        .withArgument(abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create())
-        .withDescription("The T1 distance threshold.").create();
-    
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
-        .withOption(modelOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt)
-        .withOption(maxIterOpt).withOption(threshold2Opt).create();
-    
+
+    Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d").withArgument(
+        abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The distance measure class name.").create();
+
+    Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1").withArgument(
+        abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+        .create();
+
+    Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2").withArgument(
+        abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+        .create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(helpOpt)
+        .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(maxIterOpt).withOption(threshold2Opt).create();
+
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -82,11 +83,10 @@ public final class Job {
         CommandLineUtil.printHelp(group);
         return;
       }
-      
+
       String input = cmdLine.getValue(inputOpt, "testdata").toString();
       String output = cmdLine.getValue(outputOpt, "output").toString();
-      String measureClassName = cmdLine.getValue(modelOpt,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+      String measureClassName = cmdLine.getValue(modelOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
       double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt, "47.6").toString());
       double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt, "1").toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
@@ -97,7 +97,7 @@ public final class Job {
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   /**
    * Run the meanshift clustering job on an input dataset using the given distance measure, t1, t2 and
    * iteration parameters. All output data will be written to the output directory, which will be initially
@@ -120,17 +120,14 @@ public final class Job {
    *          the double convergence criteria for iterations
    * @param maxIterations
    *          the int maximum number of iterations
+   * @throws IllegalAccessException 
+   * @throws InstantiationException 
    */
-  private static void runJob(String input,
-                             String output,
-                             String measureClassName,
-                             double t1,
-                             double t2,
-                             double convergenceDelta,
-                             int maxIterations) throws IOException {
+  private static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+      int maxIterations) throws IOException, InstantiationException, IllegalAccessException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
-    
+
     Path outPath = new Path(output);
     client.setConf(conf);
     FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -139,11 +136,12 @@ public final class Job {
     }
     String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
     InputDriver.runJob(input, directoryContainingConvertedInput);
-    MeanShiftCanopyJob.runJob(directoryContainingConvertedInput, output + "/meanshift", measureClassName, t1,
-      t2, convergenceDelta, maxIterations, true);
-    FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
-    OutputDriver.runJob(status[status.length - 1].getPath().toString(), output
-                                                                        + CLUSTERED_POINTS_OUTPUT_DIRECTORY);
+    MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2,
+        convergenceDelta, maxIterations, true, true);
+
+    ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+    clusterDumper.printClusters(null);
+
   }
-  
+
 }

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Fri Apr 30 22:52:11 2010
@@ -39,7 +39,7 @@ import org.apache.mahout.clustering.diri
 import org.apache.mahout.clustering.dirichlet.models.L1ModelDistribution;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.CosineDistanceMeasure;
@@ -163,7 +163,7 @@ public class TestClusterDumper extends M
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.runJob("testdata/points", "output", EuclideanDistanceMeasure.class.getName(), 8, 4, false);
     // now run the KMeans job
-    KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+    KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper("output/clusters-2", "output/clusteredPoints");
     clusterDumper.printClusters(termDictionary);
@@ -181,7 +181,7 @@ public class TestClusterDumper extends M
   }
 
   public void testMeanShift() throws Exception {
-    MeanShiftCanopyJob.runJob("testdata/points", "output", CosineDistanceMeasure.class.getName(), 0.5, 0.01, 0.05, 10);
+    MeanShiftCanopyDriver.runJob("testdata/points", "output", CosineDistanceMeasure.class.getName(), 0.5, 0.01, 0.05, 10, false, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper("output/clusters-1", "output/clusteredPoints");
     clusterDumper.printClusters(termDictionary);

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Fri Apr 30 22:52:11 2010
@@ -39,7 +39,7 @@ import org.apache.mahout.clustering.diri
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -163,7 +163,7 @@ public class TestCDbwEvaluator extends M
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, false);
     // now run the KMeans job
-    KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+    KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
     int numIterations = 2;
     CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
         numIterations, 1);
@@ -183,7 +183,7 @@ public class TestCDbwEvaluator extends M
   }
 
   public void testMeanShift() throws Exception {
-    MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10);
+    MeanShiftCanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10, false, true);
     int numIterations = 2;
     CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
         numIterations, 1);



Mime
View raw message