mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jeast...@apache.org
Subject svn commit: r1000295 [2/2] - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache...
Date Thu, 23 Sep 2010 03:03:22 GMT
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Thu Sep 23 03:03:21 2010
@@ -34,6 +34,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.WeightedVectorWritable;
@@ -52,16 +53,20 @@ import org.junit.Test;
 
 public final class TestCanopyCreation extends MahoutTestCase {
 
-  private static final double[][] RAW = {
-      { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 }
-  };
+  private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
 
   private List<Canopy> referenceManhattan;
+
   private final DistanceMeasure manhattanDistanceMeasure = new ManhattanDistanceMeasure();
+
   private List<Vector> manhattanCentroids;
+
   private List<Canopy> referenceEuclidean;
+
   private final DistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
+
   private List<Vector> euclideanCentroids;
+
   private FileSystem fs;
 
   private static List<VectorWritable> getPointsWritable() {
@@ -130,10 +135,7 @@ public final class TestCanopyCreation ex
       double[] refCentroid = expectedCentroids[canopyIx];
       Vector testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
-        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
-                     refCentroid[pointIx],
-                     testCentroid.get(pointIx),
-                     EPSILON);
+        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
       }
     }
   }
@@ -152,9 +154,7 @@ public final class TestCanopyCreation ex
       double[] refCentroid = expectedCentroids[canopyIx];
       Vector testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
-        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
-                     refCentroid[pointIx], testCentroid.get(pointIx),
-                     EPSILON);
+        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
       }
     }
   }
@@ -171,8 +171,9 @@ public final class TestCanopyCreation ex
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context =
-        DummyRecordWriter.build(mapper, conf, writer);
+    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter.build(mapper,
+                                                                                                                  conf,
+                                                                                                                  writer);
     mapper.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -202,8 +203,9 @@ public final class TestCanopyCreation ex
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context =
-        DummyRecordWriter.build(mapper, conf, writer);
+    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter.build(mapper,
+                                                                                                                  conf,
+                                                                                                                  writer);
     mapper.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -297,7 +299,7 @@ public final class TestCanopyCreation ex
     ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, config);
     // now run the Canopy Driver
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, false, false);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
@@ -324,17 +326,17 @@ public final class TestCanopyCreation ex
   @Test
   public void testCanopyGenEuclideanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
-    Configuration job = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, job);
-    ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, job);
+    Configuration config = new Configuration();
+    ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, config);
     // now run the Canopy Driver
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, euclideanDistanceMeasure, 3.1, 2.1, false, false);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, euclideanDistanceMeasure, 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
-    FileSystem fs = FileSystem.get(path.toUri(), job);
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+    FileSystem fs = FileSystem.get(path.toUri(), config);
+    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, config);
     Writable key = new Text();
     Canopy value = new Canopy();
     assertTrue("more to come", reader.next(key, value));
@@ -429,7 +431,7 @@ public final class TestCanopyCreation ex
     ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
     // now run the Canopy Driver in sequential mode
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, true);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, true);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
@@ -509,7 +511,7 @@ public final class TestCanopyCreation ex
     ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, false);
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, false);
     Path path = new Path(output, "clusteredPoints/part-m-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
     int count = 0;
@@ -536,12 +538,11 @@ public final class TestCanopyCreation ex
     // now run the Job using the run() command. Others can use runJob().
     Path output = getTestTempDirPath("output");
     String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(),
-        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
-        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
+        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
         EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1",
         optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
         optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
-    new CanopyDriver().run(args);
+    ToolRunner.run(new Configuration(), new CanopyDriver(), args);
     Path path = new Path(output, "clusteredPoints/part-m-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
     int count = 0;
@@ -549,8 +550,7 @@ public final class TestCanopyCreation ex
     WeightedVectorWritable vw = new WeightedVectorWritable();
     while (reader.next(canopyId, vw)) {
       count++;
-      System.out.println("Txt: " + canopyId.toString() + " Vec: "
-          + AbstractCluster.formatVector(vw.getVector(), null));
+      System.out.println("Txt: " + canopyId.toString() + " Vec: " + AbstractCluster.formatVector(vw.getVector(), null));
     }
     assertEquals("number of points", points.size(), count);
     reader.close();
@@ -566,7 +566,7 @@ public final class TestCanopyCreation ex
     // now run the Canopy Driver. User defined measure happens to be a Manhattan
     // subclass so results are same.
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false);
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Configuration job = new Configuration();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Thu Sep 23 03:03:21 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.io.WritableComp
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.Model;
@@ -52,29 +53,10 @@ import org.junit.Test;
 public final class TestMapReduce extends MahoutTestCase {
 
   private Collection<VectorWritable> sampleData = new ArrayList<VectorWritable>();
+
   private FileSystem fs;
-  private Configuration conf;
 
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sdx
-   *          double x-standard deviation of the samples
-   * @param sdy
-   *          double y-standard deviation of the samples
-   */
-  private void generateSamples(int num, double mx, double my, double sdx, double sdy) {
-    System.out.println("Generating " + num + " samples m=[" + mx + ", " + my + "] sd=[" + sdx + ", " + sdy + ']');
-    for (int i = 0; i < num; i++) {
-      addSample(new double[] { UncommonDistributions.rNorm(mx, sdx), UncommonDistributions.rNorm(my, sdy) });
-    }
-  }
+  private Configuration conf;
 
   private void addSample(double[] values) {
     Vector v = new DenseVector(2);
@@ -119,9 +101,10 @@ public final class TestMapReduce extends
     DirichletMapper mapper = new DirichletMapper();
     mapper.setup(state);
 
-    RecordWriter<Text,VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context =
-        DummyRecordWriter.build(mapper, conf, writer);
+    RecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
+    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter.build(mapper,
+                                                                                                                  conf,
+                                                                                                                  writer);
     for (VectorWritable v : sampleData) {
       mapper.map(null, v, context);
     }
@@ -137,14 +120,14 @@ public final class TestMapReduce extends
     generateSamples(100, 2, 0, 1);
     generateSamples(100, 0, 2, 1);
     generateSamples(100, 2, 2, 1);
-    DirichletState state =
-        new DirichletState(new SampledNormalDistribution(new VectorWritable(new DenseVector(2))), 20, 1);
+    DirichletState state = new DirichletState(new SampledNormalDistribution(new VectorWritable(new DenseVector(2))), 20, 1);
     DirichletMapper mapper = new DirichletMapper();
     mapper.setup(state);
 
     DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext =
-        DummyRecordWriter.build(mapper, conf, mapWriter);
+    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext = DummyRecordWriter.build(mapper,
+                                                                                                                     conf,
+                                                                                                                     mapWriter);
     for (VectorWritable v : sampleData) {
       mapper.map(null, v, mapContext);
     }
@@ -152,8 +135,11 @@ public final class TestMapReduce extends
     DirichletReducer reducer = new DirichletReducer();
     reducer.setup(state);
     RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
-    Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext =
-        DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
+    Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext = DummyRecordWriter.build(reducer,
+                                                                                                          conf,
+                                                                                                          reduceWriter,
+                                                                                                          Text.class,
+                                                                                                          VectorWritable.class);
     for (Text key : mapWriter.getKeys()) {
       reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
     }
@@ -169,8 +155,7 @@ public final class TestMapReduce extends
     generateSamples(100, 2, 0, 1);
     generateSamples(100, 0, 2, 1);
     generateSamples(100, 2, 2, 1);
-    DirichletState state =
-        new DirichletState(new SampledNormalDistribution(new VectorWritable(new DenseVector(2))), 20, 1.0);
+    DirichletState state = new DirichletState(new SampledNormalDistribution(new VectorWritable(new DenseVector(2))), 20, 1.0);
 
     Collection<Model<VectorWritable>[]> models = new ArrayList<Model<VectorWritable>[]>();
 
@@ -178,8 +163,9 @@ public final class TestMapReduce extends
       DirichletMapper mapper = new DirichletMapper();
       mapper.setup(state);
       DummyRecordWriter<Text, VectorWritable> mapWriter = new DummyRecordWriter<Text, VectorWritable>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext =
-          DummyRecordWriter.build(mapper, conf, mapWriter);
+      Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context mapContext = DummyRecordWriter.build(mapper,
+                                                                                                                       conf,
+                                                                                                                       mapWriter);
       for (VectorWritable v : sampleData) {
         mapper.map(null, v, mapContext);
       }
@@ -187,8 +173,11 @@ public final class TestMapReduce extends
       DirichletReducer reducer = new DirichletReducer();
       reducer.setup(state);
       RecordWriter<Text, DirichletCluster> reduceWriter = new DummyRecordWriter<Text, DirichletCluster>();
-      Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext =
-          DummyRecordWriter.build(reducer, conf, reduceWriter, Text.class, VectorWritable.class);
+      Reducer<Text, VectorWritable, Text, DirichletCluster>.Context reduceContext = DummyRecordWriter.build(reducer,
+                                                                                                            conf,
+                                                                                                            reduceWriter,
+                                                                                                            Text.class,
+                                                                                                            VectorWritable.class);
       for (Text key : mapWriter.getKeys()) {
         reducer.reduce(new Text(key), mapWriter.getValue(key), reduceContext);
       }
@@ -282,7 +271,7 @@ public final class TestMapReduce extends
         optKey(DefaultOptionCreator.NUM_CLUSTERS_OPTION), "20", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
         maxIterations.toString(), optKey(DirichletDriver.ALPHA_OPTION), "1.0", optKey(DefaultOptionCreator.OVERWRITE_OPTION),
         optKey(DefaultOptionCreator.CLUSTERING_OPTION) };
-    new DirichletDriver().run(args);
+    ToolRunner.run(new Configuration(), new DirichletDriver(), args);
     // and inspect results
     Collection<List<DirichletCluster>> clusters = new ArrayList<List<DirichletCluster>>();
     Configuration conf = new Configuration();
@@ -302,22 +291,21 @@ public final class TestMapReduce extends
     generate4Datasets();
     // Now run the driver
     int maxIterations = 3;
-    AbstractVectorModelDistribution modelDistribution =
-        new SampledNormalDistribution(new VectorWritable(new DenseVector(2)));
-    DirichletDriver.runJob(getTestTempDirPath("input"),
-                           getTestTempDirPath("output"),
-                           modelDistribution,
-                           20,
-                           maxIterations,
-                           1.0,
-                           1,
-                           false,
-                           true,
-                           0,
-                           false);
+    AbstractVectorModelDistribution modelDistribution = new SampledNormalDistribution(new VectorWritable(new DenseVector(2)));
+    Configuration conf = new Configuration();
+    DirichletDriver.run(conf,
+                        getTestTempDirPath("input"),
+                        getTestTempDirPath("output"),
+                        modelDistribution,
+                        20,
+                        maxIterations,
+                        1.0,
+                        false,
+                        true,
+                        0,
+                        false);
     // and inspect results
     List<List<DirichletCluster>> clusters = new ArrayList<List<DirichletCluster>>();
-    Configuration conf = new Configuration();
     conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, modelDistribution.asJsonString());
     conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
     conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
@@ -342,81 +330,6 @@ public final class TestMapReduce extends
     ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data4.txt"), fs, conf);
   }
 
-  /** Test the Mapper and Reducer using the Driver */
-  @Test
-  public void testDriverMnRnIterations() throws Exception {
-    generate4Datasets();
-    // Now run the driver
-    int maxIterations = 3;
-    AbstractVectorModelDistribution modelDistribution = new SampledNormalDistribution(new VectorWritable(new DenseVector(2)));
-    DirichletDriver.runJob(getTestTempDirPath("input"),
-                           getTestTempDirPath("output"),
-                           modelDistribution,
-                           20,
-                           maxIterations,
-                           1.0,
-                           2,
-                           false,
-                           true,
-                           0,
-                           false);
-    // and inspect results
-    Collection<List<DirichletCluster>> clusters = new ArrayList<List<DirichletCluster>>();
-    Configuration conf = new Configuration();
-    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, modelDistribution.asJsonString());
-    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
-    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
-    for (int i = 0; i <= maxIterations; i++) {
-      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
-      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
-    }
-    printResults(clusters, 0);
-  }
-
-  /** Test the Mapper and Reducer using the Driver */
-  @Test
-  public void testDriverMnRnIterationsAsymmetric() throws Exception {
-    generateSamples(500, 0, 0, 0.5, 1.0);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data1.txt"), fs, conf);
-    sampleData = new ArrayList<VectorWritable>();
-    generateSamples(500, 2, 0, 0.2);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data2.txt"), fs, conf);
-    sampleData = new ArrayList<VectorWritable>();
-    generateSamples(500, 0, 2, 0.3);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data3.txt"), fs, conf);
-    sampleData = new ArrayList<VectorWritable>();
-    generateSamples(500, 2, 2, 1);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data4.txt"), fs, conf);
-    // Now run the driver
-    int maxIterations = 3;
-    AbstractVectorModelDistribution modelDistribution =
-        new SampledNormalDistribution(new VectorWritable(new DenseVector(2)));
-    DirichletDriver.runJob(getTestTempDirPath("input"),
-                           getTestTempDirPath("output"),
-                           modelDistribution,
-                           20,
-                           maxIterations,
-                           1.0,
-                           2,
-                           false,
-                           true,
-                           0,
-                           false);
-    // and inspect results
-    Collection<List<DirichletCluster>> clusters = new ArrayList<List<DirichletCluster>>();
-    Configuration conf = new Configuration();
-    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, modelDistribution.asJsonString());
-    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, "20");
-    conf.set(DirichletDriver.ALPHA_0_KEY, "1.0");
-    for (int i = 0; i <= maxIterations; i++) {
-      conf.set(DirichletDriver.STATE_IN_KEY, new Path(getTestTempDirPath("output"), "clusters-" + i).toString());
-      clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
-    }
-    printResults(clusters, 0);
-  }
-
-  // =================== New Tests of Writable Implementations ====================
-
   @Test
   public void testNormalModelWritableSerialization() throws Exception {
     double[] m = { 1.1, 2.2, 3.3 };

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Thu Sep 23 03:03:21 2010
@@ -35,6 +35,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
 import org.apache.mahout.clustering.ClusteringTestUtils;
@@ -316,7 +317,7 @@ public final class TestFuzzyKmeansCluste
           optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION),
           optKey(DefaultOptionCreator.OVERWRITE_OPTION)
       };
-      new FuzzyKMeansDriver().run(args);
+      ToolRunner.run(new Configuration(), new FuzzyKMeansDriver(), args);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-00000"), conf);
       Writable key = new IntWritable();
       Writable out = new WeightedVectorWritable();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Thu Sep 23 03:03:21 2010
@@ -32,6 +32,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
 import org.apache.mahout.clustering.ClusteringTestUtils;
@@ -54,11 +55,10 @@ import org.junit.Test;
 
 public final class TestKmeansClustering extends MahoutTestCase {
 
-  public static final double[][] REFERENCE = {
-      { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
+  public static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+      { 5, 5 } };
 
-  private static final int[][] EXPECTED_NUM_POINTS =
-      { { 9 }, { 4, 5 }, { 4, 4, 1 }, { 1, 2, 1, 5 }, { 1, 1, 1, 2, 4 },
+  private static final int[][] EXPECTED_NUM_POINTS = { { 9 }, { 4, 5 }, { 4, 4, 1 }, { 1, 2, 1, 5 }, { 1, 1, 1, 2, 4 },
       { 1, 1, 1, 1, 1, 4 }, { 1, 1, 1, 1, 1, 2, 2 }, { 1, 1, 1, 1, 1, 1, 2, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
 
   private FileSystem fs;
@@ -274,8 +274,11 @@ public final class TestKmeansClustering 
       KMeansReducer reducer = new KMeansReducer();
       reducer.setup(clusters, measure);
       DummyRecordWriter<Text, Cluster> reducerWriter = new DummyRecordWriter<Text, Cluster>();
-      Reducer<Text, ClusterObservations, Text, Cluster>.Context reducerContext =
-          DummyRecordWriter.build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
+      Reducer<Text, ClusterObservations, Text, Cluster>.Context reducerContext = DummyRecordWriter.build(reducer,
+                                                                                                         conf,
+                                                                                                         reducerWriter,
+                                                                                                         Text.class,
+                                                                                                         ClusterObservations.class);
       for (Text key : combinerWriter.getKeys()) {
         reducer.reduce(new Text(key), combinerWriter.getValue(key), reducerContext);
       }
@@ -411,7 +414,7 @@ public final class TestKmeansClustering 
           outputPath.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
           optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
           optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
-      new KMeansDriver().run(args);
+      ToolRunner.run(new Configuration(), new KMeansDriver(), args);
 
       // now compare the expected clusters with actual
       Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
@@ -450,18 +453,17 @@ public final class TestKmeansClustering 
 
     Path outputPath = getTestTempDirPath("output");
     // now run the Canopy job
-    CanopyDriver.runJob(pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, false);
+    CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, false);
 
     // now run the KMeans job
-    KMeansDriver.runJob(pointsPath,
-                        new Path(outputPath, "clusters-0"),
-                        outputPath,
-                        new EuclideanDistanceMeasure(),
-                        0.001,
-                        10,
-                        1,
-                        true,
-                        false);
+    KMeansDriver.run(pointsPath,
+                     new Path(outputPath, "clusters-0"),
+                     outputPath,
+                     new EuclideanDistanceMeasure(),
+                     0.001,
+                     10,
+                     true,
+                     false);
 
     // now compare the expected clusters with actual
     Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java Thu Sep 23 03:03:21 2010
@@ -23,6 +23,7 @@ import java.awt.Graphics;
 import java.awt.Graphics2D;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -48,22 +49,23 @@ class DisplayCanopy extends DisplayClust
     int cx = CLUSTERS.size() - 1;
     for (List<Cluster> clusters : CLUSTERS) {
       for (Cluster cluster : clusters) {
-        g2.setStroke(new BasicStroke(1));
-        g2.setColor(Color.BLUE);
-        double[] t1 = { T1, T1 };
-        plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
-        double[] t2 = { T2, T2 };
-        plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
-        g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
-        g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
-        plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+        if (isSignificant(cluster)) {
+          g2.setStroke(new BasicStroke(1));
+          g2.setColor(Color.BLUE);
+          double[] t1 = { T1, T1 };
+          plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
+          double[] t2 = { T2, T2 };
+          plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
+          g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
+          g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
+          plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+        }
       }
       cx--;
     }
   }
 
   public static void main(String[] args) throws Exception {
-    //SIGNIFICANCE = 0.05;
     Path samples = new Path("samples");
     Path output = new Path("output");
     HadoopUtil.overwriteOutput(samples);
@@ -73,7 +75,7 @@ class DisplayCanopy extends DisplayClust
     writeSampleData(samples);
     //boolean b = true;
     //if (b) {
-    new CanopyDriver().buildClusters(samples, output, new ManhattanDistanceMeasure(), T1, T2, true);
+    CanopyDriver.buildClusters(new Configuration(), samples, output, new ManhattanDistanceMeasure(), T1, T2, true);
     loadClusters(output);
     //} else {
     //  List<Vector> points = new ArrayList<Vector>();

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java Thu Sep 23 03:03:21 2010
@@ -58,18 +58,17 @@ class DisplayFuzzyKMeans extends Display
     //if (b) {
       writeSampleData(samples);
       Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3, measure);
-      FuzzyKMeansDriver.runJob(samples,
-                               clusters,
-                               output,
-                               measure,
-                               threshold,
-                               numIterations,
-                               1,
-                               m,
-                               true,
-                               true,
-                               threshold,
-                               true);
+      FuzzyKMeansDriver.run(samples,
+      clusters,
+      output,
+      measure,
+      threshold,
+      numIterations,
+      m,
+      true,
+      true,
+      threshold,
+      true);
       loadClusters(output);
     //} else {
     //  List<Vector> points = new ArrayList<Vector>();

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Thu Sep 23 03:03:21 2010
@@ -52,7 +52,14 @@ class DisplayKMeans extends DisplayClust
     double distanceThreshold = 0.001;
     //if (b) {
     Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3, measure);
-    KMeansDriver.runJob(samples, clusters, output, measure, distanceThreshold, maxIter, 1, true, true);
+    KMeansDriver.run(samples,
+    clusters,
+    output,
+    measure,
+    distanceThreshold,
+    maxIter,
+    true,
+    true);
     loadClusters(output);
     //} else {
     //  List<Vector> points = new ArrayList<Vector>();

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Thu Sep 23 03:03:21 2010
@@ -20,7 +20,9 @@ package org.apache.mahout.clustering.syn
 import java.io.IOException;
 import java.util.Map;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.common.HadoopUtil;
@@ -41,12 +43,12 @@ public final class Job extends CanopyDri
   public static void main(String[] args) throws Exception {
     if (args.length > 0) {
       log.info("Running with only user-supplied arguments");
-      new Job().run(args);
+      ToolRunner.run(new Configuration(), new Job(), args);
     } else {
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
+      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
     }
   }
 
@@ -69,11 +71,11 @@ public final class Job extends CanopyDri
    * @param t2
    *          the canopy T2 threshold
    */
-  private static void job(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws IOException,
+  private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws IOException,
       InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
     Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
-    CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, true, false);
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
@@ -105,7 +107,7 @@ public final class Job extends CanopyDri
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance();
 
-    job(input, output, measure, t1, t2);
+    run(input, output, measure, t1, t2);
     return 0;
   }
 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Thu Sep 23 03:03:21 2010
@@ -28,6 +28,7 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Model;
 import org.apache.mahout.clustering.ModelDistribution;
 import org.apache.mahout.clustering.dirichlet.DirichletCluster;
@@ -56,13 +57,13 @@ public final class Job extends Dirichlet
   public static void main(String[] args) throws Exception {
     if (args.length > 0) {
       log.info("Running with only user-supplied arguments");
-      new Job().run(args);
+      ToolRunner.run(new Configuration(), new Job(), args);
     } else {
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
       AbstractVectorModelDistribution modelDistribution = new GaussianClusterDistribution(new VectorWritable(new RandomAccessSparseVector(60)));
-      new Job().job(new Path("testdata"), output, modelDistribution, 10, 5, 1.0, 1, true, 0);
+      new Job().run(new Path("testdata"), output, modelDistribution, 10, 5, 1.0, true, 0);
     }
   }
 
@@ -88,7 +89,6 @@ public final class Job extends Dirichlet
     addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create());
     addOption(DefaultOptionCreator.emitMostLikelyOption().create());
     addOption(DefaultOptionCreator.thresholdOption().create());
-    addOption(DefaultOptionCreator.numReducersOption().create());
 
     Map<String, String> argMap = parseArguments(args);
     if (argMap == null) {
@@ -104,7 +104,6 @@ public final class Job extends Dirichlet
     String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION);
     String distanceMeasure = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
     int numModels = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
-    int numReducers = Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
     int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
     double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
@@ -115,7 +114,7 @@ public final class Job extends Dirichlet
                                                                                                 distanceMeasure,
                                                                                                 prototypeSize);
 
-    job(input, output, modelDistribution, numModels, maxIterations, alpha0, numReducers, emitMostLikely, threshold);
+    run(input, output, modelDistribution, numModels, maxIterations, alpha0, emitMostLikely, threshold);
     return 0;
   }
 
@@ -134,32 +133,28 @@ public final class Job extends Dirichlet
    *          the maximum number of iterations
    * @param alpha0
    *          the alpha0 value for the DirichletDistribution
-   * @param numReducers
-   *          the desired number of reducers
    */
-  private void job(Path input,
+  private void run(Path input,
                    Path output,
                    ModelDistribution<VectorWritable> modelDistribution,
                    int numModels,
                    int maxIterations,
                    double alpha0,
-                   int numReducers,
                    boolean emitMostLikely,
                    double threshold) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
       NoSuchMethodException, InvocationTargetException, SecurityException, InterruptedException {
     Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
-    DirichletDriver.runJob(directoryContainingConvertedInput,
-                           output,
-                           modelDistribution,
-                           numModels,
-                           maxIterations,
-                           alpha0,
-                           numReducers,
-                           true,
-                           emitMostLikely,
-                           threshold,
-                           false);
+    DirichletDriver.run(directoryContainingConvertedInput,
+                        output,
+                        modelDistribution,
+                        numModels,
+                        maxIterations,
+                        alpha0,
+                        true,
+                        emitMostLikely,
+                        threshold,
+                        false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-" + maxIterations), new Path(output,
                                                                                                             "clusteredPoints"));

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java Thu Sep 23 03:03:21 2010
@@ -22,7 +22,9 @@ import java.util.Map;
 
 import org.apache.commons.cli2.builder.ArgumentBuilder;
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
@@ -48,12 +50,12 @@ public final class Job extends FuzzyKMea
   public static void main(String[] args) throws Exception {
     if (args.length > 0) {
       log.info("Running with only user-supplied arguments");
-      new Job().run(args);
+      ToolRunner.run(new Configuration(), new Job(), args);
     } else {
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 1, (float) 2, 0.5);
+      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 1, (float) 2, 0.5);
     }
   }
 
@@ -65,8 +67,7 @@ public final class Job extends FuzzyKMea
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
     addOption(DefaultOptionCreator.clustersInOption()
         .withDescription("The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
-            + "If k is also specified, then a random set of vectors will be selected"
-            + " and written out to this path first")
+            + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first")
         .create());
     addOption(DefaultOptionCreator.numClustersOption()
         .withDescription("The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
@@ -106,16 +107,13 @@ public final class Job extends FuzzyKMea
     DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
 
     if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
-      clusters = RandomSeedGenerator.buildRandom(
-          input,
-          clusters,
-          Integer.parseInt(argMap.get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)),
-          measure);
+      clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(argMap
+          .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
     }
     //boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
     double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
     double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    job(input, output, measure, t1, t2, maxIterations, numReduceTasks, fuzziness, convergenceDelta);
+    run(input, output, measure, t1, t2, maxIterations, numReduceTasks, fuzziness, convergenceDelta);
     return 0;
   }
 
@@ -144,7 +142,7 @@ public final class Job extends FuzzyKMea
    * @param convergenceDelta
    *          the double convergence criteria for iterations
    */
-  private static void job(Path input,
+  private static void run(Path input,
                           Path output,
                           DistanceMeasure measure,
                           double t1,
@@ -152,27 +150,26 @@ public final class Job extends FuzzyKMea
                           int maxIterations,
                           int numReducerTasks,
                           float fuzziness,
-                          double convergenceDelta)
-    throws IOException, InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
+                          double convergenceDelta) throws IOException, InstantiationException, IllegalAccessException,
+      InterruptedException, ClassNotFoundException {
 
     Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
     InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, false, false);
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, false, false);
     log.info("Running FuzzyKMeans");
-    FuzzyKMeansDriver.runJob(directoryContainingConvertedInput,
-                             new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-                             output,
-                             measure,
-                             convergenceDelta,
-                             maxIterations,
-                             numReducerTasks,
-                             fuzziness,
-                             true,
-                             true,
-                             0.0,
-                             false);
+    FuzzyKMeansDriver.run(directoryContainingConvertedInput,
+    new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+    output,
+    measure,
+    convergenceDelta,
+    maxIterations,
+    fuzziness,
+    true,
+    true,
+    0.0,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Thu Sep 23 03:03:21 2010
@@ -20,7 +20,9 @@ package org.apache.mahout.clustering.syn
 import java.io.IOException;
 import java.util.Map;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
@@ -46,12 +48,12 @@ public final class Job extends KMeansDri
   public static void main(String[] args) throws Exception {
     if (args.length > 0) {
       log.info("Running with only user-supplied arguments");
-      new Job().run(args);
+      ToolRunner.run(new Configuration(), new Job(), args);
     } else {
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      new Job().job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
+      new Job().run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
     }
   }
 
@@ -71,7 +73,6 @@ public final class Job extends KMeansDri
     addOption(DefaultOptionCreator.convergenceOption().create());
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(DefaultOptionCreator.numReducersOption().create());
     addOption(DefaultOptionCreator.clusteringOption().create());
 
     Map<String, String> argMap = parseArguments(args);
@@ -87,7 +88,6 @@ public final class Job extends KMeansDri
       measureClass = SquaredEuclideanDistanceMeasure.class.getName();
     }
     double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int numReduceTasks = Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
     int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
       HadoopUtil.overwriteOutput(output);
@@ -100,7 +100,14 @@ public final class Job extends KMeansDri
           .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
     }
     boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
-    runJob(input, clusters, output, measure, convergenceDelta, maxIterations, numReduceTasks, runClustering, false);
+    run(input,
+    clusters,
+    output,
+    measure,
+    convergenceDelta,
+    maxIterations,
+    runClustering,
+    false);
     return 0;
   }
 
@@ -131,7 +138,7 @@ public final class Job extends KMeansDri
    * @throws ClassNotFoundException 
    * @throws InterruptedException 
    */
-  private void job(Path input,
+  private void run(Path input,
                    Path output,
                    DistanceMeasure measure,
                    double t1,
@@ -145,17 +152,16 @@ public final class Job extends KMeansDri
     log.info("Preparing Input");
     InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, false, false);
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, false, false);
     log.info("Running KMeans");
-    KMeansDriver.runJob(directoryContainingConvertedInput,
-                        new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-                        output,
-                        measure,
-                        convergenceDelta,
-                        maxIterations,
-                        1,
-                        true,
-                        false);
+    KMeansDriver.run(directoryContainingConvertedInput,
+    new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+    output,
+    measure,
+    convergenceDelta,
+    maxIterations,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-" + maxIterations), new Path(output,
                                                                                                             "clusteredPoints"));

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Thu Sep 23 03:03:21 2010
@@ -22,7 +22,9 @@ import java.util.Map;
 
 import org.apache.commons.cli2.builder.ArgumentBuilder;
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.common.HadoopUtil;
@@ -43,12 +45,12 @@ public final class Job extends MeanShift
   public static void main(String[] args) throws Exception {
     if (args.length > 0) {
       log.info("Running with only user-supplied arguments");
-      new Job().run(args);
+      ToolRunner.run(new Configuration(), new Job(), args);
     } else {
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 47.6, 1, 0.5, 10);
+      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 47.6, 1, 0.5, 10);
     }
   }
 
@@ -114,7 +116,7 @@ public final class Job extends MeanShift
    * @param maxIterations
    *          the int maximum number of iterations
    */
-  private static void job(Path input,
+  private static void run(Path input,
                           Path output,
                           DistanceMeasure measure,
                           double t1,

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Thu Sep 23 03:03:21 2010
@@ -31,9 +31,10 @@ import org.apache.hadoop.mapreduce.lib.i
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.WeightedVectorWritable;
-import org.apache.mahout.clustering.dirichlet.DirichletCluster;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -55,16 +56,15 @@ public final class CDbwDriver extends Ab
   }
 
   public static void main(String[] args) throws Exception {
-    new CDbwDriver().run(args);
+    ToolRunner.run(new Configuration(), new CDbwDriver(), args);
   }
 
   @Override
-  public int run(String[] args)
-    throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException {
+  public int run(String[] args) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
+      InterruptedException {
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.numReducersOption().create());
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     if (parseArguments(args) == null) {
       return -1;
@@ -73,49 +73,16 @@ public final class CDbwDriver extends Ab
     Path input = getInputPath();
     Path output = getOutputPath();
     String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    int numReducers = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = ccl.loadClass(distanceMeasureClass).asSubclass(DistanceMeasure.class).newInstance();
 
-    job(input, null, output, measure, maxIterations, numReducers);
+    run(getConf(), input, null, output, measure, maxIterations);
     return 0;
   }
 
-  /**
-   * Run the job using supplied arguments
-   * 
-   * @param clustersIn
-   *          the directory pathname for input [n/a :: Cluster]
-   * @param clusteredPointsIn 
-              the directory pathname for input clustered points [clusterId :: VectorWritable]
-   * @param output
-   *          the directory pathname for output reference points [clusterId :: VectorWritable]
-   * @param measure
-   *          the DistanceMeasure to use
-   * @param numIterations
-   *          the number of iterations
-   * @param numReducers
-   *          the number of Reducers desired
-   * @throws InterruptedException 
-   */
-  public static void runJob(Path clustersIn,
-                            Path clusteredPointsIn,
-                            Path output,
-                            DistanceMeasure measure,
-                            int numIterations,
-                            int numReducers)
-    throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException {
-    job(clustersIn, clusteredPointsIn, output, measure, numIterations, numReducers);
-  }
-
-  private static void job(Path clustersIn,
-                          Path clusteredPointsIn,
-                          Path output,
-                          DistanceMeasure measure,
-                          int numIterations,
-                          int numReducers)
-    throws InstantiationException, IllegalAccessException, IOException, InterruptedException, ClassNotFoundException {
+  public static void run(Configuration conf, Path clustersIn, Path clusteredPointsIn, Path output, DistanceMeasure measure, int numIterations)
+      throws InstantiationException, IllegalAccessException, IOException, InterruptedException, ClassNotFoundException {
     Path stateIn = new Path(output, "representativePoints-0");
     writeInitialState(stateIn, clustersIn);
 
@@ -123,12 +90,11 @@ public final class CDbwDriver extends Ab
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
       Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
-      runIteration(clusteredPointsIn, stateIn, stateOut, measure, numReducers);
+      runIteration(clusteredPointsIn, stateIn, stateOut, measure);
       // now point the input to the old output directory
       stateIn = stateOut;
     }
 
-    Configuration conf = new Configuration();
     conf.set(STATE_IN_KEY, stateIn.toString());
     conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
     CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
@@ -139,8 +105,8 @@ public final class CDbwDriver extends Ab
     System.out.println("Separation = " + evaluator.separation());
   }
 
-  private static void writeInitialState(Path output, Path clustersIn)
-    throws InstantiationException, IllegalAccessException, IOException, SecurityException {
+  private static void writeInitialState(Path output, Path clustersIn) throws InstantiationException, IllegalAccessException,
+      IOException, SecurityException {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(output.toUri(), conf);
     for (FileStatus part : fs.listStatus(clustersIn)) {
@@ -153,10 +119,8 @@ public final class CDbwDriver extends Ab
         SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
         while (reader.next(key, value)) {
           Cluster cluster = (Cluster) value;
-          if (!(cluster instanceof DirichletCluster) || ((DirichletCluster) cluster).getTotalCount() > 0) {
-            //System.out.println("C-" + cluster.getId() + ": " + ClusterBase.formatVector(cluster.getCenter(), null));
-            writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
-          }
+          log.debug("C-" + cluster.getId() + ": " + AbstractCluster.formatVector(cluster.getCenter(), null));
+          writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
         }
         writer.close();
       }
@@ -174,11 +138,9 @@ public final class CDbwDriver extends Ab
    *          the directory pathname for output state
    * @param measure
    *          the DistanceMeasure
-   * @param numReducers
-   *          the number of Reducers desired
    */
-  private static void runIteration(Path input, Path stateIn, Path stateOut, DistanceMeasure measure, int numReducers)
-    throws IOException, InterruptedException, ClassNotFoundException {
+  private static void runIteration(Path input, Path stateIn, Path stateOut, DistanceMeasure measure) throws IOException,
+      InterruptedException, ClassNotFoundException {
     Configuration conf = new Configuration();
     conf.set(STATE_IN_KEY, stateIn.toString());
     conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
@@ -194,7 +156,6 @@ public final class CDbwDriver extends Ab
 
     job.setMapperClass(CDbwMapper.class);
     job.setReducerClass(CDbwReducer.class);
-    job.setNumReduceTasks(numReducers);
     job.setInputFormatClass(SequenceFileInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
 

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Sep 23 03:03:21 2010
@@ -46,6 +46,7 @@ import org.apache.mahout.clustering.diri
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
+import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.distance.CosineDistanceMeasure;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -57,7 +58,6 @@ import org.apache.mahout.math.VectorWrit
 import org.apache.mahout.math.hadoop.DistributedRowMatrix;
 import org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver;
 import org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob;
-import org.apache.mahout.utils.MahoutTestCase;
 import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.apache.mahout.utils.vectors.TFIDF;
 import org.apache.mahout.utils.vectors.TermEntry;
@@ -82,6 +82,7 @@ public final class TestClusterDumper ext
       "The robber wore a white fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." };
 
   private List<VectorWritable> sampleData;
+
   private String[] termDictionary;
 
   @Override
@@ -154,7 +155,7 @@ public final class TestClusterDumper ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
 
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, true, false);
+    CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"), output, measure, 8, 4, true, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -165,9 +166,16 @@ public final class TestClusterDumper ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
+    CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
     // now run the KMeans job
-    KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
+    KMeansDriver.run(getTestTempDirPath("testdata"),
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -178,20 +186,19 @@ public final class TestClusterDumper ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Path output = getTestTempDirPath("output");
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
+    CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
     // now run the Fuzzy KMeans job
-    FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"),
-                             new Path(output, "clusters-0"),
-                             output,
-                             measure,
-                             0.001,
-                             10,
-                             1,
-                             (float) 1.1,
-                             true,
-                             true,
-                             0,
-                             false);
+    FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    ((float) 1.1),
+    true,
+    true,
+    0,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -212,7 +219,7 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new SampledNormalDistribution(new VectorWritable(prototype));
-    DirichletDriver.runJob(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, 1, true, true, 0, false);
+    DirichletDriver.run(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, true, true, 0, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-10"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -223,7 +230,7 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new GaussianClusterDistribution(new VectorWritable(prototype));
-    DirichletDriver.runJob(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, 1, true, true, 0, true);
+    DirichletDriver.run(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, true, true, 0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-10"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -234,7 +241,7 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new DistanceMeasureClusterDistribution(new VectorWritable(prototype));
-    DirichletDriver.runJob(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, 1, true, true, 0, true);
+    DirichletDriver.run(getTestTempDirPath("testdata"), output, modelDistribution, 15, 10, 1.0, true, true, 0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-10"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -280,14 +287,14 @@ public final class TestClusterDumper ext
     }
     // sData = A P
     Matrix sData = a.times(p);
-  
+
     // now write sData back to file system so clustering can run against it
     Path svdData = new Path(output, "svddata");
     SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, svdData, IntWritable.class, VectorWritable.class);
     try {
       IntWritable key = new IntWritable();
       VectorWritable value = new VectorWritable();
-  
+
       for (int row = 0; row < sData.numRows(); row++) {
         key.set(row);
         value.set(sData.getRow(row));
@@ -297,9 +304,16 @@ public final class TestClusterDumper ext
       writer.close();
     }
     // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.runJob(svdData, output, measure, 8, 4, false, false);
+    CanopyDriver.run(conf, svdData, output, measure, 8, 4, false, false);
     // now run the KMeans job
-    KMeansDriver.runJob(svdData, new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
+    KMeansDriver.run(svdData,
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -319,7 +333,7 @@ public final class TestClusterDumper ext
     // Run EigenVerificationJob from within DistributedLanczosSolver.run(...)
     solver.run(testData, output, tmp, sampleData.size(), sampleDimension, false, desiredRank, 0.5, 0.0, false);
     Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
-  
+
     // now multiply the testdata matrix and the eigenvector matrix
     DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors, tmp, desiredRank - 1, sampleDimension);
     JobConf conf = new JobConf(config);
@@ -328,11 +342,18 @@ public final class TestClusterDumper ext
     a.configure(conf);
     DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
     sData.configure(conf);
-  
+
     // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.runJob(sData.getRowPath(), output, measure, 8, 4, false, false);
+    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, false);
     // now run the KMeans job
-    KMeansDriver.runJob(sData.getRowPath(), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
+    KMeansDriver.run(sData.getRowPath(),
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
@@ -354,7 +375,7 @@ public final class TestClusterDumper ext
     Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS);
     new EigenVerificationJob().run(testData, rawEigenvectors, output, tmp, 0.5, 0.0, true, null);
     Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS);
-  
+
     // now multiply the testdata matrix and the eigenvector matrix
     DistributedRowMatrix svdT = new DistributedRowMatrix(cleanEigenvectors, tmp, desiredRank - 1, sampleDimension);
     JobConf conf = new JobConf(config);
@@ -363,11 +384,18 @@ public final class TestClusterDumper ext
     a.configure(conf);
     DistributedRowMatrix sData = a.transpose().times(svdT.transpose());
     sData.configure(conf);
-  
+
     // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.runJob(sData.getRowPath(), output, measure, 8, 4, false, false);
+    CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, false);
     // now run the KMeans job
-    KMeansDriver.runJob(sData.getRowPath(), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
+    KMeansDriver.run(sData.getRowPath(),
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1000295&r1=1000294&r2=1000295&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Thu Sep 23 03:03:21 2010
@@ -42,20 +42,21 @@ import org.apache.mahout.clustering.fuzz
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
+import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.utils.MahoutTestCase;
 import org.junit.Before;
 import org.junit.Test;
 
 public final class TestCDbwEvaluator extends MahoutTestCase {
 
-  private static final double[][] REFERENCE = {
-      { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
+  private static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+      { 5, 5 } };
 
   private Map<Integer, List<VectorWritable>> representativePoints;
+
   private Map<Integer, Cluster> clusters;
 
   @Override
@@ -151,10 +152,22 @@ public final class TestCDbwEvaluator ext
   @Test
   public void testCanopy() throws Exception { // now run the Job
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, true, false);
+    CanopyDriver.run(new Configuration(),
+                     getTestTempDirPath("testdata"),
+                     getTestTempDirPath("output"),
+                     measure,
+                     3.1,
+                     2.1,
+                     true,
+                     false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.runJob(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
+    CDbwDriver.run(new Configuration(),
+                   new Path(output, "clusters-0"),
+                   new Path(output, "clusteredPoints"),
+                   output,
+                   measure,
+                   numIterations);
     checkRefPoints(numIterations);
   }
 
@@ -162,12 +175,24 @@ public final class TestCDbwEvaluator ext
   public void testKmeans() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, false, false);
+    CanopyDriver.run(new Configuration(),
+                     getTestTempDirPath("testdata"),
+                     getTestTempDirPath("output"),
+                     measure,
+                     3.1,
+                     2.1,
+                     false,
+                     false);
     // now run the KMeans job
     Path output = getTestTempDirPath("output");
-    KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
+    KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, false);
     int numIterations = 2;
-    CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
+    CDbwDriver.run(new Configuration(),
+                   new Path(output, "clusters-2"),
+                   new Path(output, "clusteredPoints"),
+                   output,
+                   measure,
+                   numIterations);
     checkRefPoints(numIterations);
   }
 
@@ -175,23 +200,34 @@ public final class TestCDbwEvaluator ext
   public void testFuzzyKmeans() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
-    CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, false, false);
+    CanopyDriver.run(new Configuration(),
+                     getTestTempDirPath("testdata"),
+                     getTestTempDirPath("output"),
+                     measure,
+                     3.1,
+                     2.1,
+                     false,
+                     false);
     // now run the KMeans job
     Path output = getTestTempDirPath("output");
-    FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"),
-                             new Path(output, "clusters-0"),
-                             output,
-                             measure,
-                             0.001,
-                             10,
-                             1,
-                             2,
-                             true,
-                             true,
-                             0,
-                             false);
+    FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
+    new Path(output, "clusters-0"),
+    output,
+    measure,
+    0.001,
+    10,
+    2,
+    true,
+    true,
+    0,
+    false);
     int numIterations = 2;
-    CDbwDriver.runJob(new Path(output, "clusters-4"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
+    CDbwDriver.run(new Configuration(),
+                   new Path(output, "clusters-4"),
+                   new Path(output, "clusteredPoints"),
+                   output,
+                   measure,
+                   numIterations);
     checkRefPoints(numIterations);
   }
 
@@ -210,36 +246,39 @@ public final class TestCDbwEvaluator ext
                                  false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
+    CDbwDriver.run(new Configuration(),
+                   new Path(output, "clusters-2"),
+                   new Path(output, "clusteredPoints"),
+                   output,
+                   measure,
+                   numIterations);
     checkRefPoints(numIterations);
   }
 
   @Test
   public void testDirichlet() throws Exception {
-    ModelDistribution<VectorWritable> modelDistribution =
-        new GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
-    DirichletDriver.runJob(getTestTempDirPath("testdata"),
-                           getTestTempDirPath("output"),
-                           modelDistribution,
-                           15,
-                           5,
-                           1.0,
-                           1,
-                           true,
-                           true,
-                           0,
-                           true);
+    ModelDistribution<VectorWritable> modelDistribution = new GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
+    DirichletDriver.run(getTestTempDirPath("testdata"),
+                        getTestTempDirPath("output"),
+                        modelDistribution,
+                        15,
+                        5,
+                        1.0,
+                        true,
+                        true,
+                        0,
+                        true);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.runJob(new Path(output, "clusters-5"),
-                      new Path(output, "clusteredPoints"),
-                      output,
-                      new EuclideanDistanceMeasure(),
-                      numIterations,
-                      1);
+    CDbwDriver.run(new Configuration(),
+                   new Path(output, "clusters-5"),
+                   new Path(output, "clusteredPoints"),
+                   output,
+                   new EuclideanDistanceMeasure(),
+                   numIterations);
     checkRefPoints(numIterations);
   }
-  
+
   @Test
   public void testEmptyCluster() {
     DistanceMeasure measure = new EuclideanDistanceMeasure();



Mime
View raw message