mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r632543 [2/2] - in /lucene/mahout/trunk: ./ src/main/java/org/apache/mahout/clustering/canopy/ src/main/java/org/apache/mahout/clustering/kmeans/ src/main/java/org/apache/mahout/utils/ src/test/java/org/apache/mahout/clustering/canopy/ src/...
Date Sat, 01 Mar 2008 03:33:21 GMT
Modified: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=632543&r1=632542&r2=632543&view=diff
==============================================================================
--- lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Fri Feb 29 19:33:13 2008
@@ -16,20 +16,7 @@
  */
 package org.apache.mahout.clustering.canopy;
 
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
 import junit.framework.TestCase;
-
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
@@ -41,11 +28,24 @@
 import org.apache.mahout.utils.DistanceMeasure;
 import org.apache.mahout.utils.EuclideanDistanceMeasure;
 import org.apache.mahout.utils.ManhattanDistanceMeasure;
+import org.apache.mahout.utils.Point;
 import org.apache.mahout.utils.UserDefinedDistanceMeasure;
 
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 public class TestCanopyCreation extends TestCase {
-  static final float[][] raw = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 },
-      { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
+  static final float[][] raw = {{1, 1}, {2, 1}, {1, 2}, {2, 2},
+          {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
 
   List<Canopy> referenceManhattan;
 
@@ -78,13 +78,13 @@
   private List<Text> getFormattedPoints(List<Float[]> points) {
     List<Text> result = new ArrayList<Text>();
     for (Float[] point : points)
-      result.add(new Text(Canopy.formatPoint(point)));
+      result.add(new Text(Point.formatPoint(point)));
     return result;
   }
 
   /**
    * Verify that the given canopies are equivalent to the referenceManhattan
-   * 
+   *
    * @param canopies
    */
   private void verifyManhattanCanopies(List<Canopy> canopies) {
@@ -93,7 +93,7 @@
 
   /**
    * Verify that the given canopies are equivalent to the referenceEuclidean
-   * 
+   *
    * @param canopies
    */
   private void verifyEuclideanCanopies(List<Canopy> canopies) {
@@ -104,7 +104,7 @@
    * Verify that the given canopies are equivalent to the reference. This means
    * the number of canopies is the same, the number of points in each is the
    * same and the centroids are the same.
-   * 
+   *
    * @param canopies
    */
   private void verifyCanopies(List<Canopy> canopies, List<Canopy> reference) {
@@ -113,19 +113,19 @@
       Canopy refCanopy = reference.get(canopyIx);
       Canopy testCanopy = canopies.get(canopyIx);
       assertEquals("canopy points " + canopyIx, refCanopy.getNumPoints(),
-          testCanopy.getNumPoints());
+              testCanopy.getNumPoints());
       Float[] refCentroid = refCanopy.computeCentroid();
       Float[] testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
         assertEquals("canopy centroid " + canopyIx + "[" + pointIx + "]",
-            refCentroid[pointIx], testCentroid[pointIx]);
+                refCentroid[pointIx], testCentroid[pointIx]);
       }
     }
   }
 
   /**
    * Print the canopies to the transcript
-   * 
+   *
    * @param canopies a List<Canopy>
    */
   private void prtCanopies(List<Canopy> canopies) {
@@ -135,15 +135,15 @@
   }
 
   private void writePointsToFile(List<Float[]> points, String fileName)
-      throws IOException {
+          throws IOException {
     writePointsToFileWithPayload(points, fileName, "");
   }
 
   private void writePointsToFileWithPayload(List<Float[]> points,
-      String fileName, String payload) throws IOException {
+                                            String fileName, String payload) throws IOException {
     BufferedWriter output = new BufferedWriter(new FileWriter(fileName));
     for (Float[] point : points) {
-      output.write(Canopy.formatPoint(point));
+      output.write(Point.formatPoint(point));
       output.write(payload);
       output.write("\n");
     }
@@ -151,19 +151,33 @@
     output.close();
   }
 
+  private void rmr(String path) throws Exception {
+    File f = new File(path);
+    if (f.exists()) {
+      if (f.isDirectory()) {
+        String[] contents = f.list();
+        for (int i = 0; i < contents.length; i++)
+          rmr(f.toString() + File.separator + contents[i]);
+      }
+      f.delete();
+    }
+  }
+
   protected void setUp() throws Exception {
     super.setUp();
+    rmr("output");
+    rmr("testdata");
     referenceManhattan = populateCanopies(manhattanDistanceMeasure,
-        getPoints(raw), (float) 3.1, (float) 2.1);
+            getPoints(raw), (float) 3.1, (float) 2.1);
     manhattanCentroids = populateCentroids(referenceManhattan);
     referenceEuclidean = populateCanopies(euclideanDistanceMeasure,
-        getPoints(raw), (float) 3.1, (float) 2.1);
+            getPoints(raw), (float) 3.1, (float) 2.1);
     euclideanCentroids = populateCentroids(referenceEuclidean);
   }
 
   /**
    * Iterate through the canopies, adding their centroids to a list
-   * 
+   *
    * @param canopies a List<Canopy>
    * @return the List<Float[]>
    */
@@ -176,15 +190,15 @@
 
   /**
    * Iterate through the points, adding new canopies. Return the canopies.
-   * 
+   *
    * @param measure a DistanceMeasure to use
-   * @param points a list<Float[]> defining the points to be clustered
-   * @param t1 the T1 distance threshold
-   * @param t2 the T2 distance threshold
+   * @param points  a list<Float[]> defining the points to be clustered
+   * @param t1      the T1 distance threshold
+   * @param t2      the T2 distance threshold
    * @return the List<Canopy> created
    */
   List<Canopy> populateCanopies(DistanceMeasure measure, List<Float[]> points,
-      float t1, float t2) {
+                                float t1, float t2) {
     List<Canopy> canopies = new ArrayList<Canopy>();
     Canopy.config(measure, t1, t2);
     /**
@@ -224,7 +238,7 @@
   /**
    * Story: User can cluster points using a ManhattanDistanceMeasure and a
    * reference implementation
-   * 
+   *
    * @throws Exception
    */
   public void testReferenceManhattan() throws Exception {
@@ -234,17 +248,17 @@
     assertEquals("number of canopies", 3, referenceManhattan.size());
     for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) {
       Canopy testCanopy = referenceManhattan.get(canopyIx);
-      int[] expectedNumPoints = { 4, 4, 3 };
-      float[][] expectedCentroids = { { (float) 1.5, (float) 1.5 },
-          { (float) 4.0, (float) 4.0 },
-          { (float) 4.6666665, (float) 4.6666665 } };
+      int[] expectedNumPoints = {4, 4, 3};
+      float[][] expectedCentroids = {{(float) 1.5, (float) 1.5},
+              {(float) 4.0, (float) 4.0},
+              {(float) 4.6666665, (float) 4.6666665}};
       assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
-          testCanopy.getNumPoints());
+              testCanopy.getNumPoints());
       float[] refCentroid = expectedCentroids[canopyIx];
       Float[] testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
         assertEquals("canopy centroid " + canopyIx + "[" + pointIx + "]",
-            refCentroid[pointIx], testCentroid[pointIx]);
+                refCentroid[pointIx], testCentroid[pointIx]);
       }
     }
   }
@@ -252,7 +266,7 @@
   /**
    * Story: User can cluster points using a EuclideanDistanceMeasure and a
    * reference implementation
-   * 
+   *
    * @throws Exception
    */
   public void testReferenceEuclidean() throws Exception {
@@ -262,17 +276,17 @@
     assertEquals("number of canopies", 3, referenceManhattan.size());
     for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) {
       Canopy testCanopy = referenceEuclidean.get(canopyIx);
-      int[] expectedNumPoints = { 5, 5, 3 };
-      float[][] expectedCentroids = { { (float) 1.8, (float) 1.8 },
-          { (float) 4.2, (float) 4.2 },
-          { (float) 4.6666665, (float) 4.6666665 } };
+      int[] expectedNumPoints = {5, 5, 3};
+      float[][] expectedCentroids = {{(float) 1.8, (float) 1.8},
+              {(float) 4.2, (float) 4.2},
+              {(float) 4.6666665, (float) 4.6666665}};
       assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
-          testCanopy.getNumPoints());
+              testCanopy.getNumPoints());
       float[] refCentroid = expectedCentroids[canopyIx];
       Float[] testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
         assertEquals("canopy centroid " + canopyIx + "[" + pointIx + "]",
-            refCentroid[pointIx], testCentroid[pointIx]);
+                refCentroid[pointIx], testCentroid[pointIx]);
       }
     }
   }
@@ -280,7 +294,7 @@
   /**
    * Story: User can cluster points without instantiating them all in memory at
    * once
-   * 
+   *
    * @throws Exception
    */
   public void testIterativeManhattan() throws Exception {
@@ -299,7 +313,7 @@
   /**
    * Story: User can cluster points without instantiating them all in memory at
    * once
-   * 
+   *
    * @throws Exception
    */
   public void testIterativeEuclidean() throws Exception {
@@ -319,7 +333,7 @@
    * Story: User can produce initial canopy centers using a
    * ManhattanDistanceMeasure and a CanopyMapper/Combiner which clusters input
    * points to produce an output set of canopy centroid points.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyMapperManhattan() throws Exception {
@@ -330,30 +344,30 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
     assertEquals("Number of map results", 3, collector.getData().size());
     // now combine the mapper output
     Canopy.config(manhattanDistanceMeasure, ((float) 3.1), ((float) 2.1));
-    Map<String, List<Writable>> mapData = collector.getData();
+    Map<String, List<Text>> mapData = collector.getData();
     collector = new DummyOutputCollector();
     for (String key : mapData.keySet())
       combiner.reduce(new Text(key), mapData.get(key).iterator(), collector,
-          null);
+              null);
     // now verify the output
-    List<Writable> data = collector.getValue("centroid");
+    List<Text> data = collector.getValue("centroid");
     assertEquals("Number of centroids", 3, data.size());
     for (int i = 0; i < data.size(); i++)
-      assertEquals("Centroid error", Canopy.formatPoint(manhattanCentroids
-          .get(i)), Canopy.formatPoint(Canopy.decodePoint(data.get(i)
-          .toString())));
+      assertEquals("Centroid error", Point.formatPoint(manhattanCentroids
+              .get(i)), Point
+              .formatPoint(Point.decodePoint(data.get(i).toString())));
   }
 
   /**
    * Story: User can produce initial canopy centers using a
    * EuclideanDistanceMeasure and a CanopyMapper/Combiner which clusters input
    * points to produce an output set of canopy centroid points.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyMapperEuclidean() throws Exception {
@@ -364,30 +378,30 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
     assertEquals("Number of map results", 3, collector.getData().size());
     // now combine the mapper output
     Canopy.config(euclideanDistanceMeasure, ((float) 3.1), ((float) 2.1));
-    Map<String, List<Writable>> mapData = collector.getData();
+    Map<String, List<Text>> mapData = collector.getData();
     collector = new DummyOutputCollector();
     for (String key : mapData.keySet())
       combiner.reduce(new Text(key), mapData.get(key).iterator(), collector,
-          null);
+              null);
     // now verify the output
-    List<Writable> data = collector.getValue("centroid");
+    List<Text> data = collector.getValue("centroid");
     assertEquals("Number of centroids", 3, data.size());
     for (int i = 0; i < data.size(); i++)
-      assertEquals("Centroid error", Canopy.formatPoint(euclideanCentroids
-          .get(i)), Canopy.formatPoint(Canopy.decodePoint(data.get(i)
-          .toString())));
+      assertEquals("Centroid error", Point.formatPoint(euclideanCentroids
+              .get(i)), Point
+              .formatPoint(Point.decodePoint(data.get(i).toString())));
   }
 
   /**
    * Story: User can produce final canopy centers using a
    * ManhattanDistanceMeasure and a CanopyReducer which clusters input centroid
    * points to produce an output set of final canopy centroid points.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyReducerManhattan() throws Exception {
@@ -402,10 +416,10 @@
     assertEquals("Number of centroids", 3, keys.size());
     int i = 0;
     for (String key : keys) {
-      List<Writable> data = collector.getValue(key);
-      assertEquals("Centroid error", Canopy.formatPoint(manhattanCentroids
-          .get(i)), Canopy.formatPoint(Canopy.decodePoint(data.get(0)
-          .toString())));
+      List<Text> data = collector.getValue(key);
+      assertEquals("Centroid error", Point.formatPoint(manhattanCentroids
+              .get(i)), Point.formatPoint(Canopy.decodeCanopy(
+              data.get(0).toString()).getCenter()));
       i++;
     }
   }
@@ -414,7 +428,7 @@
    * Story: User can produce final canopy centers using a
    * EuclideanDistanceMeasure and a CanopyReducer which clusters input centroid
    * points to produce an output set of final canopy centroid points.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyReducerEuclidean() throws Exception {
@@ -429,10 +443,10 @@
     assertEquals("Number of centroids", 3, keys.size());
     int i = 0;
     for (String key : keys) {
-      List<Writable> data = collector.getValue(key);
-      assertEquals("Centroid error", Canopy.formatPoint(euclideanCentroids
-          .get(i)), Canopy.formatPoint(Canopy.decodePoint(data.get(0)
-          .toString())));
+      List<Text> data = collector.getValue(key);
+      assertEquals("Centroid error", Point.formatPoint(euclideanCentroids
+              .get(i)), Point.formatPoint(Canopy.decodeCanopy(
+              data.get(0).toString()).getCenter()));
       i++;
     }
   }
@@ -440,7 +454,7 @@
   /**
    * Story: User can produce final canopy centers using a Hadoop map/reduce job
    * and a ManhattanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyGenManhattanMR() throws Exception {
@@ -452,11 +466,12 @@
     writePointsToFile(points, "testdata/file2");
     // now run the Canopy Driver
     CanopyDriver.runJob("testdata", "output/canopies",
-        ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
 
     // verify output from sequence file
     JobConf job = new JobConf(
-        org.apache.mahout.clustering.canopy.CanopyDriver.class);
+            org.apache.mahout.clustering.canopy.CanopyDriver.class);
     FileSystem fs = FileSystem.get(job);
     Path path = new Path("output/canopies/part-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
@@ -464,10 +479,10 @@
     Text value = new Text();
     assertTrue("more to come", reader.next(key, value));
     assertEquals("1st key", "C0", key.toString());
-    assertEquals("1st value", "[1.5, 1.5, ] ", value.toString());
+    assertEquals("1st value", "C0: [1.5, 1.5, ] ", value.toString());
     assertTrue("more to come", reader.next(key, value));
     assertEquals("2nd key", "C1", key.toString());
-    assertEquals("2nd value", "[4.333333, 4.333333, ] ", value.toString());
+    assertEquals("2nd value", "C1: [4.333333, 4.333333, ] ", value.toString());
     assertFalse("more to come", reader.next(key, value));
     reader.close();
   }
@@ -475,7 +490,7 @@
   /**
    * Story: User can produce final canopy centers using a Hadoop map/reduce job
    * and a EuclideanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testCanopyGenEuclideanMR() throws Exception {
@@ -487,11 +502,12 @@
     writePointsToFile(points, "testdata/file2");
     // now run the Canopy Driver
     CanopyDriver.runJob("testdata", "output/canopies",
-        EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
 
     // verify output from sequence file
     JobConf job = new JobConf(
-        org.apache.mahout.clustering.canopy.CanopyDriver.class);
+            org.apache.mahout.clustering.canopy.CanopyDriver.class);
     FileSystem fs = FileSystem.get(job);
     Path path = new Path("output/canopies/part-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
@@ -499,10 +515,10 @@
     Text value = new Text();
     assertTrue("more to come", reader.next(key, value));
     assertEquals("1st key", "C0", key.toString());
-    assertEquals("1st value", "[1.8, 1.8, ] ", value.toString());
+    assertEquals("1st value", "C0: [1.8, 1.8, ] ", value.toString());
     assertTrue("more to come", reader.next(key, value));
     assertEquals("2nd key", "C1", key.toString());
-    assertEquals("2nd value", "[4.4333334, 4.4333334, ] ", value.toString());
+    assertEquals("2nd value", "C1: [4.4333334, 4.4333334, ] ", value.toString());
     assertFalse("more to come", reader.next(key, value));
     reader.close();
   }
@@ -510,7 +526,7 @@
   /**
    * Story: User can cluster a subset of the points using a ClusterMapper and a
    * ManhattanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusterMapperManhattan() throws Exception {
@@ -524,23 +540,23 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
-    Map<String, List<Writable>> data = collector.getData();
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
+    Map<String, List<Text>> data = collector.getData();
     assertEquals("Number of map results", canopies.size(), data.size());
     for (String canopyDef : data.keySet()) {
       Canopy canopy = Canopy.decodeCanopy(canopyDef);
-      List<Writable> pts = data.get(canopyDef);
+      List<Text> pts = data.get(canopyDef);
       for (Writable ptDef : pts)
-        assertTrue("Point not in canopy", canopy.covers(Canopy
-            .decodePoint(ptDef.toString())));
+        assertTrue("Point not in canopy", canopy.covers(Point.decodePoint(ptDef
+                .toString())));
     }
   }
 
   /**
    * Story: User can cluster a subset of the points using a ClusterMapper and a
    * EuclideanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusterMapperEuclidean() throws Exception {
@@ -554,23 +570,23 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
-    Map<String, List<Writable>> data = collector.getData();
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
+    Map<String, List<Text>> data = collector.getData();
     assertEquals("Number of map results", canopies.size(), data.size());
     for (String canopyDef : data.keySet()) {
       Canopy canopy = Canopy.decodeCanopy(canopyDef);
-      List<Writable> pts = data.get(canopyDef);
+      List<Text> pts = data.get(canopyDef);
       for (Writable ptDef : pts)
-        assertTrue("Point not in canopy", canopy.covers(Canopy
-            .decodePoint(ptDef.toString())));
+        assertTrue("Point not in canopy", canopy.covers(Point.decodePoint(ptDef
+                .toString())));
     }
   }
 
   /**
    * Story: User can cluster a subset of the points using a ClusterReducer and a
    * ManhattanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusterReducerManhattan() throws Exception {
@@ -584,9 +600,9 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
-    Map<String, List<Writable>> data = collector.getData();
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
+    Map<String, List<Text>> data = collector.getData();
     assertEquals("Number of map results", canopies.size(), data.size());
 
     // reduce the data
@@ -599,17 +615,17 @@
     data = collector.getData();
     for (String canopyDef : data.keySet()) {
       Canopy canopy = Canopy.decodeCanopy(canopyDef);
-      List<Writable> pts = data.get(canopyDef);
+      List<Text> pts = data.get(canopyDef);
       for (Writable ptDef : pts)
-        assertTrue("Point not in canopy", canopy.covers(Canopy
-            .decodePoint(ptDef.toString())));
+        assertTrue("Point not in canopy", canopy.covers(Point.decodePoint(ptDef
+                .toString())));
     }
   }
 
   /**
    * Story: User can cluster a subset of the points using a ClusterReducer and a
    * EuclideanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusterReducerEuclidean() throws Exception {
@@ -623,9 +639,9 @@
     List<Float[]> points = getPoints(raw);
     // map the data
     for (Float[] point : points)
-      mapper.map(new Text(), new Text(Canopy.formatPoint(point)), collector,
-          null);
-    Map<String, List<Writable>> data = collector.getData();
+      mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+              null);
+    Map<String, List<Text>> data = collector.getData();
 
     // reduce the data
     Reducer reducer = new IdentityReducer();
@@ -638,17 +654,17 @@
     assertEquals("Number of map results", canopies.size(), data.size());
     for (String canopyDef : data.keySet()) {
       Canopy canopy = Canopy.decodeCanopy(canopyDef);
-      List<Writable> pts = data.get(canopyDef);
+      List<Text> pts = data.get(canopyDef);
       for (Writable ptDef : pts)
-        assertTrue("Point not in canopy", canopy.covers(Canopy
-            .decodePoint(ptDef.toString())));
+        assertTrue("Point not in canopy", canopy.covers(Point.decodePoint(ptDef
+                .toString())));
     }
   }
 
   /**
    * Story: User can produce final point clustering using a Hadoop map/reduce
    * job and a ManhattanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusteringManhattanMR() throws Exception {
@@ -660,9 +676,10 @@
     writePointsToFile(points, "testdata/file2");
     // now run the Job
     CanopyClusteringJob.runJob("testdata", "output",
-        ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
     BufferedReader reader = new BufferedReader(new FileReader(
-        "output/clusters/part-00000"));
+            "output/clusters/part-00000"));
     int count = 0;
     while (reader.ready()) {
       System.out.println(reader.readLine());
@@ -676,7 +693,7 @@
   /**
    * Story: User can produce final point clustering using a Hadoop map/reduce
    * job and a EuclideanDistanceMeasure.
-   * 
+   *
    * @throws Exception
    */
   public void testClusteringEuclideanMR() throws Exception {
@@ -688,9 +705,10 @@
     writePointsToFile(points, "testdata/file2");
     // now run the Job
     CanopyClusteringJob.runJob("testdata", "output",
-        EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
     BufferedReader reader = new BufferedReader(new FileReader(
-        "output/clusters/part-00000"));
+            "output/clusters/part-00000"));
     int count = 0;
     while (reader.ready()) {
       System.out.println(reader.readLine());
@@ -706,7 +724,7 @@
    * job and a ManhattanDistanceMeasure. Input points can have extra payload
    * information following the point [...] and this information will be retained
    * in the output.
-   * 
+   *
    * @throws Exception
    */
   public void testClusteringManhattanMRWithPayload() throws Exception {
@@ -718,9 +736,10 @@
     writePointsToFileWithPayload(points, "testdata/file2", "file2");
     // now run the Job
     CanopyClusteringJob.runJob("testdata", "output",
-        ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
     BufferedReader reader = new BufferedReader(new FileReader(
-        "output/clusters/part-00000"));
+            "output/clusters/part-00000"));
     int count = 0;
     while (reader.ready()) {
       String line = reader.readLine();
@@ -738,7 +757,7 @@
    * job and a EuclideanDistanceMeasure. Input points can have extra payload
    * information following the point [...] and this information will be retained
    * in the output.
-   * 
+   *
    * @throws Exception
    */
   public void testClusteringEuclideanMRWithPayload() throws Exception {
@@ -750,9 +769,10 @@
     writePointsToFileWithPayload(points, "testdata/file2", "file2");
     // now run the Job
     CanopyClusteringJob.runJob("testdata", "output",
-        EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            EuclideanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
     BufferedReader reader = new BufferedReader(new FileReader(
-        "output/clusters/part-00000"));
+            "output/clusters/part-00000"));
     int count = 0;
     while (reader.ready()) {
       String line = reader.readLine();
@@ -768,7 +788,7 @@
   /**
    * Story: Clustering algorithm must support arbitrary user defined distance
    * measure
-   * 
+   *
    * @throws Exception
    */
   public void testUserDefinedDistanceMeasure() throws Exception {
@@ -781,11 +801,12 @@
     // now run the Canopy Driver. User defined measure happens to be a Manhattan
     // subclass so results are same.
     CanopyDriver.runJob("testdata", "output/canopies",
-        UserDefinedDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1, "dist/apache-mahout-0.1-dev.jar");
+            UserDefinedDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
 
     // verify output from sequence file
     JobConf job = new JobConf(
-        org.apache.mahout.clustering.canopy.CanopyDriver.class);
+            org.apache.mahout.clustering.canopy.CanopyDriver.class);
     FileSystem fs = FileSystem.get(job);
     Path path = new Path("output/canopies/part-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
@@ -793,10 +814,10 @@
     Text value = new Text();
     assertTrue("more to come", reader.next(key, value));
     assertEquals("1st key", "C0", key.toString());
-    assertEquals("1st value", "[1.5, 1.5, ] ", value.toString());
+    assertEquals("1st value", "C0: [1.5, 1.5, ] ", value.toString());
     assertTrue("more to come", reader.next(key, value));
     assertEquals("2nd key", "C1", key.toString());
-    assertEquals("2nd value", "[4.333333, 4.333333, ] ", value.toString());
+    assertEquals("2nd value", "C1: [4.333333, 4.333333, ] ", value.toString());
     assertFalse("more to come", reader.next(key, value));
     reader.close();
   }

Modified: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java?rev=632543&r1=632542&r2=632543&view=diff
==============================================================================
--- lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java (original)
+++ lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java Fri Feb 29 19:33:13 2008
@@ -16,6 +16,8 @@
  * limitations under the License.
  */
 
+import org.apache.mahout.utils.Point;
+
 import java.util.ArrayList;
 import java.util.List;
 
@@ -23,7 +25,6 @@
  * This Canopy subclass maintains a list of points in the canopy so it can
  * include them in its toString method. Useful for debugging but not practical
  * for production use since it holds onto all its points.
- * 
  */
 public class VisibleCanopy extends Canopy {
   private List<Float[]> points = new ArrayList<Float[]>();
@@ -35,7 +36,7 @@
 
   /**
    * Add a point to the canopy
-   * 
+   *
    * @param point a Float[]
    */
   public void addPoint(Float[] point) {
@@ -46,13 +47,13 @@
   /**
    * Return a printable representation of this object, using the user supplied
    * identifier
-   * 
+   *
    * @return
    */
   public String toString() {
     String out = super.toString() + ": ";
     for (Float[] pt : points)
-      out = ptOut(out, pt);
+      out = Point.ptOut(out, pt);
     return out;
   }
 

Added: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=632543&view=auto
==============================================================================
--- lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (added)
+++ lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Fri Feb 29 19:33:13 2008
@@ -0,0 +1,452 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kmeans;
+
+import junit.framework.TestCase;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.canopy.DummyOutputCollector;
+import org.apache.mahout.utils.DistanceMeasure;
+import org.apache.mahout.utils.EuclideanDistanceMeasure;
+import org.apache.mahout.utils.ManhattanDistanceMeasure;
+import org.apache.mahout.utils.Point;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.List;
+
+public class TestKmeansClustering extends TestCase {
+
+  static final float[][] reference = {{1, 1}, {2, 1}, {1, 2}, {2, 2},
+          {3, 3}, {4, 4}, {5, 4}, {4, 5}, {5, 5}};
+
+  static int[][] expectedNumPoints = {{9}, {4, 5}, {4, 5, 0},
+          {1, 2, 1, 5}, {1, 1, 1, 2, 4}, {1, 1, 1, 1, 1, 4},
+          {1, 1, 1, 1, 1, 2, 2}, {1, 1, 1, 1, 1, 1, 2, 1},
+          {1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  private void rmr(String path) throws Exception {
+    File f = new File(path);
+    if (f.exists()) {
+      if (f.isDirectory()) {
+        String[] contents = f.list();
+        for (int i = 0; i < contents.length; i++)
+          rmr(f.toString() + File.separator + contents[i]);
+      }
+      f.delete();
+    }
+  }
+
+  protected void setUp() throws Exception {
+    super.setUp();
+    rmr("output");
+    rmr("testdata");
+  }
+
+  @Override
+  protected void tearDown() throws Exception {
+    super.tearDown();
+  }
+
+  /**
+   * This is the reference k-means implementation. Given its inputs it iterates
+   * over the points and clusters until their centers converge or until the
+   * maximum number of iterations is exceeded.
+   *
+   * @param points   the input List<Float[]> of points
+   * @param clusters the initial List<Cluster> of clusters
+   * @param measure  the DistanceMeasure to use
+   * @param maxIter  the maximum number of iterations
+   */
+  private void referenceKmeans(List<Float[]> points, List<Cluster> clusters,
+                               DistanceMeasure measure, int maxIter) {
+    boolean converged = false;
+    int iteration = 0;
+    while (!converged && iteration++ < maxIter) {
+      converged = iterateReference(points, clusters, measure);
+    }
+  }
+
+  /**
+   * Perform a single iteration over the points and clusters, assigning points
+   * to clusters and returning if the iterations are completed.
+   *
+   * @param points   the List<Float[]> having the input points
+   * @param clusters the List<Cluster> clusters
+   * @param measure  a DistanceMeasure to use
+   * @return
+   */
+  private boolean iterateReference(List<Float[]> points,
+                                   List<Cluster> clusters, DistanceMeasure measure) {
+    boolean converged;
+    converged = true;
+    // iterate through all points, assigning each to the nearest cluster
+    for (Float[] point : points) {
+      Cluster closestCluster = null;
+      float closestDistance = Float.MAX_VALUE;
+      for (Cluster cluster : clusters) {
+        float distance = measure.distance(cluster.getCenter(), point);
+        if (closestCluster == null || closestDistance > distance) {
+          closestCluster = cluster;
+          closestDistance = distance;
+        }
+      }
+      closestCluster.addPoint(point);
+    }
+    // test for convergence
+    for (Cluster cluster : clusters) {
+      if (!cluster.computeConvergence())
+        converged = false;
+    }
+    // update the cluster centers
+    if (!converged)
+      for (Cluster cluster : clusters)
+        cluster.recomputeCenter();
+    return converged;
+  }
+
+  private List<Float[]> getPoints(float[][] raw) {
+    List<Float[]> points = new ArrayList<Float[]>();
+    for (int i = 0; i < raw.length; i++) {
+      float[] fr = raw[i];
+      Float[] fs = new Float[fr.length];
+      for (int j = 0; j < fs.length; j++)
+        fs[j] = fr[j];
+      points.add(fs);
+    }
+    return points;
+  }
+
+  /**
+   * Story: Test the reference implementation
+   *
+   * @throws Exception
+   */
+  public void testReferenceImplementation() throws Exception {
+    List<Float[]> points = getPoints(reference);
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    Cluster.config(measure, (float) 0.001);
+    // try all possible values of k
+    for (int k = 0; k < points.size(); k++) {
+      System.out.println("Test k=" + (k + 1) + ":");
+      // pick k initial cluster centers at random
+      List<Cluster> clusters = new ArrayList<Cluster>();
+      for (int i = 0; i < k + 1; i++)
+        clusters.add(new VisibleCluster(points.get(i)));
+      // iterate clusters until they converge
+      int maxIter = 10;
+      referenceKmeans(points, clusters, measure, maxIter);
+      for (int c = 0; c < clusters.size(); c++) {
+        Cluster cluster = clusters.get(c);
+        assertEquals("Cluster " + c + " test " + k, expectedNumPoints[k][c],
+                cluster.getNumPoints());
+        System.out.println(cluster.toString());
+      }
+    }
+  }
+
+  /**
+   * Story: test that the mapper will map input points to the nearest cluster
+   *
+   * @throws Exception
+   */
+  public void testKMeansMapper() throws Exception {
+    KMeansMapper mapper = new KMeansMapper();
+    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
+    Cluster.config(euclideanDistanceMeasure, (float) 0.001);
+    List<Float[]> points = getPoints(reference);
+    for (int k = 0; k < points.size(); k++) {
+      // pick k initial cluster centers at random
+      DummyOutputCollector collector = new DummyOutputCollector();
+      List<Cluster> clusters = new ArrayList<Cluster>();
+      for (int i = 0; i < k + 1; i++) {
+        Cluster cluster = new Cluster(points.get(i));
+        // add the center so the centroid will be correct upon output
+        cluster.addPoint(cluster.getCenter());
+        clusters.add(cluster);
+      }
+      mapper.config(clusters);
+      // map the data
+      for (Float[] point : points)
+        mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+                null);
+      assertEquals("Number of map results", k + 1, collector.getData().size());
+      // now verify that all points are correctly allocated
+      for (String key : collector.getKeys()) {
+        Cluster cluster = Cluster.decodeCluster(key);
+        List<Text> values = collector.getValue(key);
+        for (Writable value : values) {
+          Float[] point = Point.decodePoint(value.toString());
+          float distance = euclideanDistanceMeasure.distance(cluster
+                  .getCenter(), point);
+          for (Cluster c : clusters)
+            assertTrue("distance error", distance <= euclideanDistanceMeasure
+                    .distance(point, c.getCenter()));
+        }
+      }
+    }
+  }
+
+  /**
+   * Story: test that the combiner will produce partial cluster totals for all
+   * of the clusters and points that it sees
+   *
+   * @throws Exception
+   */
+  public void testKMeansCombiner() throws Exception {
+    KMeansMapper mapper = new KMeansMapper();
+    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
+    Cluster.config(euclideanDistanceMeasure, (float) 0.001);
+    List<Float[]> points = getPoints(reference);
+    for (int k = 0; k < points.size(); k++) {
+      // pick k initial cluster centers at random
+      DummyOutputCollector collector = new DummyOutputCollector();
+      List<Cluster> clusters = new ArrayList<Cluster>();
+      for (int i = 0; i < k + 1; i++) {
+        Cluster cluster = new Cluster(points.get(i));
+        // add the center so the centroid will be correct upon output
+        cluster.addPoint(cluster.getCenter());
+        clusters.add(cluster);
+      }
+      mapper.config(clusters);
+      // map the data
+      for (Float[] point : points)
+        mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+                null);
+
+      // now combine the data
+      KMeansCombiner combiner = new KMeansCombiner();
+      DummyOutputCollector collector2 = new DummyOutputCollector();
+      for (String key : collector.getKeys())
+        combiner.reduce(new Text(key), collector.getValue(key).iterator(),
+                collector2, null);
+
+      assertEquals("Number of map results", k + 1, collector2.getData().size());
+      // now verify that all points are accounted for
+      int count = 0;
+      Float[] total = Point.origin(2);
+      for (String key : collector2.getKeys()) {
+        List<Text> values = collector2.getValue(key);
+        assertEquals("too many values", 1, values.size());
+        String value = values.get(0).toString();
+        int ix = value.indexOf(",");
+        count += new Integer(value.substring(0, ix));
+        total = Point.sum(total, Point.decodePoint(value.substring(ix + 2)));
+      }
+      assertEquals("total points", 9, count);
+      assertEquals("point total[0]", 27, total[0].intValue());
+      assertEquals("point total[1]", 27, total[1].intValue());
+    }
+  }
+
+  /**
+   * Story: test that the reducer will sum the partial cluster totals for all of
+   * the clusters and points that it sees
+   *
+   * @throws Exception
+   */
+  public void testKMeansReducer() throws Exception {
+    KMeansMapper mapper = new KMeansMapper();
+    EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
+    Cluster.config(euclideanDistanceMeasure, (float) 0.001);
+    List<Float[]> points = getPoints(reference);
+    for (int k = 0; k < points.size(); k++) {
+      System.out.println("K = " + k);
+      // pick k initial cluster centers at random
+      DummyOutputCollector collector = new DummyOutputCollector();
+      List<Cluster> clusters = new ArrayList<Cluster>();
+      for (int i = 0; i < k + 1; i++) {
+        Cluster cluster = new Cluster(points.get(i), i);
+        // add the center so the centroid will be correct upon output
+        cluster.addPoint(cluster.getCenter());
+        clusters.add(cluster);
+      }
+      mapper.config(clusters);
+      // map the data
+      for (Float[] point : points)
+        mapper.map(new Text(), new Text(Point.formatPoint(point)), collector,
+                null);
+
+      // now combine the data
+      KMeansCombiner combiner = new KMeansCombiner();
+      DummyOutputCollector collector2 = new DummyOutputCollector();
+      for (String key : collector.getKeys())
+        combiner.reduce(new Text(key), collector.getValue(key).iterator(),
+                collector2, null);
+
+      // now reduce the data
+      KMeansReducer reducer = new KMeansReducer();
+      DummyOutputCollector collector3 = new DummyOutputCollector();
+      for (String key : collector2.getKeys())
+        reducer.reduce(new Text(key), collector2.getValue(key).iterator(),
+                collector3, null);
+
+      assertEquals("Number of map results", k + 1, collector3.getData().size());
+
+      // compute the reference result after one iteration and compare
+      List<Cluster> reference = new ArrayList<Cluster>();
+      for (int i = 0; i < k + 1; i++)
+        reference.add(new Cluster(points.get(i), i));
+      boolean converged = iterateReference(points, reference,
+              euclideanDistanceMeasure);
+      if (k == 8)
+        assertTrue("not converged? " + k, converged);
+      else
+        assertFalse("converged? " + k, converged);
+
+      // now verify that all clusters have correct centers
+      converged = true;
+      for (int i = 0; i < reference.size(); i++) {
+        Cluster ref = reference.get(i);
+        String key = ref.getIdentifier();
+        List<Text> values = collector3.getValue(key);
+        String value = values.get(0).toString();
+        Cluster cluster = Cluster.decodeCluster(value);
+        converged = converged && cluster.isConverged();
+        System.out.println("ref= " + ref.toString() + " cluster= "
+                + cluster.toString());
+        assertEquals(k + " center[" + key + "][0]", ref.getCenter()[0], cluster
+                .getCenter()[0]);
+        assertEquals(k + " center[" + key + "][1]", ref.getCenter()[1], cluster
+                .getCenter()[1]);
+      }
+      if (k == 8)
+        assertTrue("not converged? " + k, converged);
+      else
+        assertFalse("converged? " + k, converged);
+    }
+  }
+
+  /**
+   * Story: User wishes to run kmeans job on reference data
+   *
+   * @throws Exception
+   */
+  public void testKMeansMRJob() throws Exception {
+    List<Float[]> points = getPoints(reference);
+    File testData = new File("testdata");
+    if (!testData.exists())
+      testData.mkdir();
+    testData = new File("testdata/points");
+    if (!testData.exists())
+      testData.mkdir();
+    Point.writePointsToFile(points, "testdata/points/file1");
+    Point.writePointsToFile(points, "testdata/points/file2");
+    for (int k = 0; k < points.size(); k++) {
+      System.out.println("testKMeansMRJob k= " + k);
+      // pick k initial cluster centers at random
+      JobConf job = new JobConf(KMeansDriver.class);
+      FileSystem fs = FileSystem.get(job);
+      Path path = new Path("testdata/clusters/part-00000");
+      SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path,
+              Text.class, Text.class);
+      for (int i = 0; i < k + 1; i++) {
+        Cluster cluster = new Cluster(points.get(i));
+        // add the center so the centroid will be correct upon output
+        cluster.addPoint(cluster.getCenter());
+        writer.append(new Text(cluster.getIdentifier()), new Text(Cluster
+                .formatCluster(cluster)));
+      }
+      writer.close();
+
+      // now run the Job
+      String jarLocation = "dist/apache-mahout-0.1-dev.jar";
+      KMeansDriver.runJob("testdata/points", "testdata/clusters", "output",
+              EuclideanDistanceMeasure.class.getName(), "0.001", "10", jarLocation);
+
+      // now compare the expected clusters with actual
+      File outDir = new File("output/points");
+      assertTrue("output dir exists?", outDir.exists());
+      String[] outFiles = outDir.list();
+      assertEquals("output dir files?", 4, outFiles.length);
+      BufferedReader reader = new BufferedReader(new FileReader(
+              "output/points/part-00000"));
+      int[] expect = expectedNumPoints[k];
+      DummyOutputCollector collector = new DummyOutputCollector();
+      while (reader.ready()) {
+        String line = reader.readLine();
+        String[] lineParts = line.split("\t");
+        assertEquals("line parts", 2, lineParts.length);
+        String cl = line.substring(0, line.indexOf(':'));
+        collector.collect(new Text(cl), new Text(lineParts[1]));
+      }
+      reader.close();
+      if (k == 2)
+        // cluster 3 is empty so won't appear in output
+        assertEquals("clusters[" + k + "]", expect.length - 1, collector
+                .getKeys().size());
+      else
+        assertEquals("clusters[" + k + "]", expect.length, collector.getKeys()
+                .size());
+    }
+  }
+
+  /**
+   * Story: User wants to use canopy clustering to input the initial clusters
+   * for kmeans job.
+   *
+   * @throws Exception
+   */
+  public void textKMeansWithCanopyClusterInput() throws Exception {
+    List<Float[]> points = getPoints(reference);
+    File testData = new File("testdata");
+    if (!testData.exists())
+      testData.mkdir();
+    testData = new File("testdata/points");
+    if (!testData.exists())
+      testData.mkdir();
+    Point.writePointsToFile(points, "testdata/points/file1");
+    Point.writePointsToFile(points, "testdata/points/file2");
+
+    // now run the Canopy job
+    CanopyDriver.runJob("testdata/points", "testdata/canopies",
+            ManhattanDistanceMeasure.class.getName(), (float) 3.1, (float) 2.1,
+            "dist/apache-mahout-0.1-dev.jar");
+
+    // now run the KMeans job
+    String jarLocation = "dist/apache-mahout-0.1-dev.jar";
+    KMeansDriver.runJob("testdata/points", "testdata/canopies", "output",
+            EuclideanDistanceMeasure.class.getName(), "0.001", "10", jarLocation);
+
+    // now compare the expected clusters with actual
+    File outDir = new File("output/points");
+    assertTrue("output dir exists?", outDir.exists());
+    String[] outFiles = outDir.list();
+    assertEquals("output dir files?", 4, outFiles.length);
+    BufferedReader reader = new BufferedReader(new FileReader(
+            "output/points/part-00000"));
+    DummyOutputCollector collector = new DummyOutputCollector();
+    while (reader.ready()) {
+      String line = reader.readLine();
+      String[] lineParts = line.split("\t");
+      assertEquals("line parts", 2, lineParts.length);
+      String cl = line.substring(0, line.indexOf(':'));
+      collector.collect(new Text(cl), new Text(lineParts[1]));
+    }
+    reader.close();
+    assertEquals("num points[V0]", 4, collector.getValue("V0").size());
+    assertEquals("num points[V1]", 5, collector.getValue("V1").size());
+  }
+}

Propchange: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/VisibleCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/VisibleCluster.java?rev=632543&view=auto
==============================================================================
--- lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/VisibleCluster.java (added)
+++ lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/VisibleCluster.java Fri Feb 29 19:33:13 2008
@@ -0,0 +1,66 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import org.apache.mahout.utils.Point;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This Cluster subclass maintains a list of points in the cluster so it can
+ * include them in its toString method. Useful for debugging but not practical
+ * for production use since it holds onto all its points.
+ */
+public class VisibleCluster extends Cluster {
+
+  private List<Float[]> points = new ArrayList<Float[]>();
+
+  @Override
+  public void recomputeCenter() {
+    super.recomputeCenter();
+    points = new ArrayList<Float[]>();
+  }
+
+  public VisibleCluster(Float[] point) {
+    super(point);
+  }
+
+  /**
+   * Add a point to the canopy
+   *
+   * @param point a Float[]
+   */
+  public void addPoint(Float[] point) {
+    super.addPoint(point);
+    points.add(point);
+  }
+
+  /**
+   * Return a printable representation of this object, using the user supplied
+   * identifier
+   *
+   * @return
+   */
+  public String toString() {
+    String out = super.toString() + ": ";
+    for (Float[] pt : points)
+      out += Point.formatPoint(pt);
+    return out;
+  }
+
+}

Propchange: lucene/mahout/trunk/src/test/java/org/apache/mahout/clustering/kmeans/VisibleCluster.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/mahout/trunk/src/test/java/org/apache/mahout/utils/UserDefinedDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/src/test/java/org/apache/mahout/utils/UserDefinedDistanceMeasure.java?rev=632543&r1=632542&r2=632543&view=diff
==============================================================================
--- lucene/mahout/trunk/src/test/java/org/apache/mahout/utils/UserDefinedDistanceMeasure.java (original)
+++ lucene/mahout/trunk/src/test/java/org/apache/mahout/utils/UserDefinedDistanceMeasure.java Fri Feb 29 19:33:13 2008
@@ -1,7 +1,5 @@
 package org.apache.mahout.utils;
 
-import org.apache.mahout.utils.ManhattanDistanceMeasure;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -9,9 +7,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.



Mime
View raw message