mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From smar...@apache.org
Subject svn commit: r1542650 - in /mahout/trunk: CHANGELOG core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
Date Sun, 17 Nov 2013 05:45:05 GMT
Author: smarthi
Date: Sun Nov 17 05:45:04 2013
New Revision: 1542650

URL: http://svn.apache.org/r1542650
Log:
MAHOUT-1351:Adding DenseVector support to AbstractCluster

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1542650&r1=1542649&r2=1542650&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Nov 17 05:45:04 2013
@@ -4,6 +4,8 @@ Release 0.9 - unreleased
 
   MAHOUT-1355: InteractionValueEncoder produces wrong traceDictionary entries (Johannes Schulte
via smarthi)
 
+  MAHOUT-1351: Adding DenseVector support to AbstractCluster (David DeBarr via smarthi)
+
   MAHOUT-1343: JSON output format support in cluster dumper (Telvis Calhoun via sslavic)
 
   MAHOUT-1333: Fixed examples bin directory permissions in distribution archives (Mike Percy
via sslavic)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java?rev=1542650&r1=1542649&r2=1542650&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java Sun
Nov 17 05:45:04 2013
@@ -60,7 +60,7 @@ public abstract class AbstractCluster im
   protected AbstractCluster(Vector point, int id2) {
     setNumObservations(0);
     setTotalObservations(0);
-    setCenter(new RandomAccessSparseVector(point));
+    setCenter(point.clone());
     setRadius(center.like());
     setS0(0);
     setS1(center.like());

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1542650&r1=1542649&r2=1542650&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
(original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
Sun Nov 17 05:45:04 2013
@@ -39,6 +39,7 @@ import org.apache.mahout.common.distance
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
@@ -76,6 +77,16 @@ public final class TestKmeansClustering 
     return points;
   }
   
+  public static List<VectorWritable> getPointsWritableDenseVector(double[][] raw) {
+    List<VectorWritable> points = Lists.newArrayList();
+    for (double[] fr : raw) {
+      Vector vec = new DenseVector(fr.length);
+      vec.assign(fr);
+      points.add(new VectorWritable(vec));
+    }
+    return points;
+  }
+  
   public static List<Vector> getPoints(double[][] raw) {
     List<Vector> points = Lists.newArrayList();
     for (double[] fr : raw) {
@@ -175,6 +186,62 @@ public final class TestKmeansClustering 
     }
   }
   
+  /** Story: User wishes to run kmeans job on reference data (DenseVector test) */
+  @Test
+  public void testKMeansSeqJobDenseVector() throws Exception {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    List<VectorWritable> points = getPointsWritableDenseVector(REFERENCE);
+    
+    Path pointsPath = getTestTempDirPath("points");
+    Path clustersPath = getTestTempDirPath("clusters");
+    Configuration conf = getConfiguration();
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs,
conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs,
conf);
+    for (int k = 1; k < points.size(); k++) {
+      System.out.println("testKMeansMRJob k= " + k);
+      // pick k initial cluster centers at random
+      Path path = new Path(clustersPath, "part-00000");
+      FileSystem fs = FileSystem.get(path.toUri(), conf);
+      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);
+      try {
+        for (int i = 0; i < k + 1; i++) {
+          Vector vec = points.get(i).get();
+          
+          Kluster cluster = new Kluster(vec, i, measure);
+          // add the center so the centroid will be correct upon output
+          cluster.observe(cluster.getCenter(), 1);
+          writer.append(new Text(cluster.getIdentifier()), cluster);
+        }
+      } finally {
+        Closeables.close(writer, false);
+      }
+      // now run the Job
+      Path outputPath = getTestTempDirPath("output");
+      // KMeansDriver.runJob(pointsPath, clustersPath, outputPath,
+      // EuclideanDistanceMeasure.class.getName(), 0.001, 10, k + 1, true);
+      String[] args = {optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
+          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
+          optKey(DefaultOptionCreator.OUTPUT_OPTION), outputPath.toString(),
+          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
+          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
+          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+          optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION),
+          DefaultOptionCreator.SEQUENTIAL_METHOD};
+      ToolRunner.run(conf, new KMeansDriver(), args);
+      
+      // now compare the expected clusters with actual
+      Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
+      int[] expect = EXPECTED_NUM_POINTS[k];
+      DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
+      // The key is the clusterId, the value is the weighted vector
+      for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+          new Path(clusteredPointsPath, "part-m-0"), conf)) {
+        collector.collect(record.getFirst(), record.getSecond());
+      }
+      assertEquals("clusters[" + k + ']', expect.length, collector.getKeys().size());
+    }
+  }
+  
   /** Story: User wishes to run kmeans job on reference data */
   @Test
   public void testKMeansMRJob() throws Exception {



Mime
View raw message