mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pran...@apache.org
Subject svn commit: r1301654 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/classify/ core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/iterator/ core/src/main/java/org/apache/mahout...
Date Fri, 16 Mar 2012 17:10:30 GMT
Author: pranjan
Date: Fri Mar 16 17:10:29 2012
New Revision: 1301654

URL: http://svn.apache.org/viewvc?rev=1301654&view=rev
Log:
MAHOUT-981, MAHOUT-983. Refactored K-Means Clustering and Dirichlet Clustering to use ClusterClassificationDriver. 
Using cluster.getModel().configure() in ClusterClassificationDriver in order to configure DirichletCluster for MahalanobisDistanceMeasure. 
Added/fixed test cases by:
Using separate directories in test cases for supplying initial clusters and to store buildClusters to prevent two cluster-*-final files in the same directory.
Writing IntWritable in test cases instead of LongWritable ( As the ClusterClassificationDriver clusters records with IntWritable keys). 

Removed:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestDirichletClustering.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java Fri Mar 16 17:10:29 2012
@@ -40,6 +40,7 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.dirichlet.DirichletCluster;
 import org.apache.mahout.clustering.iterator.ClusteringPolicy;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -131,25 +132,21 @@ public class ClusterClassificationDriver
    * @throws InterruptedException
    * @throws ClassNotFoundException
    */
-  public static void run(Path input, Path clusteringOutputPath, Path output,
-      Double clusterClassificationThreshold, boolean emitMostLikely,
-      boolean runSequential) throws IOException, InterruptedException,
-      ClassNotFoundException {
+  public static void run(Path input, Path clusteringOutputPath, Path output, Double clusterClassificationThreshold,
+      boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+    Configuration conf = new Configuration();
     if (runSequential) {
-      classifyClusterSeq(input, clusteringOutputPath, output,
-          clusterClassificationThreshold, emitMostLikely);
+      classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
     } else {
-      Configuration conf = new Configuration();
-      classifyClusterMR(conf, input, clusteringOutputPath, output,
-          clusterClassificationThreshold, emitMostLikely);
+      classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
     }
     
   }
   
-  private static void classifyClusterSeq(Path input, Path clusters,
-      Path output, Double clusterClassificationThreshold, boolean emitMostLikely)
+  private static void classifyClusterSeq(Configuration conf, Path input,
+      Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely)
       throws IOException {
-    List<Cluster> clusterModels = populateClusterModels(clusters);
+    List<Cluster> clusterModels = populateClusterModels(clusters, conf);
     ClusteringPolicy policy = ClusterClassifier
         .readPolicy(finalClustersPath(clusters));
     ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels,
@@ -164,19 +161,24 @@ public class ClusterClassificationDriver
    * 
    * @param clusterOutputPath
    *          The output path of the clustering.
+   * @param conf
+   *          The Hadoop Configuration
    * @return The list of clusters found by the clustering.
    * @throws IOException
    */
-  private static List<Cluster> populateClusterModels(Path clusterOutputPath)
+  private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf)
       throws IOException {
     List<Cluster> clusterModels = new ArrayList<Cluster>();
     Cluster cluster = null;
     Path finalClustersPath = finalClustersPath(clusterOutputPath);
     Iterator<?> it = new SequenceFileDirValueIterator<Writable>(
         finalClustersPath, PathType.LIST, PathFilters.partFilter(), null,
-        false, new Configuration());
+        false, conf);
     while (it.hasNext()) {
       cluster = (Cluster) it.next();
+      if(cluster instanceof DirichletCluster){
+    	  ((DirichletCluster) cluster).getModel().configure(conf);
+      }
       clusterModels.add(cluster);
     }
     return clusterModels;
@@ -306,5 +308,16 @@ public class ClusterClassificationDriver
           "Cluster Classification Driver Job failed processing " + input);
     }
   }
+
+  public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output,
+      double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException,
+      InterruptedException, ClassNotFoundException {
+    if (runSequential) {
+      classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    } else {
+      classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    }
+    
+  }
   
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java Fri Mar 16 17:10:29 2012
@@ -35,6 +35,7 @@ import org.apache.hadoop.io.LongWritable
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.dirichlet.DirichletCluster;
 import org.apache.mahout.clustering.iterator.ClusteringPolicy;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
@@ -47,7 +48,7 @@ import org.apache.mahout.math.VectorWrit
  * Mapper for classifying vectors into clusters.
  */
 public class ClusterClassificationMapper extends
-    Mapper<LongWritable,VectorWritable,IntWritable,WeightedVectorWritable> {
+    Mapper<IntWritable,VectorWritable,IntWritable,WeightedVectorWritable> {
   
   private static double threshold;
   private List<Cluster> clusterModels;
@@ -70,7 +71,7 @@ public class ClusterClassificationMapper
     
     if (clustersIn != null && !clustersIn.isEmpty()) {
       Path clustersInPath = new Path(clustersIn);
-      clusterModels = populateClusterModels(clustersInPath);
+      clusterModels = populateClusterModels(clustersInPath, conf);
       ClusteringPolicy policy = ClusterClassifier
           .readPolicy(finalClustersPath(clustersInPath));
       clusterClassifier = new ClusterClassifier(clusterModels, policy);
@@ -83,7 +84,7 @@ public class ClusterClassificationMapper
    * Mapper which classifies the vectors to respective clusters.
    */
   @Override
-  protected void map(LongWritable key, VectorWritable vw, Context context)
+  protected void map(IntWritable key, VectorWritable vw, Context context)
       throws IOException, InterruptedException {
     if (!clusterModels.isEmpty()) {
       Vector pdfPerCluster = clusterClassifier.classify(vw.get());
@@ -118,10 +119,9 @@ public class ClusterClassificationMapper
     context.write(clusterId, weightedVW);
   }
   
-  public static List<Cluster> populateClusterModels(Path clusterOutputPath)
+  public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf)
       throws IOException {
     List<Cluster> clusters = new ArrayList<Cluster>();
-    Configuration conf = new Configuration();
     Cluster cluster = null;
     FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
     FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath,
@@ -131,6 +131,9 @@ public class ClusterClassificationMapper
         null, false, conf);
     while (it.hasNext()) {
       cluster = (Cluster) it.next();
+      if(cluster instanceof DirichletCluster){
+        ((DirichletCluster) cluster).getModel().configure(conf);
+      }
       clusters.add(cluster);
     }
     return clusters;

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterer.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterer.java Fri Mar 16 17:10:29 2012
@@ -17,22 +17,17 @@
 
 package org.apache.mahout.clustering.dirichlet;
 
-import java.io.IOException;
-import java.util.Collection;
 import java.util.List;
 
-import com.google.common.collect.Lists;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile.Writer;
-import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.Model;
 import org.apache.mahout.clustering.ModelDistribution;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
+import com.google.common.collect.Lists;
+
 /**
  * Performs Bayesian mixture modeling.
  * <p/>
@@ -105,10 +100,6 @@ public class DirichletClusterer {
 
   private final List<Cluster[]> clusterSamples = Lists.newArrayList();
 
-  private boolean emitMostLikely;
-
-  private double threshold;
-
   /**
    * Create a new instance on the sample data with the given additional parameters
    * 
@@ -171,18 +162,14 @@ public class DirichletClusterer {
 
   /**
    * This constructor only used by DirichletClusterMapper for setting up clustering params
-   * @param emitMostLikely
-   * @param threshold
    */
-  public DirichletClusterer(boolean emitMostLikely, double threshold) {
+  public DirichletClusterer() {
     this.sampleData = null;
     this.modelFactory = null;
     this.thin = 0;
     this.burnin = 0;
     this.numClusters = 0;
     this.state = null;
-    this.emitMostLikely = emitMostLikely;
-    this.threshold = threshold;
   }
 
   /**
@@ -275,139 +262,4 @@ public class DirichletClusterer {
     return cluster;
   }
 
-  /**
-   * Emit the point to one or more clusters depending upon clusterer state
-   * 
-   * @param vector a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param context a Mapper.Context to emit to
-   */
-  public void emitPointToClusters(VectorWritable vector,
-                                  List<DirichletCluster> clusters,
-                                  Mapper<?,?,IntWritable,WeightedVectorWritable>.Context context)
-    throws IOException, InterruptedException {
-    Vector pi = new DenseVector(clusters.size());
-    for (int i = 0; i < clusters.size(); i++) {
-      pi.set(i, clusters.get(i).getModel().pdf(vector));
-    }
-    pi = pi.divide(pi.zSum());
-    if (emitMostLikely) {
-      emitMostLikelyCluster(vector, clusters, pi, context);
-    } else {
-      emitAllClusters(vector, clusters, pi, context);
-    }
-  }
-
-  /**
-   * Emit the point to the most likely cluster
-   * 
-   * @param point a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param pi the normalized pdf Vector for the point
-   * @param context a Mapper.Context to emit to
-   */
-  private void emitMostLikelyCluster(VectorWritable point,
-                                     Collection<DirichletCluster> clusters,
-                                     Vector pi,
-                                     Mapper<?,?,IntWritable,WeightedVectorWritable>.Context context)
-    throws IOException, InterruptedException {
-    int clusterId = -1;
-    double clusterPdf = 0;
-    for (int i = 0; i < clusters.size(); i++) {
-      double pdf = pi.get(i);
-      if (pdf > clusterPdf) {
-        clusterId = i;
-        clusterPdf = pdf;
-      }
-    }
-    //System.out.println(clusterId + ": " + ClusterBase.formatVector(vector.get(), null));
-    context.write(new IntWritable(clusterId), new WeightedVectorWritable(clusterPdf, point.get()));
-  }
-
-  /**
-   * Emit the point to all clusters if pdf exceeds the threshold
-   * @param point a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param pi the normalized pdf Vector for the point
-   * @param context a Mapper.Context to emit to
-   */
-  private void emitAllClusters(VectorWritable point,
-                               List<DirichletCluster> clusters,
-                               Vector pi,
-                               Mapper<?,?,IntWritable,WeightedVectorWritable>.Context context)
-    throws IOException, InterruptedException {
-    for (int i = 0; i < clusters.size(); i++) {
-      double pdf = pi.get(i);
-      if (pdf > threshold && clusters.get(i).getTotalObservations() > 0) {
-        //System.out.println(i + ": " + ClusterBase.formatVector(vector.get(), null));
-        context.write(new IntWritable(i), new WeightedVectorWritable(pdf, point.get()));
-      }
-    }
-  }
-
-  /**
-   * Emit the point to one or more clusters depending upon clusterer state
-   * 
-   * @param vector a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param writer a SequenceFile.Writer to emit to
-   */
-  public void emitPointToClusters(VectorWritable vector, List<DirichletCluster> clusters, Writer writer)
-    throws IOException {
-    Vector pi = new DenseVector(clusters.size());
-    for (int i = 0; i < clusters.size(); i++) {
-      double pdf = clusters.get(i).getModel().pdf(vector);
-      pi.set(i, pdf);
-    }
-    pi = pi.divide(pi.zSum());
-    if (emitMostLikely) {
-      emitMostLikelyCluster(vector, clusters, pi, writer);
-    } else {
-      emitAllClusters(vector, clusters, pi, writer);
-    }
-  }
-
-  /**
-   * Emit the point to all clusters if pdf exceeds the threshold
-   * 
-   * @param vector a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param pi the normalized pdf Vector for the point
-   * @param writer a SequenceFile.Writer to emit to
-   */
-  private void emitAllClusters(VectorWritable vector, List<DirichletCluster> clusters, Vector pi, Writer writer)
-    throws IOException {
-    for (int i = 0; i < clusters.size(); i++) {
-      double pdf = pi.get(i);
-      if (pdf > threshold && clusters.get(i).getTotalObservations() > 0) {
-        //System.out.println(i + ": " + ClusterBase.formatVector(vector.get(), null));
-        writer.append(new IntWritable(i), new WeightedVectorWritable(pdf, vector.get()));
-      }
-    }
-  }
-
-  /**
-   * Emit the point to the most likely cluster
-   * 
-   * @param vector a VectorWritable holding the Vector
-   * @param clusters a List of DirichletClusters
-   * @param pi the normalized pdf Vector for the point
-   * @param writer a SequenceFile.Writer to emit to
-   */
-  private static void emitMostLikelyCluster(VectorWritable vector,
-                                            Collection<DirichletCluster> clusters,
-                                            Vector pi,
-                                            Writer writer) throws IOException {
-    double maxPdf = 0;
-    int clusterId = -1;
-    for (int i = 0; i < clusters.size(); i++) {
-      double pdf = pi.get(i);
-      if (pdf > maxPdf) {
-        maxPdf = pdf;
-        clusterId = i;
-      }
-    }
-    //System.out.println(i + ": " + ClusterBase.formatVector(vector.get(), null));
-    writer.append(new IntWritable(clusterId), new WeightedVectorWritable(maxPdf, vector.get()));
-  }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Fri Mar 16 17:10:29 2012
@@ -17,15 +17,15 @@
 
 package org.apache.mahout.clustering.dirichlet;
 
+import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
+
 import java.io.IOException;
 import java.util.List;
 
-import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
@@ -35,9 +35,11 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
 import org.apache.mahout.clustering.dirichlet.models.DistributionDescription;
 import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
+import org.apache.mahout.clustering.iterator.DirichletClusteringPolicy;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -50,6 +52,8 @@ import org.apache.mahout.math.VectorWrit
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.io.Closeables;
+
 public class DirichletDriver extends AbstractJob {
 
   public static final String STATE_IN_KEY = "org.apache.mahout.clustering.dirichlet.stateIn";
@@ -170,9 +174,11 @@ public class DirichletDriver extends Abs
       clusterData(conf,
                   input,
                   clustersOut,
-                  new Path(output, Cluster.CLUSTERED_POINTS_DIR),
-                  emitMostLikely,
-                  threshold,
+                  output,
+                  alpha0,
+                  numModels,
+                  emitMostLikely, 
+                  threshold, 
                   runSequential);
     }
   }
@@ -442,92 +448,32 @@ public class DirichletDriver extends Abs
   /**
    * Run the job using supplied arguments
    * @param conf 
-   * 
-   * @param input
+ * @param input
    *          the directory pathname for input points
-   * @param stateIn
+ * @param stateIn
    *          the directory pathname for input state
-   * @param output
+ * @param output
    *          the directory pathname for output points
-   * @param emitMostLikely
+ * @param alpha0 TODO
+ * @param numModels TODO
+ * @param emitMostLikely
    *          a boolean if true emit only most likely cluster for each point
-   * @param threshold 
+ * @param threshold 
    *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
-   * @param runSequential execute sequentially if true
+ * @param runSequential execute sequentially if true
    */
   public static void clusterData(Configuration conf,
                                  Path input,
                                  Path stateIn,
                                  Path output,
-                                 boolean emitMostLikely,
-                                 double threshold,
+                                 double alpha0,
+                                 int numModels,
+                                 boolean emitMostLikely, 
+                                 double threshold, 
                                  boolean runSequential)
     throws IOException, InterruptedException, ClassNotFoundException {
-    if (runSequential) {
-      clusterDataSeq(conf, input, stateIn, output, emitMostLikely, threshold);
-    } else {
-      clusterDataMR(conf, input, stateIn, output, emitMostLikely, threshold);
+	  ClusterClassifier.writePolicy(new DirichletClusteringPolicy(numModels, alpha0), stateIn);
+	  ClusterClassificationDriver.run(conf, input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), threshold, emitMostLikely, runSequential);
     }
-  }
-
-  private static void clusterDataSeq(Configuration conf,
-                                     Path input,
-                                     Path stateIn,
-                                     Path output,
-                                     boolean emitMostLikely,
-                                     double threshold) throws IOException {
-
-    List<DirichletCluster> clusters = DirichletClusterMapper.loadClusters(conf, stateIn);
-
-    for (DirichletCluster cluster : clusters) {
-      cluster.getModel().configure(conf);
-    }
-    
-    DirichletClusterer clusterer = new DirichletClusterer(emitMostLikely, threshold);
-    // iterate over all points, assigning each to the closest canopy and outputing that clustering
-    FileSystem fs = FileSystem.get(input.toUri(), conf);
-    FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
-    int part = 0;
-    for (FileStatus s : status) {
-      SequenceFile.Writer writer = new SequenceFile.Writer(fs,
-                                                           conf,
-                                                           new Path(output, "part-m-" + part++),
-                                                           IntWritable.class,
-                                                           WeightedVectorWritable.class);
-      try {
-        for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
-          clusterer.emitPointToClusters(value, clusters, writer);
-        }
-      } finally {
-        Closeables.closeQuietly(writer);
-      }
-    }
-
-  }
-
-  private static void clusterDataMR(Configuration conf,
-                                    Path input,
-                                    Path stateIn,
-                                    Path output,
-                                    boolean emitMostLikely,
-                                    double threshold) throws IOException, InterruptedException, ClassNotFoundException {
-    conf.set(STATE_IN_KEY, stateIn.toString());
-    conf.set(EMIT_MOST_LIKELY_KEY, Boolean.toString(emitMostLikely));
-    conf.set(THRESHOLD_KEY, Double.toString(threshold));
-    Job job = new Job(conf, "Dirichlet Driver running clusterData over input: " + input);
-    job.setOutputKeyClass(IntWritable.class);
-    job.setOutputValueClass(WeightedVectorWritable.class);
-    job.setMapperClass(DirichletClusterMapper.class);
-    job.setInputFormatClass(SequenceFileInputFormat.class);
-    job.setOutputFormatClass(SequenceFileOutputFormat.class);
-    job.setNumReduceTasks(0);
-    job.setJarByClass(DirichletDriver.class);
-
-    FileInputFormat.addInputPath(job, input);
-    FileOutputFormat.setOutputPath(job, output);
-
-    if (!job.waitForCompletion(true)) {
-      throw new InterruptedException("Dirichlet Clustering failed processing " + stateIn);
-    }
-  }
+  
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java Fri Mar 16 17:10:29 2012
@@ -33,6 +33,13 @@ public class DirichletClusteringPolicy e
     super();
   }
   
+  /**
+   * 
+   * @param k
+   *          The number of models to create from prior
+   * @param alpha0
+   *          The alpha_0 parameter to the Dirichlet Distribution.
+   */
   public DirichletClusteringPolicy(int k, double alpha0) {
     this.alpha0 = alpha0;
     this.mixture = UncommonDistributions.rDirichlet(new DenseVector(k), alpha0);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Fri Mar 16 17:10:29 2012
@@ -17,24 +17,18 @@
 package org.apache.mahout.clustering.kmeans;
 
 import java.io.IOException;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
-import com.google.common.collect.Lists;
-import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.SequenceFile.Writer;
 import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.math.Vector;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+
 /**
  * This class implements the k-means clustering algorithm. It uses {@link Kluster} as a cluster
  * representation. The class can be used as part of a clustering job to be started as map/reduce job.
@@ -118,52 +112,6 @@ public class KMeansClusterer {
     return converged;
   }
 
-  public void outputPointWithClusterInfo(Vector vector,
-                                         Iterable<Kluster> clusters,
-                                         Mapper<?,?,IntWritable,WeightedPropertyVectorWritable>.Context context)
-    throws IOException, InterruptedException {
-    AbstractCluster nearestCluster = null;
-    double nearestDistance = Double.MAX_VALUE;
-    for (AbstractCluster cluster : clusters) {
-      Vector clusterCenter = cluster.getCenter();
-      double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, vector);
-      if (distance < nearestDistance || nearestCluster == null) {
-        nearestCluster = cluster;
-        nearestDistance = distance;
-      }
-    }
-    Map<Text, Text> props = new HashMap<Text, Text>();
-    props.put(new Text("distance"), new Text(String.valueOf(nearestDistance)));
-    context.write(new IntWritable(nearestCluster.getId()), new WeightedPropertyVectorWritable(1, vector, props));
-  }
-
-  /**
-   * Iterates over all clusters and identifies the one closes to the given point. Distance measure used is
-   * configured at creation time.
-   * 
-   * @param point
-   *          a point to find a cluster for.
-   * @param clusters
-   *          a List<Cluster> to test.
-   */
-  protected void emitPointToNearestCluster(Vector point, Iterable<Kluster> clusters, Writer writer)
-    throws IOException {
-    AbstractCluster nearestCluster = null;
-    double nearestDistance = Double.MAX_VALUE;
-    for (AbstractCluster cluster : clusters) {
-      Vector clusterCenter = cluster.getCenter();
-      double distance = this.measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
-      if (log.isDebugEnabled()) {
-        log.debug("{} Cluster: {}", distance, cluster.getId());
-      }
-      if (distance < nearestDistance || nearestCluster == null) {
-        nearestCluster = cluster;
-        nearestDistance = distance;
-      }
-    }
-    writer.append(new IntWritable(nearestCluster.getId()), new WeightedVectorWritable(1, point));
-  }
-
   /**
    * This is the reference k-means implementation. Given its inputs it iterates over the points and clusters
    * until their centers converge or until the maximum number of iterations is exceeded.

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Fri Mar 16 17:10:29 2012
@@ -16,16 +16,15 @@
  */
 package org.apache.mahout.clustering.kmeans;
 
+import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
+
 import java.io.IOException;
 import java.util.Collection;
 
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
@@ -36,8 +35,9 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
-import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.ClassUtils;
 import org.apache.mahout.common.HadoopUtil;
@@ -47,12 +47,14 @@ import org.apache.mahout.common.distance
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
 import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
 public class KMeansDriver extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
@@ -157,7 +159,7 @@ public class KMeansDriver extends Abstra
       clusterData(conf,
           input,
           clustersOut,
-          new Path(output, AbstractCluster.CLUSTERED_POINTS_DIR),
+          output,
           measure,
           delta,
           runSequential);
@@ -428,73 +430,10 @@ public class KMeansDriver extends Abstra
       log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure});
       log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
     }
-    if (runSequential) {
-      clusterDataSeq(conf, input, clustersIn, output, measure);
-    } else {
-      clusterDataMR(conf, input, clustersIn, output, measure, convergenceDelta);
-    }
-  }
-
-  private static void clusterDataSeq(Configuration conf,
-                                     Path input,
-                                     Path clustersIn,
-                                     Path output,
-                                     DistanceMeasure measure) throws IOException {
-
-    KMeansClusterer clusterer = new KMeansClusterer(measure);
-    Collection<Kluster> clusters = Lists.newArrayList();
-    KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
-    if (clusters.isEmpty()) {
-      throw new IllegalStateException("Clusters is empty!");
-    }
-    FileSystem fs = FileSystem.get(input.toUri(), conf);
-    FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
-    int part = 0;
-    for (FileStatus s : status) {
-      SequenceFile.Writer writer = new SequenceFile.Writer(fs,
-                                                           conf,
-                                                           new Path(output, "part-m-" + part),
-                                                           IntWritable.class,
-                                                           WeightedVectorWritable.class);
-      try {
-        for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
-          clusterer.emitPointToNearestCluster(value.get(), clusters, writer);
-        }
-      } finally {
-        Closeables.closeQuietly(writer);
-      }
-    }
-
-  }
-
-  private static void clusterDataMR(Configuration conf,
-                                    Path input,
-                                    Path clustersIn,
-                                    Path output,
-                                    DistanceMeasure measure,
-                                    String convergenceDelta)
-    throws IOException, InterruptedException, ClassNotFoundException {
-
-    conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn.toString());
-    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
-    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-
-    Job job = new Job(conf, "KMeans Driver running clusterData over input: " + input);
-    job.setInputFormatClass(SequenceFileInputFormat.class);
-    job.setOutputFormatClass(SequenceFileOutputFormat.class);
-    job.setOutputKeyClass(IntWritable.class);
-    job.setOutputValueClass(WeightedPropertyVectorWritable.class);
-
-    FileInputFormat.setInputPaths(job, input);
-    HadoopUtil.delete(conf, output);
-    FileOutputFormat.setOutputPath(job, output);
-
-    job.setMapperClass(KMeansClusterMapper.class);
-    job.setNumReduceTasks(0);
-    job.setJarByClass(KMeansDriver.class);
-
-    if (!job.waitForCompletion(true)) {
-      throw new InterruptedException("K-Means Clustering failed processing " + clustersIn);
-    }
+    Double clusterClassificationThreshold = 0.0;
+    ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
+    ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
+        clusterClassificationThreshold, true, runSequential);
   }
+ 
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Fri Mar 16 17:10:29 2012
@@ -517,9 +517,9 @@ public final class TestCanopyCreation ex
   public void testClusteringManhattanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job
     Path output = getTestTempDirPath("output");
@@ -538,9 +538,9 @@ public final class TestCanopyCreation ex
   public void testClusteringEuclideanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job using the run() command. Others can use runJob().
     Path output = getTestTempDirPath("output");
@@ -567,9 +567,9 @@ public final class TestCanopyCreation ex
   public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job using the run() command. Others can use runJob().
     Path output = getTestTempDirPath("output");

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java Fri Mar 16 17:10:29 2012
@@ -104,7 +104,7 @@ public class ClusterClassificationDriver
     
     conf = new Configuration();
     
-    ClusteringTestUtils.writePointsToFile(points,
+    ClusteringTestUtils.writePointsToFile(points, true, 
         new Path(pointsPath, "file1"), fs, conf);
     runClustering(pointsPath, conf, false);
     runClassificationWithOutlierRemoval(conf, false);

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestDirichletClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestDirichletClustering.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestDirichletClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestDirichletClustering.java Fri Mar 16 17:10:29 2012
@@ -19,20 +19,29 @@ package org.apache.mahout.clustering.dir
 
 import java.util.List;
 
-import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.dirichlet.models.DistanceMeasureClusterDistribution;
+import org.apache.mahout.clustering.dirichlet.models.DistributionDescription;
 import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
+import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
 import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.VectorWritable;
 import org.junit.Before;
 import org.junit.Test;
 
+import com.google.common.collect.Lists;
+
 public final class TestDirichletClustering extends MahoutTestCase {
 
   private List<VectorWritable> sampleData;
-
+  
   @Override
   @Before
   public void setUp() throws Exception {
@@ -138,5 +147,60 @@ public final class TestDirichletClusteri
     printResults(result, 2);
     assertNotNull(result);
   }
+  
+  @Test
+  public void testDirichletClusteringSeq() throws Exception {
+    Path output = getTestTempDirPath("output");
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(new Configuration());
+    
+    generateSamples(40, 1, 1, 3);
+    generateSamples(30, 1, 0, 0.1);
+    generateSamples(30, 0, 1, 0.1);
+
+    ClusteringTestUtils.writePointsToFile(sampleData,
+            getTestTempFilePath("testdata/file1"), fs, conf);
+
+    DenseVector prototype = (DenseVector) sampleData.get(0).get();
+    
+    DistributionDescription description = new DistributionDescription(
+        DistanceMeasureClusterDistribution.class.getName(),
+        RandomAccessSparseVector.class.getName(),
+        ManhattanDistanceMeasure.class.getName(), prototype.size());
+    
+    DirichletDriver.run(conf, getTestTempDirPath("testdata"), output,
+        description, 10, 1, 1.0, true, true, 0, true);
+    
+    Path path = new Path(output, "clusteredPoints/part-m-0");
+    long count = HadoopUtil.countRecords(path, conf);
+    assertEquals("number of points", sampleData.size(), count);
+  }
+  
+  @Test
+  public void testDirichletClusteringMR() throws Exception {
+    Path output = getTestTempDirPath("output");
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(new Configuration());
+    
+    generateSamples(40, 1, 1, 3);
+    generateSamples(30, 1, 0, 0.1);
 
+    ClusteringTestUtils.writePointsToFile(sampleData, true,
+            getTestTempFilePath("testdata/file1"), fs, conf);
+
+    DenseVector prototype = (DenseVector) sampleData.get(0).get();
+    
+    DistributionDescription description = new DistributionDescription(
+        DistanceMeasureClusterDistribution.class.getName(),
+        RandomAccessSparseVector.class.getName(),
+        ManhattanDistanceMeasure.class.getName(), prototype.size());
+    
+    DirichletDriver.run(conf, getTestTempDirPath("testdata"), output,
+        description, 10, 1, 1.0, true, true, 0, false);
+    
+    Path path = new Path(output, "clusteredPoints/part-m-00000");
+    long count = HadoopUtil.countRecords(path, conf);
+    assertEquals("number of points", sampleData.size(), count);
+  }
+  
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Fri Mar 16 17:10:29 2012
@@ -282,7 +282,7 @@ public final class TestMapReduce extends
     generateSamples(100, 2, 0, 0.2);
     generateSamples(100, 0, 2, 0.3);
     generateSamples(100, 2, 2, 1);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(sampleData, true, getTestTempFilePath("input/data.txt"), fs, conf);
     // Now run the driver using the run() method. Others can use runJob() as
     // before
     Integer maxIterations = 5;
@@ -410,7 +410,7 @@ public final class TestMapReduce extends
   public void testDriverIterationsMahalanobisMR() throws Exception {
     generateAsymmetricSamples(100, 0, 0, 0.5, 3.0);
     generateAsymmetricSamples(100, 0, 3, 0.3, 4.0);
-    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("input/data.txt"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(sampleData,true, getTestTempFilePath("input/data.txt"), fs, conf);
     // Now run the driver using the run() method. Others can use runJob() as
     // before
     

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Fri Mar 16 17:10:29 2012
@@ -367,8 +367,8 @@ public final class TestKmeansClustering 
     Path pointsPath = getTestTempDirPath("points");
     Path clustersPath = getTestTempDirPath("clusters");
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
     for (int k = 1; k < points.size(); k++) {
       System.out.println("testKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
@@ -423,8 +423,8 @@ public final class TestKmeansClustering 
     Path pointsPath = getTestTempDirPath("points");
     Path clustersPath = getTestTempDirPath("clusters");
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
     for (int k = 1; k < points.size(); k += 3) {
       System.out.println("testKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
@@ -481,8 +481,8 @@ public final class TestKmeansClustering 
     
     Path pointsPath = getTestTempDirPath("points");
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points, true, new Path(pointsPath, "file2"), fs, conf);
     
     Path outputPath = getTestTempDirPath("output");
     // now run the Canopy job

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java Fri Mar 16 17:10:29 2012
@@ -342,12 +342,13 @@ public final class TestClusterEvaluator 
     Configuration conf = new Configuration();
     CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true);
     // now run the KMeans job
-    KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), output, measure,
+    Path kmeansOutput = new Path(output, "kmeans");
+	KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
         0.001, 10, true, true);
     int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
-        "clusteredPoints"), output, measure, numIterations, true);
+        "clusteredPoints"), kmeansOutput, measure, numIterations, true);
     ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
     // now print out the Results
     System.out.println("Intra-cluster density = "

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1301654&r1=1301653&r2=1301654&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Fri Mar 16 17:10:29 2012
@@ -352,12 +352,13 @@ public final class TestCDbwEvaluator ext
     CanopyDriver.run(new Configuration(), testdata, output, measure, 3.1, 2.1,
         false, 0.0, true);
     // now run the KMeans job
-    KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), output, measure,
+    Path kmeansOutput = new Path(output, "kmeans");
+	KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
         0.001, 10, true, true);
     int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
-        "clusteredPoints"), output, measure, numIterations, true);
+        "clusteredPoints"), kmeansOutput, measure, numIterations, true);
     CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     // printRepPoints(numIterations);
     // now print out the Results



Mime
View raw message