mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jeast...@apache.org
Subject svn commit: r1001013 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/clustering/cdbw/ main/java/org/apache/mahout/clustering/evaluation/ test/java/org/apache/mahout/clustering/ test/java/org/apache/mahout/clustering/cdbw/
Date Fri, 24 Sep 2010 18:31:56 GMT
Author: jeastman
Date: Fri Sep 24 18:31:56 2010
New Revision: 1001013

URL: http://svn.apache.org/viewvc?rev=1001013&view=rev
Log:
MAHOUT-236 
- Implemented ClusterEvaluator that uses Mahout In Action code for
inter-cluster density and similar code for intra-cluster density over a set of 
representative points, not the entire clustered data set.
- Generalized CDbwDriver etc to RepresentativePointsDriver so any cluster
evaluator tool can use them
- Added cluster pruning to CDbwEvaluator and ClusterEvaluator that removes
clusters which cause numerical instabilities in the evaluation
- Added unit tests. All tests run

Added:
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Removed:
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
Modified:
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1001013&r1=1001012&r2=1001013&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Fri Sep 24 18:31:56 2010
@@ -19,6 +19,7 @@ package org.apache.mahout.clustering.cdb
 
 import java.io.IOException;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -29,6 +30,8 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
@@ -48,6 +51,8 @@ public class CDbwEvaluator {
 
   private final DistanceMeasure measure;
 
+  private boolean pruned = false;
+
   /**
    * For testing only
    * 
@@ -65,7 +70,7 @@ public class CDbwEvaluator {
     this.clusters = clusters;
     this.measure = measure;
     for (Integer cId : representativePoints.keySet()) {
-      setStDev(cId);
+      computeStd(cId);
     }
   }
 
@@ -80,22 +85,131 @@ public class CDbwEvaluator {
   public CDbwEvaluator(Configuration conf, Path clustersIn) throws ClassNotFoundException, InstantiationException,
       IllegalAccessException, IOException {
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-    measure = ccl.loadClass(conf.get(CDbwDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class).newInstance();
-    representativePoints = CDbwMapper.getRepresentativePoints(conf);
+    measure = ccl.loadClass(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class)
+        .newInstance();
+    representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
     clusters = loadClusters(conf, clustersIn);
     for (Integer cId : representativePoints.keySet()) {
-      setStDev(cId);
+      computeStd(cId);
+    }
+  }
+
+  /**
+   * Load the clusters from their sequence files
+   * 
+   * @param clustersIn 
+   *            a String pathname to the directory containing input cluster files
+   * @return a List<Cluster> of the clusters
+   */
+  private static Map<Integer, Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException,
+      IllegalAccessException, IOException {
+    Map<Integer, Cluster> clusters = new HashMap<Integer, Cluster>();
+    FileSystem fs = clustersIn.getFileSystem(conf);
+    for (FileStatus part : fs.listStatus(clustersIn)) {
+      if (!part.getPath().getName().startsWith(".")) {
+        Path inPart = part.getPath();
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, conf);
+        Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
+        Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
+        while (reader.next(key, value)) {
+          Cluster cluster = (Cluster) value;
+          clusters.put(cluster.getId(), cluster);
+          value = reader.getValueClass().asSubclass(Writable.class).newInstance();
+        }
+        reader.close();
+      }
     }
+    return clusters;
+  }
+
+  private void computeStd(int cI) {
+    List<VectorWritable> repPts = representativePoints.get(cI);
+    int s0 = 0;
+    Vector s1 = null;
+    Vector s2 = null;
+    for (VectorWritable vw : repPts) {
+      s0++;
+      Vector v = vw.get();
+      s1 = s1 == null ? v.clone() : s1.plus(v);
+      s2 = s2 == null ? v.times(v) : s2.plus(v.times(v));
+    }
+    if (s0 > 1) {
+      Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+      double d = std.zSum() / std.size();
+      log.debug("stDev[" + cI + "]=" + d);
+      stDevs.put(cI, d);
+    }
+  }
+
+  /**
+   * Return if the cluster is valid. Valid clusters must have more than 2 representative points,
+   * and at least one of them must be different than the cluster center. This is because the
+   * representative points extraction will duplicate the cluster center if it is empty.
+   * 
+   * @param clusterI a Cluster
+   * @return a boolean
+   */
+  private boolean invalidCluster(Cluster clusterI) {
+    List<VectorWritable> repPts = representativePoints.get(clusterI.getId());
+    if (repPts.size() < 2) {
+      return true;
+    }
+    for (VectorWritable vw : repPts) {
+      Vector vector = vw.get();
+      if (!vector.equals(clusterI.getCenter())) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private void pruneInvalidClusters() {
+    if (pruned) {
+      return;
+    }
+    for (Iterator<Cluster> it = clusters.values().iterator(); it.hasNext();) {
+      Cluster cluster = it.next();
+      if (invalidCluster(cluster)) {
+        log.info("Pruning cluster Id=" + cluster.getId());
+        it.remove();
+        representativePoints.remove(cluster.getId());
+      }
+    }
+    pruned = true;
+  }
+
+  double interDensity(Vector uIJ, int cI, int cJ) {
+    List<VectorWritable> repI = representativePoints.get(cI);
+    List<VectorWritable> repJ = representativePoints.get(cJ);
+    double density = 0.0;
+    double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2.0;
+    for (VectorWritable vwI : repI) {
+      if (measure.distance(uIJ, vwI.get()) <= std) {
+        density++;
+      }
+    }
+    for (VectorWritable vwJ : repJ) {
+      if (measure.distance(uIJ, vwJ.get()) <= std) {
+        density++;
+      }
+    }
+    return density / (repI.size() + repJ.size());
+  }
+
+  double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
+    return measure.distance(clusterCenter, repPoint) <= avgStd ? 1.0 : 0.0;
   }
 
   public double getCDbw() {
+    pruneInvalidClusters();
     return intraClusterDensity() * separation();
   }
 
   public double intraClusterDensity() {
+    pruneInvalidClusters();
     double avgStd = 0.0;
     for (Integer cId : representativePoints.keySet()) {
-      avgStd += getStdev(cId);
+      avgStd += stDevs.get(cId);
     }
     avgStd /= representativePoints.size();
 
@@ -106,7 +220,7 @@ public class CDbwEvaluator {
       double cSum = 0.0;
       for (VectorWritable aRepI : repI) {
         double inDensity = intraDensity(clusters.get(cId).getCenter(), aRepI.get(), avgStd);
-        double std = getStdev(cId);
+        double std = stDevs.get(cId);
         if (std > 0.0) {
           cSum += inDensity / std;
         }
@@ -119,11 +233,11 @@ public class CDbwEvaluator {
   }
 
   public double interClusterDensity() {
+    pruneInvalidClusters();
     double sum = 0.0;
     for (Map.Entry<Integer, List<VectorWritable>> entry1 : representativePoints.entrySet()) {
       Integer cI = entry1.getKey();
       List<VectorWritable> repI = entry1.getValue();
-      double stDevI = getStdev(cI);
       for (Map.Entry<Integer, List<VectorWritable>> entry2 : representativePoints.entrySet()) {
         Integer cJ = entry2.getKey();
         if (cI.equals(cJ)) {
@@ -143,8 +257,9 @@ public class CDbwEvaluator {
             }
           }
         }
-        double stDevJ = getStdev(cJ);
-        double interDensity = uIJ == null ? 0 : interDensity(uIJ, cI, cJ);
+        double stDevI = stDevs.get(cI);
+        double stDevJ = stDevs.get(cJ);
+        double interDensity = interDensity(uIJ, cI, cJ);
         double stdSum = stDevI + stDevJ;
         double density = 0.0;
         if (stdSum > 0.0) {
@@ -164,21 +279,8 @@ public class CDbwEvaluator {
     return sum;
   }
 
-  /**
-   * Handle missing stDevs when clusters are empty by returning 0
-   * @param cI
-   * @return
-   */
-  private Double getStdev(Integer cI) {
-    Double result = stDevs.get(cI);
-    if (result == null) {
-      return new Double(0);
-    } else {
-      return result;
-    }
-  }
-
   public double separation() {
+    pruneInvalidClusters();
     double minDistance = Double.MAX_VALUE;
     for (Map.Entry<Integer, List<VectorWritable>> entry1 : representativePoints.entrySet()) {
       Integer cI = entry1.getKey();
@@ -200,91 +302,4 @@ public class CDbwEvaluator {
     }
     return minDistance / (1.0 + interClusterDensity());
   }
-
-  /**
-   * Load the clusters from their sequence files
-   * 
-   * @param clustersIn 
-   *            a String pathname to the directory containing input cluster files
-   * @return a List<Cluster> of the clusters
-   */
-  private static Map<Integer, Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException,
-      IllegalAccessException, IOException {
-    Map<Integer, Cluster> clusters = new HashMap<Integer, Cluster>();
-    FileSystem fs = clustersIn.getFileSystem(conf);
-    for (FileStatus part : fs.listStatus(clustersIn)) {
-      if (!part.getPath().getName().startsWith(".")) {
-        Path inPart = part.getPath();
-        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, conf);
-        Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
-        Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
-        while (reader.next(key, value)) {
-          Cluster cluster = (Cluster) value;
-          clusters.put(cluster.getId(), cluster);
-          value = reader.getValueClass().asSubclass(Writable.class).newInstance();
-        }
-        reader.close();
-      }
-    }
-    return clusters;
-  }
-
-  double interDensity(Vector uIJ, int cI, int cJ) {
-    List<VectorWritable> repI = representativePoints.get(cI);
-    List<VectorWritable> repJ = representativePoints.get(cJ);
-    double density = 0.0;
-    double std = (getStdev(cI) + getStdev(cJ)) / 2.0;
-    for (VectorWritable vwI : repI) {
-      if (measure.distance(uIJ, vwI.get()) <= std) {
-        density++;
-      }
-    }
-    for (VectorWritable vwJ : repJ) {
-      if (measure.distance(uIJ, vwJ.get()) <= std) {
-        density++;
-      }
-    }
-    return density / (repI.size() + repJ.size());
-  }
-
-  private void setStDev(int cI) {
-    List<VectorWritable> repPts = representativePoints.get(cI);
-    //if (repPts == null) {
-    //  System.out.println();
-    //}
-    int s0 = 0;
-    Vector s1 = null;
-    Vector s2 = null;
-    for (VectorWritable vw : repPts) {
-      s0++;
-      Vector v = vw.get();
-      s1 = s1 == null ? v.clone() : s1.plus(v);
-      s2 = s2 == null ? v.times(v) : s2.plus(v.times(v));
-    }
-    if (s0 > 1) {
-      Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
-      double d = std.zSum() / std.size();
-      //System.out.println("stDev[" + cI + "]=" + d);
-      stDevs.put(cI, d);
-    }
-  }
-
-  /*
-  double minRpDistance(Iterable<VectorWritable> repI, Iterable<VectorWritable> repJ) {
-    double minDistance = Double.MAX_VALUE;
-    for (VectorWritable aRepI : repI) {
-      for (VectorWritable aRepJ : repJ) {
-        double distance = measure.distance(aRepI.get(), aRepJ.get());
-        if (distance < minDistance) {
-          minDistance = distance;
-        }
-      }
-    }
-    return minDistance;
-  }
-   */
-
-  double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
-    return measure.distance(clusterCenter, repPoint) <= avgStd ? 1.0 : 0.0;
-  }
 }

Added: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java?rev=1001013&view=auto
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java (added)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java Fri Sep 24 18:31:56 2010
@@ -0,0 +1,209 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ClusterEvaluator {
+
+  private static final Logger log = LoggerFactory.getLogger(ClusterEvaluator.class);
+
+  private final Map<Integer, List<VectorWritable>> representativePoints;
+
+  private final List<Cluster> clusters;
+
+  private final DistanceMeasure measure;
+
+  private boolean pruned = false;
+
+  /**
+   * For testing only
+   * 
+   * @param representativePoints
+   *            a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
+   * @param clusters
+   *            a Map<Integer,Cluster> of the clusters keyed by clusterId
+   * @param measure
+   *            an appropriate DistanceMeasure
+   */
+  public ClusterEvaluator(Map<Integer, List<VectorWritable>> representativePoints, List<Cluster> clusters, DistanceMeasure measure) {
+    this.representativePoints = representativePoints;
+    this.clusters = clusters;
+    this.measure = measure;
+  }
+
+  /**
+   * Initialize a new instance from job information
+   * 
+   * @param conf
+   *            a JobConf with appropriate parameters
+   * @param clustersIn
+   *            a String path to the input clusters directory
+   */
+  public ClusterEvaluator(Configuration conf, Path clustersIn) throws ClassNotFoundException, InstantiationException,
+      IllegalAccessException, IOException {
+    ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+    measure = ccl.loadClass(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class)
+        .newInstance();
+    representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
+    clusters = loadClusters(conf, clustersIn);
+  }
+
+  /**
+   * Load the clusters from their sequence files
+   * 
+   * @param clustersIn 
+   *            a String pathname to the directory containing input cluster files
+   * @return a List<Cluster> of the clusters
+   */
+  private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException,
+      IllegalAccessException, IOException {
+    List<Cluster> clusters = new ArrayList<Cluster>();
+    FileSystem fs = clustersIn.getFileSystem(conf);
+    for (FileStatus part : fs.listStatus(clustersIn)) {
+      if (!part.getPath().getName().startsWith(".")) {
+        Path inPart = part.getPath();
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, conf);
+        Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
+        Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
+        while (reader.next(key, value)) {
+          Cluster cluster = (Cluster) value;
+          clusters.add(cluster);
+          value = reader.getValueClass().asSubclass(Writable.class).newInstance();
+        }
+        reader.close();
+      }
+    }
+    return clusters;
+  }
+
+  /**
+   * Return if the cluster is valid. Valid clusters must have more than 2 representative points,
+   * and at least one of them must be different than the cluster center. This is because the
+   * representative points extraction will duplicate the cluster center if it is empty.
+   * 
+   * @param clusterI a Cluster
+   * @return a boolean
+   */
+  private boolean invalidCluster(Cluster clusterI) {
+    List<VectorWritable> repPts = representativePoints.get(clusterI.getId());
+    if (repPts.size() < 2) {
+      return true;
+    }
+    for (VectorWritable vw : repPts) {
+      Vector vector = vw.get();
+      if (!vector.equals(clusterI.getCenter())) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private void pruneInvalidClusters() {
+    if (pruned) {
+      return;
+    }
+    for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
+      Cluster cluster = it.next();
+      if (invalidCluster(cluster)) {
+        log.info("Pruning cluster Id=" + cluster.getId());
+        it.remove();
+        representativePoints.remove(cluster.getId());
+      }
+    }
+    pruned = true;
+  }
+
+  /**
+   * Computes the inter-cluster density as defined in "Mahout In Action"
+   * 
+   * @return the interClusterDensity
+   */
+  public double interClusterDensity() {
+    pruneInvalidClusters();
+    double max = 0;
+    double min = Double.MAX_VALUE;
+    double sum = 0;
+    int count = 0;
+    for (int i = 0; i < clusters.size(); i++) {
+      Cluster clusterI = clusters.get(i);
+      for (int j = i + 1; j < clusters.size(); j++) {
+        Cluster clusterJ = clusters.get(j);
+        double d = measure.distance(clusterI.getCenter(), clusterJ.getCenter());
+        min = Math.min(d, min);
+        max = Math.max(d, max);
+        sum += d;
+        count++;
+      }
+    }
+    double density = (sum / count - min) / (max - min);
+    log.info("Inter-Cluster Density = " + density);
+    return density;
+  }
+
+  /**
+   * Computes the intra-cluster density as the average distance of the representative points
+   * from each other
+   * 
+   * @return the intraClusterDensity of the representativePoints
+   */
+  public double intraClusterDensity() {
+    pruneInvalidClusters();
+    double avgDensity = 0;
+    for (Cluster cluster : clusters) {
+      int count = 0;
+      double max = 0;
+      double min = Double.MAX_VALUE;
+      double sum = 0;
+      List<VectorWritable> repPoints = representativePoints.get(cluster.getId());
+      for (int i = 0; i < repPoints.size(); i++) {
+        for (int j = i + 1; j < repPoints.size(); j++) {
+          double d = measure.distance(repPoints.get(i).get(), repPoints.get(j).get());
+          min = Math.min(d, min);
+          max = Math.max(d, max);
+          sum += d;
+          count++;
+        }
+      }
+      double density = (sum / count - min) / (max - min);
+      avgDensity += density;
+      log.info("Intra-Cluster Density[" + cluster.getId() + "] = " + density);
+    }
+    avgDensity = clusters.size() == 0 ? 0 : avgDensity / clusters.size();
+    log.info("Intra-Cluster Density = " + avgDensity);
+    return avgDensity;
+
+  }
+}

Added: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1001013&view=auto
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java (added)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java Fri Sep 24 18:31:56 2010
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.WeightedVectorWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class RepresentativePointsDriver extends AbstractJob {
+
+  public static final String STATE_IN_KEY = "org.apache.mahout.clustering.stateIn";
+
+  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.measure";
+
+  private static final Logger log = LoggerFactory.getLogger(RepresentativePointsDriver.class);
+
+  private RepresentativePointsDriver() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new RepresentativePointsDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
+      InterruptedException {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+    DistanceMeasure measure = ccl.loadClass(distanceMeasureClass).asSubclass(DistanceMeasure.class).newInstance();
+
+    run(getConf(), input, null, output, measure, maxIterations);
+    return 0;
+  }
+
+  public static void run(Configuration conf,
+                         Path clustersIn,
+                         Path clusteredPointsIn,
+                         Path output,
+                         DistanceMeasure measure,
+                         int numIterations) throws InstantiationException, IllegalAccessException, IOException,
+      InterruptedException, ClassNotFoundException {
+    Path stateIn = new Path(output, "representativePoints-0");
+    writeInitialState(stateIn, clustersIn);
+
+    for (int iteration = 0; iteration < numIterations; iteration++) {
+      log.info("Iteration {}", iteration);
+      // point the output to a new directory per iteration
+      Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
+      runIteration(clusteredPointsIn, stateIn, stateOut, measure);
+      // now point the input to the old output directory
+      stateIn = stateOut;
+    }
+
+    conf.set(STATE_IN_KEY, stateIn.toString());
+    conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
+  }
+
+  private static void writeInitialState(Path output, Path clustersIn) throws InstantiationException, IllegalAccessException,
+      IOException, SecurityException {
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    for (FileStatus part : fs.listStatus(clustersIn)) {
+      if (!part.getPath().getName().startsWith(".")) {
+        Path inPart = part.getPath();
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, conf);
+        Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
+        Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
+        Path path = new Path(output, inPart.getName());
+        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
+        while (reader.next(key, value)) {
+          Cluster cluster = (Cluster) value;
+          log.debug("C-" + cluster.getId() + ": " + AbstractCluster.formatVector(cluster.getCenter(), null));
+          writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter()));
+        }
+        writer.close();
+      }
+    }
+  }
+
+  /**
+   * Run the job using supplied arguments
+   * 
+   * @param input
+   *          the directory pathname for input points
+   * @param stateIn
+   *          the directory pathname for input state
+   * @param stateOut
+   *          the directory pathname for output state
+   * @param measure
+   *          the DistanceMeasure
+   */
+  private static void runIteration(Path input, Path stateIn, Path stateOut, DistanceMeasure measure) throws IOException,
+      InterruptedException, ClassNotFoundException {
+    Configuration conf = new Configuration();
+    conf.set(STATE_IN_KEY, stateIn.toString());
+    conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
+    Job job = new Job(conf);
+    job.setJarByClass(RepresentativePointsDriver.class);
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setMapOutputKeyClass(IntWritable.class);
+    job.setMapOutputValueClass(WeightedVectorWritable.class);
+
+    FileInputFormat.setInputPaths(job, input);
+    FileOutputFormat.setOutputPath(job, stateOut);
+
+    job.setMapperClass(RepresentativePointsMapper.class);
+    job.setReducerClass(RepresentativePointsReducer.class);
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+    job.waitForCompletion(true);
+  }
+}

Added: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java?rev=1001013&view=auto
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java (added)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsMapper.java Fri Sep 24 18:31:56 2010
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.OutputLogFilter;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.WeightedVectorWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.VectorWritable;
+
+public class RepresentativePointsMapper extends Mapper<IntWritable, WeightedVectorWritable, IntWritable, WeightedVectorWritable> {
+
+  private Map<Integer, List<VectorWritable>> representativePoints;
+
+  private final Map<Integer, WeightedVectorWritable> mostDistantPoints = new HashMap<Integer, WeightedVectorWritable>();
+
+  private DistanceMeasure measure = new EuclideanDistanceMeasure();
+
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    for (Map.Entry<Integer, WeightedVectorWritable> entry : mostDistantPoints.entrySet()) {
+      context.write(new IntWritable(entry.getKey()), entry.getValue());
+    }
+    super.cleanup(context);
+  }
+
+  @Override
+  protected void map(IntWritable clusterId, WeightedVectorWritable point, Context context) throws IOException, InterruptedException {
+    int key = clusterId.get();
+    WeightedVectorWritable currentMDP = mostDistantPoints.get(key);
+
+    List<VectorWritable> refPoints = representativePoints.get(key);
+    double totalDistance = 0.0;
+    for (VectorWritable refPoint : refPoints) {
+      totalDistance += measure.distance(refPoint.get(), point.getVector());
+    }
+    if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
+      mostDistantPoints.put(key, new WeightedVectorWritable(totalDistance, point.getVector().clone()));
+    }
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Configuration conf = context.getConfiguration();
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      measure = ccl.loadClass(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY)).asSubclass(DistanceMeasure.class)
+          .newInstance();
+      representativePoints = getRepresentativePoints(conf);
+    } catch (NumberFormatException e) {
+      throw new IllegalStateException(e);
+    } catch (SecurityException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalArgumentException e) {
+      throw new IllegalStateException(e);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public void configure(Map<Integer, List<VectorWritable>> referencePoints, DistanceMeasure measure) {
+    this.representativePoints = referencePoints;
+    this.measure = measure;
+  }
+
+  public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf) {
+    String statePath = conf.get(RepresentativePointsDriver.STATE_IN_KEY);
+    Map<Integer, List<VectorWritable>> representativePoints = new HashMap<Integer, List<VectorWritable>>();
+    try {
+      Path path = new Path(statePath);
+      FileSystem fs = FileSystem.get(path.toUri(), conf);
+      FileStatus[] status = fs.listStatus(path, new OutputLogFilter());
+      for (FileStatus s : status) {
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), conf);
+        try {
+          IntWritable key = new IntWritable(0);
+          VectorWritable point = new VectorWritable();
+          while (reader.next(key, point)) {
+            List<VectorWritable> repPoints = representativePoints.get(key.get());
+            if (repPoints == null) {
+              repPoints = new ArrayList<VectorWritable>();
+              representativePoints.put(key.get(), repPoints);
+            }
+            repPoints.add(point);
+            point = new VectorWritable();
+          }
+        } finally {
+          reader.close();
+        }
+      }
+      return representativePoints;
+    } catch (IOException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+}

Added: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java?rev=1001013&view=auto
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java (added)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsReducer.java Fri Sep 24 18:31:56 2010
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.evaluation;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.WeightedVectorWritable;
+import org.apache.mahout.math.VectorWritable;
+
+public class RepresentativePointsReducer extends Reducer<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
+
+  private Map<Integer, List<VectorWritable>> representativePoints;
+
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    for (Map.Entry<Integer, List<VectorWritable>> entry : representativePoints.entrySet()) {
+      IntWritable iw = new IntWritable(entry.getKey());
+      for (VectorWritable vw : entry.getValue()) {
+        context.write(iw, vw);
+      }
+    }
+    super.cleanup(context);
+  }
+
+  @Override
+  protected void reduce(IntWritable key, Iterable<WeightedVectorWritable> values, Context context) throws IOException,
+      InterruptedException {
+    // find the most distant point
+    WeightedVectorWritable mdp = null;
+    for (WeightedVectorWritable dpw : values) {
+      if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
+        mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector());
+      }
+    }
+    context.write(new IntWritable(key.get()), new VectorWritable(mdp.getVector()));
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Configuration conf = context.getConfiguration();
+    try {
+      representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
+    } catch (NumberFormatException e) {
+      throw new IllegalStateException(e);
+    } catch (SecurityException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalArgumentException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public void configure(Map<Integer, List<VectorWritable>> representativePoints) {
+    this.representativePoints = representativePoints;
+  }
+
+}

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1001013&r1=1001012&r2=1001013&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Fri Sep 24 18:31:56 2010
@@ -208,7 +208,16 @@ public final class TestClusterDumper ext
   public void testMeanShift() throws Exception {
     DistanceMeasure measure = new CosineDistanceMeasure();
     Path output = getTestTempDirPath("output");
-    MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 0.5, 0.01, 0.05, 10, false, true, false);
+    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+    output,
+    measure,
+    0.5,
+    0.01,
+    0.05,
+    10,
+    false,
+    true,
+    false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-1"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);

Added: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1001013&view=auto
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java (added)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java Fri Sep 24 18:31:56 2010
@@ -0,0 +1,340 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.dirichlet.DirichletDriver;
+import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
+import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.VectorWritable;
+import org.junit.Before;
+import org.junit.Test;
+
+public final class TestClusterEvaluator extends MahoutTestCase {
+
+  private static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+      { 5, 5 } };
+
+  private Map<Integer, List<VectorWritable>> representativePoints;
+
+  private List<Cluster> clusters;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    // Create test data
+    List<VectorWritable> sampleData = TestKmeansClustering.getPointsWritable(REFERENCE);
+    ClusteringTestUtils.writePointsToFile(sampleData, getTestTempFilePath("testdata/file1"), fs, conf);
+  }
+
+  private void checkRefPoints(int numIterations) throws IOException {
+    for (int i = 0; i <= numIterations; i++) {
+      Path out = new Path(getTestTempDirPath("output"), "representativePoints-" + i);
+      Configuration conf = new Configuration();
+      FileSystem fs = FileSystem.get(conf);
+      for (FileStatus file : fs.listStatus(out)) {
+        if (!file.getPath().getName().startsWith(".")) {
+          SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
+          try {
+            Writable clusterId = new IntWritable(0);
+            VectorWritable point = new VectorWritable();
+            while (reader.next(clusterId, point)) {
+              System.out.println("\tC-" + clusterId + ": " + AbstractCluster.formatVector(point.get(), null));
+            }
+          } finally {
+            reader.close();
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
+   * @param dC a double cluster center offset
+   * @param dP a double representative point offset
+   * @param measure the DistanceMeasure
+   */
+  private void initData(double dC, double dP, DistanceMeasure measure) {
+    clusters = new ArrayList<Cluster>();
+    clusters.add(new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure));
+    representativePoints = new HashMap<Integer, List<VectorWritable>>();
+    for (Cluster cluster : clusters) {
+      List<VectorWritable> points = new ArrayList<VectorWritable>();
+      representativePoints.put(cluster.getId(), points);
+      points.add(new VectorWritable(cluster.getCenter().clone()));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, -dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, -dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, dP }))));
+    }
+  }
+
+  @Test
+  public void testCluster0() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  @Test
+  public void testCluster1() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.5, measure);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  @Test
+  public void testCluster2() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.75, measure);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  @Test
+  public void testEmptyCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure);
+    clusters.add(cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    representativePoints.put(cluster.getId(), points);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  @Test
+  public void testSingleValueCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
+    clusters.add(cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { 1, 1 }))));
+    representativePoints.put(cluster.getId(), points);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  /**
+   * Representative points extraction will duplicate the cluster center if the cluster has no 
+   * assigned points. These clusters should be ignored like empty clusters above
+   */
+  @Test
+  public void testAllSameValueCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
+    clusters.add(cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    points.add(new VectorWritable(cluster.getCenter()));
+    points.add(new VectorWritable(cluster.getCenter()));
+    points.add(new VectorWritable(cluster.getCenter()));
+    representativePoints.put(cluster.getId(), points);
+    ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.33333333333333315, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("intra cluster density", 0.3656854249492381, evaluator.intraClusterDensity(), EPSILON);
+  }
+
+  @Test
+  public void testCanopy() throws Exception { // now run the Job
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    Configuration conf = new Configuration();
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, true, false);
+    int numIterations = 2;
+    Path output = getTestTempDirPath("output");
+    Path clustersIn = new Path(output, "clusters-0");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations);
+    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+    // now print out the Results
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+
+    checkRefPoints(numIterations);
+  }
+
+  @Test
+  public void testKmeans() throws Exception {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    // now run the Canopy job to prime kMeans canopies
+    Configuration conf = new Configuration();
+    CanopyDriver.run(conf,
+                     getTestTempDirPath("testdata"),
+                     getTestTempDirPath("output"),
+                     measure,
+                     3.1,
+                     2.1,
+                     false,
+                     false);
+    // now run the KMeans job
+    Path output = getTestTempDirPath("output");
+    KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, false);
+    int numIterations = 2;
+    Path clustersIn = new Path(output, "clusters-2");
+    RepresentativePointsDriver.run(conf,
+                                   clustersIn,
+                                   new Path(output, "clusteredPoints"),
+                                   output,
+                                   measure,
+                                   numIterations);
+    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+    // now print out the Results
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    checkRefPoints(numIterations);
+  }
+
+  @Test
+  public void testFuzzyKmeans() throws Exception {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    // now run the Canopy job to prime kMeans canopies
+    Configuration conf = new Configuration();
+    CanopyDriver.run(conf,
+                     getTestTempDirPath("testdata"),
+                     getTestTempDirPath("output"),
+                     measure,
+                     3.1,
+                     2.1,
+                     false,
+                     false);
+    // now run the KMeans job
+    Path output = getTestTempDirPath("output");
+    FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
+                          new Path(output, "clusters-0"),
+                          output,
+                          measure,
+                          0.001,
+                          10,
+                          2,
+                          true,
+                          true,
+                          0,
+                          false);
+    int numIterations = 2;
+    Path clustersIn = new Path(output, "clusters-4");
+    RepresentativePointsDriver.run(conf,
+                                   clustersIn,
+                                   new Path(output, "clusteredPoints"),
+                                   output,
+                                   measure,
+                                   numIterations);
+    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+    // now print out the Results
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    checkRefPoints(numIterations);
+  }
+
+  @Test
+  public void testMeanShift() throws Exception {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+                                    getTestTempDirPath("output"),
+                                    measure,
+                                    2.1,
+                                    1.0,
+                                    0.001,
+                                    10,
+                                    false,
+                                    true,
+                                    false);
+    int numIterations = 2;
+    Path output = getTestTempDirPath("output");
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-2");
+    RepresentativePointsDriver.run(conf,
+                                   clustersIn,
+                                   new Path(output, "clusteredPoints"),
+                                   output,
+                                   measure,
+                                   numIterations);
+    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+    // now print out the Results
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    checkRefPoints(numIterations);
+  }
+
+  @Test
+  public void testDirichlet() throws Exception {
+    ModelDistribution<VectorWritable> modelDistribution = new GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
+    DirichletDriver.run(getTestTempDirPath("testdata"),
+                        getTestTempDirPath("output"),
+                        modelDistribution,
+                        15,
+                        5,
+                        1.0,
+                        true,
+                        true,
+                        0,
+                        true);
+    int numIterations = 2;
+    Path output = getTestTempDirPath("output");
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-5");
+    RepresentativePointsDriver.run(conf,
+                                   clustersIn,
+                                   new Path(output, "clusteredPoints"),
+                                   output,
+                                   new EuclideanDistanceMeasure(),
+                                   numIterations);
+    ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
+    // now print out the Results
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    checkRefPoints(numIterations);
+  }
+
+}

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1001013&r1=1001012&r2=1001013&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Fri Sep 24 18:31:56 2010
@@ -38,6 +38,7 @@ import org.apache.mahout.clustering.cano
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.dirichlet.DirichletDriver;
 import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
+import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
@@ -150,6 +151,59 @@ public final class TestCDbwEvaluator ext
   }
 
   @Test
+  public void testEmptyCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure);
+    clusters.put(cluster.getId(), cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    representativePoints.put(cluster.getId(), points);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+  }
+
+  @Test
+  public void testSingleValueCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
+    clusters.put(cluster.getId(), cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { 1, 1 }))));
+    representativePoints.put(cluster.getId(), points);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+  }
+
+  /**
+   * Representative points extraction will duplicate the cluster center if the cluster has no 
+   * assigned points. These clusters should be ignored like empty clusters above
+   */
+  @Test
+  public void testAllSameValueCluster() {
+    DistanceMeasure measure = new EuclideanDistanceMeasure();
+    initData(1, 0.25, measure);
+    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
+    clusters.put(cluster.getId(), cluster);
+    List<VectorWritable> points = new ArrayList<VectorWritable>();
+    points.add(new VectorWritable(cluster.getCenter()));
+    points.add(new VectorWritable(cluster.getCenter()));
+    points.add(new VectorWritable(cluster.getCenter()));
+    representativePoints.put(cluster.getId(), points);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
+    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+  }
+
+  @Test
   public void testCanopy() throws Exception { // now run the Job
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     CanopyDriver.run(new Configuration(),
@@ -162,13 +216,16 @@ public final class TestCDbwEvaluator ext
                      false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.run(new Configuration(),
-                   new Path(output, "clusters-0"),
-                   new Path(output, "clusteredPoints"),
-                   output,
-                   measure,
-                   numIterations);
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-0");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     checkRefPoints(numIterations);
+    // now print out the Results
+    System.out.println("CDbw = " + evaluator.getCDbw());
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    System.out.println("Separation = " + evaluator.separation());
   }
 
   @Test
@@ -187,13 +244,16 @@ public final class TestCDbwEvaluator ext
     Path output = getTestTempDirPath("output");
     KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, true, false);
     int numIterations = 2;
-    CDbwDriver.run(new Configuration(),
-                   new Path(output, "clusters-2"),
-                   new Path(output, "clusteredPoints"),
-                   output,
-                   measure,
-                   numIterations);
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-2");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     checkRefPoints(numIterations);
+    // now print out the Results
+    System.out.println("CDbw = " + evaluator.getCDbw());
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    System.out.println("Separation = " + evaluator.separation());
   }
 
   @Test
@@ -211,48 +271,54 @@ public final class TestCDbwEvaluator ext
     // now run the KMeans job
     Path output = getTestTempDirPath("output");
     FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    2,
-    true,
-    true,
-    0,
-    false);
+                          new Path(output, "clusters-0"),
+                          output,
+                          measure,
+                          0.001,
+                          10,
+                          2,
+                          true,
+                          true,
+                          0,
+                          false);
     int numIterations = 2;
-    CDbwDriver.run(new Configuration(),
-                   new Path(output, "clusters-4"),
-                   new Path(output, "clusteredPoints"),
-                   output,
-                   measure,
-                   numIterations);
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-4");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     checkRefPoints(numIterations);
+    // now print out the Results
+    System.out.println("CDbw = " + evaluator.getCDbw());
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    System.out.println("Separation = " + evaluator.separation());
   }
 
   @Test
   public void testMeanShift() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"),
-                                 getTestTempDirPath("output"),
-                                 measure,
-                                 2.1,
-                                 1.0,
-                                 0.001,
-                                 10,
-                                 false,
-                                 true,
-                                 false);
+    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+                                    getTestTempDirPath("output"),
+                                    measure,
+                                    2.1,
+                                    1.0,
+                                    0.001,
+                                    10,
+                                    false,
+                                    true,
+                                    false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.run(new Configuration(),
-                   new Path(output, "clusters-2"),
-                   new Path(output, "clusteredPoints"),
-                   output,
-                   measure,
-                   numIterations);
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-2");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, "clusteredPoints"), output, measure, numIterations);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     checkRefPoints(numIterations);
+    // now print out the Results
+    System.out.println("CDbw = " + evaluator.getCDbw());
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    System.out.println("Separation = " + evaluator.separation());
   }
 
   @Test
@@ -270,44 +336,21 @@ public final class TestCDbwEvaluator ext
                         true);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    CDbwDriver.run(new Configuration(),
-                   new Path(output, "clusters-5"),
-                   new Path(output, "clusteredPoints"),
-                   output,
-                   new EuclideanDistanceMeasure(),
-                   numIterations);
+    Configuration conf = new Configuration();
+    Path clustersIn = new Path(output, "clusters-0");
+    RepresentativePointsDriver.run(conf,
+                                   clustersIn,
+                                   new Path(output, "clusteredPoints"),
+                                   output,
+                                   new EuclideanDistanceMeasure(),
+                                   numIterations);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
     checkRefPoints(numIterations);
-  }
-
-  @Test
-  public void testEmptyCluster() {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure);
-    clusters.put(cluster.getId(), cluster);
-    List<VectorWritable> points = new ArrayList<VectorWritable>();
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
-    assertEquals("intra cluster density", 0.7155417527999326, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.073312629199899, evaluator.getCDbw(), EPSILON);
-  }
-
-  @Test
-  public void testSingleValueCluster() {
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-    initData(1, 0.25, measure);
-    Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
-    clusters.put(cluster.getId(), cluster);
-    List<VectorWritable> points = new ArrayList<VectorWritable>();
-    points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { 1, 1 }))));
-    representativePoints.put(cluster.getId(), points);
-    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 0.0, evaluator.separation(), EPSILON);
-    assertEquals("intra cluster density", 0.7155417527999326, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 0.0, evaluator.getCDbw(), EPSILON);
+    // now print out the Results
+    System.out.println("CDbw = " + evaluator.getCDbw());
+    System.out.println("Intra-cluster density = " + evaluator.intraClusterDensity());
+    System.out.println("Inter-cluster density = " + evaluator.interClusterDensity());
+    System.out.println("Separation = " + evaluator.separation());
   }
 
 }



Mime
View raw message