Return-Path: Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: (qmail 80679 invoked from network); 28 Sep 2010 22:08:43 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 28 Sep 2010 22:08:43 -0000 Received: (qmail 6247 invoked by uid 500); 28 Sep 2010 22:08:43 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 6168 invoked by uid 500); 28 Sep 2010 22:08:42 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 6161 invoked by uid 99); 28 Sep 2010 22:08:42 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 28 Sep 2010 22:08:42 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 28 Sep 2010 22:08:40 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id B49762388993; Tue, 28 Sep 2010 22:08:19 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1002376 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Date: Tue, 28 Sep 2010 22:08:19 -0000 To: commits@mahout.apache.org From: jeastman@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100928220819.B49762388993@eris.apache.org> Author: jeastman Date: Tue Sep 28 22:08:19 2010 New Revision: 1002376 URL: http://svn.apache.org/viewvc?rev=1002376&view=rev Log: MAHOUT-513: Reviewed calculations from paper in detail, making some changes that seemed needed. All tests run but I'm still not very confident in the computed numbers Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1002376&r1=1002375&r2=1002376&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Tue Sep 28 22:08:19 2010 @@ -18,6 +18,7 @@ package org.apache.mahout.clustering.cdbw; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -39,6 +40,10 @@ import org.apache.mahout.math.function.S import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * This class calculates the CDbw metric as defined in + * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf + */ public class CDbwEvaluator { private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class); @@ -47,7 +52,7 @@ public class CDbwEvaluator { private final Map stDevs = new HashMap(); - private final Map clusters; + private final List clusters; private final DistanceMeasure measure; @@ -63,9 +68,7 @@ public class CDbwEvaluator { * @param measure * an appropriate DistanceMeasure */ - public CDbwEvaluator(Map> representativePoints, - Map clusters, - DistanceMeasure measure) { + public CDbwEvaluator(Map> representativePoints, List clusters, DistanceMeasure measure) { this.representativePoints = representativePoints; this.clusters = clusters; this.measure = measure; @@ -101,9 +104,9 @@ public class CDbwEvaluator { * a String pathname to the directory containing input cluster files * @return a List of the clusters */ - private static Map loadClusters(Configuration conf, Path clustersIn) throws InstantiationException, + private static List loadClusters(Configuration conf, Path clustersIn) throws InstantiationException, IllegalAccessException, IOException { - Map clusters = new HashMap(); + List clusters = new ArrayList(); FileSystem fs = clustersIn.getFileSystem(conf); for (FileStatus part : fs.listStatus(clustersIn)) { if (!part.getPath().getName().startsWith(".")) { @@ -113,7 +116,7 @@ public class CDbwEvaluator { Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance(); while (reader.next(key, value)) { Cluster cluster = (Cluster) value; - clusters.put(cluster.getId(), cluster); + clusters.add(cluster); value = reader.getValueClass().asSubclass(Writable.class).newInstance(); } reader.close(); @@ -122,7 +125,14 @@ public class CDbwEvaluator { return clusters; } + /** + * Compute the standard deviation of the representative points for the given cluster. + * Store these in stDevs, indexed by cI + * + * @param cI a int clusterId. + */ private void computeStd(int cI) { + // TODO: verify this approach List repPts = representativePoints.get(cI); int s0 = 0; Vector s1 = null; @@ -167,7 +177,7 @@ public class CDbwEvaluator { if (pruned) { return; } - for (Iterator it = clusters.values().iterator(); it.hasNext();) { + for (Iterator it = clusters.iterator(); it.hasNext();) { Cluster cluster = it.next(); if (invalidCluster(cluster)) { log.info("Pruning cluster Id=" + cluster.getId()); @@ -178,82 +188,116 @@ public class CDbwEvaluator { pruned = true; } + /** + * Compute the term density (eqn 2) used for inter-cluster density calculation + * + * @param uIJ the Vector midpoint between the closest representative of the clusters + * @param cI the int clusterId of the i-th cluster + * @param cJ the int clusterId of the j-th cluster + * @return a double + */ double interDensity(Vector uIJ, int cI, int cJ) { List repI = representativePoints.get(cI); List repJ = representativePoints.get(cJ); - double density = 0.0; - double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2.0; + double sum = 0.0; + Double stdevI = stDevs.get(cI); + Double stdevJ = stDevs.get(cJ); + // count the number of representative points of the clusters which are within the + // average std of the two clusters from the midpoint uIJ (eqn 3) + double avgStd = (stdevI + stdevJ) / 2.0; for (VectorWritable vwI : repI) { - if (measure.distance(uIJ, vwI.get()) <= std) { - density++; + if (measure.distance(uIJ, vwI.get()) <= avgStd) { + sum++; } } for (VectorWritable vwJ : repJ) { - if (measure.distance(uIJ, vwJ.get()) <= std) { - density++; + if (measure.distance(uIJ, vwJ.get()) <= avgStd) { + sum++; } } - return density / (repI.size() + repJ.size()); - } - - double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) { - return measure.distance(clusterCenter, repPoint) <= avgStd ? 1.0 : 0.0; + int nI = repI.size(); + int nJ = repJ.size(); + return sum / (nI + nJ); } + /** + * Compute the validity index (eqn 8) + * + * @return a double + */ public double getCDbw() { pruneInvalidClusters(); return intraClusterDensity() * separation(); } + /** + * The average density within clusters is defined as the percentage of points that belong + * to the neighborhood of representative points of the considered clusters. The goal is + * the density within clusters to be significant high. (eqn 5) + * + * @return a double + */ public double intraClusterDensity() { pruneInvalidClusters(); - double avgStd = 0.0; - for (Integer cId : representativePoints.keySet()) { - avgStd += stDevs.get(cId); - } - avgStd /= representativePoints.size(); - - double sum = 0.0; - for (Map.Entry> entry : representativePoints.entrySet()) { - Integer cId = entry.getKey(); - List repI = entry.getValue(); - double cSum = 0.0; - for (VectorWritable aRepI : repI) { - double inDensity = intraDensity(clusters.get(cId).getCenter(), aRepI.get(), avgStd); - double std = stDevs.get(cId); - if (std > 0.0) { - cSum += inDensity / std; - } - } - if (repI.size() > 0) { - sum += cSum / repI.size(); + // compute the average standard deviation of the clusters + double stdev = 0.0; + for (Integer cI : representativePoints.keySet()) { + stdev += stDevs.get(cI); + } + int c = representativePoints.size(); + stdev /= c; + // accumulate the summations + double sumI = 0.0; + for (int i = 0; i < clusters.size(); i++) { + Integer cI = clusters.get(i).getId(); + List repPtsI = representativePoints.get(cI); + int r = repPtsI.size(); + double sumJ = 0.0; + // compute the term density (eqn 6) + for (int j = 0; j < r; j++) { + // compute f(x, vIJ) (eqn 7) + Vector repJ = repPtsI.get(j).get(); + double densityIJ = (measure.distance(clusters.get(i).getCenter(), repJ) <= stdev ? 1.0 : 0.0); + // accumulate sumJ + sumJ += densityIJ / stdev; } + // accumulate sumI + sumI += sumJ / r; } - return sum / representativePoints.size(); + return sumI / c; } + /** + * This function evaluates the average density in the region among clusters (eqn 1). + * The goal is the density in the area among clusters to be significant low. + * + * @return a double + */ public double interClusterDensity() { pruneInvalidClusters(); double sum = 0.0; - for (Map.Entry> entry1 : representativePoints.entrySet()) { - Integer cI = entry1.getKey(); - List repI = entry1.getValue(); - for (Map.Entry> entry2 : representativePoints.entrySet()) { - Integer cJ = entry2.getKey(); - if (cI.equals(cJ)) { + // find the closest representative points between the clusters + for (int i = 0; i < clusters.size(); i++) { + Integer cI = clusters.get(i).getId(); + List repI = representativePoints.get(cI); + for (int j = 1; j < clusters.size(); j++) { + Integer cJ = clusters.get(j).getId(); + if (i == j) { continue; } - List repJ = entry2.getValue(); - double minDistance = Double.MAX_VALUE; - Vector uIJ = null; + List repJ = representativePoints.get(cJ); + double minDistance = Double.MAX_VALUE; // the distance between the closest representative points + Vector uIJ = null; // the midpoint between the closest representative points + // find the closest representative points between the i-th and j-th clusters for (VectorWritable aRepI : repI) { for (VectorWritable aRepJ : repJ) { - Vector vI = aRepI.get(); - Vector vJ = aRepJ.get(); - double distance = measure.distance(vI, vJ); + Vector closRepI = aRepI.get(); + Vector closRepJ = aRepJ.get(); + double distance = measure.distance(closRepI, closRepJ); if (distance < minDistance) { + // set the distance and compute the midpoint minDistance = distance; - uIJ = vI.plus(vJ).divide(2); + uIJ = closRepI.plus(closRepJ).divide(2); } } } @@ -275,31 +319,42 @@ public class CDbwEvaluator { sum += density; } } - //System.out.println("interClusterDensity=" + sum); + log.info("interClusterDensity=" + sum); return sum; } + /** + * Calculate the separation of clusters (eqn 4) taking into account both the distances between the closest + * clusters and the Inter-cluster density. The goal is the distances among clusters to be high while + * the density in the area among them to be low. + * + * @return a double + */ public double separation() { pruneInvalidClusters(); - double minDistance = Double.MAX_VALUE; - for (Map.Entry> entry1 : representativePoints.entrySet()) { - Integer cI = entry1.getKey(); - List repI = entry1.getValue(); - for (Map.Entry> entry2 : representativePoints.entrySet()) { - if (cI.equals(entry2.getKey())) { + double minDistanceSum = 0; + for (int i = 0; i < clusters.size(); i++) { + Integer cI = clusters.get(i).getId(); + List closRepI = representativePoints.get(cI); + for (int j = 0; j < clusters.size(); j++) { + if (i == j) { continue; } - List repJ = entry2.getValue(); - for (VectorWritable aRepI : repI) { - for (VectorWritable aRepJ : repJ) { + // find min{d(closRepI, closRepJ)} + Integer cJ = clusters.get(j).getId(); + List closRepJ = representativePoints.get(cJ); + double minDistance = Double.MAX_VALUE; + for (VectorWritable aRepI : closRepI) { + for (VectorWritable aRepJ : closRepJ) { double distance = measure.distance(aRepI.get(), aRepJ.get()); if (distance < minDistance) { minDistance = distance; } } } + minDistanceSum += minDistance; } } - return minDistance / (1.0 + interClusterDensity()); + return minDistanceSum / (1.0 + interClusterDensity()); } } Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1002376&r1=1002375&r2=1002376&view=diff ============================================================================== --- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original) +++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Tue Sep 28 22:08:19 2010 @@ -58,7 +58,7 @@ public final class TestCDbwEvaluator ext private Map> representativePoints; - private Map clusters; + private List clusters; @Override @Before @@ -101,13 +101,13 @@ public final class TestCDbwEvaluator ext * @param measure the DistanceMeasure */ private void initData(double dC, double dP, DistanceMeasure measure) { - clusters = new HashMap(); - clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure)); - clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure)); - clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure)); - clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure)); + clusters = new ArrayList(); + clusters.add(new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure)); + clusters.add(new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure)); + clusters.add(new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure)); + clusters.add(new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure)); representativePoints = new HashMap>(); - for (Cluster cluster : clusters.values()) { + for (Cluster cluster : clusters) { List points = new ArrayList(); representativePoints.put(cluster.getId(), points); points.add(new VectorWritable(cluster.getCenter().clone())); @@ -124,9 +124,9 @@ public final class TestCDbwEvaluator ext initData(1, 0.25, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 1.5, evaluator.separation(), EPSILON); + assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON); } @Test @@ -135,9 +135,9 @@ public final class TestCDbwEvaluator ext initData(1, 0.5, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 1.0, evaluator.separation(), EPSILON); + assertEquals("separation", 13.656854249492381, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.44721359549995787, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 0.44721359549995787, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 6.107530892134367, evaluator.getCDbw(), EPSILON); } @Test @@ -145,10 +145,10 @@ public final class TestCDbwEvaluator ext DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.75, measure); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); - assertEquals("inter cluster density", 1.017921815355728, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 0.24777966925931558, evaluator.separation(), EPSILON); + assertEquals("inter cluster density", 0.7634413615167959, evaluator.interClusterDensity(), EPSILON); + assertEquals("separation", 3.8722167199667066, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.29814239699997197, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 0.07387362452083261, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 1.1544719745942431, evaluator.getCDbw(), EPSILON); } @Test @@ -156,14 +156,14 @@ public final class TestCDbwEvaluator ext DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure); - clusters.put(cluster.getId(), cluster); + clusters.add(cluster); List points = new ArrayList(); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 1.5, evaluator.separation(), EPSILON); + assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON); } @Test @@ -171,15 +171,15 @@ public final class TestCDbwEvaluator ext DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure); - clusters.put(cluster.getId(), cluster); + clusters.add(cluster); List points = new ArrayList(); points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { 1, 1 })))); representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 1.5, evaluator.separation(), EPSILON); + assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON); } /** @@ -191,7 +191,7 @@ public final class TestCDbwEvaluator ext DistanceMeasure measure = new EuclideanDistanceMeasure(); initData(1, 0.25, measure); Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure); - clusters.put(cluster.getId(), cluster); + clusters.add(cluster); List points = new ArrayList(); points.add(new VectorWritable(cluster.getCenter())); points.add(new VectorWritable(cluster.getCenter())); @@ -199,9 +199,9 @@ public final class TestCDbwEvaluator ext representativePoints.put(cluster.getId(), points); CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure); assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON); - assertEquals("separation", 1.5, evaluator.separation(), EPSILON); + assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON); assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON); - assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON); + assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON); } @Test