Return-Path: X-Original-To: apmail-mahout-commits-archive@www.apache.org Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 27F0C761D for ; Thu, 3 Nov 2011 04:02:15 +0000 (UTC) Received: (qmail 79985 invoked by uid 500); 3 Nov 2011 04:02:15 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 79944 invoked by uid 500); 3 Nov 2011 04:02:13 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 79937 invoked by uid 99); 3 Nov 2011 04:02:11 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 03 Nov 2011 04:02:11 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 03 Nov 2011 04:02:09 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 8B3712388CBF; Thu, 3 Nov 2011 04:01:49 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1196934 - in /mahout/trunk: examples/bin/build-reuters.sh integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Date: Thu, 03 Nov 2011 04:01:49 -0000 To: commits@mahout.apache.org From: gsingers@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20111103040149.8B3712388CBF@eris.apache.org> Author: gsingers Date: Thu Nov 3 04:01:49 2011 New Revision: 1196934 URL: http://svn.apache.org/viewvc?rev=1196934&view=rev Log: MAHOUT-867: hook in ability to run basic evaluations into cluster dumper Modified: mahout/trunk/examples/bin/build-reuters.sh mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Modified: mahout/trunk/examples/bin/build-reuters.sh URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1196934&r1=1196933&r2=1196934&view=diff ============================================================================== --- mahout/trunk/examples/bin/build-reuters.sh (original) +++ mahout/trunk/examples/bin/build-reuters.sh Thu Nov 3 04:01:49 2011 @@ -100,12 +100,13 @@ if [ "x$clustertype" == "xkmeans" ]; the -c ${WORK_DIR}/reuters-kmeans-clusters \ -o ${WORK_DIR}/reuters-kmeans \ -dm org.apache.mahout.common.distance.CosineDistanceMeasure \ - -x 10 -k 20 -ow \ + -x 10 -k 20 -ow --clustering \ && \ $MAHOUT clusterdump \ -s ${WORK_DIR}/reuters-kmeans/clusters-*-final \ -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \ - -dt sequencefile -b 100 -n 20 + -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.CosineDistanceMeasure \ + --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints elif [ "x$clustertype" == "xfuzzykmeans" ]; then $MAHOUT seq2sparse \ -i ${WORK_DIR}/reuters-out-seqdir/ \ Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1196934&r1=1196933&r2=1196934&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java Thu Nov 3 04:01:49 2011 @@ -71,6 +71,7 @@ public final class RepresentativePointsD public int run(String[] args) throws ClassNotFoundException, IOException, InterruptedException { addInputOption(); addOutputOption(); + addOption("clusteredPoints", "cp", "The path to the clustered points", true); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.methodOption().create()); @@ -85,8 +86,8 @@ public final class RepresentativePointsD boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class); - - run(getConf(), input, null, output, measure, maxIterations, runSequential); + Path clusteredPoints = new Path(getOption("clusteredPoints")); + run(getConf(), input, clusteredPoints, output, measure, maxIterations, runSequential); return 0; } Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1196934&r1=1196933&r2=1196934&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Thu Nov 3 04:01:49 2011 @@ -27,8 +27,16 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.WeightedVectorWritable; +import org.apache.mahout.clustering.cdbw.CDbwEvaluator; +import org.apache.mahout.clustering.evaluation.ClusterEvaluator; +import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver; +import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper; import org.apache.mahout.common.AbstractJob; +import org.apache.mahout.common.ClassUtils; +import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.Pair; +import org.apache.mahout.common.commandline.DefaultOptionCreator; +import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; @@ -51,6 +59,8 @@ import java.util.TreeMap; public final class ClusterDumper extends AbstractJob { + protected DistanceMeasure measure; + public enum OUTPUT_FORMAT { TEXT, CSV, @@ -64,6 +74,7 @@ public final class ClusterDumper extends public static final String NUM_WORDS_OPTION = "numWords"; public static final String SUBSTRING_OPTION = "substring"; public static final String SEQ_FILE_DIR_OPTION = "seqFileDir"; + public static final String EVALUATE_CLUSTERS = "evaluate"; public static final String OUTPUT_FORMAT_OPT = "outputFormat"; @@ -77,6 +88,7 @@ public final class ClusterDumper extends private int numTopFeatures = 10; private Map> clusterIdToPoints; private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT; + private boolean runEvaluation; public ClusterDumper(Path seqFileDir, Path pointsDir) { this.seqFileDir = seqFileDir; @@ -104,6 +116,8 @@ public final class ClusterDumper extends + "If specified, then the program will output the points associated with a cluster"); addOption(DICTIONARY_OPTION, "d", "The dictionary file"); addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text"); + addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and CDbwEvaluator over the input. The output will be appended to the rest of the output at the end.", false, false, null)); + addOption(DefaultOptionCreator.distanceMeasureOption().create()); if (parseArguments(args) == null) { return -1; } @@ -127,12 +141,15 @@ public final class ClusterDumper extends if (hasOption(OUTPUT_FORMAT_OPT)) { outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT)); } + runEvaluation = hasOption(EVALUATE_CLUSTERS); + String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); + measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class); init(); printClusters(null); return 0; } - public void printClusters(String[] dictionary) throws IOException { + public void printClusters(String[] dictionary) throws Exception { Configuration conf = new Configuration(); if (this.termDictionary != null) { @@ -165,6 +182,28 @@ public final class ClusterDumper extends long numWritten = clusterWriter.write(new SequenceFileDirValueIterable(new Path(seqFileDir, "part-*"), PathType.GLOB, conf)); writer.flush(); + if (runEvaluation){ + HadoopUtil.delete(conf, new Path("tmp/representative")); + int numIters = 5; + RepresentativePointsDriver.main(new String[]{ + "--input", seqFileDir.toString(), + "--output", "tmp/representative", + "--clusteredPoints", pointsDir.toString(), + "--distanceMeasure", measure.getClass().getName(), + "--maxIter", String.valueOf(numIters)// + }); + conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, measure.getClass().getName()); + conf.set(RepresentativePointsDriver.STATE_IN_KEY, "tmp/representative/representativePoints-" + numIters); + ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir); + writer.append("\n"); + writer.append("Inter-Cluster Density: ").append(String.valueOf(ce.interClusterDensity())).append("\n"); + writer.append("Intra-Cluster Density: ").append(String.valueOf(ce.intraClusterDensity())).append("\n"); + CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir); + writer.append("CDbw Inter-Cluster Density: ").append(String.valueOf(cdbw.interClusterDensity())).append("\n"); + writer.append("CDbw Intra-Cluster Density: ").append(String.valueOf(cdbw.intraClusterDensity())).append("\n"); + writer.append("CDbw Separation: ").append(String.valueOf(cdbw.separation())).append("\n"); + writer.flush(); + } log.info("Wrote {} clusters", numWritten); } finally { if (shouldClose) {