mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r1146835 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/kmeans/ integration/src/main/java/org/apache/mahout/utils/clustering/
Date Thu, 14 Jul 2011 18:18:50 GMT
Author: gsingers
Date: Thu Jul 14 18:18:49 2011
New Revision: 1146835

URL: http://svn.apache.org/viewvc?rev=1146835&view=rev
Log:
MAHOUT-761: kmeans can emit it's distance in the clustering step

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
Thu Jul 14 18:18:49 2011
@@ -25,10 +25,10 @@ import org.apache.hadoop.io.Writable;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
-public final class WeightedVectorWritable implements Writable {
+public class WeightedVectorWritable implements Writable {
 
-  private final VectorWritable vectorWritable = new VectorWritable();
-  private double weight;
+  protected VectorWritable vectorWritable = new VectorWritable();
+  protected double weight;
 
   public WeightedVectorWritable() {
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
Thu Jul 14 18:18:49 2011
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.math.VectorWritable;
@@ -38,7 +39,7 @@ import org.apache.mahout.math.VectorWrit
  * @see KMeansDriver for more information on how to invoke this process
  */
 public class KMeansClusterMapper
-    extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable>
{
+    extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedPropertyVectorWritable>
{
   
   private final Collection<Cluster> clusters = Lists.newArrayList();
   private KMeansClusterer clusterer;

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
Thu Jul 14 18:18:49 2011
@@ -17,15 +17,20 @@
 package org.apache.mahout.clustering.kmeans;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import com.google.common.collect.Lists;
+import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.math.Vector;
@@ -117,7 +122,7 @@ public class KMeansClusterer {
 
   public void outputPointWithClusterInfo(Vector vector,
                                          Iterable<Cluster> clusters,
-                                         Mapper<?,?,IntWritable,WeightedVectorWritable>.Context
context)
+                                         Mapper<?,?,IntWritable,WeightedPropertyVectorWritable>.Context
context)
     throws IOException, InterruptedException {
     AbstractCluster nearestCluster = null;
     double nearestDistance = Double.MAX_VALUE;
@@ -129,7 +134,9 @@ public class KMeansClusterer {
         nearestDistance = distance;
       }
     }
-    context.write(new IntWritable(nearestCluster.getId()), new WeightedVectorWritable(1,
vector));
+    Map<Text, Text> props = new HashMap<Text, Text>();
+    props.put(new Text("distance"), new Text(String.valueOf(nearestDistance)));
+    context.write(new IntWritable(nearestCluster.getId()), new WeightedPropertyVectorWritable(1,
vector, props));
   }
 
   /**

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Thu Jul 14 18:18:49 2011
@@ -36,6 +36,7 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusterObservations;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
@@ -474,7 +475,7 @@ public class KMeansDriver extends Abstra
     job.setInputFormatClass(SequenceFileInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(IntWritable.class);
-    job.setOutputValueClass(WeightedVectorWritable.class);
+    job.setOutputValueClass(WeightedPropertyVectorWritable.class);
 
     FileInputFormat.setInputPaths(job, input);
     HadoopUtil.delete(conf, output);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Thu Jul 14 18:18:49 2011
@@ -17,19 +17,6 @@
 
 package org.apache.mahout.utils.clustering;
 
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
 import com.google.common.base.Charsets;
 import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
@@ -38,8 +25,10 @@ import org.apache.commons.lang.StringUti
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.Pair;
@@ -52,6 +41,19 @@ import org.apache.mahout.utils.vectors.V
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
 public final class ClusterDumper extends AbstractJob {
 
   public static final String OUTPUT_OPTION = "output";
@@ -94,8 +96,8 @@ public final class ClusterDumper extends
     addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
     addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
     addOption(POINTS_DIR_OPTION, "p",
-        "The directory containing points sequence files mapping input vectors to their cluster.
 "
-            + "If specified, then the program will output the points associated with a cluster");
+            "The directory containing points sequence files mapping input vectors to their
cluster.  "
+                    + "If specified, then the program will output the points associated with
a cluster");
     addOption(DICTIONARY_OPTION, "d", "The dictionary file");
     addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)",
"text");
     if (parseArguments(args) == null) {
@@ -147,7 +149,7 @@ public final class ClusterDumper extends
     }
     try {
       for (Cluster value :
-           new SequenceFileDirValueIterable<Cluster>(new Path(seqFileDir, "part-*"),
PathType.GLOB, conf)) {
+              new SequenceFileDirValueIterable<Cluster>(new Path(seqFileDir, "part-*"),
PathType.GLOB, conf)) {
         String fmtStr = value.asFormatString(dictionary);
         if (subString > 0 && fmtStr.length() > subString) {
           writer.write(':');
@@ -167,11 +169,24 @@ public final class ClusterDumper extends
 
         List<WeightedVectorWritable> points = clusterIdToPoints.get(value.getId());
         if (points != null) {
-          writer.write("\tWeight:  Point:\n\t");
-          for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext();)
{
+          writer.write("\tWeight : [props - optional]:  Point:\n\t");
+          for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext();
) {
             WeightedVectorWritable point = iterator.next();
             writer.write(String.valueOf(point.getWeight()));
+            if (point instanceof WeightedPropertyVectorWritable) {
+              WeightedPropertyVectorWritable tmp = (WeightedPropertyVectorWritable) point;
+              Map<Text, Text> map = tmp.getProperties();
+              writer.write(" : [");
+              for (Map.Entry<Text, Text> entry : map.entrySet()) {
+                writer.write(entry.getKey().toString());
+                writer.write("=");
+                writer.write(entry.getValue().toString());
+              }
+              writer.write("]");
+            }
+
             writer.write(": ");
+
             writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
             if (iterator.hasNext()) {
               writer.write("\n\t");
@@ -236,9 +251,9 @@ public final class ClusterDumper extends
 
   public static Map<Integer, List<WeightedVectorWritable>> readPoints(Path pointsPathDir,
Configuration conf) {
     Map<Integer, List<WeightedVectorWritable>> result = new TreeMap<Integer,
List<WeightedVectorWritable>>();
-    for (Pair<IntWritable,WeightedVectorWritable> record :
-         new SequenceFileDirIterable<IntWritable,WeightedVectorWritable>(
-             pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
+    for (Pair<IntWritable, WeightedVectorWritable> record :
+            new SequenceFileDirIterable<IntWritable, WeightedVectorWritable>(
+                    pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
       // value is the cluster id as an int, key is the name/id of the
       // vector, but that doesn't matter because we only care about printing
       // it
@@ -257,6 +272,7 @@ public final class ClusterDumper extends
   private static class TermIndexWeight {
     private final int index;
     private final double weight;
+
     TermIndexWeight(int index, double weight) {
       this.index = index;
       this.weight = weight;



Mime
View raw message