mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r788186 - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/kmeans/ core/src/main/java/org/apache/mahout/matrix/...
Date Wed, 24 Jun 2009 21:26:36 GMT
Author: gsingers
Date: Wed Jun 24 21:26:25 2009
New Revision: 788186

URL: http://svn.apache.org/viewvc?rev=788186&view=rev
Log:
MAHOUT-139 and minor MAHOUT-137: Vector improvements, some utilities and some minor refactoring of common code in Cluster and Canopy

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
      - copied, changed from r788103, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/AbstractVector.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/DenseVector.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/Vector.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/CosineDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/EuclideanDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/TanimotoDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedEuclideanDistanceMeasure.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedManhattanDistanceMeasure.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestDenseVector.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestOrderedIntDoubleMapping.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestSparseVector.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestVectorView.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/VectorTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/utils/DefaultDistanceMeasureTest.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=788186&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Wed Jun 24 21:26:25 2009
@@ -0,0 +1,67 @@
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.hadoop.io.Writable;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.DataInput;
+
+
+/**
+ *
+ *
+ **/
+public abstract class ClusterBase implements Writable {
+  // this canopy's canopyId
+  protected int id;
+  // the current center
+  protected Vector center = new SparseVector(0);
+
+  // the number of points in the canopy
+  protected int numPoints = 0;
+
+    // the total of all points added to the cluster
+  protected Vector pointTotal = null;
+
+  public Vector getPointTotal() {
+    return pointTotal;
+  }
+
+  public int getId() {
+    return id;
+  }
+
+  /**
+   * Return the center point
+   *
+   * @return the center of the Canopy
+   */
+  public Vector getCenter() {
+    return center;
+  }
+
+  public int getNumPoints() {
+    return numPoints;
+  }
+
+  public abstract String asFormatString();
+  /**
+   * Simply writes out the id, and that's it!
+   * @param out The {@link java.io.DataOutput}
+   * @throws IOException
+   */
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(id);
+  }
+
+  /**
+   * Reads in the id, nothing else
+   * @param in
+   * @throws IOException
+   */
+  public void readFields(DataInput in) throws IOException {
+    id = in.readInt();
+  }
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Wed Jun 24 21:26:25 2009
@@ -25,6 +25,7 @@
 import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.matrix.Vector;
 import org.apache.mahout.utils.DistanceMeasure;
+import org.apache.mahout.clustering.ClusterBase;
 
 import java.io.IOException;
 import java.io.DataOutput;
@@ -37,7 +38,7 @@
  * a point total which is the sum of all the points and is used to compute the
  * centroid when needed.
  */
-public class Canopy implements Writable {
+public class Canopy extends ClusterBase implements Writable {
 
   // keys used by Driver, Mapper, Combiner & Reducer
   public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
@@ -60,17 +61,6 @@
   // the distance measure
   private static DistanceMeasure measure;
 
-  // this canopy's canopyId
-  private int canopyId;
-
-  // the current center
-  private Vector center = new SparseVector(0);
-
-  // the number of points in the canopy
-  private int numPoints = 0;
-
-  // the total of all points added to the canopy
-  private Vector pointTotal = null;
 
   /**
    * Used w
@@ -84,7 +74,7 @@
    * @param point a point in vector space
    */
   public Canopy(Vector point) {
-    this.canopyId = nextCanopyId++;
+    this.id = nextCanopyId++;
     this.center = point.clone();
     this.pointTotal = point.clone();
     this.numPoints = 1;
@@ -97,7 +87,7 @@
    * @param canopyId an int identifying the canopy local to this process only
    */
   public Canopy(Vector point, int canopyId) {
-    this.canopyId = canopyId;
+    this.id = canopyId;
     this.center = point.clone();
     this.pointTotal = point.clone();
     this.numPoints = 1;
@@ -224,13 +214,13 @@
 
   @Override
   public void write(DataOutput out) throws IOException {
-    out.writeInt(canopyId);
+    super.write(out);
     AbstractVector.writeVector(out, computeCentroid());
   }
 
   @Override
   public void readFields(DataInput in) throws IOException {
-    canopyId = in.readInt();
+    super.readFields(in);
     this.center = AbstractVector.readVector(in);
     this.pointTotal = center.clone();
     this.numPoints = 1;
@@ -242,10 +232,15 @@
    * @param canopy
    */
   public static String formatCanopy(Canopy canopy) {
-    return "C" + canopy.canopyId + ": "
+    return "C" + canopy.id + ": "
         + canopy.computeCentroid().asFormatString();
   }
 
+  @Override
+  public String asFormatString() {
+    return formatCanopy(this);
+  }
+
   /**
    * Decodes and returns a Canopy from the formattedString
    * 
@@ -272,8 +267,8 @@
    */
   public void addPoint(Vector point) {
     numPoints++;
-    for (int i = 0; i < point.size(); i++)
-      pointTotal.set(i, point.get(i) + pointTotal.get(i));
+    pointTotal = pointTotal.plus(point);
+
   }
 
   /**
@@ -293,30 +288,10 @@
   }
 
   public String getIdentifier() {
-    return "C" + canopyId;
+    return "C" + id;
   }
 
-  public int getCanopyId() {
-    return canopyId;
-  }
 
-  /**
-   * Return the center point
-   * 
-   * @return the center of the Canopy
-   */
-  public Vector getCenter() {
-    return center;
-  }
-
-  /**
-   * Return the number of points in the Canopy
-   * 
-   * @return the number of points in the canopy.
-   */
-  public int getNumPoints() {
-    return numPoints;
-  }
 
   /**
    * Compute the centroid by averaging the pointTotals
@@ -324,10 +299,7 @@
    * @return a SparseVector (required by Mapper) which is the new centroid
    */
   public Vector computeCentroid() {
-    Vector result = new SparseVector(pointTotal.size());
-    for (int i = 0; i < pointTotal.size(); i++)
-      result.set(i, pointTotal.get(i) / numPoints);
-    return result;
+    return  pointTotal.divide(numPoints);
   }
 
   /**

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Wed Jun 24 21:26:25 2009
@@ -27,11 +27,13 @@
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.mahout.matrix.Vector;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
 
 import java.io.IOException;
 
 public class CanopyDriver {
-
+  private transient static Log log = LogFactory.getLog(CanopyDriver.class);
   private CanopyDriver() {
   }
 
@@ -61,6 +63,8 @@
    */
   public static void runJob(String input, String output,
                             String measureClassName, double t1, double t2, Class<? extends Vector> vectorClass) throws IOException {
+    log.info("Input: " + input + " Out: " + output + " Measure: " + measureClassName + " t1: " + t1
+    + " t2: " + t2 + " Vector Class: " + vectorClass.getSimpleName());
     JobClient client = new JobClient();
     JobConf conf = new JobConf(
             org.apache.mahout.clustering.canopy.CanopyDriver.class);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Wed Jun 24 21:26:25 2009
@@ -30,8 +30,9 @@
 import org.apache.mahout.matrix.SquareRootFunction;
 import org.apache.mahout.matrix.Vector;
 import org.apache.mahout.utils.DistanceMeasure;
+import org.apache.mahout.clustering.ClusterBase;
 
-public class Cluster implements Writable {
+public class Cluster extends ClusterBase implements Writable {
 
   private static final String ERROR_UNKNOWN_CLUSTER_FORMAT = "Unknown cluster format:\n";
 
@@ -52,11 +53,6 @@
 
   private static int nextClusterId = 0;
 
-  // this cluster's clusterId
-  private int clusterId;
-
-  // the current center
-  private Vector center = new SparseVector(0);
 
   // the current centroid is lazy evaluated and may be null
   private Vector centroid = null;
@@ -64,11 +60,7 @@
   // the standard deviation of the covered points
   private double std;
 
-  // the number of points in the cluster
-  private int numPoints = 0;
 
-  // the total of all points added to the cluster
-  private Vector pointTotal = null;
 
   // the total of all the points squared, used for std computation
   private Vector pointSquaredTotal = null;
@@ -89,6 +81,11 @@
         + cluster.computeCentroid().asFormatString();
   }
 
+  @Override
+  public String asFormatString() {
+    return formatCluster(this);
+  }
+
   /**
    * Decodes and returns a Cluster from the formattedString
    * 
@@ -117,14 +114,14 @@
 
   @Override
   public void write(DataOutput out) throws IOException {
-    out.writeInt(clusterId);
+    super.write(out);
     out.writeBoolean(converged);
     AbstractVector.writeVector(out, computeCentroid());
   }
 
   @Override
   public void readFields(DataInput in) throws IOException {
-    this.clusterId = in.readInt();
+    super.readFields(in);
     this.converged = in.readBoolean();
     this.center = AbstractVector.readVector(in);
     this.numPoints = 0;
@@ -204,7 +201,7 @@
     }
     //TODO: this is ugly
     String name = point.getName();
-    output.collect(new Text(name != null && name.equals("") == false ? name : point.asFormatString()), new Text(String.valueOf(nearestCluster.clusterId)));
+    output.collect(new Text(name != null && name.equals("") == false ? name : point.asFormatString()), new Text(String.valueOf(nearestCluster.id)));
   }
 
   /**
@@ -233,7 +230,7 @@
    */
   public Cluster(Vector center) {
     super();
-    this.clusterId = nextClusterId++;
+    this.id = nextClusterId++;
     this.center = center;
     this.numPoints = 0;
     this.pointTotal = center.like();
@@ -253,7 +250,7 @@
    */
   public Cluster(Vector center, int clusterId) {
     super();
-    this.clusterId = clusterId;
+    this.id = clusterId;
     this.center = center;
     this.numPoints = 0;
     this.pointTotal = center.like();
@@ -265,7 +262,7 @@
    */
   public Cluster(String clusterId) {
 
-    this.clusterId = Integer.parseInt((clusterId.substring(1)));
+    this.id = Integer.parseInt((clusterId.substring(1)));
     this.numPoints = 0;
     this.converged = clusterId.startsWith("V");
   }
@@ -277,9 +274,9 @@
 
   public String getIdentifier() {
     if (converged)
-      return "V" + clusterId;
+      return "V" + id;
     else
-      return "C" + clusterId;
+      return "C" + id;
   }
 
   /**
@@ -309,13 +306,8 @@
     }
   }
 
-  public Vector getCenter() {
-    return center;
-  }
+  
 
-  public int getNumPoints() {
-    return numPoints;
-  }
 
   /**
    * Compute the centroid and set the center to it.
@@ -337,9 +329,7 @@
     return converged;
   }
 
-  public Vector getPointTotal() {
-    return pointTotal;
-  }
+
 
   public boolean isConverged() {
     return converged;

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java Wed Jun 24 21:26:25 2009
@@ -86,7 +86,7 @@
             Canopy value = new Canopy();
             while (reader.next(key, value)) {
               // get the cluster info
-              Cluster cluster = new Cluster(value.getCenter(), value.getCanopyId());
+              Cluster cluster = new Cluster(value.getCenter(), value.getId());
               clusters.add(cluster);
               value = new Canopy();
             }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/AbstractVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/AbstractVector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/AbstractVector.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/AbstractVector.java Wed Jun 24 21:26:25 2009
@@ -17,18 +17,18 @@
 
 package org.apache.mahout.matrix;
 
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.google.gson.reflect.TypeToken;
+import org.apache.hadoop.io.WritableComparable;
+
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.reflect.Type;
 import java.util.HashMap;
 import java.util.Map;
-
-import org.apache.hadoop.io.WritableComparable;
-
-import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
-import com.google.gson.reflect.TypeToken;
+import java.util.Iterator;
 
 /**
  * Implementations of generic capabilities like sum of elements and dot products
@@ -52,59 +52,31 @@
 
   /**
    * Subclasses must override to return an appropriately sparse or dense result
-   * 
-   * @param rows the row cardinality
+   *
+   * @param rows    the row cardinality
    * @param columns the column cardinality
    * @return a Matrix
    */
   protected abstract Matrix matrixLike(int rows, int columns);
 
-  /**
-   * Returns an iterator for traversing the Vector, but not in any particular
-   * order. The actual implementations may make some guarantees about the order
-   * in which the vector is traversed. Otherwise, the traversal order is
-   * undefined.
-   * 
-   * @see java.lang.Iterable#iterator()
-   */
-  @Override
-  public abstract java.util.Iterator<Vector.Element> iterator();
 
   @Override
-  public Vector.Element getElement(int index) {
-    return new Element(index);
-  }
-
-  public class Element implements Vector.Element {
-    private final int ind;
-
-    public Element(int ind) {
-      this.ind = ind;
-    }
-
-    @Override
-    public double get() {
-      return getQuick(ind);
-    }
+  public abstract Vector.Element getElement(int index); 
 
-    @Override
-    public int index() {
-      return ind;
-    }
 
-    @Override
-    public void set(double value) {
-      setQuick(ind, value);
-    }
-  }
 
   public abstract Vector clone();
 
   @Override
   public Vector divide(double x) {
     Vector result = clone();
-    for (int i = 0; i < result.size(); i++)
-      result.setQuick(i, getQuick(i) / x);
+    Iterator<Element> iter = result.iterateNonZero();
+    while (iter.hasNext()) {
+      Element element = iter.next();
+      int index = element.index();
+      result.setQuick(index, element.get() / x);
+    }
+
     return result;
   }
 
@@ -113,8 +85,12 @@
     if (size() != x.size())
       throw new CardinalityException();
     double result = 0;
-    for (int i = 0; i < size(); i++)
-      result += getQuick(i) * x.getQuick(i);
+    Iterator<Element> iter = iterateNonZero();
+    while (iter.hasNext()) {
+      Element element = iter.next();
+      result += element.get() * x.getQuick(element.index());
+    }
+
     return result;
   }
 
@@ -131,8 +107,11 @@
     if (size() != x.size())
       throw new CardinalityException();
     Vector result = clone();
-    for (int i = 0; i < result.size(); i++)
-      result.setQuick(i, getQuick(i) - x.getQuick(i));
+    Iterator<Vector.Element> iter = x.iterateNonZero();
+    while (iter.hasNext()){
+      Vector.Element e = iter.next();
+      result.setQuick(e.index(), getQuick(e.index()) - e.get());
+    }
     return result;
   }
 
@@ -163,8 +142,10 @@
       return divide(val);
     } else {
       double val = 0.0;
-      for (int i = 0; i < size(); i++) {
-        val += Math.pow(getQuick(i), power);
+      Iterator<Element> iter = this.iterateNonZero();
+      while (iter.hasNext()) {
+      Element element = iter.next();
+        val += Math.pow(element.get(), power);
       }
       double divFactor = Math.pow(val, 1.0 / power);
       return divide(divFactor);
@@ -206,9 +187,16 @@
   public Vector plus(Vector x) {
     if (size() != x.size())
       throw new CardinalityException();
+    //TODO: get smarter about this, if we are adding a dense to a sparse, then we should return a dense
     Vector result = clone();
-    for (int i = 0; i < result.size(); i++)
-      result.setQuick(i, getQuick(i) + x.getQuick(i));
+    Iterator<Vector.Element> iter = x.iterateNonZero();
+    while (iter.hasNext()){
+      Vector.Element e = iter.next();
+      result.setQuick(e.index(), getQuick(e.index()) + e.get());
+    }
+
+    /*for (int i = 0; i < result.size(); i++)
+      result.setQuick(i, getQuick(i) + x.getQuick(i));*/
     return result;
   }
 
@@ -223,8 +211,13 @@
   @Override
   public Vector times(double x) {
     Vector result = clone();
-    for (int i = 0; i < result.size(); i++)
-      result.setQuick(i, getQuick(i) * x);
+    Iterator<Element> iter = iterateNonZero();
+    while (iter.hasNext()) {
+      Element element = iter.next();
+      int index = element.index();
+      result.setQuick(index, element.get() * x);
+    }
+
     return result;
   }
 
@@ -233,16 +226,25 @@
     if (size() != x.size())
       throw new CardinalityException();
     Vector result = clone();
-    for (int i = 0; i < result.size(); i++)
-      result.setQuick(i, getQuick(i) * x.getQuick(i));
+    Iterator<Element> iter = result.iterateNonZero();
+    while (iter.hasNext()) {
+      Element element = iter.next();
+      int index = element.index();
+      result.setQuick(index, element.get() * x.getQuick(index));  
+    }
+
     return result;
   }
 
   @Override
   public double zSum() {
     double result = 0;
-    for (int i = 0; i < size(); i++)
-      result += getQuick(i);
+    Iterator<Element> iter = iterateNonZero();
+    while (iter.hasNext()) {
+      Element element = iter.next();
+      result += element.get();
+    }
+
     return result;
   }
 
@@ -305,10 +307,10 @@
 
   /**
    * Decodes a point from its WritableComparable<?> representation.
-   * 
+   *
    * @param writableComparable a WritableComparable<?> produced by
-   *        asWritableComparable. Note the payload remainder: it is optional,
-   *        but can be present.
+   *                           asWritableComparable. Note the payload remainder: it is optional,
+   *                           but can be present.
    * @return the n-dimensional point
    */
   public static Vector decodeVector(WritableComparable<?> writableComparable) {
@@ -317,9 +319,9 @@
 
   /**
    * Decodes a point from its string representation.
-   * 
+   *
    * @param formattedString a formatted String produced by asFormatString. Note
-   *        the payload remainder: it is optional, but can be present.
+   *                        the payload remainder: it is optional, but can be present.
    * @return the n-dimensional point
    */
   public static Vector decodeVector(String formattedString) {
@@ -360,13 +362,11 @@
    * they have the same cardinality and all of their values are the same.
    * <p/>
    * Does not compare {@link Vector#getName()}.
-   * 
-   * 
-   * @param left The left hand Vector to compare
+   *
+   * @param left  The left hand Vector to compare
    * @param right The right hand Vector
    * @return true if the two Vectors have the same cardinality and the same
    *         values
-   * 
    * @see #strictEquivalence(Vector, Vector)
    * @see Vector#equals(Object)
    */
@@ -392,9 +392,8 @@
    * Compare whether two Vector implementations are the same, including the
    * underlying implementation. Two Vectors are the same if they have the same
    * cardinality, same name and all of their values are the same.
-   * 
-   * 
-   * @param left The left hand Vector to compare
+   *
+   * @param left  The left hand Vector to compare
    * @param right The right hand Vector
    * @return true if the two Vectors have the same cardinality and the same
    *         values
@@ -409,7 +408,7 @@
     if (leftName != null && rightName != null && !leftName.equals(rightName)) {
       return false;
     } else if ((leftName != null && rightName == null)
-        || (rightName != null && leftName == null)) {
+            || (rightName != null && leftName == null)) {
       return false;
     }
 
@@ -460,7 +459,7 @@
    */
   @Override
   public void set(String label, double value) throws IndexException,
-      UnboundLabelException {
+          UnboundLabelException {
     if (bindings == null)
       throw new UnboundLabelException();
     Integer index = bindings.get(label);
@@ -500,7 +499,7 @@
 
   /**
    * Read and return a vector from the input
-   * 
+   *
    * @param in
    * @return
    * @throws IOException
@@ -527,13 +526,13 @@
 
   /**
    * Write the vector to the output
-   * 
+   *
    * @param out
    * @param vector
    * @throws IOException
    */
   public static void writeVector(DataOutput out, Vector vector)
-      throws IOException {
+          throws IOException {
     String vectorClassName = vector.getClass().getName();
     out.writeUTF(vectorClassName);
     vector.write(out);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/DenseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/DenseVector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/DenseVector.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/DenseVector.java Wed Jun 24 21:26:25 2009
@@ -21,7 +21,7 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
-import java.util.NoSuchElementException;
+import java.util.Iterator;
 
 /**
  * Implements vector as an array of doubles
@@ -136,28 +136,72 @@
    * @see java.lang.Iterable#iterator
    */
   @Override
-  public java.util.Iterator<Vector.Element> iterator() {
-    return new Iterator();
+  public java.util.Iterator<Vector.Element> iterateNonZero() {
+    return new NonZeroIterator();
   }
 
-  private class Iterator implements java.util.Iterator<Vector.Element> {
-    private int ind;
+  @Override
+  public java.util.Iterator<Vector.Element> iterateAll() {
+    return new AllIterator();
+  }
+
+  private class NonZeroIterator implements java.util.Iterator<Vector.Element> {
 
-    private Iterator() {
-      ind = 0;
+    private Element element = new Element(0);
+    private int offset;
+
+    private NonZeroIterator() {
     }
 
     @Override
     public boolean hasNext() {
-      return ind < values.length;
+      int last = offset;
+      while (offset < values.length && values[offset] == 0){
+        offset++;
+      }
+      boolean next = true;
+      if (offset >= values.length){
+        next = false;
+      } else {
+        element.ind = offset;
+        offset++;
+      }
+      return next;
     }
 
     @Override
     public Vector.Element next() {
-      if (!hasNext()) {
+      /*if (!hasNext()) {
         throw new NoSuchElementException();
-      }
-      return new Element(ind++);
+      }*/
+      return element;
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  private class AllIterator implements java.util.Iterator<Vector.Element> {
+
+    private Element element = new Element(-1);
+
+    private AllIterator() {
+    }
+
+    @Override
+    public boolean hasNext() {
+      return element.ind + 1 < values.length;
+    }
+
+    @Override
+    public Vector.Element next() {
+      /*if (!hasNext()) {
+        throw new NoSuchElementException();
+      }*/
+      element.ind++;
+      return element;
     }
 
     @Override
@@ -166,11 +210,41 @@
     }
   }
 
+  public class Element implements Vector.Element {
+    int ind;
+
+    public Element(int ind) {
+      this.ind = ind;
+    }
+
+    @Override
+    public double get() {
+      return values[ind];
+    }
+
+    @Override
+    public int index() {
+      return ind;
+    }
+
+    @Override
+    public void set(double value) {
+      values[ind] = value;
+    }
+  }
+
+  @Override
+  public Vector.Element getElement(int index) {
+    return new Element(index);
+  }
+
   @Override
   public void write(DataOutput dataOutput) throws IOException {
     dataOutput.writeUTF(this.name==null? "": this.name);
     dataOutput.writeInt(size());
-    for (Vector.Element element : this) {
+    Iterator<Vector.Element> iter = iterateAll();
+    while (iter.hasNext()) {
+      Vector.Element element = iter.next();
       dataOutput.writeDouble(element.get());
     }
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/SparseVector.java Wed Jun 24 21:26:25 2009
@@ -21,6 +21,7 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.NoSuchElementException;
+import java.util.Iterator;
 
 /**
  * Implements vector that only stores non-zero doubles
@@ -35,7 +36,6 @@
 
   private int cardinality;
 
-  public static boolean optimizeTimes = true;
 
   public SparseVector(String name, int cardinality, int size) {
     super(name);
@@ -113,9 +113,20 @@
     return new SparseVector(newCardinality);
   }
 
+  /**
+   * NOTE: this implementation reuses the Vector.Element instance for each call of next(). If you
+   * need to preserve the instance, you need to make a copy of it
+   * @return an {@link org.apache.mahout.matrix.SparseVector.NonZeroIterator} over the Elements.
+   *
+   * @see #getElement(int)
+   */
+  public java.util.Iterator<Vector.Element> iterateNonZero() {
+    return new NonZeroIterator();
+  }
+
   @Override
-  public java.util.Iterator<Vector.Element> iterator() {
-    return new Iterator();
+  public Iterator<Vector.Element> iterateAll() {
+    return new AllIterator();
   }
 
   /**
@@ -158,18 +169,41 @@
     return result;
   }
 
-  private class Iterator implements java.util.Iterator<Vector.Element> {
+  private class AllIterator implements java.util.Iterator<Vector.Element>{
     private int offset = 0;
+    private Element element = new Element(0);
 
     @Override
     public boolean hasNext() {
+      return offset < cardinality;
+    }
+
+    @Override
+    public Vector.Element next() {
+      element.ind = offset++;
+      return element;
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+
+  private class NonZeroIterator implements java.util.Iterator<Vector.Element> {
+    private int offset = 0;
+    private Element element = new Element(0);
+    @Override
+    public boolean hasNext() {
       return offset < values.getNumMappings();
     }
 
     @Override
     public Element next() {
       if (offset < values.getNumMappings()) {
-        return new Element(values.getIndices()[offset++]);
+        element.ind = values.getIndices()[offset++];
+        return element;
       }
       throw new NoSuchElementException();
     }
@@ -181,36 +215,50 @@
   }
 
   @Override
-  public double zSum() {
-    double result = 0.0;
-    for (double value : values.getValues()) {
-      result += value;
-    }
-    return result;
+  public Vector.Element getElement(int index) {
+    return new Element(index);
   }
 
-  @Override
-  public double dot(Vector x) {
-    if (size() != x.size())
-      throw new CardinalityException();
-    double result = 0.0;
-    for (int index : values.getIndices()) {
-      result += values.get(index) * x.getQuick(index);
+  public class Element implements Vector.Element {
+    int ind;
+
+    public Element(int ind) {
+      this.ind = ind;
+    }
+
+    @Override
+    public double get() {
+      return values.get(ind);
+    }
+
+    @Override
+    public int index() {
+      return ind;
+    }
+
+    @Override
+    public void set(double value) {
+      values.set(ind, value);
     }
-    return result;
   }
 
+
+ 
   @Override
   public void write(DataOutput dataOutput) throws IOException {
     dataOutput.writeUTF(this.name == null ? "" : this.name);
     dataOutput.writeInt(size());
-    dataOutput.writeInt(getNumNondefaultElements());
-    for (Vector.Element element : this) {
-      if (element.get() != 0.0d) {
-        dataOutput.writeInt(element.index());
-        dataOutput.writeDouble(element.get());
-      }
+    int nde = getNumNondefaultElements();
+    dataOutput.writeInt(nde);
+    Iterator<Vector.Element> iter = iterateNonZero();
+    int count = 0;
+    while (iter.hasNext()) {
+      Vector.Element element = iter.next();
+      dataOutput.writeInt(element.index());
+      dataOutput.writeDouble(element.get());
+      count++;
     }
+    assert(nde == count);
   }
 
   @Override
@@ -219,49 +267,14 @@
     int cardinality = dataInput.readInt();
     int size = dataInput.readInt();
     OrderedIntDoubleMapping values = new OrderedIntDoubleMapping(size);
-    for (int i = 0; i < size; i++) {
+    int i = 0;
+    for (; i < size; i++) {
       values.set(dataInput.readInt(), dataInput.readDouble());
     }
+    assert(i == size);
     this.cardinality = cardinality;
     this.values = values;
   }
 
-  @Override
-  public Vector times(double x) {
-    Vector result;
-    if (optimizeTimes) {
-      result = like();
-      for (Vector.Element element : this) {
-        double value = element.get();
-        int index = element.index();
-        result.setQuick(index, value * x);
-      }
-    } else {
-      result = clone();
-      for (int i = 0; i < result.size(); i++)
-        result.setQuick(i, getQuick(i) * x);
-    }
-    return result;
-  }
-
-  @Override
-  public Vector times(Vector x) {
-    if (size() != x.size())
-      throw new CardinalityException();
-    Vector result;
-    if (optimizeTimes) {
-      result = like();
-      for (Vector.Element element : this) {
-        double value = element.get();
-        int index = element.index();
-        result.setQuick(index, value * x.getQuick(index));
-      }
-    } else {
-      result = clone();
-      for (int i = 0; i < result.size(); i++)
-        result.setQuick(i, getQuick(i) * x.getQuick(i));
-    }
-    return result;
-  }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/Vector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/Vector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/Vector.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/Vector.java Wed Jun 24 21:26:25 2009
@@ -18,6 +18,7 @@
 package org.apache.mahout.matrix;
 
 import java.util.Map;
+import java.util.Iterator;
 
 import org.apache.hadoop.io.Writable;
 
@@ -26,8 +27,10 @@
  * <p/>
  * NOTE: All implementing classes must have a constructor that takes an int for cardinality
  * and a no-arg constructor that can be used for marshalling the Writable instance
+ * <p/>
+ * NOTE: Implementations may choose to reuse the Vector.Element in the Iterable methods
  */
-public interface Vector extends Iterable<Vector.Element>, Cloneable, Writable {
+public interface Vector extends Cloneable, Writable {
 
   /**
    * Vectors may have a name associated with them, which makes them easy to identify
@@ -117,6 +120,25 @@
   Vector clone();
 
   /**
+   * Iterates over all elements
+   *<p/>
+   * * NOTE: Implementations may choose to reuse the Element returned for performance reasons,
+   * so if you need a copy of it, you should call {@link #getElement} for the given index
+   *
+   * @return An {@link java.util.Iterator} over all elements
+   */
+  Iterator<Element> iterateAll();
+
+  /**
+   * Iterates over all non-zero elements.
+   *<p/> 
+   * NOTE: Implementations may choose to reuse the Element returned for performance reasons,
+   * so if you need a copy of it, you should call {@link #getElement} for the given index
+   * @return An {@link java.util.Iterator} over all non-zero elements
+   */
+  Iterator<Element> iterateNonZero();
+
+  /**
    * Return the value at the index defined by the label
    * 
    * @param label a String label that maps to an index
@@ -142,6 +164,12 @@
    */
   Element getElement(int index);
 
+  /**
+   * A holder for information about a specific item in the Vector.
+   * <p/>
+   * When using with an Iterator, the implementation may choose to reuse this element, so you may need to make a copy if
+   * you want to keep it
+   */
   interface Element {
     /**
      * @return the value of this vector element.

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/matrix/VectorView.java Wed Jun 24 21:26:25 2009
@@ -20,7 +20,6 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Iterator;
 import java.util.NoSuchElementException;
 
 /**
@@ -111,17 +110,84 @@
   }
 
   @Override
-  public Iterator<Vector.Element> iterator() {
-    return new ViewIterator();
+  public java.util.Iterator<Vector.Element> iterateNonZero() {
+    return new NonZeroIterator();
   }
 
-  public class ViewIterator implements Iterator<Vector.Element> {
-    private final Iterator<Vector.Element> it;
+  @Override
+  public java.util.Iterator<Vector.Element> iterateAll() {
+    return new AllIterator();
+  }
+
+  public class NonZeroIterator implements java.util.Iterator<Vector.Element> {
+    private final java.util.Iterator<Vector.Element> it;
 
     private Vector.Element el;
 
-    public ViewIterator() {
-      it = vector.iterator();
+    public NonZeroIterator() {
+      it = vector.iterateAll();
+      buffer();
+    }
+
+    private void buffer() {
+      while (it.hasNext()) {
+        el = it.next();
+        if (isInView(el.index()) && el.get() != 0) {
+          final Vector.Element decorated = vector.getElement(el.index());
+          el = new Vector.Element() {
+            @Override
+            public double get() {
+              return decorated.get();
+            }
+
+            @Override
+            public int index() {
+              return decorated.index() - offset;
+            }
+
+            @Override
+            public void set(double value) {
+              el.set(value);
+            }
+          };
+          return;
+        }
+      }
+      el = null; // No element was found
+    }
+
+    @Override
+    public Vector.Element next() {
+      if (!hasNext()) {
+        throw new NoSuchElementException();
+      }
+      Vector.Element buffer = el;
+      buffer();
+      return buffer;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return el != null;
+    }
+
+    /**
+     * @throws UnsupportedOperationException all the time. method not
+     *         implemented.
+     */
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public class AllIterator implements java.util.Iterator<Vector.Element> {
+    private final java.util.Iterator<Vector.Element> it;
+
+    private Vector.Element el;
+
+    public AllIterator() {
+      it = vector.iterateAll();
       buffer();
     }
 
@@ -129,7 +195,7 @@
       while (it.hasNext()) {
         el = it.next();
         if (isInView(el.index())) {
-          final Vector.Element decorated = el;
+          final Vector.Element decorated = vector.getElement(el.index());
           el = new Vector.Element() {
             @Override
             public double get() {
@@ -177,6 +243,45 @@
     }
   }
 
+
+  @Override
+  public double dot(Vector x) {
+    if (size() != x.size())
+      throw new CardinalityException();
+    double result = 0;
+    for (int i = 0; i < size(); i++)
+      result += getQuick(i) * x.getQuick(i);
+    return result;
+  }
+
+  @Override
+  public Vector.Element getElement(int index) {
+    return new Element(index);
+  }
+
+  public class Element implements Vector.Element {
+    private final int ind;
+
+    public Element(int ind) {
+      this.ind = ind;
+    }
+
+    @Override
+    public double get() {
+      return getQuick(ind);
+    }
+
+    @Override
+    public int index() {
+      return ind;
+    }
+
+    @Override
+    public void set(double value) {
+      setQuick(ind, value);
+    }
+  }
+
   @Override
   public void write(DataOutput dataOutput) throws IOException {
     dataOutput.writeUTF(this.name == null ? "" : this.name);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/CosineDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/CosineDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/CosineDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/CosineDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -24,6 +24,7 @@
 
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 
 /**
  * This class implements a cosine distance metric by dividing the dot product
@@ -50,17 +51,17 @@
     double dotProduct = 0.0;
     double lengthSquaredp1 = 0.0;
     double lengthSquaredp2 = 0.0;
-    for (int i = 0; i < p1.length; i++) { 
+    for (int i = 0; i < p1.length; i++) {
       lengthSquaredp1 += p1[i] * p1[i];
       lengthSquaredp2 += p2[i] * p2[i];
       dotProduct += p1[i] * p2[i];
     }
     double denominator = Math.sqrt(lengthSquaredp1) * Math.sqrt(lengthSquaredp2);
-    
+
     // correct for floating-point rounding errors
-    if(denominator < dotProduct)
+    if (denominator < dotProduct)
       denominator = dotProduct;
-    
+
     return 1.0 - (dotProduct / denominator);
   }
 
@@ -68,20 +69,27 @@
   public double distance(Vector v1, Vector v2) {
     if (v1.size() != v2.size())
       throw new CardinalityException();
-	  double lengthSquaredv1 = 0.0;
-	  double lengthSquaredv2 = 0.0;
-	  for (int i = 0; i < v1.size(); i++) {
-	    lengthSquaredv1 += v1.getQuick(i) * v1.getQuick(i);
-	    lengthSquaredv2 += v2.getQuick(i) * v2.getQuick(i);
-	  }
-	  double dotProduct = v1.dot(v2);
-	  double denominator = Math.sqrt(lengthSquaredv1) * Math.sqrt(lengthSquaredv2);
-	  
-	  // correct for floating-point rounding errors
-    if(denominator < dotProduct)
+    double lengthSquaredv1 = 0.0;
+    double lengthSquaredv2 = 0.0;
+    Iterator<Vector.Element> iter = v1.iterateNonZero();
+    while (iter.hasNext()) {
+      Vector.Element elt = iter.next();
+      lengthSquaredv1 += elt.get() * elt.get();
+    }
+    iter = v2.iterateNonZero();
+    while (iter.hasNext()) {
+      Vector.Element elt = iter.next();
+      lengthSquaredv2 += elt.get() * elt.get();
+    }
+
+    double dotProduct = v1.dot(v2);
+    double denominator = Math.sqrt(lengthSquaredv1) * Math.sqrt(lengthSquaredv2);
+
+    // correct for floating-point rounding errors
+    if (denominator < dotProduct)
       denominator = dotProduct;
 
-	  return 1.0 - (dotProduct / denominator);
+    return 1.0 - (dotProduct / denominator);
   }
-  
+
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/EuclideanDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/EuclideanDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/EuclideanDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/EuclideanDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -24,6 +24,7 @@
 
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 
 /**
  * This class implements a Euclidian distance metric by summing the square root
@@ -52,6 +53,7 @@
       double delta = p2[i] - p1[i];
       result += delta * delta;
     }
+    //TODO: Do we really need to return the square root?
     return Math.sqrt(result);
   }
 
@@ -60,10 +62,14 @@
     if (v1.size() != v2.size())
       throw new CardinalityException();
     double result = 0;
-    for (int i = 0; i < v1.size(); i++) {
-      double delta = v2.getQuick(i) - v1.getQuick(i);
+    Vector vector = v1.plus(v2);
+    Iterator<Vector.Element> iter = vector.iterateNonZero();//this contains all non zero elements between the two
+    while (iter.hasNext()) {
+      Vector.Element e = iter.next();
+      double delta = v2.getQuick(e.index()) - v1.getQuick(e.index());
       result += delta * delta;
     }
+    //TODO: Do we really need to return the square root?
     return Math.sqrt(result);
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/ManhattanDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -24,6 +24,7 @@
 
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 
 /**
  * This class implements a "manhattan distance" metric by summing the absolute
@@ -58,8 +59,12 @@
     if (v1.size() != v2.size())
       throw new CardinalityException();
     double result = 0;
-    for (int i = 0; i < v1.size(); i++)
-      result += Math.abs(v2.getQuick(i) - v1.getQuick(i));
+   Vector vector = v1.plus(v2);
+   Iterator<Vector.Element> iter = vector.iterateNonZero();//this contains all non zero elements between the two
+   while (iter.hasNext()){
+      Vector.Element e = iter.next();
+      result += Math.abs(v2.getQuick(e.index()) - v1.getQuick(e.index()));
+    }
     return result;
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/TanimotoDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/TanimotoDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/TanimotoDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/TanimotoDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -21,6 +21,7 @@
 
 import java.util.HashSet;
 import java.util.Set;
+import java.util.Iterator;
 
 
 /**
@@ -52,7 +53,9 @@
     double a2 = 0.0;
     double b2 = 0.0;
 
-    for (Vector.Element feature : vector0) {
+    Iterator<Vector.Element> iter = vector0.iterateNonZero();
+    while (iter.hasNext()) {
+      Vector.Element feature = iter.next();
       if (!featuresSeen.add(feature.index())) {
 
         double a = feature.get();
@@ -69,7 +72,9 @@
     }
 
 
-    for (Vector.Element feature : vector1) {
+    iter = vector1.iterateNonZero();
+    while (iter.hasNext()) {
+      Vector.Element feature = iter.next();
       if (!featuresSeen.add(feature.index())) {
 
         double a = vector0.get(feature.index());

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedEuclideanDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedEuclideanDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedEuclideanDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedEuclideanDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -19,6 +19,8 @@
 
 import org.apache.mahout.matrix.Vector;
 
+import java.util.Iterator;
+
 /**
  * This class implements a Euclidian distance metric by summing the square root
  * of the squared differences between each coordinate,  optionally adding weights.
@@ -30,12 +32,16 @@
     double result = 0;
     Vector res = p2.minus(p1);
     if (weights == null) {
-      for (int i = 0; i < p1.size(); i++) {
-        result += res.get(i) * res.get(i);
+      Iterator<Vector.Element> iter = res.iterateNonZero();
+      while (iter.hasNext()) {
+        Vector.Element elt = iter.next();
+        result += elt.get() * elt.get();
       }
     } else {
-      for (int i = 0; i < p1.size(); i++) {
-        result += res.get(i) * res.get(i) * weights.get(i);  // todo this is where the weights goes, right?
+      Iterator<Vector.Element> iter = res.iterateNonZero();
+      while (iter.hasNext()) {
+        Vector.Element elt = iter.next();
+        result += elt.get() * elt.get() * weights.get(elt.index());
       }
     }
     return Math.sqrt(result);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedManhattanDistanceMeasure.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedManhattanDistanceMeasure.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedManhattanDistanceMeasure.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/utils/WeightedManhattanDistanceMeasure.java Wed Jun 24 21:26:25 2009
@@ -19,6 +19,8 @@
 
 import org.apache.mahout.matrix.Vector;
 
+import java.util.Iterator;
+
 /**
  * This class implements a "manhattan distance" metric by summing the absolute
  * values of the difference between each coordinate, optionally with weights.
@@ -31,13 +33,18 @@
 
     Vector res = p2.minus(p1);
     if (weights == null) {
-      for (int i = 0; i < res.size(); i++) {
-        result += Math.abs(res.get(i));
+      Iterator<Vector.Element> iter = res.iterateNonZero();
+      while (iter.hasNext()) {
+        Vector.Element elt = iter.next();
+        result += Math.abs(elt.get());
       }
+
     }
     else {
-      for (int i = 0; i < res.size(); i++) {
-        result += Math.abs(res.get(i) * weights.get(i)); // todo this is where the weights goes, right?
+      Iterator<Vector.Element> iter = res.iterateNonZero();
+      while (iter.hasNext()) {
+        Vector.Element elt = iter.next();
+        result += Math.abs(elt.get() * weights.get(elt.index()));
       }
     }
 

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestDenseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestDenseVector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestDenseVector.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestDenseVector.java Wed Jun 24 21:26:25 2009
@@ -19,6 +19,7 @@
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Iterator;
 
 import junit.framework.TestCase;
 
@@ -82,6 +83,42 @@
         assertEquals("set [" + i + ']', values[i], test.get(i));
   }
 
+
+  public void testIterator() throws Exception {
+    Iterator<Vector.Element> iterator = test.iterateNonZero();
+    checkIterator(iterator, values, 3);
+
+    iterator = test.iterateAll();
+    checkIterator(iterator, values, 3);
+
+    DenseVector zeros;
+    double[] doubles;
+    doubles = new double[]{0.0, 5.0, 0, 3.0};
+    zeros = new DenseVector(doubles);
+    iterator = zeros.iterateNonZero();
+    checkIterator(iterator, doubles, 2);
+    iterator = zeros.iterateAll();
+    checkIterator(iterator, doubles, doubles.length);
+
+    doubles = new double[]{0.0, 0.0, 0, 0.0};
+    zeros = new DenseVector(doubles);
+    iterator = zeros.iterateNonZero();
+    checkIterator(iterator, doubles, 0);
+    iterator = zeros.iterateAll();
+    checkIterator(iterator, doubles, doubles.length);
+
+  }
+
+  private void checkIterator(Iterator<Vector.Element> nzIter, double[] values, int expectedNum) {
+    int i = 0;
+    while (nzIter.hasNext()) {
+      Vector.Element elt = nzIter.next();
+      assertTrue((elt.index()) + " Value: " + values[elt.index() ] + " does not equal: " + elt.get(), values[elt.index()] == elt.get());
+      i++;
+    }
+    assertTrue(i + " does not equal: " + expectedNum, i == expectedNum);
+  }
+
   public void testSize() throws Exception {
     assertEquals("size", 3, test.getNumNondefaultElements());
   }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestOrderedIntDoubleMapping.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestOrderedIntDoubleMapping.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestOrderedIntDoubleMapping.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestOrderedIntDoubleMapping.java Wed Jun 24 21:26:25 2009
@@ -54,4 +54,16 @@
     assertEquals(0.0, mapping.get(5));
   }
 
+  public void testClone() throws Exception {
+    OrderedIntDoubleMapping mapping = new OrderedIntDoubleMapping(1);
+    mapping.set(0, 1.1);
+    mapping.set(5, 6.6);
+    OrderedIntDoubleMapping clone = (OrderedIntDoubleMapping) mapping.clone();
+    assertEquals(2, clone.getNumMappings());
+    assertEquals(1.1, clone.get(0));
+    assertEquals(0.0, clone.get(1));
+    assertEquals(6.6, clone.get(5));
+    assertEquals(0.0, clone.get(6));
+  }
+
 }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestSparseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestSparseVector.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestSparseVector.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestSparseVector.java Wed Jun 24 21:26:25 2009
@@ -19,13 +19,14 @@
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Iterator;
 
 import junit.framework.TestCase;
 
 public class TestSparseVector extends TestCase {
 
   final double[] values = { 1.1, 2.2, 3.3 };
-
+  final double[] gold = {0, 1.1, 2.2, 3.3, 0};
   final Vector test = new SparseVector(values.length + 2);
 
   public TestSparseVector(String name) {
@@ -51,6 +52,45 @@
     assertEquals("size", 5, test.size());
   }
 
+  public void testIterator() throws Exception {
+    Iterator<Vector.Element> iterator = test.iterateNonZero();
+    checkIterator(iterator, gold);
+
+    iterator = test.iterateAll();
+    checkIterator(iterator, gold);
+
+    SparseVector zeros;
+    double[] doubles;
+    doubles = new double[]{0.0, 5.0, 0, 3.0};
+    zeros = new SparseVector(doubles.length);
+    for (int i = 0; i < doubles.length; i++) {
+      zeros.setQuick(i, doubles[i]);
+    }
+    iterator = zeros.iterateNonZero();
+    checkIterator(iterator, doubles);
+    iterator = zeros.iterateAll();
+    checkIterator(iterator, doubles);
+
+    doubles = new double[]{0.0, 0.0, 0, 0.0};
+    zeros = new SparseVector(doubles.length);
+    for (int i = 0; i < doubles.length; i++) {
+      zeros.setQuick(i, doubles[i]);
+    }
+    iterator = zeros.iterateNonZero();
+    checkIterator(iterator, doubles);
+    iterator = zeros.iterateAll();
+    checkIterator(iterator, doubles);
+
+  }
+
+  private void checkIterator(Iterator<Vector.Element> nzIter, double[] values) {
+    while (nzIter.hasNext()) {
+      Vector.Element elt = nzIter.next();
+      assertTrue((elt.index()) + " Value: " + values[elt.index()]
+              + " does not equal: " + elt.get(), values[elt.index()] == elt.get());
+    }
+  }
+
   public void testCopy() throws Exception {
     Vector copy = test.clone();
     for (int i = 0; i < test.size(); i++)

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestVectorView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestVectorView.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestVectorView.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/TestVectorView.java Wed Jun 24 21:26:25 2009
@@ -19,6 +19,8 @@
 
 import junit.framework.TestCase;
 
+import java.util.Iterator;
+
 public class TestVectorView extends TestCase {
 
   private static final int cardinality = 3;
@@ -64,6 +66,38 @@
     }
   }
 
+  public void testIterator() throws Exception {
+
+    Iterator<Vector.Element> iter;
+    VectorView view;
+    double[] gold;
+    view = new VectorView(new DenseVector(values), offset, cardinality);
+    gold = new double[]{1.1, 2.2, 3.3};
+    iter = view.iterateAll();
+    checkIterator(iter, gold);
+    iter = view.iterateNonZero();
+    checkIterator(iter, gold);
+
+    view = new VectorView(new DenseVector(values), 0, cardinality);
+    gold = new double[]{0.0, 1.1, 2.2};
+    iter = view.iterateAll();
+    checkIterator(iter, gold);
+    gold = new double[]{1.1, 2.2};
+    iter = view.iterateNonZero();
+    checkIterator(iter, gold);
+    
+  }
+
+  private void checkIterator(Iterator<Vector.Element> iter, double[] gold) {
+    int i = 0;
+    while (iter.hasNext()) {
+      Vector.Element elt = iter.next();
+      assertTrue((elt.index()) + " Value: " + gold[i]
+              + " does not equal: " + elt.get(), gold[i] == elt.get());
+      i++;
+    }
+  }
+
   public void testGetUnder() {
     try {
       test.get(-1);

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/VectorTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/VectorTest.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/VectorTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/matrix/VectorTest.java Wed Jun 24 21:26:25 2009
@@ -22,6 +22,7 @@
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
+import java.util.Iterator;
 
 import junit.framework.TestCase;
 
@@ -224,7 +225,9 @@
    */
   private static void doTestEnumeration(double[] apriori, Vector vector) {
     double[] test = new double[apriori.length];
-    for (Vector.Element e : vector) {
+    Iterator<Vector.Element> iter = vector.iterateNonZero();
+    while (iter.hasNext()) {
+      Vector.Element e = iter.next();
       test[e.index()] = e.get();
     }
 
@@ -250,7 +253,7 @@
     doTestEnumeration(apriori, sparse);
   }
 
-  public void testSparseVectorTimesX() {
+  /*public void testSparseVectorTimesX() {
     Random rnd = new Random(0xDEADBEEFL);
     Vector v1 = randomSparseVector(rnd);
     double x = rnd.nextDouble();
@@ -272,9 +275,9 @@
         + " ms for 10 iterations");
     for (int i = 0; i < 50000; i++)
       assertEquals("i=" + i, rRef.getQuick(i), rOpt.getQuick(i));
-  }
+  }*/
 
-  public void testSparseVectorTimesV() {
+  /*public void testSparseVectorTimesV() {
     Random rnd = new Random(0xDEADBEEFL);
     Vector v1 = randomSparseVector(rnd);
     Vector v2 = randomSparseVector(rnd);
@@ -296,7 +299,7 @@
         + " ms for 10 iterations");
     for (int i = 0; i < 50000; i++)
       assertEquals("i=" + i, rRef.getQuick(i), rOpt.getQuick(i));
-  }
+  }*/
 
   private static Vector randomSparseVector(Random rnd) {
     SparseVector v1 = new SparseVector(50000);

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/utils/DefaultDistanceMeasureTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/utils/DefaultDistanceMeasureTest.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/utils/DefaultDistanceMeasureTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/utils/DefaultDistanceMeasureTest.java Wed Jun 24 21:26:25 2009
@@ -20,6 +20,7 @@
 import junit.framework.TestCase;
 import org.apache.mahout.matrix.DenseVector;
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
 
 
 public abstract class DefaultDistanceMeasureTest extends TestCase {
@@ -30,12 +31,35 @@
 
     DistanceMeasure distanceMeasure = distanceMeasureFactory();
 
-    Vector[] vectors = {
-        new DenseVector(new double[]{1, 1, 1, 1, 1, 1}),
-        new DenseVector(new double[]{2, 2, 2, 2, 2, 2}),
-        new DenseVector(new double[]{6, 6, 6, 6, 6, 6})
+    Vector[] vectors;
+    vectors = new Vector[]{
+            new DenseVector(new double[]{1, 1, 1, 1, 1, 1}),
+            new DenseVector(new double[]{2, 2, 2, 2, 2, 2}),
+            new DenseVector(new double[]{6, 6, 6, 6, 6, 6})
     };
 
+    compare(distanceMeasure, vectors);
+
+    vectors = new Vector[3];
+    vectors[0] = new SparseVector(5);
+    vectors[0].setQuick(0, 1);
+    vectors[0].setQuick(3, 1);
+    vectors[0].setQuick(4, 1);
+
+    vectors[1] = new SparseVector(5);
+    vectors[1].setQuick(0, 2);
+    vectors[1].setQuick(3, 2);
+    vectors[1].setQuick(4, 2);
+
+    vectors[2] = new SparseVector(5);
+    vectors[2].setQuick(0, 6);
+    vectors[2].setQuick(3, 6);
+    vectors[2].setQuick(4, 6);
+
+    compare(distanceMeasure, vectors);
+  }
+
+  private void compare(DistanceMeasure distanceMeasure, Vector[] vectors) {
     double[][] distanceMatrix = new double[3][3];
 
     for (int a = 0; a < 3; a++) {
@@ -55,8 +79,6 @@
     assertEquals(0.0, distanceMatrix[2][2]);
     assertTrue(distanceMatrix[2][0] > distanceMatrix[2][1]);
     assertTrue(distanceMatrix[2][1] > distanceMatrix[2][2]);
-
-
   }
 
 }

Copied: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (from r788103, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?p2=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java&p1=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java&r1=788103&r2=788186&rev=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Wed Jun 24 21:26:25 2009
@@ -1,4 +1,4 @@
-package org.apache.mahout.utils.vectors;
+package org.apache.mahout.utils.clustering;
 
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
@@ -17,7 +17,12 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.jobcontrol.Job;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+import org.apache.mahout.utils.vectors.SequenceFileVectorIterable;
+import org.apache.mahout.utils.vectors.VectorDumper;
+import org.apache.mahout.clustering.ClusterBase;
 
 import java.io.IOException;
 import java.io.BufferedWriter;
@@ -30,18 +35,18 @@
  *
  *
  **/
-public class VectorDumper {
-  private transient static Log log = LogFactory.getLog(VectorDumper.class);
+public class ClusterDumper {
+  private transient static Log log = LogFactory.getLog(ClusterDumper.class);
   private static final String LINE_SEP = System.getProperty("line.separator");
 
-  public static void main(String[] args) throws IOException {
+  public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
 
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
             abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Sequence File containing the Vectors").withShortName("s").create();
+            withDescription("The Sequence File containing the Clusters").withShortName("s").create();
     Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
             abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
             withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
@@ -69,15 +74,17 @@
         client.setConf(conf);
         FileSystem fs = FileSystem.get(path.toUri(), conf);
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-        VectorIterable iter = new SequenceFileVectorIterable(reader);
+
         Writer writer = null;
         if (cmdLine.hasOption(outputOpt)){
           writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
         } else {
           writer = new OutputStreamWriter(System.out);
         }
-        for (Vector vector : iter) {
-          writer.write(vector.asFormatString());
+        Writable key = (Writable) reader.getKeyClass().newInstance();
+        ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
+        while (reader.next(key, value)){
+          writer.write(value.asFormatString());
           writer.write(LINE_SEP);
         }
         if (cmdLine.hasOption(outputOpt)){
@@ -97,4 +104,4 @@
     formatter.setGroup(group);
     formatter.print();
   }
-}
+}
\ No newline at end of file

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java Wed Jun 24 21:26:25 2009
@@ -18,11 +18,16 @@
  **/
 public class SequenceFileVectorIterable implements VectorIterable {
   private SequenceFile.Reader reader;
+  private boolean transpose = false;
 
   public SequenceFileVectorIterable(SequenceFile.Reader reader) {
     this.reader = reader;
   }
 
+  public SequenceFileVectorIterable(SequenceFile.Reader reader, boolean transpose) {
+    this.reader = reader;
+    this.transpose = transpose;
+  }
 
   @Override
   public Iterator<Vector> iterator() {
@@ -35,13 +40,18 @@
     }
   }
 
-  private class SeqFileIterator implements Iterator<Vector> {
+  public class SeqFileIterator implements Iterator<Vector> {
     private Writable key;
-    private Vector value;
+    private Writable value;
 
     private SeqFileIterator() throws IllegalAccessException, InstantiationException {
-      value = (Vector) reader.getValueClass().newInstance();
-      key = (Writable) reader.getKeyClass().newInstance();
+      if (transpose == false){
+        key = (Writable) reader.getKeyClass().newInstance();
+        value = (Vector) reader.getValueClass().newInstance();
+      } else {
+        value = (Vector) reader.getValueClass().newInstance();
+        key = (Writable) reader.getKeyClass().newInstance();
+      }
     }
 
     @Override
@@ -55,7 +65,15 @@
 
     @Override
     public Vector next() {
-      return value;
+      return transpose ? (Vector)key : (Vector)value;
+    }
+
+    /**
+     * Only valid when {@link #next()} is also valid
+     * @return The current Key
+     */
+    public Writable key(){
+      return transpose ? value : key;
     }
 
     @Override

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=788186&r1=788185&r2=788186&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Wed Jun 24 21:26:25 2009
@@ -43,13 +43,18 @@
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
             abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
             withDescription("The Sequence File containing the Vectors").withShortName("s").create();
+    Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).
+            withDescription("If the Key is a vector, then dump that instead").withShortName("u").create();
+    Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).
+            withDescription("Print out the key as well, delimited by a tab (or the value if useKey is true)").withShortName("p").create();
     Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
             abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
             withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
 
-    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).create();
+    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
+            .withOption(vectorAsKeyOpt).withOption(printKeyOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -70,17 +75,28 @@
         client.setConf(conf);
         FileSystem fs = FileSystem.get(path.toUri(), conf);
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-        VectorIterable iter = new SequenceFileVectorIterable(reader);
+        SequenceFileVectorIterable vectorIterable = new SequenceFileVectorIterable(reader, cmdLine.hasOption(vectorAsKeyOpt));
         Writer writer = null;
         if (cmdLine.hasOption(outputOpt)) {
           writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
         } else {
           writer = new OutputStreamWriter(System.out);
         }
-        for (Vector vector : iter) {
+        boolean printKey = cmdLine.hasOption(printKeyOpt);
+        SequenceFileVectorIterable.SeqFileIterator iterator =
+                (SequenceFileVectorIterable.SeqFileIterator) vectorIterable.iterator();
+        int i = 0;
+        while (iterator.hasNext()) {
+          Vector vector = iterator.next();
+          if (printKey){
+            writer.write(iterator.key().toString());
+            writer.write("\t");
+          }
           writer.write(vector.asFormatString());
           writer.write(LINE_SEP);
+          i++;
         }
+        System.err.println("Dumped " + i + " Vectors");
         if (cmdLine.hasOption(outputOpt)) {
           writer.close();
         }



Mime
View raw message