mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jman...@apache.org
Subject svn commit: r898669 [3/3] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/ap...
Date Wed, 13 Jan 2010 08:01:42 GMT
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Wed Jan 13 08:01:34 2010
@@ -30,6 +30,7 @@
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.VectorWritable;
 
 class DisplayMeanShift extends DisplayDirichlet {
   private DisplayMeanShift() {
@@ -60,8 +61,8 @@
     // plot the sample data
     g2.setColor(Color.DARK_GRAY);
     dv.assign(0.03);
-    for (Vector v : sampleData)
-      plotRectangle(g2, v, dv);
+    for (VectorWritable v : sampleData)
+      plotRectangle(g2, v.get(), dv);
     int i = 0;
     for (MeanShiftCanopy canopy : canopies)
       if (canopy.getBoundPoints().size() > 0.015 * sampleData.size()) {
@@ -76,8 +77,8 @@
   private static void testReferenceImplementation() {
     // add all points to the canopies
     int nextCanopyId = 0;
-    for (Vector aRaw : sampleData) {
-      clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++), canopies);
+    for (VectorWritable aRaw : sampleData) {
+      clusterer.mergeCanopy(new MeanShiftCanopy(aRaw.get(), nextCanopyId++), canopies);
     }
     boolean done = false;
     while (!done) {// shift canopies to their centroids

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java Wed Jan 13 08:01:34 2010
@@ -36,6 +36,7 @@
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -75,20 +76,19 @@
       String input = cmdLine.getValue(inputOpt, "testdata").toString();
       String output = cmdLine.getValue(outputOpt, "output").toString();
       String vectorClassName = cmdLine.getValue(vectorOpt, "org.apache.mahout.math.SparseVector").toString();
-      Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
-      runJob(input, output, vectorClass);
+      runJob(input, output);
     } catch (OptionException e) {
       LOG.error("Exception parsing command line: ", e);
       CommandLineUtil.printHelp(group);
     }
   }
 
-  public static void runJob(String input, String output, Class<? extends Vector> vectorClass) throws IOException {
+  public static void runJob(String input, String output) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(InputDriver.class);
 
     conf.setOutputKeyClass(Text.class);
-    conf.setOutputValueClass(vectorClass);
+    conf.setOutputValueClass(VectorWritable.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
     FileInputFormat.setInputPaths(conf, new Path(input));
     FileOutputFormat.setOutputPath(conf, new Path(output));

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Wed Jan 13 08:01:34 2010
@@ -101,7 +101,7 @@
         double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
         double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
 
-        runJob(input, output, measureClass, t1, t2, vectorClass);
+        runJob(input, output, measureClass, t1, t2);
       } catch (OptionException e) {
         LOG.error("Exception", e);
         CommandLineUtil.printHelp(group);
@@ -131,8 +131,7 @@
    *          the canopy T2 threshold
    */
   private static void runJob(String input, String output,
-      String measureClassName, double t1, double t2,
-      Class<? extends Vector> vectorClass) throws IOException {
+      String measureClassName, double t1, double t2) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
 
@@ -143,9 +142,9 @@
       dfs.delete(outPath, true);
     String directoryContainingConvertedInput = output
         + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
-    InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+    InputDriver.runJob(input, directoryContainingConvertedInput);
     CanopyClusteringJob.runJob(directoryContainingConvertedInput, output,
-        measureClassName, t1, t2, vectorClass);
+        measureClassName, t1, t2);
   }
 
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Wed Jan 13 08:01:34 2010
@@ -42,6 +42,7 @@
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -139,7 +140,7 @@
     }
     fs.mkdirs(outPath);
     final String directoryContainingConvertedInput = output + DIRECTORY_CONTAINING_CONVERTED_INPUT;
-    InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+    InputDriver.runJob(input, directoryContainingConvertedInput);
     DirichletDriver.runJob(directoryContainingConvertedInput, output + "/state", modelFactory,
         numModels, maxIterations, alpha_0, numReducers);
     printResults(output + "/state", modelFactory, maxIterations, numModels,
@@ -156,7 +157,7 @@
    */
   public static void printResults(String output, String modelDistribution,
       int numIterations, int numModels, double alpha_0) {
-    List<List<DirichletCluster<Vector>>> clusters = new ArrayList<List<DirichletCluster<Vector>>>();
+    List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
     JobConf conf = new JobConf(KMeansDriver.class);
     conf.set(DirichletDriver.MODEL_FACTORY_KEY, modelDistribution);
     conf.set(DirichletDriver.NUM_CLUSTERS_KEY, Integer.toString(numModels));
@@ -175,12 +176,12 @@
    * @param significant the minimum number of samples to enable printing a model
    */
   private static void printResults(
-      List<List<DirichletCluster<Vector>>> clusters, int significant) {
+      List<List<DirichletCluster<VectorWritable>>> clusters, int significant) {
     int row = 0;
-    for (List<DirichletCluster<Vector>> r : clusters) {
+    for (List<DirichletCluster<VectorWritable>> r : clusters) {
       System.out.print("sample[" + row++ + "]= ");
       for (int k = 0; k < r.size(); k++) {
-        Model<Vector> model = r.get(k).getModel();
+        Model<VectorWritable> model = r.get(k).getModel();
         if (model.count() > significant) {
           int total = (int) r.get(k).getTotalCount();
           System.out.print("m" + k + '(' + total + ')' + model.toString()

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java Wed Jan 13 08:01:34 2010
@@ -23,16 +23,17 @@
 import org.apache.mahout.clustering.dirichlet.models.NormalModel;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * An implementation of the ModelDistribution interface suitable for testing the
  * DirichletCluster algorithm. Uses a Normal Distribution
  */
-public class NormalScModelDistribution implements ModelDistribution<Vector> {
+public class NormalScModelDistribution implements ModelDistribution<VectorWritable> {
 
   @Override
-  public Model<Vector>[] sampleFromPrior(int howMany) {
-    Model<Vector>[] result = new NormalModel[howMany];
+  public Model<VectorWritable>[] sampleFromPrior(int howMany) {
+    Model<VectorWritable>[] result = new NormalModel[howMany];
     for (int i = 0; i < howMany; i++) {
       DenseVector mean = new DenseVector(60);
       for (int j = 0; j < 60; j++)
@@ -43,8 +44,8 @@
   }
 
   @Override
-  public Model<Vector>[] sampleFromPosterior(Model<Vector>[] posterior) {
-    Model<Vector>[] result = new NormalModel[posterior.length];
+  public Model<VectorWritable>[] sampleFromPosterior(Model<VectorWritable>[] posterior) {
+    Model<VectorWritable>[] result = new NormalModel[posterior.length];
     for (int i = 0; i < posterior.length; i++) {
       NormalModel m = (NormalModel) posterior[i];
       result[i] = m.sample();

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Wed Jan 13 08:01:34 2010
@@ -100,7 +100,7 @@
       String className = cmdLine.getValue(vectorClassOpt, "org.apache.mahout.math.SparseVector").toString();
       Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
 
-      runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations, vectorClass);
+      runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
     } catch (OptionException e) {
       LOG.error("Exception", e);
       CommandLineUtil.printHelp(group);
@@ -127,8 +127,7 @@
    * @param maxIterations the int maximum number of iterations
    */
   private static void runJob(String input, String output, String measureClass,
-      double t1, double t2, double convergenceDelta, int maxIterations,
-      Class<? extends Vector> vectorClass) throws IOException {
+      double t1, double t2, double convergenceDelta, int maxIterations) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
 
@@ -140,14 +139,14 @@
     final String directoryContainingConvertedInput = output
         + DIRECTORY_CONTAINING_CONVERTED_INPUT;
     System.out.println("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, vectorClass);
+    InputDriver.runJob(input, directoryContainingConvertedInput);
     System.out.println("Running Canopy to get initial clusters");
     CanopyDriver.runJob(directoryContainingConvertedInput, output
         + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, measureClass,
-        t1, t2, vectorClass);
+        t1, t2);
     System.out.println("Running KMeans");
     KMeansDriver.runJob(directoryContainingConvertedInput, output
         + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output,
-        measureClass, convergenceDelta, maxIterations, 1, vectorClass);
+        measureClass, convergenceDelta, maxIterations, 1);
   }
 }

Modified: lucene/mahout/trunk/math/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/pom.xml?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/pom.xml (original)
+++ lucene/mahout/trunk/math/pom.xml Wed Jan 13 08:01:34 2010
@@ -108,11 +108,6 @@
 
   <dependencies>
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-core</artifactId>
-    </dependency>
-
-    <dependency>
       <groupId>concurrent</groupId>
       <artifactId>concurrent</artifactId>
       <version>1.3.4</version>

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java Wed Jan 13 08:01:34 2010
@@ -31,9 +31,9 @@
 /** A few universal implementations of convenience functions */
 public abstract class AbstractMatrix implements Matrix {
 
-  private Map<String, Integer> columnLabelBindings;
+  protected Map<String, Integer> columnLabelBindings;
 
-  private Map<String, Integer> rowLabelBindings;
+  protected Map<String, Integer> rowLabelBindings;
 
   @Override
   public double get(String rowLabel, String columnLabel) throws IndexException,
@@ -460,71 +460,4 @@
     return result;
   }
 
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    // read the label bindings
-    int colSize = in.readInt();
-    if (colSize > 0) {
-      columnLabelBindings = new HashMap<String, Integer>();
-      for (int i = 0; i < colSize; i++) {
-        columnLabelBindings.put(in.readUTF(), in.readInt());
-      }
-    }
-    int rowSize = in.readInt();
-    if (rowSize > 0) {
-      rowLabelBindings = new HashMap<String, Integer>();
-      for (int i = 0; i < rowSize; i++) {
-        rowLabelBindings.put(in.readUTF(), in.readInt());
-      }
-    }
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    // write the label bindings
-    if (columnLabelBindings == null) {
-      out.writeInt(0);
-    } else {
-      out.writeInt(columnLabelBindings.size());
-      for (Map.Entry<String, Integer> stringIntegerEntry : columnLabelBindings.entrySet()) {
-        out.writeUTF(stringIntegerEntry.getKey());
-        out.writeInt(stringIntegerEntry.getValue());
-      }
-    }
-    if (rowLabelBindings == null) {
-      out.writeInt(0);
-    } else {
-      out.writeInt(rowLabelBindings.size());
-      for (Map.Entry<String, Integer> stringIntegerEntry : rowLabelBindings.entrySet()) {
-        out.writeUTF(stringIntegerEntry.getKey());
-        out.writeInt(stringIntegerEntry.getValue());
-      }
-    }
-  }
-
-  /** Reads a typed Matrix instance from the input stream */
-  public static Matrix readMatrix(DataInput in) throws IOException {
-    String matrixClassName = in.readUTF();
-    Matrix matrix;
-    try {
-      matrix = Class.forName(matrixClassName).asSubclass(Matrix.class)
-          .newInstance();
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-    matrix.readFields(in);
-    return matrix;
-  }
-
-  /** Writes a typed Matrix instance to the output stream */
-  public static void writeMatrix(DataOutput out, Matrix matrix)
-      throws IOException {
-    out.writeUTF(matrix.getClass().getName());
-    matrix.write(out);
-  }
-
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/AbstractVector.java Wed Jan 13 08:01:34 2010
@@ -20,11 +20,6 @@
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 import com.google.gson.reflect.TypeToken;
-import org.apache.hadoop.io.WritableComparable;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
 import java.lang.reflect.Type;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -332,17 +327,6 @@
   }
 
   /**
-   * Decodes a point from its WritableComparable<?> representation.
-   *
-   * @param writableComparable a WritableComparable<?> produced by asWritableComparable. Note the payload remainder: it
-   *                           is optional, but can be present.
-   * @return the n-dimensional point
-   */
-  public static Vector decodeVector(WritableComparable<?> writableComparable) {
-    return decodeVector(writableComparable.toString());
-  }
-
-  /**
    * Decodes a point from its string representation.
    *
    * @param formattedString a formatted String produced by asFormatString. Note the payload remainder: it is optional,
@@ -506,40 +490,4 @@
     set(index, value);
   }
 
-  // cache most recent vector instance class name
-  private static String instanceClassName;
-
-  // cache most recent vector instance class
-  private static Class<? extends Vector> instanceClass;
-
-  /** Read and return a vector from the input */
-  public static Vector readVector(DataInput in) throws IOException {
-    String vectorClassName = in.readUTF();
-    Vector vector;
-    try {
-      if (!vectorClassName.equals(instanceClassName)) {
-        instanceClassName = vectorClassName;
-        instanceClass = Class.forName(vectorClassName).asSubclass(Vector.class);
-      }
-      vector = instanceClass.newInstance();
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-    vector.readFields(in);
-    return vector;
-  }
-
-  /** Write the vector to the output */
-  public static void writeVector(DataOutput out, Vector vector)
-      throws IOException {
-    String vectorClassName = vector.getClass().getName();
-    out.writeUTF(vectorClassName);
-    vector.write(out);
-
-  }
-
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseMatrix.java Wed Jan 13 08:01:34 2010
@@ -17,24 +17,21 @@
 
 package org.apache.mahout.math;
 
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
 
 /** Matrix of doubles implemented using a 2-d array */
 public class DenseMatrix extends AbstractMatrix {
 
-  private double[][] values;
+  protected double[][] values;
 
   public DenseMatrix() {
     super();
   }
 
-  private int columnSize() {
+  protected int columnSize() {
     return values[0].length;
   }
 
-  private int rowSize() {
+  protected int rowSize() {
     return values.length;
   }
 
@@ -153,30 +150,5 @@
     }
     return new DenseVector(values[row]);
   }
-
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    super.readFields(in);
-    int rows = in.readInt();
-    int columns = in.readInt();
-    this.values = new double[rows][columns];
-    for (int row = 0; row < rows; row++) {
-      for (int column = 0; column < columns; column++) {
-        this.values[row][column] = in.readDouble();
-      }
-    }
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    super.write(out);
-    out.writeInt(rowSize());
-    out.writeInt(columnSize());
-    for (double[] row : values) {
-      for (double value : row) {
-        out.writeDouble(value);
-      }
-    }
-  }
-
+  
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/DenseVector.java Wed Jan 13 08:01:34 2010
@@ -27,8 +27,8 @@
 /** Implements vector as an array of doubles */
 public class DenseVector extends AbstractVector {
 
-  private double[] values;
-  private double lengthSquared = -1.0;
+  protected double[] values;
+  protected double lengthSquared = -1.0;
 
   /** For serialization purposes only */
   public DenseVector() {
@@ -58,6 +58,21 @@
     this.values = new double[cardinality];
   }
 
+  /**
+   * Copy-constructor (for use in turning a SparseVector into a dense one, for example)
+   * @param vector
+   */
+  public DenseVector(Vector vector) {
+    super(vector.getName());
+    values = new double[vector.size()];
+    Iterator<Vector.Element> it = vector.iterateNonZero();
+    Vector.Element e = null;
+    while(it.hasNext()) {
+      e = it.next();
+      values[e.index()] = e.get();
+    }
+  }
+
   @Override
   protected Matrix matrixLike(int rows, int columns) {
     return new DenseMatrix(rows, columns);
@@ -227,28 +242,6 @@
     return new Element(index);
   }
 
-  @Override
-  public void write(DataOutput dataOutput) throws IOException {
-    dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
-    dataOutput.writeInt(size());
-    Iterator<Vector.Element> iter = iterateAll();
-    while (iter.hasNext()) {
-      Vector.Element element = iter.next();
-      dataOutput.writeDouble(element.get());
-    }
-  }
-
-  @Override
-  public void readFields(DataInput dataInput) throws IOException {
-    this.setName(dataInput.readUTF());
-    double[] values = new double[dataInput.readInt()];
-    for (int i = 0; i < values.length; i++) {
-      values[i] = dataInput.readDouble();
-    }
-    this.values = values;
-    lengthSquared = -1.0;
-  }
-
   /**
    * Indicate whether the two objects are the same or not. Two {@link org.apache.mahout.math.Vector}s can be equal
    * even if the underlying implementation is not equal.

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Matrix.java Wed Jan 13 08:01:34 2010
@@ -17,12 +17,10 @@
 
 package org.apache.mahout.math;
 
-import org.apache.hadoop.io.Writable;
-
 import java.util.Map;
 
 /** The basic interface including numerous convenience functions */
-public interface Matrix extends Cloneable, Writable {
+public interface Matrix extends Cloneable {
 
   /** @return a formatted String suitable for output */
   String asFormatString();

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/MatrixView.java Wed Jan 13 08:01:34 2010
@@ -146,21 +146,4 @@
         cardinality[COL]);
   }
 
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    super.readFields(in);
-    this.offset = new int[]{in.readInt(), in.readInt()};
-    this.cardinality = new int[]{in.readInt(), in.readInt()};
-    this.matrix = readMatrix(in);
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    super.write(out);
-    out.writeInt(offset[ROW]);
-    out.writeInt(offset[COL]);
-    out.writeInt(cardinality[ROW]);
-    out.writeInt(cardinality[COL]);
-    writeMatrix(out, this.matrix);
-  }
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java Wed Jan 13 08:01:34 2010
@@ -171,26 +171,4 @@
     return new DenseVector(d);
   }
 
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    super.readFields(in);
-    this.cardinality = new int[]{in.readInt(), in.readInt()};
-    int colSize = in.readInt();
-    this.columns = new Vector[colSize];
-    for (int col = 0; col < colSize; col++) {
-      columns[col] = AbstractVector.readVector(in);
-    }
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    super.write(out);
-    out.writeInt(cardinality[ROW]);
-    out.writeInt(cardinality[COL]);
-    out.writeInt(columns.length);
-    for (Vector col : columns) {
-      AbstractVector.writeVector(out, col);
-    }
-  }
-
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseMatrix.java Wed Jan 13 08:01:34 2010
@@ -176,28 +176,4 @@
     return res;
   }
 
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    super.readFields(in);
-    this.cardinality = new int[]{in.readInt(), in.readInt()};
-    int rowsize = in.readInt();
-    this.rows = new HashMap<Integer, Vector>();
-    for (int row = 0; row < rowsize; row++) {
-      int key = in.readInt();
-      rows.put(key, AbstractVector.readVector(in));
-    }
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    super.write(out);
-    out.writeInt(cardinality[ROW]);
-    out.writeInt(cardinality[COL]);
-    out.writeInt(rows.size());
-    for (Map.Entry<Integer, Vector> integerVectorEntry : rows.entrySet()) {
-      out.writeInt(integerVectorEntry.getKey());
-      AbstractVector.writeVector(out, integerVectorEntry.getValue());
-    }
-  }
-
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java Wed Jan 13 08:01:34 2010
@@ -167,26 +167,4 @@
     return rows[row];
   }
 
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    super.readFields(in);
-    this.cardinality = new int[]{in.readInt(), in.readInt()};
-    int rowsize = in.readInt();
-    this.rows = new Vector[rowsize];
-    for (int row = 0; row < rowsize; row++) {
-      rows[row] = AbstractVector.readVector(in);
-    }
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    super.write(out);
-    out.writeInt(cardinality[ROW]);
-    out.writeInt(cardinality[COL]);
-    out.writeInt(rows.length);
-    for (Vector row : rows) {
-      AbstractVector.writeVector(out, row);
-    }
-  }
-
 }

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/SparseVector.java Wed Jan 13 08:01:34 2010
@@ -32,10 +32,10 @@
 /** Implements vector that only stores non-zero doubles */
 public class SparseVector extends AbstractVector {
 
-  private OpenIntDoubleHashMap values;
+  protected OpenIntDoubleHashMap values;
 
-  private int cardinality;
-  private double lengthSquared = -1.0;
+  protected int cardinality;
+  protected double lengthSquared = -1.0;
 
   /** For serialization purposes only. */
   public SparseVector() {
@@ -278,43 +278,6 @@
     }
   }
 
-
-  @Override
-  public void write(DataOutput dataOutput) throws IOException {
-    dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
-    dataOutput.writeInt(size());
-    int nde = getNumNondefaultElements();
-    dataOutput.writeInt(nde);
-    Iterator<Vector.Element> iter = iterateNonZero();
-    int count = 0;
-    while (iter.hasNext()) {
-      Vector.Element element = iter.next();
-      dataOutput.writeInt(element.index());
-      dataOutput.writeDouble(element.get());
-      count++;
-    }
-    assert (nde == count);
-  }
-
-  @Override
-  public void readFields(DataInput dataInput) throws IOException {
-    this.setName(dataInput.readUTF());
-    this.cardinality = dataInput.readInt();
-    int size = dataInput.readInt();
-    OpenIntDoubleHashMap values = new OpenIntDoubleHashMap((int) (size * 1.5));
-    int i = 0;
-    while (i < size) {
-      int index = dataInput.readInt();
-      double value = dataInput.readDouble();
-      values.put(index, value);
-      i++;
-    }
-    assert (i == size);
-    this.values = values;
-    this.lengthSquared = -1.0;
-  }
-
-
   @Override
   public double getLengthSquared() {
     if (lengthSquared >= 0.0) {

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/Vector.java Wed Jan 13 08:01:34 2010
@@ -17,7 +17,6 @@
 
 package org.apache.mahout.math;
 
-import org.apache.hadoop.io.Writable;
 
 import java.util.Iterator;
 import java.util.Map;
@@ -27,7 +26,7 @@
  * constructor that takes an int for cardinality and a no-arg constructor that can be used for marshalling the Writable
  * instance <p/> NOTE: Implementations may choose to reuse the Vector.Element in the Iterable methods
  */
-public interface Vector extends Cloneable, Writable {
+public interface Vector extends Cloneable {
 
   /**
    * Vectors may have a name associated with them, which makes them easy to identify

Modified: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java (original)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/VectorView.java Wed Jan 13 08:01:34 2010
@@ -280,22 +280,6 @@
   }
 
   @Override
-  public void write(DataOutput dataOutput) throws IOException {
-    dataOutput.writeUTF(this.getName() == null ? "" : this.getName());
-    dataOutput.writeInt(offset);
-    dataOutput.writeInt(cardinality);
-    writeVector(dataOutput, vector);
-  }
-
-  @Override
-  public void readFields(DataInput dataInput) throws IOException {
-    this.setName(dataInput.readUTF());
-    this.offset = dataInput.readInt();
-    this.cardinality = dataInput.readInt();
-    this.vector = readVector(dataInput);
-  }
-
-  @Override
   public boolean equals(Object o) {
     return this == o || (o instanceof Vector && equivalent(this, (Vector) o));
 

Modified: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java (original)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/MatrixTest.java Wed Jan 13 08:01:34 2010
@@ -18,21 +18,6 @@
 package org.apache.mahout.math;
 
 import junit.framework.TestCase;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.mahout.math.AbstractMatrix;
-import org.apache.mahout.math.CardinalityException;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.IndexException;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.NegateFunction;
-import org.apache.mahout.math.PlusFunction;
-import org.apache.mahout.math.UnboundLabelException;
-import org.apache.mahout.math.Vector;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -640,20 +625,4 @@
     Matrix mm = AbstractMatrix.decodeMatrix(json);
     assertEquals("Fee", m.get(0, 1), mm.get("Fee", "Bar"));
   }
-
-  public void testMatrixWritable() throws IOException {
-    Matrix m = matrixFactory(new double[][]{{1, 3, 4}, {5, 2, 3},
-        {1, 4, 2}});
-    DataOutputBuffer out = new DataOutputBuffer();
-    m.write(out);
-    out.close();
-
-    DataInputStream in = new DataInputStream(new ByteArrayInputStream(out
-        .getData()));
-    Matrix m2 = m.like();
-    m2.readFields(in);
-    in.close();
-    assertEquals("row size", m.size()[ROW], m2.size()[ROW]);
-    assertEquals("col size", m.size()[COL], m2.size()[COL]);
-  }
 }

Modified: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java (original)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/TestMatrixView.java Wed Jan 13 08:01:34 2010
@@ -18,22 +18,6 @@
 package org.apache.mahout.math;
 
 import junit.framework.TestCase;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.mahout.math.AbstractMatrix;
-import org.apache.mahout.math.CardinalityException;
-import org.apache.mahout.math.DenseMatrix;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.IndexException;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.MatrixView;
-import org.apache.mahout.math.NegateFunction;
-import org.apache.mahout.math.PlusFunction;
-import org.apache.mahout.math.UnboundLabelException;
-import org.apache.mahout.math.Vector;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -522,20 +506,6 @@
     }
   }
 
-  public void testMatrixWritable() throws IOException {
-    DataOutputBuffer out = new DataOutputBuffer();
-    test.write(out);
-    out.close();
-
-    DataInputStream in = new DataInputStream(new ByteArrayInputStream(out
-        .getData()));
-    Matrix m2 = test.clone();
-    m2.readFields(in);
-    in.close();
-    assertEquals("row size", test.size()[ROW], m2.size()[ROW]);
-    assertEquals("col size", test.size()[COL], m2.size()[COL]);
-  }
-
   public void testLabelBindings() {
     assertNull("row bindings", test.getRowLabelBindings());
     assertNull("col bindings", test.getColumnLabelBindings());

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java Wed Jan 13 08:01:34 2010
@@ -20,6 +20,7 @@
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 
 import java.util.Iterator;
 import java.io.IOException;
@@ -82,7 +83,7 @@
     @Override
     public Vector next() {
       
-      return transpose ? (Vector)key : (Vector)value;
+      return (transpose ? (VectorWritable)key : (VectorWritable)value).get();
     }
 
     /**

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Wed Jan 13 08:01:34 2010
@@ -20,6 +20,7 @@
 import org.apache.mahout.math.Vector;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.mahout.math.VectorWritable;
 
 import java.io.IOException;
 
@@ -42,7 +43,7 @@
         break;
       }
       if (point != null) {
-        writer.append(new LongWritable(recNum++), point);
+        writer.append(new LongWritable(recNum++), new VectorWritable(point));
       }
 
     }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Wed Jan 13 08:01:34 2010
@@ -44,6 +44,7 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * This class converts a set of input documents in the sequence file format to
@@ -157,8 +158,8 @@
    * the speed of your disk read
    * 
    * @param minSupport
-   * @param filePath
-   * @param dictionaryPath
+   * @param wordCountPath
+   * @param dictionaryPathBase
    * @throws IOException
    */
   private static List<Path> createDictionaryChunks(int minSupport,
@@ -267,9 +268,9 @@
         .setJobName("DictionaryVectorizer Vector generator to group Partial Vectors");
     
     conf.setMapOutputKeyClass(Text.class);
-    conf.setMapOutputValueClass(SparseVector.class);
+    conf.setMapOutputValueClass(VectorWritable.class);
     conf.setOutputKeyClass(Text.class);
-    conf.setOutputValueClass(SparseVector.class);
+    conf.setOutputValueClass(VectorWritable.class);
     
     FileInputFormat.setInputPaths(conf,
         getCommaSeparatedPaths(partialVectorPaths));
@@ -346,7 +347,7 @@
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(Text.class);
     conf.setOutputKeyClass(Text.class);
-    conf.setOutputValueClass(SparseVector.class);
+    conf.setOutputValueClass(VectorWritable.class);
     DistributedCache
         .setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
     FileInputFormat.setInputPaths(conf, new Path(input));
@@ -371,7 +372,9 @@
    * Count the frequencies of words in parallel using Map/Reduce. The input
    * documents have to be in {@link SequenceFile} format
    * 
-   * @param params
+   * @param input
+   * @param output
+   * @param analyzer
    * @throws IOException
    * @throws InterruptedException
    * @throws ClassNotFoundException

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorGenerator.java Wed Jan 13 08:01:34 2010
@@ -42,21 +42,24 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * Converts a document in to a SparseVector
  */
 public class PartialVectorGenerator extends MapReduceBase implements
-    Reducer<Text,Text,Text,SparseVector> {
+    Reducer<Text,Text,Text, VectorWritable> {
   private Analyzer analyzer;
   private Map<String,Integer> dictionary = new HashMap<String,Integer>();
   private FileSystem fs; // local filesystem
   private URI[] localFiles; // local filenames from the distributed cache
-  
+
+  private VectorWritable vectorWritable = new VectorWritable();
+
   @Override
   public void reduce(Text key,
                      Iterator<Text> values,
-                     OutputCollector<Text,SparseVector> output,
+                     OutputCollector<Text,VectorWritable> output,
                      Reporter reporter) throws IOException {
     
     if (values.hasNext()) {
@@ -71,6 +74,7 @@
       int count = 0;
       while ((token = ts.next(token)) != null) {
         String tk = new String(token.termBuffer(), 0, token.termLength());
+        if(dictionary.containsKey(tk) == false) continue;
         if (termFrequency.containsKey(tk) == false) {
           count += tk.length() + 1;
           termFrequency.put(tk, new MutableInt(0));
@@ -88,8 +92,8 @@
         vector.setQuick(dictionary.get(tk).intValue(), pair.getValue()
             .doubleValue());
       }
-      
-      output.collect(key, vector);
+      vectorWritable.set(vector);
+      output.collect(key, vectorWritable);
     }
   }
   

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/PartialVectorMerger.java Wed Jan 13 08:01:34 2010
@@ -26,26 +26,30 @@
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.math.SparseVector;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * Converts a document in to a SparseVector
  */
 public class PartialVectorMerger extends MapReduceBase implements
-    Reducer<Text,SparseVector,Text,SparseVector> {
-  
+    Reducer<Text,VectorWritable,Text, VectorWritable> {
+
+  private VectorWritable vectorWritable = new VectorWritable();
+
   @Override
   public void reduce(Text key,
-                     Iterator<SparseVector> values,
-                     OutputCollector<Text,SparseVector> output,
+                     Iterator<VectorWritable> values,
+                     OutputCollector<Text,VectorWritable> output,
                      Reporter reporter) throws IOException {
     
     SparseVector vector =
         new SparseVector(key.toString(), Integer.MAX_VALUE, 10);
     while (values.hasNext()) {
-      SparseVector value = values.next();
-      value.addTo(vector);
+      VectorWritable value = values.next();
+      value.get().addTo(vector);
     }
-    output.collect(key, vector);
+    vectorWritable.set(vector);
+    output.collect(key, vectorWritable);
     
   }
   

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java Wed Jan 13 08:01:34 2010
@@ -25,6 +25,7 @@
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.mahout.math.SparseVector;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
 
 import java.io.File;
@@ -39,7 +40,7 @@
     Path path = new Path(tmpFile.getAbsolutePath());
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
-    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
     SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
     RandomVectorIterable iter = new RandomVectorIterable(50);
     writer.write(iter);

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java Wed Jan 13 08:01:34 2010
@@ -26,6 +26,7 @@
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.SparseVector;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.utils.vectors.RandomVectorIterable;
 
 import java.io.File;
@@ -44,7 +45,7 @@
     Path path = new Path(tmpFile.getAbsolutePath());
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
-    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
     SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
     RandomVectorIterable iter = new RandomVectorIterable(50);
     writer.write(iter);
@@ -52,7 +53,7 @@
 
     SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, path, conf);
     LongWritable key = new LongWritable();
-    SparseVector value = new SparseVector();
+    VectorWritable value = new VectorWritable();
     int count = 0;
     while (seqReader.next(key, value)){
       count++;

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=898669&r1=898668&r2=898669&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Wed Jan 13 08:01:34 2010
@@ -19,17 +19,25 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.io.StringReader;
 import java.net.URISyntaxException;
-import java.util.Random;
+import java.util.*;
 
 import junit.framework.TestCase;
 
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.mutable.MutableInt;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.math.SparseVector;
 
 /**
  * Test the dictionary Vector
@@ -132,5 +140,112 @@
       "output/wordcount", new StandardAnalyzer(), 2, 100);
     
     
-  } 
+  }
+
+  public void testPerf() throws Exception {
+    Analyzer analyzer = new SimpleAnalyzer();
+    String key = "key";
+    String value = "";
+    for(String doc : DOCS) value += doc + " ";
+    Map<String, Integer> dictionary = new HashMap<String,Integer>();
+
+    TokenStream ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+
+    Token token = new Token();
+    int count = 0;
+    while ((token = ts.next(token)) != null) {
+      String tk = new String(token.termBuffer(), 0, token.termLength());
+      if(dictionary.containsKey(tk)) continue;
+      dictionary.put(tk, count++);
+    }
+
+
+    long vectorOnlyTotal = 0;
+    long total = 0;
+
+    Random rand = new Random(12345);
+    String[] docs = generateRandomText(1000);
+
+    for(int i=0; i<21000; i++) {
+
+      long time = System.nanoTime();
+
+      value = docs[rand.nextInt(docs.length)];
+      ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+
+      SparseVector vector;
+      Map<String,MutableInt> termFrequency = new HashMap<String,MutableInt>();
+
+      token = new Token();
+      ts.reset();
+      while ((token = ts.next(token)) != null) {
+        String tk = new String(token.termBuffer(), 0, token.termLength());
+        if(dictionary.containsKey(tk) == false) continue;
+        if (termFrequency.containsKey(tk) == false) {
+          count += tk.length() + 1;
+          termFrequency.put(tk, new MutableInt(0));
+        }
+        termFrequency.get(tk).increment();
+      }
+
+      vector =
+          new SparseVector(key.toString(), Integer.MAX_VALUE, termFrequency.size());
+
+      for (Map.Entry<String,MutableInt> pair : termFrequency.entrySet()) {
+        String tk = pair.getKey();
+        if (dictionary.containsKey(tk) == false) continue;
+        vector.setQuick(dictionary.get(tk).intValue(), pair.getValue()
+            .doubleValue());
+      }
+      total += (i<1000?0:1)*(System.nanoTime() - time);
+
+      time = System.nanoTime();
+
+
+      value = docs[rand.nextInt(docs.length)];
+      ts = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+      
+      vector =
+          new SparseVector(key.toString(), Integer.MAX_VALUE, 10);
+
+      token = new Token();
+      ts.reset();
+      while ((token = ts.next(token)) != null) {
+        String tk = new String(token.termBuffer(), 0, token.termLength());
+        if(dictionary.containsKey(tk) == false) continue;
+        int tokenKey = dictionary.get(tk);
+        vector.setQuick(tokenKey, vector.getQuick(tokenKey) + 1);
+      }
+      vectorOnlyTotal += (i<1000?0:1)*(System.nanoTime() - time);
+
+
+    }
+
+    System.out.println("With map: " + (total / 1e6) + "ms/KVect, with vector only: " + (vectorOnlyTotal/1e6) + "ms/KVect");
+
+  }
+  private static final String [] DOCS = {
+        "The quick red fox jumped over the lazy brown dogs.",
+        "Mary had a little lamb whose fleece was white as snow.",
+        "Moby Dick is a story of a whale and a man obsessed.",
+        "The robber wore a black fleece jacket and a baseball cap.",
+        "The English Springer Spaniel is the best of all dogs."
+    };
+
+  public static String[] generateRandomText(int docs) throws Exception {
+    String[] s = new String[docs];
+    Random r = new Random(1234);
+    for(int i=0; i<s.length; i++) {
+      String str = DOCS[i % DOCS.length];
+      String[] tokens = str.split(" ");
+      String[] other = DOCS[r.nextInt(DOCS.length)].split(" ");
+      List<String> l = new ArrayList<String>();
+      for(String t : tokens) {
+        l.add(r.nextBoolean() ? t : other[r.nextInt(other.length)]);
+      }
+      s[i] = StringUtils.join(l.toArray(new String[l.size()]), " ");
+    }
+    return s;
+  }
+  
 }



Mime
View raw message