mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From robina...@apache.org
Subject svn commit: r907466 - in /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils: clustering/ClusterDumper.java vectors/VectorDumper.java vectors/VectorHelper.java
Date Sun, 07 Feb 2010 19:49:16 GMT
Author: robinanil
Date: Sun Feb  7 19:49:16 2010
New Revision: 907466

URL: http://svn.apache.org/viewvc?rev=907466&view=rev
Log:
MAHOUT-278 Cluster dumper reads DictionaryVectorizer dictionary chunks

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=907466&r1=907465&r2=907466&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Sun Feb  7 19:49:16 2010
@@ -17,6 +17,22 @@
 
 package org.apache.mahout.utils.clustering;
 
+import java.io.File;
+import java.io.FileWriter;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -34,40 +50,26 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.jobcontrol.Job;
 import org.apache.mahout.clustering.ClusterBase;
-import org.apache.mahout.math.Vector;
 import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.math.Vector;
 import org.apache.mahout.utils.vectors.VectorHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.FileWriter;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
 public final class ClusterDumper {
-
-  private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
-
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(ClusterDumper.class);
+  
   private final String seqFileDir;
   private final String pointsDir;
   private String termDictionary;
+  private String dictionaryFormat;
   private String outputFile;
   private int subString = Integer.MAX_VALUE;
-  private Map<String, List<String>> clusterIdToPoints = null;
+  private Map<String,List<String>> clusterIdToPoints = null;
   private boolean useJSON = false;
-
+  
   public ClusterDumper(String seqFileDir, String pointsDir) throws IOException {
     this.seqFileDir = seqFileDir;
     this.pointsDir = pointsDir;
@@ -77,36 +79,49 @@
   private void init() throws IOException {
     if (this.pointsDir != null) {
       JobConf conf = new JobConf(Job.class);
-      //read in the points
+      // read in the points
       clusterIdToPoints = readPoints(this.pointsDir, conf);
     } else {
       clusterIdToPoints = Collections.emptyMap();
     }
   }
-
-  public void printClusters() throws IOException, InstantiationException, IllegalAccessException
{
+  
+  public void printClusters() throws IOException,
+                             InstantiationException,
+                             IllegalAccessException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
     client.setConf(conf);
     
     String[] dictionary = null;
     if (this.termDictionary != null) {
-      dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
+      if (dictionaryFormat.equals("text")) {
+        dictionary = VectorHelper.loadTermDictionary(new File(
+            this.termDictionary));
+      } else if (dictionaryFormat.equals("sequencefile")) {
+        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(),
+          conf);
+        dictionary = VectorHelper.loadTermDictionary(conf, fs,
+          this.termDictionary);
+      } else {
+        throw new IllegalArgumentException("Invalid dictionary format");
+      }
     }
     
     Writer writer = null;
-    if (this.outputFile != null){
+    if (this.outputFile != null) {
       writer = new FileWriter(this.outputFile);
     } else {
       writer = new OutputStreamWriter(System.out);
     }
     
-    File[] seqFileList = new File(this.seqFileDir).listFiles(new FilenameFilter(){
-      @Override
-      public boolean accept(File file, String name) {
-        return name.endsWith(".crc") == false;
-      }
-    });        
+    File[] seqFileList = new File(this.seqFileDir)
+        .listFiles(new FilenameFilter() {
+          @Override
+          public boolean accept(File file, String name) {
+            return name.endsWith(".crc") == false;
+          }
+        });
     for (File seqFile : seqFileList) {
       if (!seqFile.isFile()) {
         continue;
@@ -117,11 +132,14 @@
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
       Writable key = (Writable) reader.getKeyClass().newInstance();
       ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
-      while (reader.next(key, value)){
+      while (reader.next(key, value)) {
         Vector center = value.getCenter();
-        String fmtStr = useJSON ? center.asFormatString() : VectorHelper.vectorToString(center,
dictionary);
-        writer.append("Id: ").append(String.valueOf(value.getId())).append(":").append("name:")
-                .append(center.getName()).append(":").append(fmtStr.substring(0, Math.min(subString,
fmtStr.length()))).append('\n');
+        String fmtStr = useJSON ? center.asFormatString() : VectorHelper
+            .vectorToString(center, dictionary);
+        writer.append("Id: ").append(String.valueOf(value.getId())).append(":")
+            .append("name:").append(center.getName()).append(":").append(
+              fmtStr.substring(0, Math.min(subString, fmtStr.length())))
+            .append('\n');
         
         if (dictionary != null) {
           String topTerms = getTopFeatures(center, dictionary, 10);
@@ -130,13 +148,15 @@
           writer.write('\n');
         }
         
-        List<String> points = clusterIdToPoints.get(String.valueOf(value.getId()));
-        if (points != null){
+        List<String> points = clusterIdToPoints.get(String.valueOf(value
+            .getId()));
+        if (points != null) {
           writer.write("\tPoints: ");
-          for (Iterator<String> iterator = points.iterator(); iterator.hasNext();)
{
+          for (Iterator<String> iterator = points.iterator(); iterator
+              .hasNext();) {
             String point = iterator.next();
             writer.append(point);
-            if (iterator.hasNext()){
+            if (iterator.hasNext()) {
               writer.append(", ");
             }
           }
@@ -146,63 +166,100 @@
       }
       reader.close();
     }
-    if (this.outputFile != null){
+    if (this.outputFile != null) {
       writer.flush();
       writer.close();
-    } 
+    }
   }
   
   public String getOutputFile() {
     return outputFile;
   }
+  
   public void setOutputFile(String outputFile) {
     this.outputFile = outputFile;
   }
+  
   public int getSubString() {
     return subString;
   }
+  
   public void setSubString(int subString) {
     this.subString = subString;
   }
-  public Map<String, List<String>> getClusterIdToPoints() {
+  
+  public Map<String,List<String>> getClusterIdToPoints() {
     return clusterIdToPoints;
   }
+  
   public String getTermDictionary() {
     return termDictionary;
   }
-  public void setTermDictionary(String termDictionary) {
+  
+  public void setTermDictionary(String termDictionary, String dictionaryType) {
     this.termDictionary = termDictionary;
+    this.dictionaryFormat = dictionaryType;
   }
   
-  
-  public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException
{
+  public static void main(String[] args) throws IOException,
+                                        IllegalAccessException,
+                                        InstantiationException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false).withArgument(
-            abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The directory containing Sequence Files for the Clusters").withShortName("s").create();
-    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
-    Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
-            abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
-            withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
-    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).
-            withDescription("Output the centroid as JSON.  Otherwise it substitues in the
terms for vector cell entries").withShortName("j").create();
-    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
-            abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The directory containing points sequence files mapping input
vectors to their cluster.  " +
-                    "If specified, then the program will output the points associated with
a cluster").withShortName("p").create();
-    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
-        abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).
-        withDescription("The dictionary file. ").withShortName("d").create();
-    Option helpOpt = obuilder.withLongName("help").
-            withDescription("Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).create();
-
+    
+    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false)
+        .withArgument(
+          abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1)
+              .create()).withDescription(
+          "The directory containing Sequence Files for the Clusters")
+        .withShortName("s").create();
+    Option outputOpt = obuilder.withLongName("output").withRequired(false)
+        .withArgument(
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The output file.  If not specified, dumps to the console")
+        .withShortName("o").create();
+    Option substringOpt = obuilder
+        .withLongName("substring")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("substring").withMinimum(1).withMaximum(1).create())
+        .withDescription("The number of chars of the asFormatString() to print")
+        .withShortName("b").create();
+    Option centroidJSonOpt = obuilder
+        .withLongName("json")
+        .withRequired(false)
+        .withDescription(
+          "Output the centroid as JSON.  Otherwise it substitues in the terms for vector
cell entries")
+        .withShortName("j").create();
+    Option pointsOpt = obuilder
+        .withLongName("pointsDir")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The directory containing points sequence files mapping input vectors to their
cluster.  "
+              + "If specified, then the program will output the points associated with a
cluster")
+        .withShortName("p").create();
+    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false)
+        .withArgument(
+          abuilder.withName("dictionary").withMinimum(1).withMaximum(1)
+              .create()).withDescription("The dictionary file. ")
+        .withShortName("d").create();
+    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
+      false).withArgument(
+      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
+          .create()).withDescription(
+      "The dictionary file type (text|sequencefile)").withShortName("dt")
+        .create();
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
+    
+    Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(
+      seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(
+      pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).withOption(
+      dictTypeOpt).create();
     
     try {
       Parser parser = new Parser();
@@ -215,21 +272,21 @@
       if (!cmdLine.hasOption(seqOpt)) {
         return;
       }
-      String seqFileDir = cmdLine.getValue(seqOpt).toString();      
+      String seqFileDir = cmdLine.getValue(seqOpt).toString();
       String termDictionary = null;
       if (cmdLine.hasOption(dictOpt)) {
         termDictionary = cmdLine.getValue(dictOpt).toString();
       }
-
+      
       String pointsDir = null;
       if (cmdLine.hasOption(pointsOpt)) {
         pointsDir = cmdLine.getValue(pointsOpt).toString();
       }
       String outputFile = null;
-      if (cmdLine.hasOption(outputOpt)){
+      if (cmdLine.hasOption(outputOpt)) {
         outputFile = cmdLine.getValue(outputOpt).toString();
       }
-
+      
       int sub = -1;
       if (cmdLine.hasOption(substringOpt)) {
         sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
@@ -238,34 +295,41 @@
       if (cmdLine.hasOption(centroidJSonOpt)) {
         clusterDumper.setUseJSON(true);
       }
-
+      
       if (outputFile != null) {
         clusterDumper.setOutputFile(outputFile);
       }
+      
+      String dictionaryType = "text";
+      if (cmdLine.hasOption(dictTypeOpt)) {
+        dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
+      }
+      
       if (termDictionary != null) {
-        clusterDumper.setTermDictionary(termDictionary);
+        clusterDumper.setTermDictionary(termDictionary, dictionaryType);
       }
       if (sub > 0) {
         clusterDumper.setSubString(sub);
-      }      
-      clusterDumper.printClusters();      
+      }
+      clusterDumper.printClusters();
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-
+  
   private void setUseJSON(boolean json) {
     this.useJSON = json;
   }
-
-  private static Map<String, List<String>> readPoints(String pointsPathDir, JobConf
conf) throws IOException {
-    SortedMap<String, List<String>> result = new TreeMap<String, List<String>>();
+  
+  private static Map<String,List<String>> readPoints(String pointsPathDir,
+                                                     JobConf conf) throws IOException {
+    SortedMap<String,List<String>> result = new TreeMap<String,List<String>>();
     
-    File[] children = new File(pointsPathDir).listFiles(new FilenameFilter(){
+    File[] children = new File(pointsPathDir).listFiles(new FilenameFilter() {
       @Override
       public boolean accept(File file, String name) {
-        return name.endsWith(".crc") == false; 
+        return name.endsWith(".crc") == false;
       }
     });
     
@@ -281,7 +345,9 @@
         Text key = (Text) reader.getKeyClass().newInstance();
         Text value = (Text) reader.getValueClass().newInstance();
         while (reader.next(key, value)) {
-          //value is the cluster id as an int, key is the name/id of the vector, but that
doesn't matter because we only care about printing it
+          // value is the cluster id as an int, key is the name/id of the
+          // vector, but that doesn't matter because we only care about printing
+          // it
           String clusterId = value.toString();
           List<String> pointList = result.get(clusterId);
           if (pointList == null) {
@@ -294,14 +360,12 @@
         log.error("Exception", e);
       } catch (IllegalAccessException e) {
         log.error("Exception", e);
-      }      
+      }
     }
     
     return result;
   }
-
-
-
+  
   static class TermIndexWeight {
     public int index = -1;
     public double weight = 0;
@@ -309,48 +373,50 @@
     TermIndexWeight(int index, double weight) {
       this.index = index;
       this.weight = weight;
-    }    
+    }
   }
-
-  private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms)
{
-
+  
+  private static String getTopFeatures(Vector vector,
+                                       String[] dictionary,
+                                       int numTerms) {
+    
     List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();
     
     Iterator<Vector.Element> iter = vector.iterateNonZero();
-      while (iter.hasNext()) {
-        Vector.Element elt = iter.next();     
-        vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+    while (iter.hasNext()) {
+      Vector.Element elt = iter.next();
+      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
+    }
+    
+    // Sort results in reverse order (ie weight in descending order)
+    Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() {
+      @Override
+      public int compare(TermIndexWeight one, TermIndexWeight two) {
+        return Double.compare(two.weight, one.weight);
       }
-      
-      // Sort results in reverse order (ie weight in descending order)
-      Collections.sort(vectorTerms, new Comparator<TermIndexWeight> () {
-        @Override
-        public int compare(TermIndexWeight one, TermIndexWeight two) {
-          return Double.compare(two.weight, one.weight);
-        }
-      });
-      
-      List<String> topTerms = new LinkedList<String>();
-      
-      for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
-        int index = vectorTerms.get(i).index;
-        String dictTerm = dictionary[index];
-        if (dictTerm == null) {
-          log.error("Dictionary entry missing for {}", index);
-          continue;
-        }
-        topTerms.add(dictTerm); 
+    });
+    
+    List<String> topTerms = new LinkedList<String>();
+    
+    for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
+      int index = vectorTerms.get(i).index;
+      String dictTerm = dictionary[index];
+      if (dictTerm == null) {
+        log.error("Dictionary entry missing for {}", index);
+        continue;
       }
-      
-      StringBuilder sb = new StringBuilder();
-      for (Iterator<String> iterator = topTerms.iterator(); iterator.hasNext();) {
-        String term = iterator.next();
-        sb.append(term);
-        if (iterator.hasNext()){
-          sb.append(", ");
-        }
+      topTerms.add(dictTerm);
+    }
+    
+    StringBuilder sb = new StringBuilder();
+    for (Iterator<String> iterator = topTerms.iterator(); iterator.hasNext();) {
+      String term = iterator.next();
+      sb.append(term);
+      if (iterator.hasNext()) {
+        sb.append(", ");
       }
-      return sb.toString();
+    }
+    return sb.toString();
   }
   
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=907466&r1=907465&r2=907466&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
(original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Sun Feb  7 19:49:16 2010
@@ -17,6 +17,12 @@
 
 package org.apache.mahout.utils.vectors;
 
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -36,77 +42,110 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
-
 /**
- * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link org.apache.mahout.math.Vector}s
- * and dump out the results using {@link org.apache.mahout.math.Vector#asFormatString()}
to either the console
+ * Can read in a {@link org.apache.hadoop.io.SequenceFile} of
+ * {@link org.apache.mahout.math.Vector}s and dump out the results using
+ * {@link org.apache.mahout.math.Vector#asFormatString()} to either the console
  * or to a file.
  */
 public final class VectorDumper {
-
+  
   private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
-
-  private VectorDumper() {
-  }
-
+  
+  private VectorDumper() {}
+  
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
-            abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Sequence File containing the Vectors").withShortName("s").create();
-    Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).
-            withDescription("If the Key is a vector, then dump that instead").withShortName("u").create();
-    Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).
-            withDescription("Print out the key as well, delimited by a tab (or the value
if useKey is true)").withShortName("p").create();
-    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
-    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
-            abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).
-            withDescription("The dictionary file. ").withShortName("d").create();
-    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).
-            withDescription("Output the centroid as JSON.  Otherwise it substitues in the
terms for vector cell entries").withShortName("j").create();
-    Option helpOpt = obuilder.withLongName("help").
-            withDescription("Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(dictOpt).withOption(centroidJSonOpt)
-            .withOption(vectorAsKeyOpt).withOption(printKeyOpt).create();
-
+    
+    Option seqOpt = obuilder.withLongName("seqFile").withRequired(false)
+        .withArgument(
+          abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Sequence File containing the Vectors")
+        .withShortName("s").create();
+    Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false)
+        .withDescription("If the Key is a vector, then dump that instead")
+        .withShortName("u").create();
+    Option printKeyOpt = obuilder
+        .withLongName("printKey")
+        .withRequired(false)
+        .withDescription(
+          "Print out the key as well, delimited by a tab (or the value if useKey is true)")
+        .withShortName("p").create();
+    Option outputOpt = obuilder.withLongName("output").withRequired(false)
+        .withArgument(
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The output file.  If not specified, dumps to the console")
+        .withShortName("o").create();
+    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false)
+        .withArgument(
+          abuilder.withName("dictionary").withMinimum(1).withMaximum(1)
+              .create()).withDescription("The dictionary file. ")
+        .withShortName("d").create();
+    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
+      false).withArgument(
+      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
+          .create()).withDescription(
+      "The dictionary file type (text|sequencefile)").withShortName("dt")
+        .create();
+    Option centroidJSonOpt = obuilder
+        .withLongName("json")
+        .withRequired(false)
+        .withDescription(
+          "Output the centroid as JSON.  Otherwise it substitues in the terms for vector
cell entries")
+        .withShortName("j").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
+    
+    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(
+      outputOpt).withOption(dictTypeOpt).withOption(dictOpt).withOption(
+      centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(printKeyOpt)
+        .create();
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
-
+        
         printHelp(group);
         return;
       }
-
+      
       if (cmdLine.hasOption(seqOpt)) {
         Path path = new Path(cmdLine.getValue(seqOpt).toString());
         System.out.println("Input Path: " + path);
         JobClient client = new JobClient();
         JobConf conf = new JobConf(Job.class);
         client.setConf(conf);
-
+        
+        FileSystem fs = FileSystem.get(path.toUri(), conf);
+        
+        String dictionaryType = "text";
+        if (cmdLine.hasOption(dictTypeOpt)) {
+          dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
+        }
+        
         String[] dictionary = null;
         if (cmdLine.hasOption(dictOpt)) {
-          dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString()));
+          if (dictionaryType.equals("text")) {
+            dictionary = VectorHelper.loadTermDictionary(new File(cmdLine
+                .getValue(dictOpt).toString()));
+          } else if (dictionaryType.equals("sequencefile")) {
+            dictionary = VectorHelper.loadTermDictionary(conf, fs, cmdLine
+                .getValue(dictOpt).toString());
+          } else {
+            throw new OptionException(dictTypeOpt);
+          }
         }
         boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
-        FileSystem fs = FileSystem.get(path.toUri(), conf);
+        
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-        SequenceFileVectorIterable vectorIterable = new SequenceFileVectorIterable(reader,
cmdLine.hasOption(vectorAsKeyOpt));
+        SequenceFileVectorIterable vectorIterable = new SequenceFileVectorIterable(
+            reader, cmdLine.hasOption(vectorAsKeyOpt));
         Writer writer;
         if (cmdLine.hasOption(outputOpt)) {
           writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
@@ -114,8 +153,8 @@
           writer = new OutputStreamWriter(System.out);
         }
         boolean printKey = cmdLine.hasOption(printKeyOpt);
-        SequenceFileVectorIterable.SeqFileIterator iterator =
-                (SequenceFileVectorIterable.SeqFileIterator) vectorIterable.iterator();
+        SequenceFileVectorIterable.SeqFileIterator iterator = (SequenceFileVectorIterable.SeqFileIterator)
vectorIterable
+            .iterator();
         int i = 0;
         while (iterator.hasNext()) {
           Vector vector = iterator.next();
@@ -123,7 +162,9 @@
             writer.write(iterator.key().toString());
             writer.write("\t");
           }
-          String fmtStr = useJSON ? vector.asFormatString() : (dictionary != null ? VectorHelper.vectorToString(vector,
dictionary) : vector.asFormatString());
+          String fmtStr = useJSON ? vector.asFormatString()
+              : (dictionary != null ? VectorHelper.vectorToString(vector,
+                dictionary) : vector.asFormatString());
           writer.write(fmtStr);
           writer.write('\n');
           i++;
@@ -134,14 +175,14 @@
         }
         System.err.println("Dumped " + i + " Vectors");
       }
-
+      
     } catch (OptionException e) {
       log.error("Exception", e);
       printHelp(group);
     }
-
+    
   }
-
+  
   private static void printHelp(Group group) {
     HelpFormatter formatter = new HelpFormatter();
     formatter.setGroup(group);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=907466&r1=907465&r2=907466&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
(original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
Sun Feb  7 19:49:16 2010
@@ -1,4 +1,5 @@
 package org.apache.mahout.utils.vectors;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,9 +17,6 @@
  * limitations under the License.
  */
 
-import org.apache.mahout.common.FileLineIterator;
-import org.apache.mahout.math.Vector;
-
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -26,20 +24,34 @@
 import java.util.Iterator;
 import java.util.regex.Pattern;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
+
 public class VectorHelper {
   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
-  private VectorHelper() {
-  }
-
-
+  
+  private VectorHelper() {}
+  
   /**
-   * Create a String from a vector that fills in the values with the appropriate value from
a dictionary where each the ith entry is the term for the ith vector cell..
+   * Create a String from a vector that fills in the values with the appropriate
+   * value from a dictionary where each the ith entry is the term for the ith
+   * vector cell..
+   * 
    * @param vector
-   * @param dictionary The dictionary.  See
+   * @param dictionary
+   *          The dictionary. See
    * @return The String
    */
-  public static String vectorToString(Vector vector, String [] dictionary){
+  public static String vectorToString(Vector vector, String[] dictionary) {
     StringBuilder bldr = new StringBuilder(2048);
     String name = vector.getName();
     if (name != null && name.length() > 0) {
@@ -49,42 +61,77 @@
     Iterator<Vector.Element> iter = vector.iterateNonZero();
     boolean first = true;
     while (iter.hasNext()) {
-      if (first){
+      if (first) {
         first = false;
       } else {
         bldr.append(", ");
       }
       Vector.Element elt = (Vector.Element) iter.next();
       bldr.append(elt.index()).append(':').append(dictionary[elt.index()]);
-
+      
     }
     return bldr.toString();
   }
-
-
+  
   /**
-   * Read in a dictionary file.  Format is:
-   * <pre>term DocFreq Index</pre>
+   * Read in a dictionary file. Format is:
+   * 
+   * <pre>
+   * term DocFreq Index
+   * </pre>
+   * 
    * @param dictFile
    * @return
    * @throws IOException
    */
-  public static String [] loadTermDictionary(File dictFile) throws IOException {
+  public static String[] loadTermDictionary(File dictFile) throws IOException {
     return loadTermDictionary(new FileInputStream(dictFile));
   }
-
+  
+  /**
+   * Read a dictionary in {@link SequenceFile} generated by {@link DictionaryVectorizer}
+   * @param conf
+   * @param fs
+   * @param filePattern <PATH TO DICTIONARY>/dictionary.file-* 
+   * @return
+   * @throws IOException
+   */
+  public static String[] loadTermDictionary(Configuration conf,
+                                            FileSystem fs,
+                                            String filePattern) throws IOException {
+    FileStatus[] dictionaryFiles = fs.globStatus(new Path(filePattern));
+    OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<String>();
+    Text key = new Text();
+    IntWritable value = new IntWritable();
+    for (FileStatus fileStatus : dictionaryFiles) {
+      Path path = fileStatus.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // key is term value is id
+      while (reader.next(key, value)) {
+        dict.put(key.toString(), value.get());
+      }
+    }
+    String[] dictionary = new String[dict.size()];
+    for (String feature : dict.keys()) {
+      dictionary[dict.get(feature)] = feature;
+    }
+    return dictionary;
+  }
+  
   /**
-   * Read in a dictionary file.  Format is:
-   * First line is the number of entries
-   * <pre>term DocFreq Index</pre>
+   * Read in a dictionary file. Format is: First line is the number of entries
+   * 
+   * <pre>
+   * term DocFreq Index
+   * </pre>
    */
-  public static String [] loadTermDictionary(InputStream is) throws IOException {
+  public static String[] loadTermDictionary(InputStream is) throws IOException {
     FileLineIterator it = new FileLineIterator(is);
-
+    
     int numEntries = Integer.parseInt(it.next());
-    //System.out.println(numEntries);
-    String [] result = new String[numEntries];
-
+    // System.out.println(numEntries);
+    String[] result = new String[numEntries];
+    
     while (it.hasNext()) {
       String line = it.next();
       if (line.startsWith("#")) {
@@ -94,7 +141,7 @@
       if (tokens.length < 3) {
         continue;
       }
-      int index = Integer.parseInt(tokens[2]);//tokens[1] is the doc freq
+      int index = Integer.parseInt(tokens[2]);// tokens[1] is the doc freq
       result[index] = tokens[0];
     }
     return result;



Mime
View raw message