mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From robina...@apache.org
Subject svn commit: r909696 - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/lda/ utils/src/main/java/org/apache/mahout/clustering/ utils/src/main/java/org/apache/mahout/clustering/lda/
Date Sat, 13 Feb 2010 02:17:03 GMT
Author: robinanil
Date: Sat Feb 13 02:17:01 2010
New Revision: 909696

URL: http://svn.apache.org/viewvc?rev=909696&view=rev
Log:
Moved LDAPrintTopics to utils added functionality to read DictionaryVectorizer dictionary.file-*


Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
      - copied, changed from r909603, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Removed:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java

Copied: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
(from r909603, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?p2=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java&p1=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java&r1=909603&r2=909696&rev=909696&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
(original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Sat Feb 13 02:17:01 2010
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.PriorityQueue;
@@ -43,31 +44,30 @@
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.utils.vectors.VectorHelper;
 
 /**
  * Class to print out the top K words for each topic.
  */
 public class LDAPrintTopics {
   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
-  private LDAPrintTopics() {
-  }
-
+  
+  private LDAPrintTopics() {}
+  
   private static class StringDoublePair implements Comparable<StringDoublePair> {
     private final double score;
     private final String word;
-
+    
     StringDoublePair(double score, String word) {
       this.score = score;
       this.word = word;
     }
-
+    
     @Override
     public int compareTo(StringDoublePair other) {
       return Double.compare(score, other.score);
     }
-
+    
     @Override
     public boolean equals(Object o) {
       if (!(o instanceof StringDoublePair)) {
@@ -76,20 +76,22 @@
       StringDoublePair other = (StringDoublePair) o;
       return score == other.score && word.equals(other.word);
     }
-
+    
     @Override
     public int hashCode() {
       return (int) Double.doubleToLongBits(score) ^ word.hashCode();
     }
-
+    
   }
-
-  public static List<List<String>> topWordsForTopics(String dir, Configuration
job,
-                                                     List<String> wordList, int numWordsToPrint)
throws IOException {
+  
+  public static List<List<String>> topWordsForTopics(String dir,
+                                                     Configuration job,
+                                                     List<String> wordList,
+                                                     int numWordsToPrint) throws IOException
{
     FileSystem fs = new Path(dir).getFileSystem(job);
-
+    
     List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
-
+    
     IntPairWritable key = new IntPairWritable();
     DoubleWritable value = new DoubleWritable();
     for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
@@ -98,7 +100,7 @@
       while (reader.next(key, value)) {
         int topic = key.getX();
         int word = key.getY();
-
+        
         ensureQueueSize(queues, topic);
         if (word >= 0 && topic >= 0) {
           double score = value.get();
@@ -108,7 +110,7 @@
       }
       reader.close();
     }
-
+    
     List<List<String>> result = new ArrayList<List<String>>();
     for (int i = 0; i < queues.size(); ++i) {
       result.add(i, new LinkedList<String>());
@@ -116,20 +118,23 @@
         result.get(i).add(0, sdp.word); // prepend
       }
     }
-
+    
     return result;
   }
-
+  
   // Expands the queue list to have a Queue for topic K
-  private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues,
int k) {
+  private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues,
+                                      int k) {
     for (int i = queues.size(); i <= k; ++i) {
       queues.add(new PriorityQueue<StringDoublePair>());
     }
   }
-
+  
   // Adds the word if the queue is below capacity, or the score is high enough
-  private static void maybeEnqueue(Queue<StringDoublePair> q, String word,
-                                   double score, int numWordsToPrint) {
+  private static void maybeEnqueue(Queue<StringDoublePair> q,
+                                   String word,
+                                   double score,
+                                   int numWordsToPrint) {
     if (q.size() >= numWordsToPrint && score > q.peek().score) {
       q.poll();
     }
@@ -137,84 +142,94 @@
       q.add(new StringDoublePair(score, word));
     }
   }
-
-  // Reads dictionary in created by the vector Driver in util
-  private static List<String> readDictionary(File path) throws IOException {
-    FileLineIterator it = new FileLineIterator(path);
-
-    List<String> result = new ArrayList<String>();
-
-    // skip 2 lines
-    it.next();
-    it.next();
-    while (it.hasNext()) {
-      String line = it.next();
-      String[] parts = TAB_PATTERN.split(line);
-      String word = parts[0];
-      int index = Integer.parseInt(parts[2]);
-      if (index != result.size()) {
-        throw new IllegalArgumentException();
-      }
-      result.add(word);
-    }
-
-    return result;
-  }
-
+  
   public static void main(String[] args) throws Exception {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-            "Path to an LDA output (a state)").withShortName("i").create();
-
-    Option dictOpt = obuilder.withLongName("dict").withRequired(true).withArgument(
-            abuilder.withName("dict").withMinimum(1).withMaximum(1).create()).withDescription(
-            "Dictionary to read in, in the same format as one created by org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
-
-    Option outOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-            "Output directory to write top words").withShortName("o").create();
-
-    Option wordOpt = obuilder.withLongName("words").withRequired(false).withArgument(
-            abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()).withDescription(
-            "Number of words to print").withShortName("w").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(outOpt).withOption(
-            wordOpt).withOption(inputOpt).create();
+    
+    Option inputOpt = obuilder.withLongName("input").withRequired(true)
+        .withArgument(
+          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+        .withDescription("Path to an LDA output (a state)").withShortName("i")
+        .create();
+    
+    Option dictOpt = obuilder.withLongName("dict").withRequired(true)
+        .withArgument(
+          abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "Dictionary to read in, in the same format as one created by "
+              + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
+          "d").create();
+    
+    Option outOpt = obuilder.withLongName("output").withRequired(true)
+        .withArgument(
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+        .withDescription("Output directory to write top words").withShortName(
+          "o").create();
+    
+    Option wordOpt = obuilder.withLongName("words").withRequired(false)
+        .withArgument(
+          abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
+            "20").create()).withDescription("Number of words to print")
+        .withShortName("w").create();
+    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
+      false).withArgument(
+      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
+          .create()).withDescription(
+      "The dictionary file type (text|sequencefile)").withShortName("dt")
+        .create();
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
+    
+    Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(
+      outOpt).withOption(wordOpt).withOption(inputOpt).withOption(dictTypeOpt)
+        .create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       String input = cmdLine.getValue(inputOpt).toString();
       File output = new File(cmdLine.getValue(outOpt).toString());
-      File dict = new File(cmdLine.getValue(dictOpt).toString());
+      String dictFile = cmdLine.getValue(dictOpt).toString();
       int numWords = 20;
       if (cmdLine.hasOption(wordOpt)) {
         numWords = Integer.parseInt(cmdLine.getValue(wordOpt).toString());
       }
-
-      List<String> wordList = readDictionary(dict);
-
       Configuration config = new Configuration();
-      List<List<String>> topWords = topWordsForTopics(input, config, wordList,
numWords);
-
+      
+      String dictionaryType = "text";
+      if (cmdLine.hasOption(dictTypeOpt)) {
+        dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
+      }
+      
+      List<String> wordList;
+      if (dictionaryType.equals("text")) {
+        wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(
+            dictFile)));
+      } else if (dictionaryType.equals("sequencefile")) {
+        FileSystem fs = FileSystem.get(new Path(dictFile).toUri(), config);
+        wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs,
+          dictFile));
+      } else {
+        throw new IllegalArgumentException("Invalid dictionary format");
+      }
+      
+      List<List<String>> topWords = topWordsForTopics(input, config, wordList,
+        numWords);
+      
       if (!output.exists()) {
         if (!output.mkdirs()) {
           throw new IOException("Could not create directory: " + output);
         }
       }
-
+      
       for (int i = 0; i < topWords.size(); ++i) {
         List<String> topK = topWords.get(i);
         File out = new File(output, "topic-" + i);
@@ -226,11 +241,11 @@
         }
         writer.close();
       }
-
+      
     } catch (OptionException e) {
       CommandLineUtil.printHelp(group);
       throw e;
     }
   }
-
+  
 }



Mime
View raw message