mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r894684 - in /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils: clustering/ClusterDumper.java vectors/lucene/ClusterLabels.java
Date Wed, 30 Dec 2009 17:27:11 GMT
Author: gsingers
Date: Wed Dec 30 17:27:11 2009
New Revision: 894684

URL: http://svn.apache.org/viewvc?rev=894684&view=rev
Log:
MAHOUT-163: New features for ClusterDumper and add in ClusterLabel capability for Lucene index
using Log Likelihood

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
  (with props)
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=894684&r1=894683&r2=894684&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Wed Dec 30 17:27:11 2009
@@ -42,6 +42,7 @@
 
 import java.io.File;
 import java.io.FileWriter;
+import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
@@ -61,9 +62,120 @@
   private static final String LINE_SEP = System.getProperty("line.separator");
   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
 
-  private ClusterDumper() {
+  String seqFileDir;
+  String pointsDir;
+  String termDictionary;
+  String outputFile;
+  int subString = Integer.MAX_VALUE;
+  Map<String, List<String>> clusterIdToPoints = null;
+  
+  public ClusterDumper(String seqFileDir, String pointsDir) throws IOException {
+    this.seqFileDir = seqFileDir;
+    this.pointsDir = pointsDir;
+    init();
+  }
+  
+  private void init() throws IOException {
+    if (this.pointsDir != null) {
+      JobConf conf = new JobConf(Job.class);
+      //read in the points
+      clusterIdToPoints = readPoints(this.pointsDir, conf);
+    } else {
+      clusterIdToPoints = Collections.emptyMap();
+    }
   }
 
+  public void printClusters() throws IOException, InstantiationException, IllegalAccessException
{
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(Job.class);
+    client.setConf(conf);
+    
+    ArrayList<String> dictionary = null;
+    if (this.termDictionary != null) {
+      dictionary = getTermDict(this.termDictionary);
+    }
+    
+    Writer writer = null;
+    if (this.outputFile != null){
+      writer = new FileWriter(this.outputFile);
+    } else {
+      writer = new OutputStreamWriter(System.out);
+    }
+    
+    File[] seqFileList = new File(this.seqFileDir).listFiles(new FilenameFilter(){
+      @Override
+      public boolean accept(File file, String name) {
+        return name.endsWith(".crc") == false;
+      }
+    });        
+    for (File seqFile : seqFileList) {
+      if (!seqFile.isFile()) {
+        continue;
+      }
+      Path path = new Path(seqFile.getAbsolutePath());
+      System.out.println("Input Path: " + path);
+      FileSystem fs = FileSystem.get(path.toUri(), conf);
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      Writable key = (Writable) reader.getKeyClass().newInstance();
+      ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
+      while (reader.next(key, value)){
+        Vector center = value.getCenter();
+        String fmtStr = center.asFormatString();
+        writer.append("Id: ").append(String.valueOf(value.getId())).append(":").append("name:")
+                .append(center.getName()).append(":").append(fmtStr.substring(0, Math.min(subString,
fmtStr.length()))).append(LINE_SEP);
+        
+        if (dictionary != null) {
+          String topTerms = getTopFeatures(center, dictionary, 10);
+          writer.write("\tTop Terms: ");
+          writer.write(topTerms);
+          writer.write(LINE_SEP);
+        }
+        
+        List<String> points = clusterIdToPoints.get(String.valueOf(value.getId()));
+        if (points != null){
+          writer.write("\tPoints: ");
+          for (Iterator<String> iterator = points.iterator(); iterator.hasNext();)
{
+            String point = iterator.next();
+            writer.append(point);
+            if (iterator.hasNext()){
+              writer.append(", ");
+            }
+          }
+          writer.write(LINE_SEP);
+        }
+        writer.flush();
+      }
+      reader.close();
+    }
+    if (this.outputFile != null){
+      writer.flush();
+      writer.close();
+    } 
+  }
+  
+  public String getOutputFile() {
+    return outputFile;
+  }
+  public void setOutputFile(String outputFile) {
+    this.outputFile = outputFile;
+  }
+  public int getSubString() {
+    return subString;
+  }
+  public void setSubString(int subString) {
+    this.subString = subString;
+  }
+  public Map<String, List<String>> getClusterIdToPoints() {
+    return clusterIdToPoints;
+  }
+  public String getTermDictionary() {
+    return termDictionary;
+  }
+  public void setTermDictionary(String termDictionary) {
+    this.termDictionary = termDictionary;
+  }
+  
+  
   public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException
{
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
@@ -80,7 +192,8 @@
             withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
     Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
             abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The directory contaning points sequence files mapping input
vectors to their cluster.  If specified, then the program will output the points associated
with a cluster").withShortName("p").create();
+            withDescription("The directory containing points sequence files mapping input
vectors to their cluster.  " +
+                    "If specified, then the program will output the points associated with
a cluster").withShortName("p").create();
     Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
         abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).
         withDescription("The dictionary file. ").withShortName("d").create();
@@ -89,102 +202,65 @@
 
     Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(dictOpt).create();
 
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
       if (cmdLine.hasOption(helpOpt)) {
-
         CommandLineUtil.printHelp(group);
         return;
       }
-
-      ArrayList<String> dictionary = null;
+      if (!cmdLine.hasOption(seqOpt)) {
+        return;
+      }
+      String seqFileDir = cmdLine.getValue(seqOpt).toString();      
+      String termDictionary = null;
       if (cmdLine.hasOption(dictOpt)) {
-        dictionary = getTermDict(cmdLine.getValue(dictOpt).toString());
+        termDictionary = cmdLine.getValue(dictOpt).toString();
       }
-      
-      if (cmdLine.hasOption(seqOpt)) {
-        JobClient client = new JobClient();
-        JobConf conf = new JobConf(Job.class);
-        client.setConf(conf);
-        Map<String, List<String>> clusterIdToPoints;
-        if (cmdLine.hasOption(pointsOpt)) {
-          //read in the points
-          clusterIdToPoints = readPoints(cmdLine.getValue(pointsOpt).toString(), conf);
-        } else {
-          clusterIdToPoints = Collections.emptyMap();
-        }
-        Writer writer;
-        if (cmdLine.hasOption(outputOpt)){
-          writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
-        } else {
-          writer = new OutputStreamWriter(System.out);
-        }
-        int sub = Integer.MAX_VALUE;
-        if (cmdLine.hasOption(substringOpt)) {
-          sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
-        }
-        
-        String seqDir = cmdLine.getValue(seqOpt).toString();        
-        File[] seqFileList = new File(seqDir).listFiles();        
-        for (File seqFile : seqFileList) {          
-          if (!seqFile.isFile()) {
-            continue;
-          }          
-          Path path = new Path(seqFile.getAbsolutePath());
-          System.out.println("Input Path: " + path);
-          FileSystem fs = FileSystem.get(path.toUri(), conf);
-          SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-          Writable key = (Writable) reader.getKeyClass().newInstance();
-          ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
-          while (reader.next(key, value)){
-            Vector center = value.getCenter();
-            String fmtStr = center.asFormatString();
-            writer.append(String.valueOf(value.getId())).append(":").append("name:")
-                    .append(center.getName()).append(":").append(fmtStr.substring(0, Math.min(sub,
fmtStr.length()))).append(LINE_SEP);
-            
-            if (dictionary != null) {
-              String topTerms = getTopFeatures(center, dictionary, 10);
-              writer.write("\tTop Terms: ");
-              writer.write(topTerms);
-              writer.write(LINE_SEP);
-            }
-            
-            List<String> points = clusterIdToPoints.get(String.valueOf(value.getId()));
-            if (points != null){
-              writer.write("\tPoints: ");
-              for (Iterator<String> iterator = points.iterator(); iterator.hasNext();)
{
-                String point = iterator.next();
-                writer.append(point);
-                if (iterator.hasNext()){
-                  writer.append(", ");
-                }
-              }
-              writer.write(LINE_SEP);
-            }
-            writer.flush();
-          }
-          reader.close();
-        }
-        if (cmdLine.hasOption(outputOpt)){
-          writer.flush();
-          writer.close();
-        }
+
+      String pointsDir = null;
+      if (cmdLine.hasOption(pointsOpt)) {
+        pointsDir = cmdLine.getValue(pointsOpt).toString();
+      }
+      String outputFile = null;
+      if (cmdLine.hasOption(outputOpt)){
+        outputFile = cmdLine.getValue(outputOpt).toString();
       }
 
+      int sub = -1;
+      if (cmdLine.hasOption(substringOpt)) {
+        sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
+      }
+      
+      ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
+      
+      if (outputFile != null) {
+        clusterDumper.setOutputFile(outputFile);
+      }
+      if (termDictionary != null) {
+        clusterDumper.setTermDictionary(termDictionary);
+      }
+      if (sub > 0) {
+        clusterDumper.setSubString(sub);
+      }      
+      clusterDumper.printClusters();      
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
-
   }
 
   private static Map<String, List<String>> readPoints(String pointsPathDir, JobConf
conf) throws IOException {
     Map<String, List<String>> result = new HashMap<String, List<String>>();
     
-    File[] children = new File(pointsPathDir).listFiles();
+    File[] children = new File(pointsPathDir).listFiles(new FilenameFilter(){
+      @Override
+      public boolean accept(File file, String name) {
+        return name.endsWith(".crc") == false; 
+      }
+    });
     
     for (File file : children) {
       if (!file.isFile()) {
@@ -294,4 +370,4 @@
       return sb.toString();
   }
   
-}
\ No newline at end of file
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=894684&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
(added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
Wed Dec 30 17:27:11 2009
@@ -0,0 +1,425 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Get labels for the cluster using Log Likelihood Ratio (LLR).
+ * <p/>
+ * "The most useful way to think of this (LLR) is as the percentage of in-cluster
+ * documents that have the feature (term) versus the percentage out, keeping in
+ * mind that both percentages are uncertain since we have only a sample of all
+ * possible documents." - Ted Dunning
+ * <p/>
+ * More about LLR can be found at :
+ * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html
+ */
+public class ClusterLabels {
+
+  class TermInfoClusterInOut implements Comparable<TermInfoClusterInOut> {
+    public String term;
+    public int inClusterDF;
+    public int outClusterDF;
+    public double logLikelihoodRatio;
+
+    public TermInfoClusterInOut(String term, int inClusterDF, int outClusterDF) {
+      this.term = term;
+      this.inClusterDF = inClusterDF;
+      this.outClusterDF = outClusterDF;
+    }
+
+    @Override
+    public int compareTo(TermInfoClusterInOut that) {
+      int res = -Double.compare(logLikelihoodRatio, that.logLikelihoodRatio);
+      if (res == 0) {
+        res = term.compareTo(that.term);
+      }
+      return res;
+    }
+    /*
+    * new Comparator<TermInfoClusterInOut>() {
+              // sort in descending order on LLR value
+              @Override
+              public int compare(TermInfoClusterInOut arg0, TermInfoClusterInOut arg1) {
+                return new Double(arg1.logLikelihoodRatio).compareTo(arg0.logLikelihoodRatio);
+              }
+            }
+    *
+    * */
+
+    public int getInClusterDiff() {
+      return this.inClusterDF - this.outClusterDF;
+    }
+  }
+
+  private static final Logger log = LoggerFactory.getLogger(ClusterLabels.class);
+  private static final String LINE_SEP = System.getProperty("line.separator");
+
+  String seqFileDir;
+  String pointsDir;
+  String indexDir;
+  String contentField;
+  String idField;
+  Map<String, List<String>> clusterIdToPoints = null;
+  String output;
+  public static final int DEFAULT_MIN_IDS = 50;
+  public static final int DEFAULT_MAX_LABELS = 25;
+  private int minNumIds = DEFAULT_MIN_IDS;
+  private int maxLabels = DEFAULT_MAX_LABELS;
+
+  public ClusterLabels(String seqFileDir, String pointsDir, String indexDir, String contentField,
int minNumIds, int maxLabels) throws IOException {
+    this.seqFileDir = seqFileDir;
+    this.pointsDir = pointsDir;
+    this.indexDir = indexDir;
+    this.contentField = contentField;
+    this.minNumIds = minNumIds;
+    this.maxLabels = maxLabels;
+    init();
+  }
+
+  private void init() throws IOException {
+    ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
+    this.clusterIdToPoints = clusterDumper.getClusterIdToPoints();
+  }
+
+  public void getLabels() throws CorruptIndexException, IOException {
+
+    Writer writer = null;
+    if (this.output != null) {
+      writer = new FileWriter(this.output);
+    } else {
+      writer = new OutputStreamWriter(System.out);
+    }
+
+    for (String clusterID : clusterIdToPoints.keySet()) {
+      List<TermInfoClusterInOut> termInfos = getClusterLabels(clusterID, clusterIdToPoints.get(clusterID));
+      if (termInfos != null) {
+        writer.write(LINE_SEP);
+        writer.write("Top labels for Cluster " + clusterID);
+        writer.write(LINE_SEP);
+        writer.write("Term \t\t LLR \t\t In-ClusterDF \t\t Out-ClusterDF ");
+        writer.write(LINE_SEP);
+        for (TermInfoClusterInOut termInfo : termInfos) {
+          writer.write(termInfo.term + "\t\t" + termInfo.logLikelihoodRatio + "\t\t" + termInfo.inClusterDF
+ "\t\t" + termInfo.outClusterDF);
+          writer.write(LINE_SEP);
+        }
+      }
+    }
+    writer.flush();
+    if (this.output != null) {
+      writer.close();
+    }
+  }
+
+  /**
+   * Get the list of labels, sorted by best score.
+   *
+   * @param clusterID
+   * @param ids
+   * @return
+   * @throws CorruptIndexException
+   * @throws IOException
+   */
+  protected List<TermInfoClusterInOut> getClusterLabels(String clusterID, List<String>
ids) throws CorruptIndexException, IOException {
+
+    if (ids.size() < minNumIds) {
+      log.info("Skipping small cluster " + clusterID);
+      return null;
+    }
+
+    log.info("Processing Cluster " + clusterID + " with " + ids.size() + " documents");
+
+    IndexReader reader = IndexReader.open(this.indexDir, false);
+
+    log.info("# of documents in the index " + reader.numDocs());
+
+    Set<String> idSet = new HashSet<String>();
+    idSet.addAll(ids);
+
+    int numDocs = reader.numDocs();
+
+    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
+
+    log.info("Populating term infos from the index");
+
+    /**
+     * This code is as that of CachedTermInfo, with one major change, which is to get the
document frequency.
+     *
+     * Since we have deleted the documents out of the cluster, the document frequency for
a term should 
+     * only include the in-cluster documents. The document frequency obtained from TermEnum
reflects the 
+     * frequency in the entire index. To get the in-cluster frequency, we need to query the
index to get
+     * the term frequencies in each document. The number of results of this call will be
the in-cluster 
+     * document frequency.
+     */
+
+    TermEnum te = reader.terms(new Term(contentField, ""));
+    int count = 0;
+
+    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
+    do {
+      Term term = te.term();
+      if (term == null || term.field().equals(contentField) == false) {
+        break;
+      }
+      OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
+
+      // Generate bitset for the term
+      TermDocs termDocs = reader.termDocs(term);
+
+      while (termDocs.next()) {
+        termBitset.set(termDocs.doc());
+      }
+
+      // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
+      // This modifies the termBitset, but that's fine as we are not using it anywhere else.
+      termBitset.and(clusterDocBitset);
+      int inclusterDF = (int) termBitset.cardinality();
+
+      TermEntry entry = new TermEntry(term.text(), count++, inclusterDF);
+      termEntryMap.put(entry.term, entry);
+    } while (te.next());
+    te.close();
+
+    List<TermInfoClusterInOut> clusteredTermInfo = new LinkedList<TermInfoClusterInOut>();
+
+    int clusterSize = ids.size();
+    int corpusSize = numDocs;
+
+    for (TermEntry termEntry : termEntryMap.values()) {
+      int corpusDF = reader.terms(new Term(this.contentField, termEntry.term)).docFreq();
+      int outDF = corpusDF - termEntry.docFreq;
+      int inDF = termEntry.docFreq;
+      TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.term, inDF,
outDF);
+      double llr = scoreDocumentFrequencies(inDF, outDF, clusterSize, corpusSize);
+      termInfoCluster.logLikelihoodRatio = llr;
+      clusteredTermInfo.add(termInfoCluster);
+    }
+
+    Collections.sort(clusteredTermInfo);
+    // Cleanup
+    reader.close();
+    termEntryMap.clear();
+
+    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
+  }
+
+
+  private OpenBitSet getClusterDocBitset(IndexReader reader, Set<String> idSet, String
idField) throws CorruptIndexException, IOException {
+    int numDocs = reader.numDocs();
+
+    OpenBitSet bitset = new OpenBitSet(numDocs);
+
+    FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField),
Collections.emptySet());
+
+    for (int i = 0; i < numDocs; i++) {
+      String id = null;
+      // Use Lucene's internal ID if idField is not specified. Else, get it from the document.
+      if (idField == null) {
+        id = Integer.toString(i);
+      } else {
+        id = reader.document(i, idFieldSelector).get(idField);
+      }
+      if (idSet.contains(id)) {
+        bitset.set(i);
+      }
+    }
+    log.info("Created bitset for in-cluster documents : " + bitset.cardinality());
+    return bitset;
+  }
+
+  private double scoreDocumentFrequencies(int inDF, int outDF, int clusterSize, int corpusSize)
{
+    int k12 = clusterSize - inDF;
+    int k22 = corpusSize - clusterSize - outDF;
+
+    return logLikelihoodRatio(inDF, k12, outDF, k22);
+  }
+
+
+  private double entropy(int ... elements) {
+    double sum = 0;
+    for (int element : elements) {
+      sum += element;
+    }
+    double result = 0.0;
+    for (int x : elements) {
+      if (x < 0) {
+        throw new IllegalArgumentException("Should not have negative count for entropy computation:
(" + x + ")");
+      }
+      int zeroFlag = (x == 0 ? 1 : 0);
+      result += x * Math.log((x + zeroFlag) / sum);
+    }
+    return -result;
+  }
+
+  /**
+   * Calculate Log-likehood ratio for the given matrix.
+   */
+  private double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+    double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
+    double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
+    double matrixEntropy = entropy(k11, k12, k21, k22);
+    return 2 * (matrixEntropy - rowEntropy - columnEntropy);
+  }
+
+  public String getIdField() {
+    return idField;
+  }
+
+  public void setIdField(String idField) {
+    this.idField = idField;
+  }
+
+  public String getOutput() {
+    return output;
+  }
+
+  public void setOutput(String output) {
+    this.output = output;
+  }
+
+  /**
+   * @param args
+   * @throws IOException
+   */
+  public static void main(String[] args) {
+
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
+            abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Lucene index directory").withShortName("d").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The output file. If not specified, the result is printed on
console.").withShortName("o").create();
+
+    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
+            abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
+            withDescription("The content field in the index").withShortName("f").create();
+
+    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
+            abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
+            withDescription("The field for the document ID in the index.  If null, then the
Lucene internal doc " +
+                    "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+
+    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true).withArgument(
+            abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).
+            withDescription("The directory containing Sequence Files for the Clusters").withShortName("s").create();
+
+    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument(
+            abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).
+            withDescription("The directory containing points sequence files mapping input
vectors to their cluster.  ").withShortName("p").create();
+    Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument(
+            abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).
+            withDescription("The minimum number of points required in a cluster to print
the labels for").withShortName("m").create();
+    Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false).withArgument(
+            abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create()).
+            withDescription("The maximum number of labels to print per cluster").withShortName("x").create();
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
+            .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt).withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+
+      String seqFileDir = cmdLine.getValue(seqOpt).toString();
+      String pointsDir = cmdLine.getValue(pointsOpt).toString();
+      String indexDir = cmdLine.getValue(indexOpt).toString();
+      String contentField = cmdLine.getValue(fieldOpt).toString();
+
+
+      String idField = null;
+      String output = null;
+
+      if (cmdLine.hasOption(idFieldOpt)) {
+        idField = cmdLine.getValue(idFieldOpt).toString();
+      }
+      if (cmdLine.hasOption(outputOpt)) {
+        output = cmdLine.getValue(outputOpt).toString();
+      }
+      int maxLabels = DEFAULT_MAX_LABELS;
+      if (cmdLine.hasOption(maxLabelsOpt)) {
+        maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
+      }
+      int minSize = DEFAULT_MIN_IDS;
+      if (cmdLine.hasOption(minClusterSizeOpt)) {
+        minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
+      }
+      ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField,
minSize, maxLabels);
+
+      if (idField != null) {
+        clusterLabel.setIdField(idField);
+      }
+      if (output != null) {
+        clusterLabel.setOutput(output);
+      }
+
+      clusterLabel.getLabels();
+
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    } catch (IOException e) {
+      log.error("Exception", e);
+    }
+  }
+
+}

Propchange: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message