mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r992920 - /mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
Date Mon, 06 Sep 2010 01:16:39 GMT
Author: drew
Date: Mon Sep  6 01:16:38 2010
New Revision: 992920

URL: http://svn.apache.org/viewvc?rev=992920&view=rev
Log:
Throws IllegalArgumentException if input directory does not exist (instead of exiting silently).
Added whitespace for readability.

Modified:
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=992920&r1=992919&r2=992920&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Mon
Sep  6 01:16:38 2010
@@ -56,74 +56,82 @@ import org.slf4j.LoggerFactory;
 
 public final class Driver {
   private static final Logger log = LoggerFactory.getLogger(Driver.class);
-  
+
   private Driver() { }
-  
+
   public static void main(String[] args) throws IOException {
+
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
       abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
         .withDescription("The Lucene directory").withShortName("d").create();
-    
+
     Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
       abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The
output file")
         .withShortName("o").create();
-    
+
     Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
       abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
       "The field in the index").withShortName("f").create();
-    
+
     Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
       abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
       "The field in the index containing the index.  If null, then the Lucene internal doc
"
           + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-    
+
     Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
       abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
       "The output of the dictionary").withShortName("t").create();
-    
+
     Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
       abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
       "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
-    
+
     Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
       abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
       "The delimiter for outputing the dictionary").withShortName("l").create();
+
     Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
       abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
       "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite
norm.  "
           + "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
+
     Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
       abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
       "The maximum number of vectors to output.  If not specified, then it will loop over
all docs")
         .withShortName("m").create();
-    
+
     Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
       abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
       "The VectorWriter to use, either seq "
           + "(SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)")
         .withShortName("e").create();
+
     Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
       abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
       "The minimum document frequency.  Default is 1").withShortName("md").create();
+
     Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
       abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
       "The max percentage of docs for the DF.  Can be used to remove really high frequency
terms."
           + "  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
+
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
+
     Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
       outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
         .withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
         .withOption(weightOpt).withOption(minDFOpt).create();
+
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
         
         CommandLineUtil.printHelp(group);
@@ -132,107 +140,121 @@ public final class Driver {
       // Springify all this
       if (cmdLine.hasOption(inputOpt)) { // Lucene case
         File file = new File(cmdLine.getValue(inputOpt).toString());
-        if (file.exists() && file.isDirectory()) {
-          long maxDocs = Long.MAX_VALUE;
-          if (cmdLine.hasOption(maxOpt)) {
-            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-          }
-          if (maxDocs < 0) {
-            throw new IllegalArgumentException("maxDocs must be >= 0");
-          }
-          Directory dir = FSDirectory.open(file);
-          IndexReader reader = IndexReader.open(dir, true);
-          Weight weight;
-          if (cmdLine.hasOption(weightOpt)) {
-            String wString = cmdLine.getValue(weightOpt).toString();
-            if (wString.equalsIgnoreCase("tf")) {
-              weight = new TF();
-            } else if (wString.equalsIgnoreCase("tfidf")) {
-              weight = new TFIDF();
-            } else {
-              throw new OptionException(weightOpt);
-            }
-          } else {
+        if (!file.isDirectory()) {
+          throw new IllegalArgumentException("Lucene directory: " + file.getName() + 
+              " does not exist or is not a directory");
+        }
+
+        long maxDocs = Long.MAX_VALUE;
+        if (cmdLine.hasOption(maxOpt)) {
+          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
+        }
+        if (maxDocs < 0) {
+          throw new IllegalArgumentException("maxDocs must be >= 0");
+        }
+
+        Directory dir = FSDirectory.open(file);
+        IndexReader reader = IndexReader.open(dir, true);
+
+        Weight weight;
+        if (cmdLine.hasOption(weightOpt)) {
+          String wString = cmdLine.getValue(weightOpt).toString();
+          if (wString.equalsIgnoreCase("tf")) {
+            weight = new TF();
+          } else if (wString.equalsIgnoreCase("tfidf")) {
             weight = new TFIDF();
+          } else {
+            throw new OptionException(weightOpt);
           }
-          String field = cmdLine.getValue(fieldOpt).toString();
-          int minDf = 1;
-          if (cmdLine.hasOption(minDFOpt)) {
-            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
-          }
-          int maxDFPercent = 99;
-          if (cmdLine.hasOption(maxDFPercentOpt)) {
-            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
-          }
-          TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
-          VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-          double norm = LuceneIterable.NO_NORMALIZING;
-          if (cmdLine.hasOption(powerOpt)) {
-            String power = cmdLine.getValue(powerOpt).toString();
-            if (power.equals("INF")) {
-              norm = Double.POSITIVE_INFINITY;
-            } else {
-              norm = Double.parseDouble(power);
-            }
-          }
-          String idField = null;
-          if (cmdLine.hasOption(idFieldOpt)) {
-            idField = cmdLine.getValue(idFieldOpt).toString();
-          }
-          LuceneIterable iterable;
-          if (norm == LuceneIterable.NO_NORMALIZING) {
-            iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
+        } else {
+          weight = new TFIDF();
+        }
+
+        String field = cmdLine.getValue(fieldOpt).toString();
+
+        int minDf = 1;
+        if (cmdLine.hasOption(minDFOpt)) {
+          minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
+        }
+
+        int maxDFPercent = 99;
+        if (cmdLine.hasOption(maxDFPercentOpt)) {
+          maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
+        }
+
+        TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
+        VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+
+        double norm = LuceneIterable.NO_NORMALIZING;
+        if (cmdLine.hasOption(powerOpt)) {
+          String power = cmdLine.getValue(powerOpt).toString();
+          if (power.equals("INF")) {
+            norm = Double.POSITIVE_INFINITY;
           } else {
-            iterable = new LuceneIterable(reader, idField, field, mapper, norm);
+            norm = Double.parseDouble(power);
           }
-          String outFile = cmdLine.getValue(outputOpt).toString();
-          log.info("Output File: {}", outFile);
-          
-          VectorWriter vectorWriter;
-          if (cmdLine.hasOption(outWriterOpt)) {
-            String outWriter = cmdLine.getValue(outWriterOpt).toString();
-            if (outWriter.equals("file")) {
-              BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
-              vectorWriter = new JWriterVectorWriter(writer);
-            } else {
-              vectorWriter = getSeqFileWriter(outFile);
-            }
+        }
+
+        String idField = null;
+        if (cmdLine.hasOption(idFieldOpt)) {
+          idField = cmdLine.getValue(idFieldOpt).toString();
+        }
+
+        LuceneIterable iterable;
+        if (norm == LuceneIterable.NO_NORMALIZING) {
+          iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
+        } else {
+          iterable = new LuceneIterable(reader, idField, field, mapper, norm);
+        }
+
+        String outFile = cmdLine.getValue(outputOpt).toString();
+        log.info("Output File: {}", outFile);
+
+        VectorWriter vectorWriter;
+        if (cmdLine.hasOption(outWriterOpt)) {
+          String outWriter = cmdLine.getValue(outWriterOpt).toString();
+          if (outWriter.equals("file")) {
+            BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+            vectorWriter = new JWriterVectorWriter(writer);
           } else {
             vectorWriter = getSeqFileWriter(outFile);
           }
-          
-          long numDocs = vectorWriter.write(iterable, maxDocs);
-          vectorWriter.close();
-          log.info("Wrote: {} vectors", numDocs);
-          
-          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
-              : "\t";
-          File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
-          log.info("Dictionary Output file: {}", dictOutFile);
-          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
-              new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
-          JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
-          tiWriter.write(termInfo);
-          tiWriter.close();
-          writer.close();
+        } else {
+          vectorWriter = getSeqFileWriter(outFile);
         }
+
+        long numDocs = vectorWriter.write(iterable, maxDocs);
+        vectorWriter.close();
+        log.info("Wrote: {} vectors", numDocs);
+
+        String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
: "\t";
+        
+        File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
+        log.info("Dictionary Output file: {}", dictOutFile);
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+            new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+        JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
+        tiWriter.write(termInfo);
+        tiWriter.close();
+        writer.close();
+
       }
-      
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
     Path path = new Path(outFile);
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     // TODO: Make this parameter driven
+
     SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
       VectorWritable.class);
-    
+
     return new SequenceFileVectorWriter(seqWriter);
   }
-  
+
 }



Mime
View raw message