Return-Path: Delivered-To: apmail-mahout-commits-archive@www.apache.org Received: (qmail 3859 invoked from network); 6 Sep 2010 01:17:01 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 6 Sep 2010 01:17:01 -0000 Received: (qmail 70126 invoked by uid 500); 6 Sep 2010 01:17:01 -0000 Delivered-To: apmail-mahout-commits-archive@mahout.apache.org Received: (qmail 70003 invoked by uid 500); 6 Sep 2010 01:17:00 -0000 Mailing-List: contact commits-help@mahout.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@mahout.apache.org Delivered-To: mailing list commits@mahout.apache.org Received: (qmail 69996 invoked by uid 99); 6 Sep 2010 01:17:00 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 06 Sep 2010 01:17:00 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 06 Sep 2010 01:16:59 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 0F9D223888EA; Mon, 6 Sep 2010 01:16:39 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r992920 - /mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Date: Mon, 06 Sep 2010 01:16:39 -0000 To: commits@mahout.apache.org From: drew@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100906011639.0F9D223888EA@eris.apache.org> Author: drew Date: Mon Sep 6 01:16:38 2010 New Revision: 992920 URL: http://svn.apache.org/viewvc?rev=992920&view=rev Log: Throws IllegalArgumentException if input directory does not exist (instead of exiting silently). Added whitespace for readability. Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=992920&r1=992919&r2=992920&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Mon Sep 6 01:16:38 2010 @@ -56,74 +56,82 @@ import org.slf4j.LoggerFactory; public final class Driver { private static final Logger log = LoggerFactory.getLogger(Driver.class); - + private Driver() { } - + public static void main(String[] args) throws IOException { + DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); - + Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument( abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) .withDescription("The Lucene directory").withShortName("d").create(); - + Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument( abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file") .withShortName("o").create(); - + Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument( abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription( "The field in the index").withShortName("f").create(); - + Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument( abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription( "The field in the index containing the index. If null, then the Lucene internal doc " + "id is used which is prone to error if the underlying index changes").withShortName("i").create(); - + Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument( abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription( "The output of the dictionary").withShortName("t").create(); - + Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument( abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription( "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create(); - + Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument( abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription( "The delimiter for outputing the dictionary").withShortName("l").create(); + Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument( abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription( "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create(); + Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument( abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription( "The maximum number of vectors to output. If not specified, then it will loop over all docs") .withShortName("m").create(); - + Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument( abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription( "The VectorWriter to use, either seq " + "(SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)") .withShortName("e").create(); + Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument( abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription( "The minimum document frequency. Default is 1").withShortName("md").create(); + Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument( abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription( "The max percentage of docs for the DF. Can be used to remove really high frequency terms." + " Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create(); + Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); + Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption( outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt) .withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt) .withOption(weightOpt).withOption(minDFOpt).create(); + try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); - + if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); @@ -132,107 +140,121 @@ public final class Driver { // Springify all this if (cmdLine.hasOption(inputOpt)) { // Lucene case File file = new File(cmdLine.getValue(inputOpt).toString()); - if (file.exists() && file.isDirectory()) { - long maxDocs = Long.MAX_VALUE; - if (cmdLine.hasOption(maxOpt)) { - maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); - } - if (maxDocs < 0) { - throw new IllegalArgumentException("maxDocs must be >= 0"); - } - Directory dir = FSDirectory.open(file); - IndexReader reader = IndexReader.open(dir, true); - Weight weight; - if (cmdLine.hasOption(weightOpt)) { - String wString = cmdLine.getValue(weightOpt).toString(); - if (wString.equalsIgnoreCase("tf")) { - weight = new TF(); - } else if (wString.equalsIgnoreCase("tfidf")) { - weight = new TFIDF(); - } else { - throw new OptionException(weightOpt); - } - } else { + if (!file.isDirectory()) { + throw new IllegalArgumentException("Lucene directory: " + file.getName() + + " does not exist or is not a directory"); + } + + long maxDocs = Long.MAX_VALUE; + if (cmdLine.hasOption(maxOpt)) { + maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); + } + if (maxDocs < 0) { + throw new IllegalArgumentException("maxDocs must be >= 0"); + } + + Directory dir = FSDirectory.open(file); + IndexReader reader = IndexReader.open(dir, true); + + Weight weight; + if (cmdLine.hasOption(weightOpt)) { + String wString = cmdLine.getValue(weightOpt).toString(); + if (wString.equalsIgnoreCase("tf")) { + weight = new TF(); + } else if (wString.equalsIgnoreCase("tfidf")) { weight = new TFIDF(); + } else { + throw new OptionException(weightOpt); } - String field = cmdLine.getValue(fieldOpt).toString(); - int minDf = 1; - if (cmdLine.hasOption(minDFOpt)) { - minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); - } - int maxDFPercent = 99; - if (cmdLine.hasOption(maxDFPercentOpt)) { - maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); - } - TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent); - VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); - double norm = LuceneIterable.NO_NORMALIZING; - if (cmdLine.hasOption(powerOpt)) { - String power = cmdLine.getValue(powerOpt).toString(); - if (power.equals("INF")) { - norm = Double.POSITIVE_INFINITY; - } else { - norm = Double.parseDouble(power); - } - } - String idField = null; - if (cmdLine.hasOption(idFieldOpt)) { - idField = cmdLine.getValue(idFieldOpt).toString(); - } - LuceneIterable iterable; - if (norm == LuceneIterable.NO_NORMALIZING) { - iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING); + } else { + weight = new TFIDF(); + } + + String field = cmdLine.getValue(fieldOpt).toString(); + + int minDf = 1; + if (cmdLine.hasOption(minDFOpt)) { + minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); + } + + int maxDFPercent = 99; + if (cmdLine.hasOption(maxDFPercentOpt)) { + maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); + } + + TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent); + VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); + + double norm = LuceneIterable.NO_NORMALIZING; + if (cmdLine.hasOption(powerOpt)) { + String power = cmdLine.getValue(powerOpt).toString(); + if (power.equals("INF")) { + norm = Double.POSITIVE_INFINITY; } else { - iterable = new LuceneIterable(reader, idField, field, mapper, norm); + norm = Double.parseDouble(power); } - String outFile = cmdLine.getValue(outputOpt).toString(); - log.info("Output File: {}", outFile); - - VectorWriter vectorWriter; - if (cmdLine.hasOption(outWriterOpt)) { - String outWriter = cmdLine.getValue(outWriterOpt).toString(); - if (outWriter.equals("file")) { - BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); - vectorWriter = new JWriterVectorWriter(writer); - } else { - vectorWriter = getSeqFileWriter(outFile); - } + } + + String idField = null; + if (cmdLine.hasOption(idFieldOpt)) { + idField = cmdLine.getValue(idFieldOpt).toString(); + } + + LuceneIterable iterable; + if (norm == LuceneIterable.NO_NORMALIZING) { + iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING); + } else { + iterable = new LuceneIterable(reader, idField, field, mapper, norm); + } + + String outFile = cmdLine.getValue(outputOpt).toString(); + log.info("Output File: {}", outFile); + + VectorWriter vectorWriter; + if (cmdLine.hasOption(outWriterOpt)) { + String outWriter = cmdLine.getValue(outWriterOpt).toString(); + if (outWriter.equals("file")) { + BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); + vectorWriter = new JWriterVectorWriter(writer); } else { vectorWriter = getSeqFileWriter(outFile); } - - long numDocs = vectorWriter.write(iterable, maxDocs); - vectorWriter.close(); - log.info("Wrote: {} vectors", numDocs); - - String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() - : "\t"; - File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString()); - log.info("Dictionary Output file: {}", dictOutFile); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream(dictOutFile), Charset.forName("UTF8"))); - JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field); - tiWriter.write(termInfo); - tiWriter.close(); - writer.close(); + } else { + vectorWriter = getSeqFileWriter(outFile); } + + long numDocs = vectorWriter.write(iterable, maxDocs); + vectorWriter.close(); + log.info("Wrote: {} vectors", numDocs); + + String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t"; + + File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString()); + log.info("Dictionary Output file: {}", dictOutFile); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(dictOutFile), Charset.forName("UTF8"))); + JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field); + tiWriter.write(termInfo); + tiWriter.close(); + writer.close(); + } - } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } } - + private static VectorWriter getSeqFileWriter(String outFile) throws IOException { Path path = new Path(outFile); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // TODO: Make this parameter driven + SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, VectorWritable.class); - + return new SequenceFileVectorWriter(seqWriter); } - + }