mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r800084 - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/arff/ main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/arff/ test/java/org/apache/mahout/utils/vectors/lucene/
Date Sun, 02 Aug 2009 14:19:56 GMT
Author: gsingers
Date: Sun Aug  2 14:19:55 2009
New Revision: 800084

URL: http://svn.apache.org/viewvc?rev=800084&view=rev
Log:
MAHOUT-155: Driver, converter for ARFF, also some updates to Lucene

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
      - copied, changed from r799873, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
      - copied, changed from r799870, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
Removed:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=800084&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java Sun Aug  2 14:19:55 2009
@@ -0,0 +1,70 @@
+package org.apache.mahout.utils.vectors.arff;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+
+
+/**
+ * An interface for representing an ARFFModel.  Implementations can decide on the best approach
+ * for storing the model, as some approaches will be fine for smaller files, while larger
+ * ones may require a better implementation.
+ *
+ **/
+public interface ARFFModel {
+  public static final DateFormat DEFAULT_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+  public static final String ARFF_SPARSE = "{";//indicates the vector is sparse
+  public static final String ARFF_COMMENT = "%";
+  public static final String ATTRIBUTE = "@attribute";
+  public static final String DATA = "@data";
+  public static final String RELATION = "@relation";
+
+
+  String getRelation();
+
+  void setRelation(String relation);
+
+  /**
+   * The vector attributes (labels in Mahout speak)
+   * @return the map
+   */
+  Map<String, Integer> getLabelBindings();
+
+  Integer getNominalValue(String nominal);
+
+  void addNominal(String nominal, int idx);
+
+  DateFormat getDateFormat(Integer idx);
+
+  void addDateFormat(Integer idx, DateFormat format);
+
+  Integer getLabelIndex(String label);
+
+  void addLabel(String label, Integer idx);
+
+  ARFFType getARFFType(Integer idx);
+
+  void addType(Integer idx, ARFFType type);
+
+  /**
+   * The count of the number of words seen
+   * @return the count
+   */
+  long getWordCount();
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java?rev=800084&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java Sun Aug  2 14:19:55 2009
@@ -0,0 +1,20 @@
+package org.apache.mahout.utils.vectors.arff;
+
+public enum ARFFType {
+  NUMERIC("numeric"), NOMINAL("{"), DATE("date"), STRING("string");
+
+  private String indicator;
+  ARFFType(String indicator) {
+    this.indicator = indicator;
+  }
+
+  public String getIndicator() {
+    return indicator;
+  }
+
+  
+
+  public String getLabel(String line) {
+    return line.substring(ARFFModel.ATTRIBUTE.length(), line.length() - indicator.length()).trim();
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=800084&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Sun Aug  2 14:19:55 2009
@@ -0,0 +1,216 @@
+package org.apache.mahout.utils.vectors.arff;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.text.DateFormat;
+import java.util.Iterator;
+
+
+/**
+ * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link org.apache.mahout.matrix.Vector}s
+ * <p/>
+ * Attribute type handling:
+ * <ul>
+ * <li>Numeric -> As is</li>
+ * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''} will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
+ * <li>Dates -> Convert to time as a long</li>
+ * <li>Strings -> Create a map of String -> long</li>
+ * </ul>
+ * <p/>
+ * <p/>
+ * <p/>
+ * <p/>
+ * NOTE: This class does not set the label bindings on every vector.  If you want the label
+ * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector.
+ */
+public class ARFFVectorIterable implements VectorIterable {
+
+  protected BufferedReader buff;
+  protected boolean inData;
+  protected MapBackedARFFModel model;
+
+
+  public ARFFVectorIterable(File file) throws IOException {
+    this(new FileReader(file));
+  }
+
+  public ARFFVectorIterable(File file, Charset encoding) throws IOException {
+    this(new InputStreamReader(new FileInputStream(file), encoding));
+  }
+
+  public ARFFVectorIterable(String arff) throws IOException {
+    this(new StringReader(arff));
+  }
+
+  public ARFFVectorIterable(Reader reader) throws IOException {
+    if (reader instanceof BufferedReader) {
+      buff = (BufferedReader) reader;
+    } else {
+      buff = new BufferedReader(reader);
+    }
+    //grab the attributes, then start the iterator at the first line of data
+    String line = null;
+    int labelNumber = 0;
+    inData = false;
+    model = new MapBackedARFFModel();
+
+    while ((line = buff.readLine()) != null) {
+      line = line.trim();
+      String lower = line.toLowerCase();
+      ARFFType type;
+      Integer labelNumInt = new Integer(labelNumber);
+      if (lower.startsWith(ARFFModel.ARFF_COMMENT)) {
+        continue;
+      } else if (lower.startsWith(ARFFModel.RELATION)) {
+        model.setRelation(line.substring(ARFFModel.RELATION.length()).trim());
+      } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) {
+        String label;
+        if (lower.indexOf(ARFFType.NUMERIC.getIndicator()) != -1) {
+          label = ARFFType.NUMERIC.getLabel(line);
+          type = ARFFType.NUMERIC;
+        } else if (lower.indexOf(ARFFType.STRING.getIndicator()) != -1) {
+          label = ARFFType.STRING.getLabel(line);
+          type = ARFFType.STRING;
+          //TODO: create a map so we know which
+
+        } else if (lower.indexOf(ARFFType.NOMINAL.getIndicator()) != -1) {
+          label = ARFFType.NOMINAL.getLabel(line);
+          type = ARFFType.NOMINAL;
+          //@ATTRIBUTE class        {Iris-setosa,Iris-versicolor,Iris-virginica}
+          int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
+          String [] classes = line.substring(classIdx + 1, line.length() - 1).split(",");
+          for (int i = 0; i < classes.length; i++) {
+            model.addNominal(classes[i].trim(), i);
+          }
+
+        } else if (lower.indexOf(ARFFType.DATE.getIndicator()) != -1) {
+          label = ARFFType.DATE.getLabel(line);
+          type = ARFFType.DATE;
+          //TODO: DateFormatter map
+          DateFormat format = ARFFModel.DEFAULT_DATE_FORMAT;
+          int idx = lower.indexOf(ARFFType.DATE.getIndicator());
+          String[] split = line.split(" ");
+          if (split.length >= 4) {//we have a date format
+            String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
+            if (formStr.startsWith("\"")) {
+              formStr = formStr.substring(1, formStr.length() - 1);
+            }
+            format = new SimpleDateFormat(formStr);
+          }
+          model.addDateFormat(labelNumInt, format);
+          //@attribute <name> date [<date-format>]
+        } else {
+          throw new UnsupportedOperationException("Invalid attribute: " + line);
+        }
+        model.addLabel(label, labelNumInt);
+        model.addType(labelNumInt, type);
+        labelNumber++;
+      } else if (lower.startsWith(ARFFModel.DATA)) {
+        inData = true;
+        break;//skip it
+      }
+    }
+
+  }
+
+
+  /*public ARFFVectorIterable(SequenceFile seqFile){
+
+  }*/
+
+  @Override
+  public Iterator<Vector> iterator() {
+    return new ARFFIterator();
+  }
+
+  class ARFFIterator implements Iterator<Vector> {
+    String line = null;
+
+    @Override
+    public boolean hasNext() {
+      boolean result = false;
+      try {
+        while ((line = buff.readLine()) != null) {
+          line = line.trim();
+          if (line.equals("") == false && line.startsWith(ARFFModel.ARFF_COMMENT) == false) {
+            break;
+          }
+        }
+        if (line != null) {
+          result = true;
+        }
+      } catch (IOException e) {
+        result = false;
+      }
+      return result;
+    }
+
+    @Override
+    public Vector next() {
+      Vector result = null;
+      if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
+        line = line.substring(1, line.length() - 1);
+        String[] splits = line.split(",");
+        result = new SparseVector(model.getLabelSize());
+        for (int i = 0; i < splits.length; i++) {
+          String[] data = splits[i].split(" ");//first is index, second is
+          int idx = Integer.parseInt(data[0]);
+
+          result.setQuick(idx, model.getValue(data[1], idx));
+        }
+      } else {
+        result = new DenseVector(model.getLabelSize());
+        String[] splits = line.split(",");
+        for (int i = 0; i < splits.length; i++) {
+          result.setQuick(i, model.getValue(splits[i], i));
+        }
+      }
+      //result.setLabelBindings(labelBindings);
+      return result;
+    }
+
+
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException("remove not supported");
+    }
+  }
+
+  /**
+   * Returns info about the ARFF content that was parsed.
+   * @return the model
+   */
+  public MapBackedARFFModel getModel() {
+    return model;
+  }
+}

Copied: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java (from r799873, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java?p2=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java&p1=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java&r1=799873&r2=800084&rev=800084&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java Sun Aug  2 14:19:55 2009
@@ -1,4 +1,4 @@
-package org.apache.mahout.utils.vectors.lucene;
+package org.apache.mahout.utils.vectors.arff;
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -29,16 +29,9 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
 import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.utils.CommandLineUtil;
-import org.apache.mahout.utils.vectors.TF;
-import org.apache.mahout.utils.vectors.TFIDF;
-import org.apache.mahout.utils.vectors.TermInfo;
-import org.apache.mahout.utils.vectors.Weight;
-import org.apache.mahout.utils.vectors.io.JWriterTermInfoWriter;
+import org.apache.mahout.utils.strings.StringUtil;
 import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
 import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
 import org.apache.mahout.utils.vectors.io.VectorWriter;
@@ -49,9 +42,11 @@
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileWriter;
+import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.nio.charset.Charset;
+import java.util.Map;
 
 
 /**
@@ -60,63 +55,40 @@
  **/
 public class Driver {
   private transient static Logger log = LoggerFactory.getLogger(Driver.class);
-  //TODO: This assumes LuceneIterable, make it generic.
 
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
 
-    Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
-            abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Lucene directory").withShortName("d").create();
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+            withDescription("The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted").withShortName("d").create();
 
     Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
             abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output file").withShortName("o").create();
+            withDescription("The output directory.  Files will have the same name as the input, but with the extension .mvc").withShortName("o").create();
 
-    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
-            abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
-            withDescription("The field in the index").withShortName("f").create();
-
-    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
-            abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
-            withDescription("The field in the index containing the index.  If null, then the Lucene internal doc " +
-                    "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+    Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+            abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+            withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName("m").create();
 
     Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
             abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output of the dictionary").withShortName("t").create();
-
-    Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
-            abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).
-            withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+            withDescription("The file to output the label bindings").withShortName("t").create();
 
     Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
             abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
             withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
-    Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
-            abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).
-            withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  " +
-                    "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
-    Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-            abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
-            withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName("m").create();
 
     Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
             abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).
             withDescription("The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)").withShortName("e").create();
-    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
-            abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).
-            withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();
-    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-            abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).
-            withDescription("The max percentage of docs for the DF.  Can be used to remove really high frequency terms.  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
+
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
-            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
-            .withOption(weightOpt).withOption(minDFOpt).create();
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
+            .withOption(helpOpt).withOption(dictOutOpt).withOption(outWriterOpt).withOption(delimiterOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -127,91 +99,36 @@
         CommandLineUtil.printHelp(group);
         return;
       }
-      //Springify all this
       if (cmdLine.hasOption(inputOpt)) {//Lucene case
-        File file = new File(cmdLine.getValue(inputOpt).toString());
-        if (file.exists() && file.isDirectory()) {
-          long maxDocs = Long.MAX_VALUE;
-          if (cmdLine.hasOption(maxOpt)) {
-            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-          }
-          if (maxDocs < 0) {
-            throw new IllegalArgumentException("maxDocs must be >= 0");
-          }
-          Directory dir = FSDirectory.open(file);
-          IndexReader reader = IndexReader.open(dir, true);
-          Weight weight = null;
-          if (cmdLine.hasOption(weightOpt)) {
-            String wString = cmdLine.getValue(weightOpt).toString();
-            if (wString.equalsIgnoreCase("tf")) {
-              weight = new TF();
-            } else if (wString.equalsIgnoreCase("tfidf")) {
-              weight = new TFIDF();
-            } else {
-              throw new OptionException(weightOpt);
-            }
-          } else {
-            weight = new TFIDF();
-          }
-          String field = cmdLine.getValue(fieldOpt).toString();
-          int minDf = 1;
-          if (cmdLine.hasOption(minDFOpt)) {
-            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
-          }
-          int maxDFPercent = 99;
-          if (cmdLine.hasOption(maxDFPercentOpt)) {
-            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
-          }
-          TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
-          VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-          LuceneIteratable iteratable = null;
-          String power = null;
-          double norm = -1;
-          if (cmdLine.hasOption(powerOpt)) {
-            power = cmdLine.getValue(powerOpt).toString();
-            if (power.equals("INF")) {
-              norm = Double.POSITIVE_INFINITY;
-            } else {
-              norm = Double.parseDouble(power);
-            }
-          }
-          String idField = null;
-          if (cmdLine.hasOption(idFieldOpt)) {
-            idField = cmdLine.getValue(idFieldOpt).toString();
-          }
-          if (norm == LuceneIteratable.NO_NORMALIZING) {
-            iteratable = new LuceneIteratable(reader, idField, field, mapper, LuceneIteratable.NO_NORMALIZING);
-          } else {
-            iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
-          }
-          String outFile = cmdLine.getValue(outputOpt).toString();
-          log.info("Output File: " + outFile);
-
-          VectorWriter vectorWriter;
-          if (cmdLine.hasOption(outWriterOpt)) {
-            String outWriter = cmdLine.getValue(outWriterOpt).toString();
-            if (outWriter.equals("file")) {
-              BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
-              vectorWriter = new JWriterVectorWriter(writer);
-            } else {
-              vectorWriter = getSeqFileWriter(outFile);
+        File input = new File(cmdLine.getValue(inputOpt).toString());
+        long maxDocs = Long.MAX_VALUE;
+        if (cmdLine.hasOption(maxOpt)) {
+          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
+        }
+        if (maxDocs < 0) {
+          throw new IllegalArgumentException("maxDocs must be >= 0");
+        }
+        String outDir = cmdLine.getValue(outputOpt).toString();
+        log.info("Output Dir: " + outDir);
+        String outWriter = null;
+        if (cmdLine.hasOption(outWriterOpt)) {
+          outWriter = cmdLine.getValue(outWriterOpt).toString();
+        }
+        String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+        File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
+        if (input.exists() && input.isDirectory()) {
+          File[] files = input.listFiles(new FilenameFilter() {
+            @Override
+            public boolean accept(File file, String name) {
+              return name.endsWith(".arff");
             }
-          } else {
-            vectorWriter = getSeqFileWriter(outFile);
+          });
+          for (int i = 0; i < files.length; i++) {
+            File file = files[i];
+            writeFile(dictOut, delimiter, outWriter, outDir, file, maxDocs);
           }
-
-          long numDocs = vectorWriter.write(iteratable, maxDocs);
-          vectorWriter.close();
-          log.info("Wrote: " + numDocs + " vectors");
-
-          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
-          File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
-          log.info("Dictionary Output file: " + dictOutFile);
-          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
-          JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
-          tiWriter.write(termInfo);
-          tiWriter.close();
-          writer.close();
+        } else {
+          writeFile(dictOut, delimiter, outWriter, outDir, input, maxDocs);
         }
       }
 
@@ -221,17 +138,44 @@
     }
   }
 
+  private static void writeFile(File dictOut, String delimiter, String outWriter, String outDir, File file, long maxDocs) throws IOException {
+    ARFFVectorIterable iteratable = new ARFFVectorIterable(file);
+    String outFile = outDir + "/" + file.getName() + ".mvc";
+
+    VectorWriter vectorWriter;
+    if (outWriter != null ) {
+      if (outWriter.equals("file")) {
+        BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+        vectorWriter = new JWriterVectorWriter(writer);
+      } else {
+        vectorWriter = getSeqFileWriter(outFile);
+      }
+    } else {
+      vectorWriter = getSeqFileWriter(outFile);
+    }
+
+    long numDocs = vectorWriter.write(iteratable, maxDocs);
+    vectorWriter.close();
+    log.info("Wrote: " + numDocs + " vectors");
+    
+    log.info("Dictionary Output file: " + dictOut);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOut), Charset.forName("UTF8")));
+    Map<String, Integer> labels = iteratable.getModel().getLabelBindings();
+    for (Map.Entry<String, Integer> entry : labels.entrySet()) {
+      writer.append(entry.getKey()).append(delimiter).append(String.valueOf(entry.getValue())).append(StringUtil.LINE_SEP);
+    }
+    writer.close();
+  }
+
   private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
     VectorWriter sfWriter;
     Path path = new Path(outFile);
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
-    //TODO: Make this parameter driven
     SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, SparseVector.class);
-
     sfWriter = new SequenceFileVectorWriter(seqWriter);
     return sfWriter;
   }
 
 
-}
+}
\ No newline at end of file

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=800084&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Sun Aug  2 14:19:55 2009
@@ -0,0 +1,232 @@
+package org.apache.mahout.utils.vectors.arff;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Collections;
+
+
+/**
+ * Holds ARFF information in {@link java.util.Map}.
+ */
+public class MapBackedARFFModel implements ARFFModel {
+
+  protected long wordCount = 1;
+
+  protected String relation;
+
+  private Map<String, Integer> labelBindings;
+  private Map<Integer, ARFFType> typeMap; //key is the vector index, value is the type
+  private Map<Integer, DateFormat> dateMap;
+  private Map<String, Integer> nominalMap;
+  private Map<String, Long> words;
+
+  public MapBackedARFFModel() {
+    labelBindings = new HashMap<String, Integer>();
+    words = new HashMap<String, Long>();
+    typeMap = new HashMap<Integer, ARFFType>();
+    dateMap = new HashMap<Integer, DateFormat>();
+    nominalMap = new HashMap<String, Integer>();
+  }
+
+  public MapBackedARFFModel(Map<String, Long> words, long wordCount) {
+    this.words = words;
+    this.wordCount = wordCount;
+  }
+
+  public String getRelation() {
+    return relation;
+  }
+
+  public void setRelation(String relation) {
+    this.relation = relation;
+  }
+
+  /**
+   * Convert a piece of String data at a specific spot into a value
+   *
+   * @param data The data to convert
+   * @param idx  The position in the ARFF data
+   * @return A double representing the data
+   */
+  protected double getValue(String data, int idx) {
+    double result = 0;
+    ARFFType type = typeMap.get(idx);
+    data = data.replaceAll("\"", "");
+    data = data.trim();
+    switch (type) {
+      case NUMERIC: {
+        result = processNumeric(data);
+        break;
+      }
+      case DATE: {
+        result = processDate(data, idx);
+        break;
+      }
+      case STRING: {
+        //may have quotes
+        result = processString(data);
+        break;
+      }
+      case NOMINAL: {
+        result = processNominal(data);
+        break;
+      }
+
+
+    }
+    return result;
+  }
+
+  protected double processNominal(String data) {
+    double result;
+    Integer ord = nominalMap.get(data);
+    if (ord == null) {
+      throw new RuntimeException("Invalid nominal: " + data);
+    }
+    result = ord;
+    return result;
+  }
+
+  /**
+   * Process a String
+   *
+   * @param data
+   * @return
+   */
+  //Not sure how scalable this is going to be
+  protected double processString(String data) {
+    double result;
+    data = data.replaceAll("\"", "");
+    //map it to an long
+    Long theLong = words.get(data);
+    if (theLong == null) {
+      theLong = wordCount++;
+      words.put(data, theLong);
+    }
+    result = theLong;
+    return result;
+  }
+
+  protected double processNumeric(String data) {
+    return Double.parseDouble(data);
+  }
+
+  protected double processDate(String data, int idx) {
+    double result;
+    DateFormat format = dateMap.get(idx);
+    if (format == null) {
+      format = DEFAULT_DATE_FORMAT;
+    }
+    Date date = null;
+    try {
+      date = format.parse(data);
+      result = date.getTime();// hmmm, what kind of loss casting long to double?
+    } catch (ParseException e) {
+      throw new RuntimeException(e);
+    }
+    return result;
+  }
+
+  /**
+   * The vector attributes (labels in Mahout speak)
+   * @return the map
+   */
+  public Map<String, Integer> getLabelBindings() {
+    return labelBindings;
+  }
+
+  /**
+   * The map of types encountered
+   * @return the map
+   */
+  public Map<Integer, ARFFType> getTypeMap() {
+    return Collections.unmodifiableMap(typeMap);
+  }
+
+  /**
+   * Map of Date formatters used
+   * @return the map
+   */
+  public Map<Integer, DateFormat> getDateMap() {
+    return Collections.unmodifiableMap(dateMap);
+  }
+
+  /**
+   * Map nominals to ids
+   * @return the map
+   */
+  public Map<String, Integer> getNominalMap() {
+    return Collections.unmodifiableMap(nominalMap);
+  }
+
+  /**
+   * Immutable map of words to the long id used for those words
+   * @return The map
+   */
+  public Map<String, Long> getWords() {
+    return Collections.unmodifiableMap(words);
+  }
+
+  public Integer getNominalValue(String nominal){
+    return nominalMap.get(nominal);
+  }
+
+  public void addNominal(String nominal, int idx) {
+    nominalMap.put(nominal, idx);
+  }
+
+  public DateFormat getDateFormat(Integer idx){
+    return dateMap.get(idx);
+  }
+
+  public void addDateFormat(Integer idx, DateFormat format) {
+    dateMap.put(idx, format);
+  }
+
+  public Integer getLabelIndex(String label){
+    return labelBindings.get(label);
+  }
+
+  public void addLabel(String label, Integer idx) {
+    labelBindings.put(label, idx);
+  }
+
+  public ARFFType getARFFType(Integer idx){
+    return typeMap.get(idx);
+  }
+
+  public void addType(Integer idx, ARFFType type) {
+    typeMap.put(idx, type);
+  }
+
+  /**
+   * The count of the number of words seen
+   * @return the count
+   */
+  public long getWordCount() {
+    return wordCount;
+  }
+
+  public int getLabelSize() {
+    return labelBindings.size();
+  }
+}

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=800084&r1=800083&r2=800084&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Sun Aug  2 14:19:55 2009
@@ -60,7 +60,6 @@
  **/
 public class Driver {
   private transient static Logger log = LoggerFactory.getLogger(Driver.class);
-  //TODO: This assumes LuceneIterable, make it generic.
 
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
@@ -164,7 +163,7 @@
           }
           TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
           VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-          LuceneIteratable iteratable = null;
+          LuceneIterable iterable = null;
           String power = null;
           double norm = -1;
           if (cmdLine.hasOption(powerOpt)) {
@@ -179,10 +178,10 @@
           if (cmdLine.hasOption(idFieldOpt)) {
             idField = cmdLine.getValue(idFieldOpt).toString();
           }
-          if (norm == LuceneIteratable.NO_NORMALIZING) {
-            iteratable = new LuceneIteratable(reader, idField, field, mapper, LuceneIteratable.NO_NORMALIZING);
+          if (norm == LuceneIterable.NO_NORMALIZING) {
+            iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
           } else {
-            iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
+            iterable = new LuceneIterable(reader, idField, field, mapper, norm);
           }
           String outFile = cmdLine.getValue(outputOpt).toString();
           log.info("Output File: " + outFile);
@@ -200,7 +199,7 @@
             vectorWriter = getSeqFileWriter(outFile);
           }
 
-          long numDocs = vectorWriter.write(iteratable, maxDocs);
+          long numDocs = vectorWriter.write(iterable, maxDocs);
           vectorWriter.close();
           log.info("Wrote: " + numDocs + " vectors");
 

Copied: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (from r799870, lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?p2=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java&p1=lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java&r1=799870&r2=800084&rev=800084&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Sun Aug  2 14:19:55 2009
@@ -32,7 +32,7 @@
  *
  *
  **/
-public class LuceneIteratable implements VectorIterable {
+public class LuceneIterable implements VectorIterable {
 
 
   private IndexReader indexReader;
@@ -45,7 +45,7 @@
 
   public static final double NO_NORMALIZING = -1.0;
 
-  public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper) {
+  public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper) {
     this(reader, idField, field, mapper, NO_NORMALIZING);
   }
 
@@ -57,7 +57,7 @@
    * @param mapper The {@link org.apache.mahout.utils.vectors.lucene.VectorMapper} for creating {@link org.apache.mahout.matrix.Vector}s from Lucene's TermVectors.
    * @param normPower The normalization value.  Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
    */
-  public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
+  public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
     if (normPower != NO_NORMALIZING && normPower < 0){
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=800084&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java Sun Aug  2 14:19:55 2009
@@ -0,0 +1,202 @@
+package org.apache.mahout.utils.vectors.arff;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.strings.StringUtil;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.text.DateFormat;
+
+
+/**
+ *
+ *
+ **/
+public class ARFFVectorIterableTest extends TestCase {
+
+  public void testValues() throws Exception {
+    StringBuilder builder = new StringBuilder();
+    builder.append("%comments").append(StringUtil.LINE_SEP).append("@RELATION Mahout").append(StringUtil.LINE_SEP)
+            .append("@ATTRIBUTE foo numeric").append(StringUtil.LINE_SEP)
+            .append("@ATTRIBUTE bar numeric").append(StringUtil.LINE_SEP)
+            .append("@ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\"").append(StringUtil.LINE_SEP)
+            .append("@ATTRIBUTE junk string").append(StringUtil.LINE_SEP)
+            .append("@ATTRIBUTE theNominal {c,b,a}").append(StringUtil.LINE_SEP)
+            .append("@DATA").append(StringUtil.LINE_SEP)
+            .append("1,2, \"2009-01-01 5:55:55\", foo, c").append(StringUtil.LINE_SEP)
+            .append("2,3").append(StringUtil.LINE_SEP)
+            .append("{0 5,1 23}").append(StringUtil.LINE_SEP);
+    ARFFVectorIterable iterable = new ARFFVectorIterable(builder.toString());
+    assertTrue(iterable.getModel().getRelation() + " is not equal to " + "Mahout", iterable.getModel().getRelation().equals("Mahout") == true);
+    Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
+    assertNotNull(bindings);
+    Iterator<Vector> iter = iterable.iterator();
+    assertTrue(iter.hasNext());
+    Vector next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof DenseVector);
+    assertEquals("", next.get(0), 1.0);
+    assertEquals("", next.get(1), 2.0);
+    assertTrue(iter.hasNext());
+    next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof DenseVector);
+    assertEquals("", next.get(0), 2.0);
+    assertEquals("", next.get(1), 3.0);
+
+    assertTrue(iter.hasNext());
+    next = iter.next();
+    assertNotNull(next);
+    assertTrue("Wrong instanceof", next instanceof SparseVector);
+    assertEquals("", next.get(0), 5.0);
+    assertEquals("", next.get(1), 23.0);
+
+    assertFalse(iter.hasNext());
+  }
+
+  public void testDense() throws Exception {
+    ARFFVectorIterable iterable = new ARFFVectorIterable(SAMPLE_DENSE_ARFF);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof DenseVector);
+      count++;
+    }
+    assertTrue(count + " does not equal: " + 10, count == 10);
+  }
+
+  public void testSparse() throws Exception {
+    ARFFVectorIterable iterable = new ARFFVectorIterable(SAMPLE_SPARSE_ARFF);
+    int count = 0;
+    for (Vector vector : iterable) {
+      assertTrue("Vector is not dense", vector instanceof SparseVector);
+      count++;
+    }
+    assertTrue(count + " does not equal: " + 10, count == 10);
+  }
+
+  public void testNonNumeric() throws Exception {
+    try {
+      ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF);
+      int count = 0;
+      for (Vector vector : iterable) {
+        assertTrue("Vector is not dense", vector instanceof SparseVector);
+        count++;
+      }
+      assertTrue(count + " does not equal: " + 10, count == 10);
+      Map<String, Integer> nominalMap = iterable.getModel().getNominalMap();
+      assertNotNull(nominalMap);
+      assertTrue("nominalMap Size: " + nominalMap.size() + " is not: " + 2, nominalMap.size() == 2);
+      Map<Integer, ARFFType> integerARFFTypeMap = iterable.getModel().getTypeMap();
+      assertNotNull("Type map null", integerARFFTypeMap);
+      assertTrue("integerARFFTypeMap Size: " + integerARFFTypeMap.size() + " is not: " + 5, integerARFFTypeMap.size() == 5);
+      Map<String, Long> words = iterable.getModel().getWords();
+      assertNotNull("words null", words);
+      assertTrue("words Size: " + words.size() + " is not: " + 10, words.size() == 10);
+      System.out.println("Words: " + words);
+      Map<Integer, DateFormat> integerDateFormatMap = iterable.getModel().getDateMap();
+      assertNotNull("date format null", integerDateFormatMap);
+      assertTrue("integerDateFormatMap Size: " + integerDateFormatMap.size() + " is not: " + 1, integerDateFormatMap.size() == 1);
+    } catch (UnsupportedOperationException e) {
+
+    }
+
+  }
+
+
+  public static final String SAMPLE_DENSE_ARFF = "   % Comments\n" +
+          "   % \n" +
+          "   % Comments go here" +
+          "   % \n" +
+          "   @RELATION Mahout\n" +
+          "\n" +
+          "   @ATTRIBUTE foo  NUMERIC\n" +
+          "   @ATTRIBUTE bar   NUMERIC\n" +
+          "   @ATTRIBUTE hockey  NUMERIC\n" +
+          "   @ATTRIBUTE football   NUMERIC\n" +
+          "  \n" +
+          "\n" +
+          "\n" +
+          "   @DATA\n" +
+          "   23.1,3.23,1.2,0.2\n" +
+          "   2.9,3.0,1.2,0.2\n" +
+          "   2.7,3.2,1.3,0.2\n" +
+          "   2.6,3.1,1.23,0.2\n" +
+          "   23.0,3.6,1.2,0.2\n" +
+          "   23.2,3.9,1.7,0.2\n" +
+          "   2.6,3.2,1.2,0.3\n" +
+          "   23.0,3.2,1.23,0.2\n" +
+          "   2.2,2.9,1.2,0.2\n" +
+          "   2.9,3.1,1.23,0.1\n";
+
+
+  public static final String SAMPLE_SPARSE_ARFF = "   % Comments\n" +
+          "   % \n" +
+          "   % Comments go here" +
+          "   % \n" +
+          "   @RELATION Mahout\n" +
+          "\n" +
+          "   @ATTRIBUTE foo  NUMERIC\n" +
+          "   @ATTRIBUTE bar   NUMERIC\n" +
+          "   @ATTRIBUTE hockey  NUMERIC\n" +
+          "   @ATTRIBUTE football   NUMERIC\n" +
+          "   @ATTRIBUTE tennis   NUMERIC\n" +
+          "  \n" +
+          "\n" +
+          "\n" +
+          "   @DATA\n" +
+          "   {1 23.1,2 3.23,3 1.2,4 0.2}\n" +
+          "   {0 2.9}\n" +
+          "   {0 2.7,2 3.2,3 1.3,4 0.2}\n" +
+          "   {1 2.6,2 3.1,3 1.23,4 0.2}\n" +
+          "   {1 23.0,2 3.6,3 1.2,4 0.2}\n" +
+          "   {0 23.2,1 3.9,3 1.7,4 0.2}\n" +
+          "   {0 2.6,1 3.2,2 1.2,4 0.3}\n" +
+          "   {1 23.0,2 3.2,3 1.23}\n" +
+          "   {1 2.2,2 2.94 0.2}\n" +
+          "   {1 2.9,2 3.1}\n";
+
+  public static final String NON_NUMERIC_ARFF = "   % Comments\n" +
+          "   % \n" +
+          "   % Comments go here" +
+          "   % \n" +
+          "   @RELATION Mahout\n" +
+          "\n" +
+          "   @ATTRIBUTE junk  NUMERIC\n" +
+          "   @ATTRIBUTE foo  NUMERIC\n" +
+          "   @ATTRIBUTE bar   {c,d}\n" +
+          "   @ATTRIBUTE hockey  string\n" +
+          "   @ATTRIBUTE football   date \"yyyy-MM-dd\"\n" +
+          "  \n" +
+          "\n" +
+          "\n" +
+          "   @DATA\n" +
+          "   {2 c,3 gretzky,4 1973-10-23}\n" +
+          "   {1 2.9,2 d,3 orr,4 1973-11-23}\n" +
+          "   {2 c,3 bossy,4 1981-10-23}\n" +
+          "   {1 2.6,2 c,3 lefleur,4 1989-10-23}\n" +
+          "   {3 esposito,4 1973-04-23}\n" +
+          "   {1 23.2,2 d,3 chelios,4 1999-2-23}\n" +
+          "   {3 richard,4 1973-10-12}\n" +
+          "   {3 howe,4 1983-06-23}\n" +
+          "   {0 2.2,2 d,3 messier,4 2008-11-23}\n" +
+          "   {2 c,3 roy,4 1973-10-13}\n";
+}

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=800084&r1=800083&r2=800084&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Sun Aug  2 14:19:55 2009
@@ -67,7 +67,7 @@
     Weight weight = new TFIDF();
     TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
     VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-    LuceneIteratable iterable = new LuceneIteratable(reader, "id", "content", mapper);
+    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper);
 
     //TODO: do something more meaningful here
     for (Vector vector : iterable) {



Mime
View raw message