mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1177027 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/text/ test/java/org/apache/mahout/text/
Date Wed, 28 Sep 2011 19:37:45 GMT
Author: srowen
Date: Wed Sep 28 19:37:45 2011
New Revision: 1177027

URL: http://svn.apache.org/viewvc?rev=1177027&view=rev
Log:
MAHOUT-799 remove CSV filter that wasn't working

Removed:
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromCsvFilter.java
Modified:
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
Wed Sep 28 19:37:45 2011
@@ -26,6 +26,7 @@ import org.apache.mahout.common.iterator
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Map;
 
 /**
@@ -37,8 +38,9 @@ public final class PrefixAdditionFilter 
                               String keyPrefix,
                               Map<String, String> options, 
                               ChunkedWriter writer,
+                              Charset charset,
                               FileSystem fs) {
-    super(conf, keyPrefix, options, writer, fs);
+    super(conf, keyPrefix, options, writer, charset, fs);
   }
 
   @Override
@@ -47,7 +49,8 @@ public final class PrefixAdditionFilter 
     ChunkedWriter writer = getWriter();
     if (fst.isDir()) {
       String dirPath = getPrefix() + Path.SEPARATOR + current.getName() + Path.SEPARATOR
+ fst.getPath().getName();
-      fs.listStatus(fst.getPath(), new PrefixAdditionFilter(getConf(), dirPath, getOptions(),
writer, fs));
+      fs.listStatus(fst.getPath(),
+                    new PrefixAdditionFilter(getConf(), dirPath, getOptions(), writer, getCharset(),
fs));
     } else {
       InputStream in = null;
       try {

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
Wed Sep 28 19:37:45 2011
@@ -19,7 +19,7 @@ package org.apache.mahout.text;
 
 import java.io.IOException;
 import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
+import java.nio.charset.Charset;
 import java.util.Map;
 
 import com.google.common.collect.Maps;
@@ -45,42 +45,10 @@ public class SequenceFilesFromDirectory 
   private static final String PREFIX_ADDITION_FILTER = PrefixAdditionFilter.class.getName();
   
   private static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
-  static final String[] FILE_FILTER_CLASS_OPTION = {"fileFilterClass","filter"};
+  private static final String[] FILE_FILTER_CLASS_OPTION = {"fileFilterClass","filter"};
   private static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
-  static final String[] CHARSET_OPTION = {"charset", "c"};
+  private static final String[] CHARSET_OPTION = {"charset", "c"};
 
-  public static void run(Configuration conf,
-                         String keyPrefix,
-                         Map<String, String> options,
-                         Path input,
-                         Path output)
-    throws InstantiationException, IllegalAccessException, InvocationTargetException, IOException,
-           NoSuchMethodException, ClassNotFoundException {
-    FileSystem fs = FileSystem.get(input.toUri(), conf);
-    ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])),
output);
-
-    try {
-      SequenceFilesFromDirectoryFilter pathFilter;
-      String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
-      if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
-        pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, fs);
-      } else {
-        Class<? extends SequenceFilesFromDirectoryFilter> pathFilterClass =
-            Class.forName(fileFilterClassName).asSubclass(SequenceFilesFromDirectoryFilter.class);
-        Constructor<? extends SequenceFilesFromDirectoryFilter> constructor =
-            pathFilterClass.getConstructor(Configuration.class,
-                                           String.class,
-                                           Map.class,
-                                           ChunkedWriter.class,
-                                           FileSystem.class);
-        pathFilter = constructor.newInstance(conf, keyPrefix, options, writer, fs);
-      }
-      fs.listStatus(input, pathFilter);
-    } finally {
-      Closeables.closeQuietly(writer);
-    }
-  }
-  
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new SequenceFilesFromDirectory(), args);
   }
@@ -89,9 +57,7 @@ public class SequenceFilesFromDirectory 
    * callback main after processing hadoop parameters
    */
   @Override
-  public int run(String[] args)
-    throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
NoSuchMethodException,
-           InvocationTargetException {
+  public int run(String[] args) throws Exception {
     addOptions();    
     
     if (parseArguments(args) == null) {
@@ -107,7 +73,32 @@ public class SequenceFilesFromDirectory 
     }
     String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
 
-    run(getConf(), keyPrefix, options, input, output);
+    Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(input.toUri(), conf);
+    ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])),
output);
+
+    try {
+      SequenceFilesFromDirectoryFilter pathFilter;
+      String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
+      if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
+        pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset,
fs);
+      } else {
+        Class<? extends SequenceFilesFromDirectoryFilter> pathFilterClass =
+            Class.forName(fileFilterClassName).asSubclass(SequenceFilesFromDirectoryFilter.class);
+        Constructor<? extends SequenceFilesFromDirectoryFilter> constructor =
+            pathFilterClass.getConstructor(Configuration.class,
+                                           String.class,
+                                           Map.class,
+                                           ChunkedWriter.class,
+                                           Charset.class,
+                                           FileSystem.class);
+        pathFilter = constructor.newInstance(conf, keyPrefix, options, writer, fs);
+      }
+      fs.listStatus(input, pathFilter);
+    } finally {
+      Closeables.closeQuietly(writer);
+    }
     return 0;
   }
 

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
(original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
Wed Sep 28 19:37:45 2011
@@ -32,7 +32,7 @@ import java.util.Map;
 /**
  * Implement this interface if you wish to extend SequenceFilesFromDirectory with your own
parsing logic.
  */
-public abstract class SequenceFilesFromDirectoryFilter extends SequenceFilesFromDirectory
implements PathFilter {
+public abstract class SequenceFilesFromDirectoryFilter implements PathFilter {
   private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromDirectoryFilter.class);
 
   private final String prefix;
@@ -40,26 +40,20 @@ public abstract class SequenceFilesFromD
   private final Charset charset;
   private final FileSystem fs;
   private final Map<String, String> options;
-
-  protected SequenceFilesFromDirectoryFilter() {
-    this.prefix = null;
-    this.writer = null;
-    this.charset = null;
-    this.fs = null;
-    this.options = null;
-  }
+  private final Configuration conf;
 
   protected SequenceFilesFromDirectoryFilter(Configuration conf,
                                              String keyPrefix,
                                              Map<String, String> options,
                                              ChunkedWriter writer,
+                                             Charset charset,
                                              FileSystem fs) {
     this.prefix = keyPrefix;
     this.writer = writer;
-    this.charset = Charset.forName(options.get(SequenceFilesFromDirectory.CHARSET_OPTION[0]));
+    this.charset = charset;
     this.fs = fs;
     this.options = options;
-    setConf(conf);
+    this.conf = conf;
   }
 
   protected final String getPrefix() {
@@ -81,6 +75,10 @@ public abstract class SequenceFilesFromD
   protected final Map<String, String> getOptions() {
     return options;
   }
+  
+  protected final Configuration getConf() {
+    return conf;
+  }
 
   @Override
   public final boolean accept(Path current) {

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
(original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
Wed Sep 28 19:37:45 2011
@@ -43,10 +43,6 @@ public final class TestSequenceFilesFrom
       {"test3", "This is the third text."}
   };
 
-  private enum ParserType {
-    TEXT, CSV
-  }
-  
   /**
    * Story converting text files to SequenceFile
    */
@@ -66,46 +62,15 @@ public final class TestSequenceFilesFrom
     // prepare input files
     createFilesFromArrays(conf, inputDir, DATA1);
 
-    String prefix = "UID";
-    SequenceFilesFromDirectory.main(new String[] {"--input",
-        inputDir.toString(), "--output", outputDir.toString(), "--chunkSize",
-        "64", "--charset",
-        Charsets.UTF_8.name(), "--keyPrefix", prefix});
+    SequenceFilesFromDirectory.main(new String[] {
+        "--input", inputDir.toString(),
+        "--output", outputDir.toString(),
+        "--chunkSize", "64",
+        "--charset", Charsets.UTF_8.name(),
+        "--keyPrefix", "UID"});
     
     // check output chunk files
-    checkChunkFiles(conf, outputDir, DATA1, prefix, ParserType.TEXT);
-  }
-
-  /**
-   * Story converting a TSV file to SequenceFile
-   */
-  @Test
-  public void testSequnceFileFromDirectoryTsv() throws Exception {
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-
-    // create
-    Path tmpDir = this.getTestTempDirPath();
-    Path inputDir = new Path(tmpDir, "inputDir");
-    fs.mkdirs(inputDir);
-    Path outputDir = new Path(tmpDir, "outputDir");
-    
-    // prepare input TSV file
-    createTsvFilesFromArrays(conf, inputDir, DATA1);
-    
-    // convert it to SequenceFile
-    String prefix = "UID";
-    int chunkSizeInMB = 64;
-    int keyColumn = 0;
-    int valueColumn = 1;
-    SequenceFilesFromCsvFilter.main(new String[] {"--input", inputDir.toString(),
-        "--output", outputDir.toString(), "--charset", Charsets.UTF_8.name(),
-        "--chunkSize", Integer.toString(chunkSizeInMB), "--keyPrefix", prefix,
-        "--keyColumn", Integer.toString(keyColumn), "--valueColumn",
-        Integer.toString(valueColumn)});
-    
-    // check output chunk files
-    checkChunkFiles(conf, outputDir, DATA1, prefix, ParserType.CSV);
+    checkChunkFiles(conf, outputDir, DATA1, "UID");
   }
 
   private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][]
data) throws IOException {
@@ -120,23 +85,10 @@ public final class TestSequenceFilesFrom
     }
   }
 
-  private static void createTsvFilesFromArrays(Configuration conf, Path inputDir, String[][]
data) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    OutputStreamWriter writer = new OutputStreamWriter(fs.create(new Path(inputDir, "inputTsvFile")));
-    try {
-      for (String[] aData : data) {
-        writer.write(aData[0] + '\t' + aData[1] + '\n');
-      }
-    } finally {
-      Closeables.closeQuietly(writer);
-    }
-  }
-
   private static void checkChunkFiles(Configuration conf,
                                       Path outputDir,
                                       String[][] data,
-                                      String prefix,
-                                      ParserType inputType) throws IOException {
+                                      String prefix) throws IOException {
     FileSystem fs = FileSystem.get(conf);
     
     // output exists?
@@ -147,11 +99,7 @@ public final class TestSequenceFilesFrom
 
     Map<String,String> fileToData = Maps.newHashMap();
     for (String[] aData : data) {
-      if (inputType == ParserType.CSV) {
-        fileToData.put(prefix + aData[0], aData[1]);
-      } else {
-        fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
-      }
+      fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
     }
 
     // read a chunk to check content



Mime
View raw message