avro-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject svn commit: r1550605 - in /avro/trunk: ./ lang/java/mapred/src/main/java/org/apache/avro/mapred/ lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/ lang/java/mapred/src/test/java/org/apache/avro/mapred/
Date Fri, 13 Dec 2013 00:58:52 GMT
Author: cutting
Date: Fri Dec 13 00:58:52 2013
New Revision: 1550605

URL: http://svn.apache.org/r1550605
Log:
AVRO-1234. Java: Permit AvroInputFormat to process files whose names don't end in .avro. 
Contributed by Dave Beech & Sandy Ryza.

Added:
    avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java
  (with props)
Modified:
    avro/trunk/CHANGES.txt
    avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroAsTextInputFormat.java
    avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroInputFormat.java
    avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/TetherInputFormat.java

Modified: avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/avro/trunk/CHANGES.txt?rev=1550605&r1=1550604&r2=1550605&view=diff
==============================================================================
--- avro/trunk/CHANGES.txt (original)
+++ avro/trunk/CHANGES.txt Fri Dec 13 00:58:52 2013
@@ -35,6 +35,9 @@ Trunk (not yet released)
     AVRO-1398. Increase default sync interval from 16k to 64k.
     (Rob Turner via cutting)
 
+    AVRO-1234. Java: Permit AvroInputFormat to process files whose
+    names don't end in .avro.  (Dave Beech & Sandy Ryza via cutting)
+
   BUG FIXES
 
     AVRO-1368. Fix SpecificDatumWriter to, when writing a string

Modified: avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroAsTextInputFormat.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroAsTextInputFormat.java?rev=1550605&r1=1550604&r2=1550605&view=diff
==============================================================================
--- avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroAsTextInputFormat.java
(original)
+++ avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroAsTextInputFormat.java
Fri Dec 13 00:58:52 2013
@@ -39,16 +39,25 @@ import org.apache.hadoop.mapred.Reporter
  * <p>
  * This {@link org.apache.hadoop.mapred.InputFormat} is useful for applications
  * that wish to process Avro data using tools like MapReduce Streaming.
+ * 
+ * By default, when pointed at a directory, this will silently skip over any
+ * files in it that do not have .avro extension. To instead include all files,
+ * set the avro.mapred.ignore.inputs.without.extension property to false.
  */
 public class AvroAsTextInputFormat extends FileInputFormat<Text, Text> {
 
   @Override
   protected FileStatus[] listStatus(JobConf job) throws IOException {
-    List<FileStatus> result = new ArrayList<FileStatus>();
-    for (FileStatus file : super.listStatus(job))
-      if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
-        result.add(file);
-    return result.toArray(new FileStatus[0]);
+    if (job.getBoolean(AvroInputFormat.IGNORE_FILES_WITHOUT_EXTENSION_KEY,
+        AvroInputFormat.IGNORE_INPUTS_WITHOUT_EXTENSION_DEFAULT)) {
+      List<FileStatus> result = new ArrayList<FileStatus>();
+      for (FileStatus file : super.listStatus(job))
+        if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
+          result.add(file);
+      return result.toArray(new FileStatus[0]);
+    } else {
+      return super.listStatus(job);
+    }
   }
   
   @Override

Modified: avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroInputFormat.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroInputFormat.java?rev=1550605&r1=1550604&r2=1550605&view=diff
==============================================================================
--- avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroInputFormat.java
(original)
+++ avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/AvroInputFormat.java
Fri Dec 13 00:58:52 2013
@@ -31,17 +31,36 @@ import org.apache.hadoop.mapred.FileSpli
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.RecordReader;
 
-/** An {@link org.apache.hadoop.mapred.InputFormat} for Avro data files */
+/**
+ * An {@link org.apache.hadoop.mapred.InputFormat} for Avro data files.
+ * 
+ * By default, when pointed at a directory, this will silently skip over any
+ * files in it that do not have .avro extension. To instead include all files,
+ * set the avro.mapred.ignore.inputs.without.extension property to false.
+ */
 public class AvroInputFormat<T>
   extends FileInputFormat<AvroWrapper<T>, NullWritable> {
 
+  /** Whether to silently ignore input files without the .avro extension */
+  public static final String IGNORE_FILES_WITHOUT_EXTENSION_KEY =
+      "avro.mapred.ignore.inputs.without.extension";
+  
+  /** Default of whether to silently ignore input files without the .avro
+   * extension. */
+  public static final boolean IGNORE_INPUTS_WITHOUT_EXTENSION_DEFAULT = true;
+  
   @Override
   protected FileStatus[] listStatus(JobConf job) throws IOException {
-    List<FileStatus> result = new ArrayList<FileStatus>();
-    for (FileStatus file : super.listStatus(job))
-      if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
-        result.add(file);
-    return result.toArray(new FileStatus[0]);
+    if (job.getBoolean(IGNORE_FILES_WITHOUT_EXTENSION_KEY,
+        IGNORE_INPUTS_WITHOUT_EXTENSION_DEFAULT)) {
+      List<FileStatus> result = new ArrayList<FileStatus>();
+      for (FileStatus file : super.listStatus(job))
+        if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
+          result.add(file);
+      return result.toArray(new FileStatus[0]);
+    } else {
+      return super.listStatus(job);
+    }
   }
 
   @Override

Modified: avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/TetherInputFormat.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/TetherInputFormat.java?rev=1550605&r1=1550604&r2=1550605&view=diff
==============================================================================
--- avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/TetherInputFormat.java
(original)
+++ avro/trunk/lang/java/mapred/src/main/java/org/apache/avro/mapred/tether/TetherInputFormat.java
Fri Dec 13 00:58:52 2013
@@ -31,19 +31,31 @@ import org.apache.hadoop.mapred.FileSpli
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.RecordReader;
 
+import org.apache.avro.mapred.AvroInputFormat;
 import org.apache.avro.mapred.AvroOutputFormat;
 
-/** An {@link org.apache.hadoop.mapred.InputFormat} for tethered Avro input. */
+/**
+ * An {@link org.apache.hadoop.mapred.InputFormat} for tethered Avro input.
+ * 
+ * By default, when pointed at a directory, this will silently skip over any
+ * files in it that do not have .avro extension. To instead include all files,
+ * set the avro.mapred.ignore.inputs.without.extension property to false.
+ * */
 class TetherInputFormat
   extends FileInputFormat<TetherData, NullWritable> {
 
   @Override
   protected FileStatus[] listStatus(JobConf job) throws IOException {
-    List<FileStatus> result = new ArrayList<FileStatus>();
-    for (FileStatus file : super.listStatus(job))
-      if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
-        result.add(file);
-    return result.toArray(new FileStatus[0]);
+    if (job.getBoolean(AvroInputFormat.IGNORE_FILES_WITHOUT_EXTENSION_KEY,
+        AvroInputFormat.IGNORE_INPUTS_WITHOUT_EXTENSION_DEFAULT)) {
+      List<FileStatus> result = new ArrayList<FileStatus>();
+      for (FileStatus file : super.listStatus(job))
+        if (file.getPath().getName().endsWith(AvroOutputFormat.EXT))
+          result.add(file);
+      return result.toArray(new FileStatus[0]);
+    } else {
+      return super.listStatus(job);
+    }
   }
 
   @Override

Added: avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java?rev=1550605&view=auto
==============================================================================
--- avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java
(added)
+++ avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java
Fri Dec 13 00:58:52 2013
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.  See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package org.apache.avro.mapred;
+
+import java.io.File;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.Assert;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestAvroInputFormat {
+  
+  private static final String TEST_DIR = System.getProperty("test.dir", ".") +
+      File.separator + TestAvroInputFormat.class.getName();
+  private JobConf conf;
+  private FileSystem fs;
+  private Path inputDir;
+  
+  @Before
+  public void setUp() throws Exception {
+    conf = new JobConf();
+    fs = FileSystem.getLocal(conf);
+    inputDir = new Path(TEST_DIR);
+  }
+  
+  
+  @After
+  public void tearDown() throws Exception {
+    fs.delete(inputDir, true);
+  }
+  
+  @SuppressWarnings("rawtypes")
+  @Test
+  public void testIgnoreFilesWithoutExtension() throws Exception {
+    fs.mkdirs(inputDir);
+    Path avroFile = new Path(inputDir, "somefile.avro");
+    Path textFile = new Path(inputDir, "someotherfile.txt");
+    fs.create(avroFile).close();
+    fs.create(textFile).close();
+    
+    FileInputFormat.setInputPaths(conf, inputDir);
+
+    
+    AvroInputFormat inputFormat = new AvroInputFormat();
+    FileStatus[] statuses = inputFormat.listStatus(conf);
+    Assert.assertEquals(1, statuses.length);
+    Assert.assertEquals("somefile.avro", statuses[0].getPath().getName());
+    
+    conf.setBoolean(AvroInputFormat.IGNORE_FILES_WITHOUT_EXTENSION_KEY, false);
+    statuses = inputFormat.listStatus(conf);
+    Assert.assertEquals(2, statuses.length);
+    Set<String> names = new HashSet<String>();
+    names.add(statuses[0].getPath().getName());
+    names.add(statuses[1].getPath().getName());
+    Assert.assertTrue(names.contains("somefile.avro"));
+    Assert.assertTrue(names.contains("someotherfile.txt"));
+  }
+}

Propchange: avro/trunk/lang/java/mapred/src/test/java/org/apache/avro/mapred/TestAvroInputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message