Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm
Precedence: bulk
Reply-To: common-dev@hadoop.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: cdouglas@apache.org
To: common-commits@hadoop.apache.org
Date: Fri, 08 May 2015 21:31:51 -0000
Message-Id: <63d354327f254ef4ab6d76547cf3e7f8@git.apache.org>
In-Reply-To: <fb9e114612d348ee97acfe8ede74cbc8@git.apache.org>
References: <fb9e114612d348ee97acfe8ede74cbc8@git.apache.org>
Subject: [2/2] hadoop git commit: MAPREDUCE-2094. LineRecordReader should not
 seek into non-splittable, compressed streams.

MAPREDUCE-2094. LineRecordReader should not seek into non-splittable, compressed streams.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2edcf931
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2edcf931
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2edcf931

Branch: refs/heads/trunk
Commit: 2edcf931d7843cddcf3da5666a73d6ee9a10d00d
Parents: ec2748d
Author: Chris Douglas <cdouglas@apache.org>
Authored: Fri May 8 14:24:57 2015 -0700
Committer: Chris Douglas <cdouglas@apache.org>
Committed: Fri May 8 14:31:11 2015 -0700

----------------------------------------------------------------------
 .../hadoop-mapreduce-client-core/pom.xml         |  1 +
 .../apache/hadoop/mapred/FileInputFormat.java    | 17 ++++++++++++-----
 .../apache/hadoop/mapred/LineRecordReader.java   |  7 +++++++
 .../mapreduce/lib/input/FileInputFormat.java     | 19 +++++++++++++------
 .../mapreduce/lib/input/LineRecordReader.java    |  9 ++++++++-
 .../hadoop/mapred/TestLineRecordReader.java      |  7 +++++++
 .../lib/input/TestLineRecordReader.java          |  7 +++++++
 ...estSafeguardSplittingUnsplittableFiles.txt.gz |  1 +
 8 files changed, 56 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
index 19ce44b..c524b60 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
@@ -93,6 +93,7 @@
             <exclude>.gitattributes</exclude>
             <exclude>src/test/resources/recordSpanningMultipleSplits.txt</exclude>
             <exclude>src/test/resources/testBOM.txt</exclude>
+            <exclude>src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz</exclude>
           </excludes>
         </configuration>
       </plugin>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
index 5e45b49..c6cbd50 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
@@ -57,9 +57,12 @@ import com.google.common.collect.Iterables;
  * <p><code>FileInputFormat</code> is the base class for all file-based 
  * <code>InputFormat</code>s. This provides a generic implementation of
  * {@link #getSplits(JobConf, int)}.
- * Subclasses of <code>FileInputFormat</code> can also override the 
- * {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
- * not split-up and are processed as a whole by {@link Mapper}s.
+ *
+ * Implementations of <code>FileInputFormat</code> can also override the
+ * {@link #isSplitable(FileSystem, Path)} method to prevent input files
+ * from being split-up in certain situations. Implementations that may
+ * deal with non-splittable files <i>must</i> override this method, since
+ * the default implementation assumes splitting is always possible.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
@@ -116,9 +119,13 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
   }
 
   /**
-   * Is the given filename splitable? Usually, true, but if the file is
+   * Is the given filename splittable? Usually, true, but if the file is
    * stream compressed, it will not be.
-   * 
+   *
+   * The default implementation in <code>FileInputFormat</code> always returns
+   * true. Implementations that may deal with non-splittable files <i>must</i>
+   * override this method.
+   *
    * <code>FileInputFormat</code> implementations can override this and return
    * <code>false</code> to ensure that individual input files are never split-up
    * so that {@link Mapper}s process entire files.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
index ba075e5..45263c4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
@@ -118,6 +118,13 @@ public class LineRecordReader implements RecordReader<LongWritable, Text> {
         end = cIn.getAdjustedEnd();
         filePosition = cIn; // take pos from compressed stream
       } else {
+        if (start != 0) {
+          // So we have a split that is part of a file stored using
+          // a Compression codec that cannot be split.
+          throw new IOException("Cannot seek in " +
+              codec.getClass().getSimpleName() + " compressed stream");
+        }
+
         in = new SplitLineReader(codec.createInputStream(fileIn,
             decompressor), job, recordDelimiter);
         filePosition = fileIn;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
index a3ffe01..f5cd5d1 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
@@ -51,13 +51,16 @@ import com.google.common.collect.Lists;
 
 /** 
  * A base class for file-based {@link InputFormat}s.
- * 
+ *
  * <p><code>FileInputFormat</code> is the base class for all file-based 
  * <code>InputFormat</code>s. This provides a generic implementation of
  * {@link #getSplits(JobContext)}.
- * Subclasses of <code>FileInputFormat</code> can also override the 
- * {@link #isSplitable(JobContext, Path)} method to ensure input-files are
- * not split-up and are processed as a whole by {@link Mapper}s.
+ *
+ * Implementations of <code>FileInputFormat</code> can also override the
+ * {@link #isSplitable(JobContext, Path)} method to prevent input files
+ * from being split-up in certain situations. Implementations that may
+ * deal with non-splittable files <i>must</i> override this method, since
+ * the default implementation assumes splitting is always possible.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
@@ -146,9 +149,13 @@ public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
   }
 
   /**
-   * Is the given filename splitable? Usually, true, but if the file is
+   * Is the given filename splittable? Usually, true, but if the file is
    * stream compressed, it will not be.
-   * 
+   *
+   * The default implementation in <code>FileInputFormat</code> always returns
+   * true. Implementations that may deal with non-splittable files <i>must</i>
+   * override this method.
+   *
    * <code>FileInputFormat</code> implementations can override this and return
    * <code>false</code> to ensure that individual input files are never split-up
    * so that {@link Mapper}s process entire files.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
index 42e94ad..5af8f43 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
@@ -86,7 +86,7 @@ public class LineRecordReader extends RecordReader<LongWritable, Text> {
     
     CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
     if (null!=codec) {
-      isCompressedInput = true;	
+      isCompressedInput = true;
       decompressor = CodecPool.getDecompressor(codec);
       if (codec instanceof SplittableCompressionCodec) {
         final SplitCompressionInputStream cIn =
@@ -99,6 +99,13 @@ public class LineRecordReader extends RecordReader<LongWritable, Text> {
         end = cIn.getAdjustedEnd();
         filePosition = cIn;
       } else {
+        if (start != 0) {
+          // So we have a split that is only part of a file stored using
+          // a Compression codec that cannot be split.
+          throw new IOException("Cannot seek in " +
+              codec.getClass().getSimpleName() + " compressed stream");
+        }
+
         in = new SplitLineReader(codec.createInputStream(fileIn,
             decompressor), job, this.recordDelimiterBytes);
         filePosition = fileIn;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
index 4c94e59..cbbbeaa 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
@@ -127,6 +127,13 @@ public class TestLineRecordReader {
     testSplitRecords("blockEndingInCR.txt.bz2", 136494);
   }
 
+  @Test(expected=IOException.class)
+  public void testSafeguardSplittingUnsplittableFiles() throws IOException {
+    // The LineRecordReader must fail when trying to read a file that
+    // was compressed using an unsplittable file format
+    testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2);
+  }
+
   // Use the LineRecordReader to read records from the file
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
       throws IOException {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
index 52fdc09..8b385a0 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
@@ -110,6 +110,13 @@ public class TestLineRecordReader {
     testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
   }
 
+  @Test(expected=IOException.class)
+  public void testSafeguardSplittingUnsplittableFiles() throws IOException {
+    // The LineRecordReader must fail when trying to read a file that
+    // was compressed using an unsplittable file format
+    testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2);
+  }
+
   // Use the LineRecordReader to read records from the file
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
       throws IOException {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2edcf931/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
----------------------------------------------------------------------
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
new file mode 100644
index 0000000..557db03
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
@@ -0,0 +1 @@
+Hello World