Return-Path: X-Original-To: apmail-hadoop-common-commits-archive@www.apache.org Delivered-To: apmail-hadoop-common-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 103F017956 for ; Tue, 6 Jan 2015 21:20:09 +0000 (UTC) Received: (qmail 54907 invoked by uid 500); 6 Jan 2015 21:20:10 -0000 Delivered-To: apmail-hadoop-common-commits-archive@hadoop.apache.org Received: (qmail 54829 invoked by uid 500); 6 Jan 2015 21:20:10 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: common-dev@hadoop.apache.org Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 54820 invoked by uid 99); 6 Jan 2015 21:20:09 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 06 Jan 2015 21:20:09 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 9F2DE9D1774; Tue, 6 Jan 2015 21:20:09 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jlowe@apache.org To: common-commits@hadoop.apache.org Message-Id: <1c741e5481c64cbca6ef77264e21e002@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hadoop git commit: HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created stream is equal to start of split. Contributed by Ankit Kamboj Date: Tue, 6 Jan 2015 21:20:09 +0000 (UTC) Repository: hadoop Updated Branches: refs/heads/trunk cd7d78914 -> d02fb5375 HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created stream is equal to start of split. Contributed by Ankit Kamboj Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/d02fb537 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/d02fb537 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/d02fb537 Branch: refs/heads/trunk Commit: d02fb53750bc592c23ba470ae82eb6f47d9a00ec Parents: cd7d789 Author: Jason Lowe Authored: Tue Jan 6 21:19:10 2015 +0000 Committer: Jason Lowe Committed: Tue Jan 6 21:19:10 2015 +0000 ---------------------------------------------------------------------- hadoop-common-project/hadoop-common/CHANGES.txt | 3 +++ .../apache/hadoop/io/compress/BZip2Codec.java | 2 +- .../hadoop/mapred/TestLineRecordReader.java | 21 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/d02fb537/hadoop-common-project/hadoop-common/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index e7a2061..49438aa 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -677,6 +677,9 @@ Release 2.7.0 - UNRELEASED HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils and ShellBasedIdMapping (vinayakumarb) + HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly + created stream is equal to start of split (Ankit Kamboj via jlowe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/d02fb537/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java ---------------------------------------------------------------------- diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java index 91178ec..2c5a7be 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java @@ -225,7 +225,7 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec { // ........................................^^[We align at wrong position!] // ...........................................................^^[While this pos is correct] - if (in.getPos() <= start) { + if (in.getPos() < start) { ((Seekable)seekableIn).seek(start); in = new BZip2CompressionInputStream(seekableIn, start, end, readMode); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/d02fb537/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java index a7a87c9..4c94e59 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java @@ -106,6 +106,27 @@ public class TestLineRecordReader { testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498); } + //This test ensures record reader doesn't lose records when it starts + //exactly at the starting byte of a bz2 compressed block + @Test + public void testBzip2SplitStartAtBlockMarker() throws IOException { + //136504 in blockEndingInCR.txt.bz2 is the byte at which the bz2 block ends + //In the following test cases record readers should iterate over all the records + //and should not miss any record. + + //Start next split at just the start of the block. + testSplitRecords("blockEndingInCR.txt.bz2", 136504); + + //Start next split a byte forward in next block. + testSplitRecords("blockEndingInCR.txt.bz2", 136505); + + //Start next split 3 bytes forward in next block. + testSplitRecords("blockEndingInCR.txt.bz2", 136508); + + //Start next split 10 bytes from behind the end marker. + testSplitRecords("blockEndingInCR.txt.bz2", 136494); + } + // Use the LineRecordReader to read records from the file public ArrayList readRecords(URL testFileUrl, int splitSize) throws IOException {