hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasan...@apache.org
Subject svn commit: r1656881 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/io/orc/ test/resources/
Date Tue, 03 Feb 2015 18:34:36 GMT
Author: prasanthj
Date: Tue Feb  3 18:34:35 2015
New Revision: 1656881

URL: http://svn.apache.org/r1656881
Log:
HIVE-9471: Bad seek in uncompressed ORC, at row-group boundary. (Mithun Radhakrishnan reviewed
by Prasanth Jayachandran)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
    hive/trunk/ql/src/test/resources/orc-file-has-null.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/InStream.java Tue Feb  3 18:34:35
2015
@@ -98,6 +98,12 @@ abstract class InStream extends InputStr
 
     public void seek(long desired) {
       for(int i = 0; i < bytes.length; ++i) {
+        if (desired == 0 && bytes[i].remaining() == 0) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Attempting seek into empty stream (" + name + ") Skipping stream.");
+          }
+          return;
+        }
         if (offsets[i] <= desired &&
             desired - offsets[i] < bytes[i].remaining()) {
           currentOffset = desired;

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/IntegerWriter.java Tue Feb  3
18:34:35 2015
@@ -40,6 +40,11 @@ interface IntegerWriter {
   void write(long value) throws IOException;
 
   /**
+   * Suppress underlying stream.
+   */
+  void suppress();
+
+  /**
    * Flush the buffer
    * @throws IOException
    */

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java Tue Feb
 3 18:34:35 2015
@@ -1557,32 +1557,36 @@ class RecordReaderImpl implements Record
       StreamName name = new StreamName(columnId,
           OrcProto.Stream.Kind.DICTIONARY_DATA);
       InStream in = streams.get(name);
-      if (in.available() > 0) {
-        dictionaryBuffer = new DynamicByteArray(64, in.available());
-        dictionaryBuffer.readAll(in);
-        // Since its start of strip invalidate the cache.
-        dictionaryBufferInBytesCache = null;
+      if (in != null) { // Guard against empty dictionary stream.
+        if (in.available() > 0) {
+          dictionaryBuffer = new DynamicByteArray(64, in.available());
+          dictionaryBuffer.readAll(in);
+          // Since its start of strip invalidate the cache.
+          dictionaryBufferInBytesCache = null;
+        }
+        in.close();
       } else {
         dictionaryBuffer = null;
       }
-      in.close();
 
       // read the lengths
       name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
       in = streams.get(name);
-      IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
-          .getKind(), in, false);
-      int offset = 0;
-      if (dictionaryOffsets == null ||
-          dictionaryOffsets.length < dictionarySize + 1) {
-        dictionaryOffsets = new int[dictionarySize + 1];
+      if (in != null) { // Guard against empty LENGTH stream.
+        IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
+            .getKind(), in, false);
+        int offset = 0;
+        if (dictionaryOffsets == null ||
+            dictionaryOffsets.length < dictionarySize + 1) {
+          dictionaryOffsets = new int[dictionarySize + 1];
+        }
+        for (int i = 0; i < dictionarySize; ++i) {
+          dictionaryOffsets[i] = offset;
+          offset += (int) lenReader.next();
+        }
+        dictionaryOffsets[dictionarySize] = offset;
+        in.close();
       }
-      for(int i=0; i < dictionarySize; ++i) {
-        dictionaryOffsets[i] = offset;
-        offset += (int) lenReader.next();
-      }
-      dictionaryOffsets[dictionarySize] = offset;
-      in.close();
 
       // set up the row reader
       name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriter.java Tue
Feb  3 18:34:35 2015
@@ -31,7 +31,7 @@ class RunLengthIntegerWriter implements
   static final int MIN_DELTA = -128;
   static final int MAX_LITERAL_SIZE = 128;
   private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
-  private final PositionedOutputStream output;
+  private final OutStream output;
   private final boolean signed;
   private final long[] literals = new long[MAX_LITERAL_SIZE];
   private int numLiterals = 0;
@@ -40,7 +40,7 @@ class RunLengthIntegerWriter implements
   private int tailRunLength = 0;
   private SerializationUtils utils;
 
-  RunLengthIntegerWriter(PositionedOutputStream output,
+  RunLengthIntegerWriter(OutStream output,
                          boolean signed) {
     this.output = output;
     this.signed = signed;
@@ -135,6 +135,11 @@ class RunLengthIntegerWriter implements
   }
 
   @Override
+  public void suppress() {
+    this.output.suppress();
+  }
+
+  @Override
   public void getPosition(PositionRecorder recorder) throws IOException {
     output.getPosition(recorder);
     recorder.addPosition(numLiterals);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
Tue Feb  3 18:34:35 2015
@@ -138,7 +138,7 @@ class RunLengthIntegerWriterV2 implement
   private int fixedRunLength = 0;
   private int variableRunLength = 0;
   private final long[] literals = new long[MAX_SCOPE];
-  private final PositionedOutputStream output;
+  private final OutStream output;
   private final boolean signed;
   private EncodingType encoding;
   private int numLiterals;
@@ -160,11 +160,11 @@ class RunLengthIntegerWriterV2 implement
   private SerializationUtils utils;
   private boolean alignedBitpacking;
 
-  RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
+  RunLengthIntegerWriterV2(OutStream output, boolean signed) {
     this(output, signed, true);
   }
 
-  RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
+  RunLengthIntegerWriterV2(OutStream output, boolean signed,
       boolean alignedBitpacking) {
     this.output = output;
     this.signed = signed;
@@ -818,6 +818,11 @@ class RunLengthIntegerWriterV2 implement
     }
   }
 
+  @Override
+  public void suppress() {
+    this.output.suppress();
+  }
+
   private void initializeLiterals(long val) {
     literals[numLiterals++] = val;
     fixedRunLength = 1;

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java Tue Feb  3 18:34:35
2015
@@ -630,7 +630,7 @@ class WriterImpl implements Writer, Memo
       return rowIndexEntry;
     }
 
-    IntegerWriter createIntegerWriter(PositionedOutputStream output,
+    IntegerWriter createIntegerWriter(OutStream output,
                                       boolean signed, boolean isDirectV2,
                                       StreamFactory writer) {
       if (isDirectV2) {
@@ -882,7 +882,7 @@ class WriterImpl implements Writer, Memo
                       StreamFactory writer,
                       boolean nullable) throws IOException {
       super(columnId, inspector, writer, nullable);
-      PositionedOutputStream out = writer.createStream(id,
+      OutStream out = writer.createStream(id,
           OrcProto.Stream.Kind.DATA);
       this.isDirectV2 = isNewWriteFormat(writer);
       this.writer = createIntegerWriter(out, true, isDirectV2, writer);
@@ -1162,6 +1162,14 @@ class WriterImpl implements Writer, Memo
         // Write the dictionary by traversing the red-black tree writing out
         // the bytes and lengths; and creating the map from the original order
         // to the final sorted order.
+        if (dictionary.size() == 0) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Empty dictionary. Suppressing dictionary stream.");
+          }
+          stringOutput.suppress();
+          lengthOutput.suppress();
+        }
+
         dictionary.visit(new StringRedBlackTree.Visitor() {
           private int currentId = 0;
           @Override
@@ -1467,7 +1475,7 @@ class WriterImpl implements Writer, Memo
                    StreamFactory writer,
                    boolean nullable) throws IOException {
       super(columnId, inspector, writer, nullable);
-      PositionedOutputStream out = writer.createStream(id,
+      OutStream out = writer.createStream(id,
           OrcProto.Stream.Kind.DATA);
       this.isDirectV2 = isNewWriteFormat(writer);
       this.writer = createIntegerWriter(out, true, isDirectV2, writer);

Modified: hive/trunk/ql/src/test/resources/orc-file-has-null.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-has-null.out?rev=1656881&r1=1656880&r2=1656881&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-has-null.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-has-null.out Tue Feb  3 18:34:35 2015
@@ -48,7 +48,7 @@ Stripes:
       Entry 2:count: 1000 hasNull: false min: RG3 max: RG3 sum: 3000 positions: 0,2,125,0,0,66,488
       Entry 3:count: 0 hasNull: true positions: 0,4,125,0,0,136,488
       Entry 4:count: 0 hasNull: true positions: 0,6,125,0,0,136,488
-  Stripe: offset: 424 data: 156 rows: 5000 tail: 60 index: 119
+  Stripe: offset: 424 data: 156 rows: 5000 tail: 55 index: 119
     Stream: column 0 section ROW_INDEX start: 424 length 17
     Stream: column 1 section ROW_INDEX start: 441 length 63
     Stream: column 2 section ROW_INDEX start: 504 length 39
@@ -56,8 +56,6 @@ Stripes:
     Stream: column 1 section LENGTH start: 656 length 32
     Stream: column 2 section PRESENT start: 688 length 11
     Stream: column 2 section DATA start: 699 length 0
-    Stream: column 2 section LENGTH start: 699 length 0
-    Stream: column 2 section DICTIONARY_DATA start: 699 length 0
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DICTIONARY_V2[0]
@@ -67,15 +65,15 @@ Stripes:
       Entry 2:count: 0 hasNull: true positions: 0,2,120,0,0,0,0
       Entry 3:count: 0 hasNull: true positions: 0,4,115,0,0,0,0
       Entry 4:count: 0 hasNull: true positions: 0,6,110,0,0,0,0
-  Stripe: offset: 759 data: 186 rows: 5000 tail: 60 index: 148
-    Stream: column 0 section ROW_INDEX start: 759 length 17
-    Stream: column 1 section ROW_INDEX start: 776 length 63
-    Stream: column 2 section ROW_INDEX start: 839 length 68
-    Stream: column 1 section DATA start: 907 length 113
-    Stream: column 1 section LENGTH start: 1020 length 32
-    Stream: column 2 section DATA start: 1052 length 24
-    Stream: column 2 section LENGTH start: 1076 length 6
-    Stream: column 2 section DICTIONARY_DATA start: 1082 length 11
+  Stripe: offset: 754 data: 186 rows: 5000 tail: 60 index: 148
+    Stream: column 0 section ROW_INDEX start: 754 length 17
+    Stream: column 1 section ROW_INDEX start: 771 length 63
+    Stream: column 2 section ROW_INDEX start: 834 length 68
+    Stream: column 1 section DATA start: 902 length 113
+    Stream: column 1 section LENGTH start: 1015 length 32
+    Stream: column 2 section DATA start: 1047 length 24
+    Stream: column 2 section LENGTH start: 1071 length 6
+    Stream: column 2 section DICTIONARY_DATA start: 1077 length 11
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DICTIONARY_V2[1]
@@ -85,16 +83,14 @@ Stripes:
       Entry 2:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions:
0,198,464
       Entry 3:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions:
0,330,440
       Entry 4:count: 1000 hasNull: false min: STRIPE-3 max: STRIPE-3 sum: 8000 positions:
0,462,416
-  Stripe: offset: 1153 data: 156 rows: 5000 tail: 60 index: 119
-    Stream: column 0 section ROW_INDEX start: 1153 length 17
-    Stream: column 1 section ROW_INDEX start: 1170 length 63
-    Stream: column 2 section ROW_INDEX start: 1233 length 39
-    Stream: column 1 section DATA start: 1272 length 113
-    Stream: column 1 section LENGTH start: 1385 length 32
-    Stream: column 2 section PRESENT start: 1417 length 11
-    Stream: column 2 section DATA start: 1428 length 0
-    Stream: column 2 section LENGTH start: 1428 length 0
-    Stream: column 2 section DICTIONARY_DATA start: 1428 length 0
+  Stripe: offset: 1148 data: 156 rows: 5000 tail: 55 index: 119
+    Stream: column 0 section ROW_INDEX start: 1148 length 17
+    Stream: column 1 section ROW_INDEX start: 1165 length 63
+    Stream: column 2 section ROW_INDEX start: 1228 length 39
+    Stream: column 1 section DATA start: 1267 length 113
+    Stream: column 1 section LENGTH start: 1380 length 32
+    Stream: column 2 section PRESENT start: 1412 length 11
+    Stream: column 2 section DATA start: 1423 length 0
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DICTIONARY_V2[0]
@@ -105,6 +101,6 @@ Stripes:
       Entry 3:count: 0 hasNull: true positions: 0,4,115,0,0,0,0
       Entry 4:count: 0 hasNull: true positions: 0,6,110,0,0,0,0
 
-File length: 1736 bytes
+File length: 1728 bytes
 Padding length: 0 bytes
 Padding ratio: 0%



Mime
View raw message