drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From j...@apache.org
Subject [1/9] drill git commit: DRILL-2031: Parquet bit column reader fix
Date Fri, 27 Feb 2015 08:01:55 GMT
Repository: drill
Updated Branches:
  refs/heads/master d72d6030e -> 74517f5d9


DRILL-2031: Parquet bit column reader fix

Removes attempted optimized bit reader, left a comment where it was removed as a pointer back
to this change set. For now the higher level Parquet interface will be used to ensure reading
is correct, a fix of the optimized reader can be pursued at a later date if is is necessary.

Fixed two bugs in the test framework necessary to verify the fix against the complex parquet
reader (which uses a similar interface, but it ensures that none of the other columns are
read incorrectly).

Fix conflict between parquet bit reader fix and vector reallocation changes.


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/1d2ed349
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/1d2ed349
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/1d2ed349

Branch: refs/heads/master
Commit: 1d2ed349699a326165c721257937905e3043418c
Parents: d72d603
Author: Jason Altekruse <altekrusejason@gmail.com>
Authored: Mon Jan 19 15:38:46 2015 -0800
Committer: Jason Altekruse <altekrusejason@gmail.com>
Committed: Thu Feb 26 16:40:53 2015 -0800

----------------------------------------------------------------------
 .../store/parquet/columnreaders/BitReader.java  | 65 ++++++--------------
 .../java/org/apache/drill/DrillTestWrapper.java |  2 +-
 .../drill/exec/HyperVectorValueIterator.java    |  2 +-
 .../physical/impl/writer/TestParquetWriter.java |  6 ++
 4 files changed, 26 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/1d2ed349/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BitReader.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BitReader.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BitReader.java
index 9aabc9c..7416463 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BitReader.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BitReader.java
@@ -21,6 +21,7 @@ import io.netty.buffer.ByteBuf;
 
 import org.apache.drill.common.exceptions.ExecutionSetupException;
 import org.apache.drill.exec.vector.BaseDataValueVector;
+import org.apache.drill.exec.vector.BitVector;
 import org.apache.drill.exec.vector.ValueVector;
 
 import parquet.column.ColumnDescriptor;
@@ -29,10 +30,6 @@ import parquet.hadoop.metadata.ColumnChunkMetaData;
 
 final class BitReader extends ColumnReader {
 
-  private byte currentByte;
-  private byte nextByte;
-  private ByteBuf bytebuf;
-
   BitReader(ParquetRecordReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData,
             boolean fixedLength, ValueVector v, SchemaElement schemaElement) throws ExecutionSetupException
{
     super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
@@ -44,49 +41,23 @@ final class BitReader extends ColumnReader {
     recordsReadInThisIteration = Math.min(pageReader.currentPage.getValueCount()
         - pageReader.valuesRead, recordsToReadInThisPass - valuesReadInCurrentPass);
 
-    readStartInBytes = pageReader.readPosInBytes;
-    readLengthInBits = recordsReadInThisIteration * dataTypeLengthInBits;
-    readLength = (int) Math.ceil(readLengthInBits / 8.0);
-
-    bytebuf = pageReader.pageDataByteArray;
-    // standard read, using memory mapping
-    if (pageReader.bitShift == 0) {
-      ((BaseDataValueVector) valueVec).getData().writeBytes(bytebuf,
-          (int) readStartInBytes, (int) readLength);
-    } else { // read in individual values, because a bitshift is necessary with where the
last page or batch ended
-
-      vectorData = ((BaseDataValueVector) valueVec).getData();
-      nextByte = bytebuf.getByte((int) Math.max(0, Math.ceil(pageReader.valuesRead / 8.0)
- 1));
-      readLengthInBits = recordsReadInThisIteration + pageReader.bitShift;
-
-      int i = 0;
-      // read individual bytes with appropriate shifting
-      for (; i < (int) readLength; i++) {
-        currentByte = nextByte;
-        currentByte = (byte) (currentByte >>> pageReader.bitShift);
-        // mask the bits about to be added from the next byte
-        currentByte = (byte) (currentByte & ParquetRecordReader.startBitMasks[pageReader.bitShift
- 1]);
-        // if we are not on the last byte
-        if ((int) Math.ceil(pageReader.valuesRead / 8.0) + i < pageReader.byteLength)
{
-          // grab the next byte from the buffer, shift and mask it, and OR it with the leftover
bits
-          nextByte = bytebuf.getByte((int) Math.ceil(pageReader.valuesRead / 8.0) + i);
-          currentByte = (byte) (currentByte | nextByte
-              << (8 - pageReader.bitShift)
-              & ParquetRecordReader.endBitMasks[8 - pageReader.bitShift - 1]);
-        }
-        vectorData.setByte(valuesReadInCurrentPass / 8 + i, currentByte);
-      }
-      vectorData.setIndex(0, (valuesReadInCurrentPass / 8)
-          + (int) readLength - 1);
-      vectorData.capacity(vectorData.writerIndex() + 1);
-    }
-
-    // check if the values in this page did not end on a byte boundary, store a number of
bits the next page must be
-    // shifted by to read all of the values into the vector without leaving space
-    if (readLengthInBits % 8 != 0) {
-      pageReader.bitShift = (int) readLengthInBits % 8;
-    } else {
-      pageReader.bitShift = 0;
+    // A more optimized reader for bit columns was removed to fix the bug
+    // DRILL-2031. It attempted to copy large runs of values directly from the
+    // decompressed parquet stream into a BitVector. This was complicated by
+    // parquet not always breaking a page on a row number divisible by 8. In
+    // this case the batch would have to be cut off early or we would have to
+    // copy the next page byte-by-byte with a bit shift to move the values into
+    // the correct position (to make the value vector one contiguous buffer of
+    // data). As page boundaries do not line up across columns, cutting off a
+    // batch at every page boundary of a bit column could be costly with many
+    // such pages, so we opted to try to shift the bits when necessary.
+    //
+    // In the end, this was too much complexity for not enough performance
+    // benefit, for now this reader has been moved to use the higher level value
+    // by value reader provided by the parquet library.
+    for (int i = 0; i < recordsReadInThisIteration; i++){
+      ((BitVector)valueVec).getMutator().setSafe(i + valuesReadInCurrentPass,
+            pageReader.valueReader.readBoolean() ? 1 : 0 );
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/1d2ed349/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java b/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java
index f06203e..75a91b3 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/DrillTestWrapper.java
@@ -299,7 +299,7 @@ public class DrillTestWrapper {
    */
   protected void compareOrderedResults() throws Exception {
     if (highPerformanceComparison) {
-      if (baselineQueryType != null) {
+      if (baselineQueryType == null) {
         throw new Exception("Cannot do a high performance comparison without using a baseline
file");
       }
       compareResultsHyperVector();

http://git-wip-us.apache.org/repos/asf/drill/blob/1d2ed349/exec/java-exec/src/test/java/org/apache/drill/exec/HyperVectorValueIterator.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/HyperVectorValueIterator.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/HyperVectorValueIterator.java
index d214b7c..9ad72eb 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/HyperVectorValueIterator.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/HyperVectorValueIterator.java
@@ -80,7 +80,7 @@ public class HyperVectorValueIterator implements Iterator<Object>
{
 
   @Override
   public Object next() {
-    if (currVec == null || indexInCurrentVector == currVec.getValueCapacity()) {
+    if (currVec == null || indexInCurrentVector == currVec.getAccessor().getValueCount())
{
       currVec = hyperVector.getValueVectors()[indexInVectorList];
       indexInVectorList++;
       indexInCurrentVector = 0;

http://git-wip-us.apache.org/repos/asf/drill/blob/1d2ed349/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
index 6aa3288..7298f28 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/writer/TestParquetWriter.java
@@ -204,6 +204,12 @@ public class TestParquetWriter extends BaseTestQuery {
     runTestAndValidate("*", "*", inputTable, "nullable_test");
   }
 
+  @Ignore("Binary file too large for version control, TODO - make available on S3 bucket
or similar service")
+  @Test
+  public void testBitError_Drill_2031() throws Exception {
+    compareParquetReadersHyperVector("*", "dfs.`/tmp/wide2/0_0_3.parquet`");
+  }
+
   @Test
   public void testDecimal() throws Exception {
     String selection = "cast(salary as decimal(8,2)) as decimal8, cast(salary as decimal(15,2))
as decimal15, " +


Mime
View raw message