parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject git commit: PARQUET-62: Fix binary dictionary write bug.
Date Wed, 20 Aug 2014 21:02:12 GMT
Repository: incubator-parquet-mr
Updated Branches:
  refs/heads/master 45e581028 -> 54bb98327


PARQUET-62: Fix binary dictionary write bug.

The binary dictionary writers keep track of written values in memory to
deduplicate and write dictionary pages periodically. If the written
values are changed by the caller, then this corrupts the dictionary
without an error message. This adds a defensive copy to fix the problem.

Author: Ryan Blue <rblue@cloudera.com>

Closes #29 from rdblue/PARQUET-62-fix-dictionary-bug and squashes the following commits:

42b6920 [Ryan Blue] PARQUET-62: Fix binary dictionary write bug.


Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/54bb9832
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/54bb9832
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/54bb9832

Branch: refs/heads/master
Commit: 54bb983271adcbcc1519ab9e2288871d69093708
Parents: 45e5810
Author: Ryan Blue <rblue@cloudera.com>
Authored: Wed Aug 20 14:02:01 2014 -0700
Committer: julien <julien@twitter.com>
Committed: Wed Aug 20 14:02:01 2014 -0700

----------------------------------------------------------------------
 .../dictionary/DictionaryValuesWriter.java      | 10 +++++--
 .../values/dictionary/TestDictionary.java       | 31 +++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/54bb9832/parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesWriter.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesWriter.java
b/parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesWriter.java
index 9c41114..0e76c47 100644
--- a/parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesWriter.java
+++ b/parquet-column/src/main/java/parquet/column/values/dictionary/DictionaryValuesWriter.java
@@ -34,6 +34,7 @@ import it.unimi.dsi.fastutil.objects.Object2IntMap;
 import it.unimi.dsi.fastutil.objects.ObjectIterator;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Iterator;
 
 import parquet.Log;
@@ -265,7 +266,7 @@ public abstract class DictionaryValuesWriter extends ValuesWriter {
         int id = binaryDictionaryContent.getInt(v);
         if (id == -1) {
           id = binaryDictionaryContent.size();
-          binaryDictionaryContent.put(v, id);
+          binaryDictionaryContent.put(copy(v), id);
           // length as int (4 bytes) + actual bytes
           dictionaryByteSize += 4 + v.length();
         }
@@ -319,6 +320,11 @@ public abstract class DictionaryValuesWriter extends ValuesWriter {
         plainValuesWriter.writeBytes(reverseDictionary[id]);
       }
     }
+
+    protected static Binary copy(Binary binary) {
+      return Binary.fromByteArray(
+          Arrays.copyOf(binary.getBytes(), binary.length()));
+    }
   }
 
   /**
@@ -343,7 +349,7 @@ public abstract class DictionaryValuesWriter extends ValuesWriter {
         int id = binaryDictionaryContent.getInt(value);
         if (id == -1) {
           id = binaryDictionaryContent.size();
-          binaryDictionaryContent.put(value, id);
+          binaryDictionaryContent.put(copy(value), id);
           dictionaryByteSize += length;
         }
         encodedValues.add(id);

http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/54bb9832/parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java
b/parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java
index 64196d4..a5d6e1f 100644
--- a/parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java
+++ b/parquet-column/src/test/java/parquet/column/values/dictionary/TestDictionary.java
@@ -20,6 +20,7 @@ import static parquet.column.Encoding.PLAIN;
 import static parquet.column.Encoding.PLAIN_DICTIONARY;
 import static parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
 import static parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
+import static parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
 import static parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
 import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
 
@@ -94,7 +95,26 @@ public class TestDictionary {
 
     //simulate cutting the page
     cw.reset();
-    assertEquals(0,cw.getBufferedSize());
+    assertEquals(0, cw.getBufferedSize());
+  }
+
+  @Test
+  public void testBinaryDictionaryChangedValues() throws IOException {
+    int COUNT = 100;
+    ValuesWriter cw = new PlainBinaryDictionaryValuesWriter(200, 10000);
+    writeRepeatedWithReuse(COUNT, cw, "a");
+    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
+    writeRepeatedWithReuse(COUNT, cw, "b");
+    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
+    // now we will fall back
+    writeDistinct(COUNT, cw, "c");
+    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
+
+    DictionaryValuesReader cr = initDicReader(cw, BINARY);
+    checkRepeated(COUNT, bytes1, cr, "a");
+    checkRepeated(COUNT, bytes2, cr, "b");
+    BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
+    checkDistinct(COUNT, bytes3, cr2, "c");
   }
 
   @Test
@@ -466,6 +486,15 @@ public class TestDictionary {
     }
   }
 
+  private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
+    Binary reused = Binary.fromString(prefix + "0");
+    for (int i = 0; i < COUNT; i++) {
+      Binary content = Binary.fromString(prefix + i % 10);
+      System.arraycopy(content.getBytes(), 0, reused.getBytes(), 0, reused.length());
+      cw.writeBytes(reused);
+    }
+  }
+
   private BytesInput getBytesAndCheckEncoding(ValuesWriter cw, Encoding encoding)
       throws IOException {
     BytesInput bytes = BytesInput.copy(cw.getBytes());


Mime
View raw message