parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject [parquet-mr] branch master updated: PARQUET-1510: Fix notEq for optional columns with null values. (#603)
Date Mon, 28 Jan 2019 16:50:57 GMT
This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new d9a1962  PARQUET-1510: Fix notEq for optional columns with null values. (#603)
d9a1962 is described below

commit d9a19621370608f4431394cc36bddc063d59cc5a
Author: Ryan Blue <rdblue@users.noreply.github.com>
AuthorDate: Mon Jan 28 08:50:52 2019 -0800

    PARQUET-1510: Fix notEq for optional columns with null values. (#603)
    
    Dictionaries cannot contain null values, so notEq filters cannot
    conclude that a block cannot match using only the dictionary. Instead,
    it must also check whether the block may have at least one null value.
    If there are no null values, then the existing check is correct.
---
 .../filter2/dictionarylevel/DictionaryFilter.java       |  5 ++++-
 .../filter2/dictionarylevel/DictionaryFilterTest.java   | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
index ecd1043..52e1458 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java
@@ -189,7 +189,10 @@ public class DictionaryFilter implements FilterPredicate.Visitor<Boolean>
{
 
     try {
       Set<T> dictSet = expandDictionary(meta);
-      if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value))
{
+      boolean mayContainNull = (meta.getStatistics() == null
+          || !meta.getStatistics().isNumNullsSet()
+          || meta.getStatistics().getNumNulls() > 0);
+      if (dictSet != null && dictSet.size() == 1 && dictSet.contains(value)
&& !mayContainNull) {
         return BLOCK_CANNOT_MATCH;
       }
     } catch (IOException e) {
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
index 39db6d4..6af4437 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java
@@ -88,6 +88,7 @@ public class DictionaryFilterTest {
       "message test { "
           + "required binary binary_field; "
           + "required binary single_value_field; "
+          + "optional binary optional_single_value_field; "
           + "required fixed_len_byte_array(17) fixed_field (DECIMAL(40,4)); "
           + "required int32 int32_field; "
           + "required int64 int64_field; "
@@ -165,6 +166,11 @@ public class DictionaryFilterTest {
               ALPHABET.substring(index, index+1) : UUID.randomUUID().toString())
           .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]);
 
+      // 10% of the time, leave the field null
+      if (index % 10 > 0) {
+        group.append("optional_single_value_field", "sharp");
+      }
+
       writer.write(group);
     }
     writer.close();
@@ -256,7 +262,7 @@ public class DictionaryFilterTest {
   @SuppressWarnings("deprecation")
   private void testDictionaryEncodedColumnsV1() throws Exception {
     Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
-        "binary_field", "single_value_field", "int32_field", "int64_field",
+        "binary_field", "single_value_field", "optional_single_value_field", "int32_field",
"int64_field",
         "double_field", "float_field", "int96_field"));
     for (ColumnChunkMetaData column : ccmd) {
       String name = column.getPath().toDotString();
@@ -281,7 +287,7 @@ public class DictionaryFilterTest {
 
   private void testDictionaryEncodedColumnsV2() throws Exception {
     Set<String> dictionaryEncodedColumns = new HashSet<String>(Arrays.asList(
-        "binary_field", "single_value_field", "fixed_field", "int32_field",
+        "binary_field", "single_value_field", "optional_single_value_field", "fixed_field",
"int32_field",
         "int64_field", "double_field", "float_field", "int96_field"));
     for (ColumnChunkMetaData column : ccmd) {
       EncodingStats encStats = column.getEncodingStats();
@@ -355,6 +361,7 @@ public class DictionaryFilterTest {
   @Test
   public void testNotEqBinary() throws Exception {
     BinaryColumn sharp = binaryColumn("single_value_field");
+    BinaryColumn sharpAndNull = binaryColumn("optional_single_value_field");
     BinaryColumn b = binaryColumn("binary_field");
 
     assertTrue("Should drop block with only the excluded value",
@@ -363,6 +370,12 @@ public class DictionaryFilterTest {
     assertFalse("Should not drop block with any other value",
         canDrop(notEq(sharp, Binary.fromString("applause")), ccmd, dictionaries));
 
+    assertFalse("Should not drop block with only the excluded value and null",
+        canDrop(notEq(sharpAndNull, Binary.fromString("sharp")), ccmd, dictionaries));
+
+    assertFalse("Should not drop block with any other value",
+        canDrop(notEq(sharpAndNull, Binary.fromString("applause")), ccmd, dictionaries));
+
     assertFalse("Should not drop block with a known value",
         canDrop(notEq(b, Binary.fromString("x")), ccmd, dictionaries));
 


Mime
View raw message