parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject parquet-mr git commit: PARQUET-791: Add missing column support for UserDefinedPredicate
Date Thu, 08 Dec 2016 17:07:42 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 98c27699c -> 71cff7c59


PARQUET-791: Add missing column support for UserDefinedPredicate

This extends the fixing #354 to UserDefinedPredicate.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #389 from viirya/PARQUET-791 and squashes the following commits:

d6be37d [Liang-Chi Hsieh] Address comment.
7e929c3 [Liang-Chi Hsieh] PARQUET-791: Add missing column support for UserDefinedPredicate.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/71cff7c5
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/71cff7c5
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/71cff7c5

Branch: refs/heads/master
Commit: 71cff7c5940b7101ff098601850d46b7a4698180
Parents: 98c2769
Author: Liang-Chi Hsieh <viirya@gmail.com>
Authored: Thu Dec 8 09:07:37 2016 -0800
Committer: Ryan Blue <blue@apache.org>
Committed: Thu Dec 8 09:07:37 2016 -0800

----------------------------------------------------------------------
 .../statisticslevel/StatisticsFilter.java       | 23 ++++--
 .../statisticslevel/TestStatisticsFilter.java   | 74 +++++++++++++++++++-
 2 files changed, 91 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/71cff7c5/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index b37297a..ac7132e 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -328,18 +328,31 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
     Column<T> filterColumn = ud.getColumn();
     ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
     U udp = ud.getUserDefinedPredicate();
+
+    if (columnChunk == null) {
+      // the column isn't in this file so all values are null.
+      // lets run the udp with null value to see if it keeps null or not.
+      if (inverted) {
+        return udp.keep(null);
+      } else {
+        return !udp.keep(null);
+      }
+    }
+
     Statistics<T> stats = columnChunk.getStatistics();
 
     if (stats.isEmpty()) {
       // we have no statistics available, we cannot drop any chunks
-      return false;
+      return BLOCK_MIGHT_MATCH;
     }
 
     if (isAllNulls(columnChunk)) {
-      // there is no min max, there is nothing
-      // else we can say about this chunk, we
-      // cannot drop it.
-      return false;
+      // lets run the udp with null value to see if it keeps null or not.
+      if (inverted) {
+        return udp.keep(null);
+      } else {
+        return !udp.keep(null);
+      }
     }
 
     org.apache.parquet.filter2.predicate.Statistics<T> udpStats =

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/71cff7c5/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index b47ed69..d8b4407 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -83,6 +83,7 @@ public class TestStatisticsFilter {
   private static final IntColumn intColumn = intColumn("int.column");
   private static final DoubleColumn doubleColumn = doubleColumn("double.column");
   private static final BinaryColumn missingColumn = binaryColumn("missing");
+  private static final IntColumn missingColumn2 = intColumn("missing.int");
 
   private static final IntStatistics intStats = new IntStatistics();
   private static final IntStatistics nullIntStats = new IntStatistics();
@@ -269,7 +270,10 @@ public class TestStatisticsFilter {
 
     @Override
     public boolean keep(Integer value) {
-      throw new RuntimeException("this method should not be called");
+      if (value == null) {
+        return true;
+      }
+      throw new RuntimeException("this method should not be called with value != null");
     }
 
     @Override
@@ -283,11 +287,27 @@ public class TestStatisticsFilter {
     }
   }
 
+  public static class DropNullUdp extends SevensAndEightsUdp {
+    @Override
+    public boolean keep(Integer value) {
+      if (value == null) {
+        return false;
+      }
+      throw new RuntimeException("this method should not be called with value != null");
+    }
+  }
+
   @Test
   public void testUdp() {
     FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
     FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));
 
+    FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
+    FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2,
DropNullUdp.class)));
+
+    FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
+    FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2,
SevensAndEightsUdp.class)));
+
     IntStatistics seven = new IntStatistics();
     seven.setMinMax(7, 7);
 
@@ -320,6 +340,58 @@ public class TestStatisticsFilter {
     assertFalse(canDrop(invPred, Arrays.asList(
         getIntColumnMeta(neither, 177L),
         getDoubleColumnMeta(doubleStats, 177L))));
+
+    // udpDropMissingColumn drops null column.
+    assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(seven, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(eight, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(neither, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    // invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
+    assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(seven, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(eight, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+        getIntColumnMeta(neither, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    // udpKeepMissingColumn keeps null column.
+    assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(seven, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(eight, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(neither, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    // invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
+    assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(seven, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(eight, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+        getIntColumnMeta(neither, 177L),
+        getDoubleColumnMeta(doubleStats, 177L))));
   }
 
   @Test


Mime
View raw message