parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ziva...@apache.org
Subject parquet-mr git commit: PARQUET-1217: Incorrect handling of missing values in Statistics
Date Tue, 27 Feb 2018 13:20:52 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 8bbc6cb95 -> b82d96218


PARQUET-1217: Incorrect handling of missing values in Statistics

In parquet-format every value in Statistics is optional while parquet-mr does not properly
handle these scenarios:
- null_count is set but min/max or min_value/max_value are not: filtering may fail with NPE
or incorrect filtering occurs
  fix: check if min/max is set before comparing to the related values
- null_count is not set: filtering handles null_count as if it would be 0 -> incorrect
filtering may occur
  fix: introduce new method in Statistics object to check if num_nulls is set; check if num_nulls
is set by the new method before using its value for filtering

Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>

Closes #458 from gszadovszky/PARQUET-1217 and squashes the following commits:

9d14090 [Gabor Szadovszky] Updates according to rdblue's comments
116d1d3 [Gabor Szadovszky] PARQUET-1217: Updates according to zi's comments
c264b50 [Gabor Szadovszky] PARQUET-1217: fix handling of unset nullCount
2ec2fb1 [Gabor Szadovszky] PARQUET-1217: Incorrect handling of missing values in Statistics


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/b82d9621
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/b82d9621
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/b82d9621

Branch: refs/heads/master
Commit: b82d96218bfd37f6df95a2e8d7675d091ab61970
Parents: 8bbc6cb
Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>
Authored: Tue Feb 27 14:19:14 2018 +0100
Committer: Zoltan Ivanfi <zi@cloudera.com>
Committed: Tue Feb 27 14:19:14 2018 +0100

----------------------------------------------------------------------
 .../cli/commands/ParquetMetadataCommand.java    |  4 +-
 .../parquet/cli/commands/ShowPagesCommand.java  |  2 +-
 .../parquet/column/statistics/Statistics.java   | 78 ++++++++++++++++++--
 .../column/statistics/TestStatistics.java       |  1 +
 .../statisticslevel/StatisticsFilter.java       | 42 ++++++++++-
 .../converter/ParquetMetadataConverter.java     | 19 +++--
 .../statisticslevel/TestStatisticsFilter.java   | 64 +++++++++++++++-
 .../converter/TestParquetMetadataConverter.java | 33 +++++++++
 .../hadoop/TestColumnChunkPageWriteStore.java   |  4 +-
 .../parquet/hadoop/TestParquetFileWriter.java   | 48 ++++++------
 10 files changed, 249 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
----------------------------------------------------------------------
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
index 0bd77a3..54fe657 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
@@ -168,12 +168,12 @@ public class ParquetMetadataCommand extends BaseCommand {
     if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
       console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
           name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
-          humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()),
+          humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
           minMaxAsString(stats, type.getOriginalType())));
     } else {
       console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
           name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
-          stats == null ? "" : String.valueOf(stats.getNumNulls()),
+          stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
           minMaxAsString(stats, type.getOriginalType())));
     }
   }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
----------------------------------------------------------------------
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
index beda452..4d0e2c9 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
@@ -191,7 +191,7 @@ public class ShowPagesCommand extends BaseCommand {
       String enc = encodingAsString(page.getValueEncoding(), false);
       long totalSize = page.getCompressedSize();
       int count = page.getValueCount();
-      long numNulls = page.getStatistics().getNumNulls();
+      String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls())
: "";
       float perValue = ((float) totalSize) / count;
       String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType());
       return String.format("%3d-%-3d  %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s",

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 00d0bbf..a087c5f 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -35,6 +35,44 @@ import org.apache.parquet.schema.Type;
  */
 public abstract class Statistics<T extends Comparable<T>> {
 
+  /**
+   * Builder class to build Statistics objects. Used to read the statistics from the Parquet
file.
+   */
+  public static class Builder {
+    private final PrimitiveType type;
+    private byte[] min;
+    private byte[] max;
+    private long numNulls = -1;
+
+    private Builder(PrimitiveType type) {
+      this.type = type;
+    }
+
+    public Builder withMin(byte[] min) {
+      this.min = min;
+      return this;
+    }
+
+    public Builder withMax(byte[] max) {
+      this.max = max;
+      return this;
+    }
+
+    public Builder withNumNulls(long numNulls) {
+      this.numNulls = numNulls;
+      return this;
+    }
+
+    public Statistics<?> build() {
+      Statistics<?> stats = createStats(type);
+      if (min != null && max != null) {
+        stats.setMinMaxFromBytes(min, max);
+      }
+      stats.num_nulls = this.numNulls;
+      return stats;
+    }
+  }
+
   private final PrimitiveType type;
   private final PrimitiveComparator<T> comparator;
   private boolean hasNonNullValue;
@@ -110,6 +148,17 @@ public abstract class Statistics<T extends Comparable<T>>
{
   }
 
   /**
+   * Returns a builder to create new statistics object. Used to read the statistics from
the parquet file.
+   *
+   * @param type
+   *          type of the column
+   * @return builder to create new statistics object
+   */
+  public static Builder getBuilder(PrimitiveType type) {
+    return new Builder(type);
+  }
+
+  /**
    * updates statistics min and max using the passed value
    * @param value value to use to update min and max
    */
@@ -217,7 +266,9 @@ public abstract class Statistics<T extends Comparable<T>>
{
    * Abstract method to set min and max values from byte arrays.
    * @param minBytes byte array to set the min value to
    * @param maxBytes byte array to set the max value to
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
    */
+  @Deprecated
   abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
 
   /**
@@ -310,9 +361,13 @@ public abstract class Statistics<T extends Comparable<T>>
{
 
   @Override
   public String toString() {
-    if (this.hasNonNullValue())
-      return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(),
this.getNumNulls());
-    else if (!this.isEmpty())
+    if (this.hasNonNullValue()) {
+      if (isNumNullsSet()) {
+        return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(),
this.getNumNulls());
+      } else {
+        return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString());
+      }
+    } else if (!this.isEmpty())
       return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
     else
       return "no stats for this column";
@@ -335,7 +390,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
 
   /**
    * Returns the null count
-   * @return null count
+   * @return null count or {@code -1} if the null count is not set
    */
   public long getNumNulls() {
     return num_nulls;
@@ -343,8 +398,12 @@ public abstract class Statistics<T extends Comparable<T>>
{
 
   /**
    * Sets the number of nulls to the parameter value
-   * @param nulls null count to set the count to
+   *
+   * @param nulls
+   *          null count to set the count to
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
    */
+  @Deprecated
   public void setNumNulls(long nulls) {
     num_nulls = nulls;
   }
@@ -355,7 +414,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
    * @return true if object is empty, false otherwise
    */
   public boolean isEmpty() {
-    return !hasNonNullValue && num_nulls == 0;
+    return !hasNonNullValue && !isNumNullsSet();
   }
 
   /**
@@ -366,6 +425,13 @@ public abstract class Statistics<T extends Comparable<T>>
{
   }
 
   /**
+   * @return whether numNulls is set and can be used
+   */
+  public boolean isNumNullsSet() {
+    return num_nulls >= 0;
+  }
+
+  /**
    * Sets the page/column as having a valid non-null value
    * kind of misnomer here
    */

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index 8ca1ca6..5e5d5fd 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -42,6 +42,7 @@ public class TestStatistics {
   @Test
   public void testNumNulls() {
     IntStatistics stats = new IntStatistics();
+    assertTrue(stats.isNumNullsSet());
     assertEquals(stats.getNumNulls(), 0);
 
     stats.incrementNumNulls();

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index f168a60..446c8a3 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -40,7 +40,6 @@ import org.apache.parquet.filter2.predicate.Operators.UserDefined;
 import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 
-import static org.apache.parquet.Preconditions.checkArgument;
 import static org.apache.parquet.Preconditions.checkNotNull;
 
 /**
@@ -122,6 +121,10 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
     }
 
     if (value == null) {
+      // We don't know anything about the nulls in this chunk
+      if (!stats.isNumNullsSet()) {
+        return BLOCK_MIGHT_MATCH;
+      }
       // we are looking for records where v eq(null)
       // so drop if there are no nulls in this chunk
       return !hasNulls(meta);
@@ -133,6 +136,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     // drop if value < min || value > max
     return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
   }
@@ -166,12 +174,17 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return isAllNulls(meta);
     }
 
-    if (hasNulls(meta)) {
+    if (stats.isNumNullsSet() && hasNulls(meta)) {
       // we are looking for records where v notEq(someNonNull)
       // but this chunk contains nulls, we cannot drop it
       return BLOCK_MIGHT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     // drop if this is a column where min = max = value
     return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value)
== 0;
   }
@@ -201,6 +214,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = lt.getValue();
 
     // drop if value <= min
@@ -232,6 +250,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = ltEq.getValue();
 
     // drop if value < min
@@ -263,6 +286,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = gt.getValue();
 
     // drop if value >= max
@@ -294,6 +322,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = gtEq.getValue();
 
     // drop if value > max
@@ -355,6 +388,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       }
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
       new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(),
stats.genericGetMax(),
         stats.comparator());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index c4e5da3..0daabb6 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -401,7 +401,8 @@ public class ParquetMetadataConverter {
   static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
       (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder)
{
     // create stats object based on the column type
-    org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type);
+    org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
+        org.apache.parquet.column.statistics.Statistics.getBuilder(type);
 
     if (formatStats != null) {
       // Use the new V2 min-max statistics over the former one if it is filled
@@ -409,9 +410,12 @@ public class ParquetMetadataConverter {
         byte[] min = formatStats.min_value.array();
         byte[] max = formatStats.max_value.array();
         if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
-          stats.setMinMaxFromBytes(min, max);
+          statsBuilder.withMin(min);
+          statsBuilder.withMax(max);
+        }
+        if (formatStats.isSetNull_count()) {
+          statsBuilder.withNumNulls(formatStats.null_count);
         }
-        stats.setNumNulls(formatStats.null_count);
       } else {
         boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
         boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax())
: false;
@@ -424,13 +428,16 @@ public class ParquetMetadataConverter {
         if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName())
&&
             (sortOrdersMatch || maxEqualsMin)) {
           if (isSet) {
-            stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array());
+            statsBuilder.withMin(formatStats.min.array());
+            statsBuilder.withMax(formatStats.max.array());
+          }
+          if (formatStats.isSetNull_count()) {
+            statsBuilder.withNumNulls(formatStats.null_count);
           }
-          stats.setNumNulls(formatStats.null_count);
         }
       }
     }
-    return stats;
+    return statsBuilder.build();
   }
 
   public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index d8b4407..6fdec2a 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -22,7 +22,6 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 
-import org.apache.parquet.io.api.Binary;
 import org.junit.Test;
 
 import org.apache.parquet.column.Encoding;
@@ -39,6 +38,7 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
 
 import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
 import static org.apache.parquet.io.api.Binary.fromString;
@@ -62,7 +62,8 @@ import static org.apache.parquet.filter2.statisticslevel.StatisticsFilter.canDro
 
 public class TestStatisticsFilter {
 
-  private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount)
{
+  private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?>
stats,
+      long valueCount) {
     return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
         PrimitiveTypeName.INT32,
         CompressionCodecName.GZIP,
@@ -71,7 +72,8 @@ public class TestStatisticsFilter {
         0L, 0L, valueCount, 0L, 0L);
   }
 
-  private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount)
{
+  private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?>
stats,
+      long valueCount) {
     return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
         PrimitiveTypeName.DOUBLE,
         CompressionCodecName.GZIP,
@@ -87,13 +89,16 @@ public class TestStatisticsFilter {
 
   private static final IntStatistics intStats = new IntStatistics();
   private static final IntStatistics nullIntStats = new IntStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
   private static final DoubleStatistics doubleStats = new DoubleStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
 
   static {
     intStats.setMinMax(10, 100);
     doubleStats.setMinMax(10, 100);
 
-    nullIntStats.setMinMax(0, 0);
     nullIntStats.setNumNulls(177);
   }
 
@@ -105,6 +110,9 @@ public class TestStatisticsFilter {
       getIntColumnMeta(nullIntStats, 177L), // column of all nulls
       getDoubleColumnMeta(doubleStats, 177L));
 
+  private static final List<ColumnChunkMetaData> missingMinMaxColumnMetas = Arrays.asList(
+      getIntColumnMeta(emptyIntStats, 177L),                // missing min/max values and
numNulls => stats is empty
+      getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null
values
 
   @Test
   public void testEqNonNull() {
@@ -116,6 +124,9 @@ public class TestStatisticsFilter {
     // drop columns of all nulls when looking for non-null value
     assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
     assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas));
+    assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -137,6 +148,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertFalse(canDrop(eq(missingColumn, null), columnMetas));
+
+    assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
+    assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -163,6 +177,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
+    assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -192,6 +209,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
+
+    assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
+    assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -205,6 +225,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas));
+    assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -218,6 +241,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas));
+    assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -231,6 +257,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas));
+    assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -244,6 +273,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas));
+    assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -297,6 +329,26 @@ public class TestStatisticsFilter {
     }
   }
 
+  public static class AllPositiveUdp extends UserDefinedPredicate<Double> {
+    @Override
+    public boolean keep(Double value) {
+      if (value == null) {
+        return true;
+      }
+      throw new RuntimeException("this method should not be called with value != null");
+    }
+
+    @Override
+    public boolean canDrop(Statistics<Double> statistics) {
+      return statistics.getMin() <= 0.0;
+    }
+
+    @Override
+    public boolean inverseCanDrop(Statistics<Double> statistics) {
+      return statistics.getMin() > 0.0;
+    }
+  }
+
   @Test
   public void testUdp() {
     FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
@@ -308,6 +360,8 @@ public class TestStatisticsFilter {
     FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
     FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2,
SevensAndEightsUdp.class)));
 
+    FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);
+
     IntStatistics seven = new IntStatistics();
     seven.setMinMax(7, 7);
 
@@ -392,6 +446,8 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
         getIntColumnMeta(neither, 177L),
         getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index b83da5d..6cce32f 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -658,6 +658,7 @@ public class TestParquetMetadataConverter {
         binaryType);
 
     Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
+    Assert.assertTrue(convertedStats.isNumNullsSet());
     Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
     if (helper == StatsHelper.V1) {
       assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
@@ -670,6 +671,38 @@ public class TestParquetMetadataConverter {
   }
 
   @Test
+  public void testMissingValuesFromStats() {
+    ParquetMetadataConverter converter = new ParquetMetadataConverter();
+    PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32");
+
+    org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
+    Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats,
type);
+    assertFalse(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertTrue(stats.isEmpty());
+    assertEquals(-1, stats.getNumNulls());
+
+    formatStats.clear();
+    formatStats.setMin(BytesUtils.intToBytes(-100));
+    formatStats.setMax(BytesUtils.intToBytes(100));
+    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+    assertFalse(stats.isNumNullsSet());
+    assertTrue(stats.hasNonNullValue());
+    assertFalse(stats.isEmpty());
+    assertEquals(-1, stats.getNumNulls());
+    assertEquals(-100, stats.genericGetMin());
+    assertEquals(100, stats.genericGetMax());
+
+    formatStats.clear();
+    formatStats.setNull_count(2000);
+    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+    assertTrue(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertFalse(stats.isEmpty());
+    assertEquals(2000, stats.getNumNulls());
+  }
+
+  @Test
   public void testSkippedV2Stats() {
     testSkippedV2Stats(
         Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""),

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index 87574cd..0b7b951 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
 import org.apache.parquet.schema.Types;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
 
@@ -92,7 +93,8 @@ public class TestColumnChunkPageWriteStore {
     int v = 3;
     BytesInput definitionLevels = BytesInput.fromInt(d);
     BytesInput repetitionLevels = BytesInput.fromInt(r);
-    Statistics<?> statistics = new BinaryStatistics();
+    Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
+        .build();
     BytesInput data = BytesInput.fromInt(v);
     int rowCount = 5;
     int nullCount = 1;

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
index 4243e9b..c73e569 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -27,7 +27,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.parquet.Version;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
-import org.apache.parquet.hadoop.util.HadoopOutputFile;
 import org.junit.Assume;
 import org.junit.Rule;
 import org.junit.Test;
@@ -48,6 +47,7 @@ import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
 
 import java.io.File;
 import java.io.IOException;
@@ -95,8 +95,8 @@ public class TestParquetFileWriter {
   private static final byte[] BYTES4 = { 3, 4, 5, 6 };
   private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;
 
-  private static final BinaryStatistics STATS1 = new BinaryStatistics();
-  private static final BinaryStatistics STATS2 = new BinaryStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
 
   private String writeSchema;
 
@@ -145,24 +145,24 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
     w.end(new HashMap<String, String>());
@@ -225,15 +225,15 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
@@ -242,10 +242,10 @@ public class TestParquetFileWriter {
 
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
 
@@ -330,15 +330,15 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
@@ -347,10 +347,10 @@ public class TestParquetFileWriter {
 
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
 


Mime
View raw message