parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ziva...@apache.org
Subject [parquet-mr] branch parquet-1.8.x updated: PARQUET-1217: Incorrect handling of missing values in Statistics
Date Tue, 24 Apr 2018 13:19:27 GMT
This is an automated email from the ASF dual-hosted git repository.

zivanfi pushed a commit to branch parquet-1.8.x
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/parquet-1.8.x by this push:
     new 0fe55e1  PARQUET-1217: Incorrect handling of missing values in Statistics
0fe55e1 is described below

commit 0fe55e1d3d90bed53182d3c0cfd64e5a66a0ada8
Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>
AuthorDate: Tue Feb 27 14:19:14 2018 +0100

    PARQUET-1217: Incorrect handling of missing values in Statistics
    
    In parquet-format every value in Statistics is optional while parquet-mr does not properly
handle these scenarios:
    - null_count is set but min/max or min_value/max_value are not: filtering may fail with
NPE or incorrect filtering occurs
      fix: check if min/max is set before comparing to the related values
    - null_count is not set: filtering handles null_count as if it would be 0 -> incorrect
filtering may occur
      fix: introduce new method in Statistics object to check if num_nulls is set; check if
num_nulls is set by the new method before using its value for filtering
    
    Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>
    
    Closes #458 from gszadovszky/PARQUET-1217 and squashes the following commits:
    
    9d14090 [Gabor Szadovszky] Updates according to rdblue's comments
    116d1d3 [Gabor Szadovszky] PARQUET-1217: Updates according to zi's comments
    c264b50 [Gabor Szadovszky] PARQUET-1217: fix handling of unset nullCount
    2ec2fb1 [Gabor Szadovszky] PARQUET-1217: Incorrect handling of missing values in Statistics
    
    This change is based on b82d96218bfd37f6df95a2e8d7675d091ab61970 but is not a clean cherry-pick.
---
 .../parquet/column/statistics/Statistics.java      | 68 +++++++++++++++++++++-
 .../parquet/column/statistics/TestStatistics.java  |  1 +
 .../filter2/statisticslevel/StatisticsFilter.java  | 42 ++++++++++++-
 .../format/converter/ParquetMetadataConverter.java | 12 ++--
 .../statisticslevel/TestStatisticsFilter.java      | 64 ++++++++++++++++++--
 .../converter/TestParquetMetadataConverter.java    | 35 +++++++++++
 .../hadoop/TestColumnChunkPageWriteStore.java      |  6 +-
 .../parquet/hadoop/TestParquetFileWriter.java      | 54 ++++++++---------
 8 files changed, 240 insertions(+), 42 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 30153c0..26c14c1 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -31,6 +31,44 @@ import java.util.Arrays;
  */
 public abstract class Statistics<T extends Comparable<T>> {
 
+  /**
+   * Builder class to build Statistics objects. Used to read the statistics from the Parquet
file.
+   */
+  public static class Builder {
+    private final PrimitiveTypeName type;
+    private byte[] min;
+    private byte[] max;
+    private long numNulls = -1;
+
+    private Builder(PrimitiveTypeName type) {
+      this.type = type;
+    }
+
+    public Builder withMin(byte[] min) {
+      this.min = min;
+      return this;
+    }
+
+    public Builder withMax(byte[] max) {
+      this.max = max;
+      return this;
+    }
+
+    public Builder withNumNulls(long numNulls) {
+      this.numNulls = numNulls;
+      return this;
+    }
+
+    public Statistics<?> build() {
+      Statistics<?> stats = getStatsBasedOnType(type);
+      if (min != null && max != null) {
+        stats.setMinMaxFromBytes(min, max);
+      }
+      stats.num_nulls = this.numNulls;
+      return stats;
+    }
+  }
+
   private boolean hasNonNullValue;
   private long num_nulls;
 
@@ -68,6 +106,17 @@ public abstract class Statistics<T extends Comparable<T>>
{
   }
 
   /**
+   * Returns a builder to create new statistics object. Used to read the statistics from
the parquet file.
+   *
+   * @param type
+   *          type of the column
+   * @return builder to create new statistics object
+   */
+  public static Builder getBuilder(PrimitiveTypeName type) {
+    return new Builder(type);
+  }
+
+  /**
    * updates statistics min and max using the passed value
    * @param value value to use to update min and max
    */
@@ -172,7 +221,9 @@ public abstract class Statistics<T extends Comparable<T>>
{
    * Abstract method to set min and max values from byte arrays.
    * @param minBytes byte array to set the min value to
    * @param maxBytes byte array to set the max value to
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
    */
+  @Deprecated
   abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
 
   abstract public T genericGetMin();
@@ -221,7 +272,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
 
   /**
    * Returns the null count
-   * @return null count
+   * @return null count or {@code -1} if the null count is not set
    */
   public long getNumNulls() {
     return num_nulls;
@@ -229,8 +280,12 @@ public abstract class Statistics<T extends Comparable<T>>
{
 
   /**
    * Sets the number of nulls to the parameter value
-   * @param nulls null count to set the count to
+   *
+   * @param nulls
+   *          null count to set the count to
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
    */
+  @Deprecated
   public void setNumNulls(long nulls) {
     num_nulls = nulls;
   }
@@ -241,7 +296,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
    * @return true if object is empty, false otherwise
    */
   public boolean isEmpty() {
-    return !hasNonNullValue && num_nulls == 0;
+    return !hasNonNullValue && !isNumNullsSet();
   }
 
   /**
@@ -252,6 +307,13 @@ public abstract class Statistics<T extends Comparable<T>>
{
   }
  
   /**
+   * @return whether numNulls is set and can be used
+   */
+  public boolean isNumNullsSet() {
+    return num_nulls >= 0;
+  }
+
+  /**
    * Sets the page/column as having a valid non-null value
    * kind of misnomer here
    */ 
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index 128acb4..cf4bf59 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -37,6 +37,7 @@ public class TestStatistics {
   @Test
   public void testNumNulls() {
     IntStatistics stats = new IntStatistics();
+    assertTrue(stats.isNumNullsSet());
     assertEquals(stats.getNumNulls(), 0);
 
     stats.incrementNumNulls();
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index ac7132e..531c091 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -40,7 +40,6 @@ import org.apache.parquet.filter2.predicate.Operators.UserDefined;
 import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 
-import static org.apache.parquet.Preconditions.checkArgument;
 import static org.apache.parquet.Preconditions.checkNotNull;
 
 /**
@@ -122,6 +121,10 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
     }
 
     if (value == null) {
+      // We don't know anything about the nulls in this chunk
+      if (!stats.isNumNullsSet()) {
+        return BLOCK_MIGHT_MATCH;
+      }
       // we are looking for records where v eq(null)
       // so drop if there are no nulls in this chunk
       return !hasNulls(meta);
@@ -133,6 +136,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     // drop if value < min || value > max
     return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax())
> 0;
   }
@@ -166,12 +174,17 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return isAllNulls(meta);
     }
 
-    if (hasNulls(meta)) {
+    if (stats.isNumNullsSet() && hasNulls(meta)) {
       // we are looking for records where v notEq(someNonNull)
       // but this chunk contains nulls, we cannot drop it
       return BLOCK_MIGHT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     // drop if this is a column where min = max = value
     return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax())
== 0;
   }
@@ -201,6 +214,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = lt.getValue();
 
     // drop if value <= min
@@ -232,6 +250,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = ltEq.getValue();
 
     // drop if value < min
@@ -263,6 +286,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = gt.getValue();
 
     // drop if value >= max
@@ -294,6 +322,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       return BLOCK_CANNOT_MATCH;
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     T value = gtEq.getValue();
 
     // drop if value >= max
@@ -355,6 +388,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean>
{
       }
     }
 
+    if (!stats.hasNonNullValue()) {
+      // stats does not contain min/max values, we cannot drop any chunks
+      return BLOCK_MIGHT_MATCH;
+    }
+
     org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
         new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(),
stats.genericGetMax());
 
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index cc43008..9df5660 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -337,7 +337,8 @@ public class ParquetMetadataConverter {
   static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
       (String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder)
{
     // create stats object based on the column type
-    org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type);
+    org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
+        org.apache.parquet.column.statistics.Statistics.getBuilder(type);
     // If there was no statistics written to the footer, create an empty Statistics object
and return
 
     // NOTE: See docs in CorruptStatistics for explanation of why this check is needed
@@ -347,11 +348,14 @@ public class ParquetMetadataConverter {
     if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy,
type) &&
         SortOrder.SIGNED == typeSortOrder) {
       if (statistics.isSetMax() && statistics.isSetMin()) {
-        stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array());
+        statsBuilder.withMin(statistics.min.array());
+        statsBuilder.withMax(statistics.max.array());
+      }
+      if (statistics.isSetNull_count()) {
+        statsBuilder.withNumNulls(statistics.null_count);
       }
-      stats.setNumNulls(statistics.null_count);
     }
-    return stats;
+    return statsBuilder.build();
   }
 
   public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index d8b4407..a0551a4 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -22,7 +22,6 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 
-import org.apache.parquet.io.api.Binary;
 import org.junit.Test;
 
 import org.apache.parquet.column.Encoding;
@@ -39,6 +38,7 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
 
 import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
 import static org.apache.parquet.io.api.Binary.fromString;
@@ -62,7 +62,8 @@ import static org.apache.parquet.filter2.statisticslevel.StatisticsFilter.canDro
 
 public class TestStatisticsFilter {
 
-  private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount)
{
+  private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?>
stats,
+      long valueCount) {
     return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
         PrimitiveTypeName.INT32,
         CompressionCodecName.GZIP,
@@ -71,7 +72,8 @@ public class TestStatisticsFilter {
         0L, 0L, valueCount, 0L, 0L);
   }
 
-  private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount)
{
+  private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?>
stats,
+      long valueCount) {
     return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
         PrimitiveTypeName.DOUBLE,
         CompressionCodecName.GZIP,
@@ -87,13 +89,16 @@ public class TestStatisticsFilter {
 
   private static final IntStatistics intStats = new IntStatistics();
   private static final IntStatistics nullIntStats = new IntStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(PrimitiveTypeName.INT32).build();
   private static final DoubleStatistics doubleStats = new DoubleStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(PrimitiveTypeName.DOUBLE).withNumNulls(100).build();
 
   static {
     intStats.setMinMax(10, 100);
     doubleStats.setMinMax(10, 100);
 
-    nullIntStats.setMinMax(0, 0);
     nullIntStats.setNumNulls(177);
   }
 
@@ -105,6 +110,9 @@ public class TestStatisticsFilter {
       getIntColumnMeta(nullIntStats, 177L), // column of all nulls
       getDoubleColumnMeta(doubleStats, 177L));
 
+  private static final List<ColumnChunkMetaData> missingMinMaxColumnMetas = Arrays.asList(
+      getIntColumnMeta(emptyIntStats, 177L),                // missing min/max values and
numNulls => stats is empty
+      getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null
values
 
   @Test
   public void testEqNonNull() {
@@ -116,6 +124,9 @@ public class TestStatisticsFilter {
     // drop columns of all nulls when looking for non-null value
     assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
     assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas));
+    assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -137,6 +148,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertFalse(canDrop(eq(missingColumn, null), columnMetas));
+
+    assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
+    assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -163,6 +177,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
+    assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -192,6 +209,9 @@ public class TestStatisticsFilter {
         getDoubleColumnMeta(doubleStats, 177L))));
 
     assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
+
+    assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
+    assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -205,6 +225,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas));
+    assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -218,6 +241,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas));
+    assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -231,6 +257,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas));
+    assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -244,6 +273,9 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
 
     assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
+
+    assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas));
+    assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas));
   }
 
   @Test
@@ -297,6 +329,26 @@ public class TestStatisticsFilter {
     }
   }
 
+  public static class AllPositiveUdp extends UserDefinedPredicate<Double> {
+    @Override
+    public boolean keep(Double value) {
+      if (value == null) {
+        return true;
+      }
+      throw new RuntimeException("this method should not be called with value != null");
+    }
+
+    @Override
+    public boolean canDrop(Statistics<Double> statistics) {
+      return statistics.getMin() <= 0.0;
+    }
+
+    @Override
+    public boolean inverseCanDrop(Statistics<Double> statistics) {
+      return statistics.getMin() > 0.0;
+    }
+  }
+
   @Test
   public void testUdp() {
     FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
@@ -308,6 +360,8 @@ public class TestStatisticsFilter {
     FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
     FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2,
SevensAndEightsUdp.class)));
 
+    FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);
+
     IntStatistics seven = new IntStatistics();
     seven.setMinMax(7, 7);
 
@@ -392,6 +446,8 @@ public class TestStatisticsFilter {
     assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
         getIntColumnMeta(neither, 177L),
         getDoubleColumnMeta(doubleStats, 177L))));
+
+    assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
   }
 
   @Test
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index 35c35c1..f009e7f 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -22,7 +22,9 @@ import static java.util.Collections.emptyList;
 import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart;
 import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED;
 import static org.apache.parquet.format.Type.INT32;
@@ -558,10 +560,43 @@ public class TestParquetMetadataConverter {
             .as(OriginalType.UTF8).named("b"));
 
     Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
+    Assert.assertTrue(convertedStats.isNumNullsSet());
     Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
     Assert.assertEquals("Should have correct min (unsigned sort)",
         Binary.fromString("A"), convertedStats.genericGetMin());
     Assert.assertEquals("Should have correct max (unsigned sort)",
         Binary.fromString("z"), convertedStats.genericGetMax());
   }
+
+  @Test
+  public void testMissingValuesFromStats() {
+    ParquetMetadataConverter converter = new ParquetMetadataConverter();
+    PrimitiveTypeName type = PrimitiveTypeName.INT32;
+
+    org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
+    Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats,
type);
+    assertFalse(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertTrue(stats.isEmpty());
+    assertEquals(-1, stats.getNumNulls());
+
+    formatStats.clear();
+    formatStats.setMin(BytesUtils.intToBytes(-100));
+    formatStats.setMax(BytesUtils.intToBytes(100));
+    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+    assertFalse(stats.isNumNullsSet());
+    assertTrue(stats.hasNonNullValue());
+    assertFalse(stats.isEmpty());
+    assertEquals(-1, stats.getNumNulls());
+    assertEquals(-100, stats.genericGetMin());
+    assertEquals(100, stats.genericGetMax());
+
+    formatStats.clear();
+    formatStats.setNull_count(2000);
+    stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+    assertTrue(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertFalse(stats.isEmpty());
+    assertEquals(2000, stats.getNumNulls());
+  }
 }
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index bb80521..a83247f 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -54,12 +54,12 @@ import org.apache.parquet.column.page.DataPageV2;
 import org.apache.parquet.column.page.PageReadStore;
 import org.apache.parquet.column.page.PageReader;
 import org.apache.parquet.column.page.PageWriter;
-import org.apache.parquet.column.statistics.BinaryStatistics;
 import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
 import org.apache.parquet.schema.Types;
 
 public class TestColumnChunkPageWriteStore {
@@ -90,7 +90,7 @@ public class TestColumnChunkPageWriteStore {
     int v = 3;
     BytesInput definitionLevels = BytesInput.fromInt(d);
     BytesInput repetitionLevels = BytesInput.fromInt(r);
-    Statistics<?> statistics = new BinaryStatistics();
+    Statistics<?> statistics = Statistics.getBuilder(PrimitiveTypeName.BINARY).build();
     BytesInput data = BytesInput.fromInt(v);
     int rowCount = 5;
     int nullCount = 1;
@@ -155,13 +155,13 @@ public class TestColumnChunkPageWriteStore {
 
     BytesInput fakeData = BytesInput.fromInt(34);
     int fakeCount = 3;
-    BinaryStatistics fakeStats = new BinaryStatistics();
 
     ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(
         compressor(UNCOMPRESSED), schema);
 
     for (ColumnDescriptor col : schema.getColumns()) {
       PageWriter pageWriter = store.getPageWriter(col);
+      Statistics<?> fakeStats = Statistics.getStatsBasedOnType(col.getType());
       pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
     }
 
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
index ff3b017..24307b6 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -46,7 +46,6 @@ import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
-
 import java.io.File;
 import java.io.IOException;
 import java.util.*;
@@ -57,6 +56,7 @@ import static org.junit.Assert.*;
 import static org.apache.parquet.column.Encoding.BIT_PACKED;
 import static org.apache.parquet.column.Encoding.PLAIN;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
 import static org.apache.parquet.schema.Type.Repetition.*;
 import static org.apache.parquet.hadoop.TestUtils.enforceEmptyDir;
 
@@ -93,8 +93,8 @@ public class TestParquetFileWriter {
   private static final byte[] BYTES4 = { 3, 4, 5, 6 };
   private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;
 
-  private static final BinaryStatistics STATS1 = new BinaryStatistics();
-  private static final BinaryStatistics STATS2 = new BinaryStatistics();
+  private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS
= org.apache.parquet.column.statistics.Statistics
+      .getBuilder(PrimitiveTypeName.BINARY).build();
 
   private String writeSchema;
 
@@ -143,24 +143,24 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
     w.end(new HashMap<String, String>());
@@ -223,15 +223,15 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
@@ -240,10 +240,10 @@ public class TestParquetFileWriter {
 
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
 
@@ -328,15 +328,15 @@ public class TestParquetFileWriter {
     w.startBlock(3);
     w.startColumn(C1, 5, CODEC);
     long c1Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c1Ends = w.getPos();
     w.startColumn(C2, 6, CODEC);
     long c2Starts = w.getPos();
-    w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
-    w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     long c2Ends = w.getPos();
     w.endBlock();
@@ -345,10 +345,10 @@ public class TestParquetFileWriter {
 
     w.startBlock(4);
     w.startColumn(C1, 7, CODEC);
-    w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.startColumn(C2, 8, CODEC);
-    w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
     w.endColumn();
     w.endBlock();
 
@@ -635,8 +635,10 @@ public class TestParquetFileWriter {
     byte[] bytes4 = { 3, 4, 5, 6};
     CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
 
-    BinaryStatistics stats1 = new BinaryStatistics();
-    BinaryStatistics stats2 = new BinaryStatistics();
+    org.apache.parquet.column.statistics.Statistics<?> stats1 = org.apache.parquet.column.statistics.Statistics
+        .getStatsBasedOnType(BINARY);
+    org.apache.parquet.column.statistics.Statistics<?> stats2 = org.apache.parquet.column.statistics.Statistics
+        .getStatsBasedOnType(INT64);
 
     ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
     w.start();

-- 
To stop receiving notification emails like this one, please contact
zivanfi@apache.org.

Mime
View raw message