parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From b...@apache.org
Subject parquet-mr git commit: PARQUET-372: Do not write stats larger than 4k.
Date Thu, 05 May 2016 20:54:33 GMT
Repository: parquet-mr
Updated Branches:
  refs/heads/master 39a3cd0f4 -> c3f3830f7


PARQUET-372: Do not write stats larger than 4k.

This updates the stats conversion to check whether the min and max
values for page stats are larger than 4k. If so, no statistics for a
page are written.

Author: Ryan Blue <blue@apache.org>

Closes #275 from rdblue/PARQUET-372-fix-min-max-for-long-values and squashes the following
commits:

61e05d9 [Ryan Blue] PARQUET-372: Add comment to explain not truncating values.
fbbc1c4 [Ryan Blue] PARQUET-372: Do not write stats larger than 4k.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/c3f3830f
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/c3f3830f
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/c3f3830f

Branch: refs/heads/master
Commit: c3f3830f771f26a537d2930b00b270451bbc5627
Parents: 39a3cd0
Author: Ryan Blue <blue@apache.org>
Authored: Thu May 5 13:54:28 2016 -0700
Committer: Ryan Blue <blue@apache.org>
Committed: Thu May 5 13:54:28 2016 -0700

----------------------------------------------------------------------
 .../column/statistics/BinaryStatistics.java     |  25 +++
 .../column/statistics/BooleanStatistics.java    |   5 +
 .../column/statistics/DoubleStatistics.java     |   5 +
 .../column/statistics/FloatStatistics.java      |   5 +
 .../column/statistics/IntStatistics.java        |   5 +
 .../column/statistics/LongStatistics.java       |   5 +
 .../parquet/column/statistics/Statistics.java   |   8 +
 .../converter/ParquetMetadataConverter.java     |   8 +-
 .../converter/TestParquetMetadataConverter.java | 159 ++++++++++++++++++-
 .../parquet/statistics/TestStatistics.java      |   9 ++
 10 files changed, 232 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java
index e8439f0..c319b4a 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java
@@ -68,6 +68,11 @@ public class BinaryStatistics extends Statistics<Binary> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || ((min.length() + max.length()) < size);
+  }
+
+  @Override
   public String toString() {
     if (this.hasNonNullValue())
       return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(),
this.getNumNulls());
@@ -77,11 +82,19 @@ public class BinaryStatistics extends Statistics<Binary> {
       return "no stats for this column";
   }
 
+  /**
+   * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
+   */
+  @Deprecated
   public void updateStats(Binary min_value, Binary max_value) {
     if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
     if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
   }
 
+  /**
+   * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
+   */
+  @Deprecated
   public void initializeStats(Binary min_value, Binary max_value) {
       min = min_value.copy();
       max = max_value.copy();
@@ -98,14 +111,26 @@ public class BinaryStatistics extends Statistics<Binary> {
     return max;
   }
 
+  /**
+   * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
+   */
+  @Deprecated
   public Binary getMax() {
     return max;
   }
 
+  /**
+   * @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
+   */
+  @Deprecated
   public Binary getMin() {
     return min;
   }
 
+  /**
+   * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
+   */
+  @Deprecated
   public void setMinMax(Binary min, Binary max) {
     this.max = max;
     this.min = min;

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java
index 1d02c74..22c2393 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java
@@ -62,6 +62,11 @@ public class BooleanStatistics extends Statistics<Boolean> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || (2 < size);
+  }
+
+  @Override
   public String toString() {
     if (this.hasNonNullValue())
       return String.format("min: %b, max: %b, num_nulls: %d", min, max, this.getNumNulls());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java
index 9d94439..d67a550 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java
@@ -62,6 +62,11 @@ public class DoubleStatistics extends Statistics<Double> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || (16 < size);
+  }
+
+  @Override
   public String toString() {
     if(this.hasNonNullValue())
       return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java
index c164cf5..dffc207 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java
@@ -62,6 +62,11 @@ public class FloatStatistics extends Statistics<Float> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || (8 < size);
+  }
+
+  @Override
   public String toString() {
     if (this.hasNonNullValue())
       return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java
index 8deb28a..a5d7ba1 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java
@@ -62,6 +62,11 @@ public class IntStatistics extends Statistics<Integer> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || (8 < size);
+  }
+
+  @Override
   public String toString() {
     if (this.hasNonNullValue())
       return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java
index a8c177e..f7971ef 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java
@@ -62,6 +62,11 @@ public class LongStatistics extends Statistics<Long> {
   }
 
   @Override
+  public boolean isSmallerThan(long size) {
+    return !hasNonNullValue() || (16 < size);
+  }
+
+  @Override
   public String toString() {
     if (this.hasNonNullValue())
       return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 5424414..30153c0 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -191,6 +191,14 @@ public abstract class Statistics<T extends Comparable<T>>
{
   abstract public byte[] getMinBytes();
 
   /**
+   * Abstract method to return whether the min and max values fit in the given
+   * size.
+   * @param size a size in bytes
+   * @return true iff the min and max values are less than size bytes
+   */
+  abstract public boolean isSmallerThan(long size);
+
+  /**
    * toString() to display min, max, num_nulls in a string
    */
   abstract public String toString();

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 75b07fd..9eb471f 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -77,6 +77,7 @@ public class ParquetMetadataConverter {
 
   public static final MetadataFilter NO_FILTER = new NoFilter();
   public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
+  public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k
 
   private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);
 
@@ -284,7 +285,11 @@ public class ParquetMetadataConverter {
   public static Statistics toParquetStatistics(
       org.apache.parquet.column.statistics.Statistics statistics) {
     Statistics stats = new Statistics();
-    if (!statistics.isEmpty()) {
+    // Don't write stats larger than the max size rather than truncating. The
+    // rationale is that some engines may use the minimum value in the page as
+    // the true minimum for aggregations and there is no way to mark that a
+    // value has been truncated and is a lower bound and not in the page.
+    if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) {
       stats.setNull_count(statistics.getNumNulls());
       if (statistics.hasNonNullValue()) {
         stats.setMax(statistics.getMaxBytes());
@@ -293,6 +298,7 @@ public class ParquetMetadataConverter {
     }
     return stats;
   }
+
   /**
    * @deprecated Replaced by {@link #fromParquetStatistics(
    * String createdBy, Statistics statistics, PrimitiveTypeName type)}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index b9cfde7..3c888c3 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -45,12 +45,21 @@ import java.util.Set;
 import java.util.TreeSet;
 
 import com.google.common.collect.Sets;
+import org.apache.parquet.Version;
+import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.statistics.BinaryStatistics;
+import org.apache.parquet.column.statistics.BooleanStatistics;
+import org.apache.parquet.column.statistics.DoubleStatistics;
+import org.apache.parquet.column.statistics.FloatStatistics;
+import org.apache.parquet.column.statistics.IntStatistics;
+import org.apache.parquet.column.statistics.LongStatistics;
+import org.apache.parquet.column.statistics.Statistics;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnPath;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.io.api.Binary;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -357,5 +366,153 @@ public class TestParquetMetadataConverter {
     assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
     assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
     assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
-  }  
+  }
+
+  @Test
+  public void testBinaryStats() {
+    // make fake stats and verify the size check
+    BinaryStatistics stats = new BinaryStatistics();
+    stats.incrementNumNulls(3004);
+    byte[] min = new byte[904];
+    byte[] max = new byte[2388];
+    stats.updateStats(Binary.fromConstantByteArray(min));
+    stats.updateStats(Binary.fromConstantByteArray(max));
+    long totalLen = min.length + max.length;
+    Assert.assertFalse("Should not be smaller than min + max size",
+        stats.isSmallerThan(totalLen));
+    Assert.assertTrue("Should be smaller than min + max size + 1",
+        stats.isSmallerThan(totalLen + 1));
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertArrayEquals("Min should match", min, formatStats.getMin());
+    Assert.assertArrayEquals("Max should match", max, formatStats.getMax());
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+
+    // convert to empty stats because the values are too large
+    stats.setMinMaxFromBytes(max, max);
+
+    formatStats = ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertFalse("Min should not be set", formatStats.isSetMin());
+    Assert.assertFalse("Max should not be set", formatStats.isSetMax());
+    Assert.assertFalse("Num nulls should not be set",
+        formatStats.isSetNull_count());
+
+    Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics(
+        Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY);
+
+    Assert.assertTrue(roundTripStats.isEmpty());
+  }
+
+  @Test
+  public void testIntegerStats() {
+    // make fake stats and verify the size check
+    IntStatistics stats = new IntStatistics();
+    stats.incrementNumNulls(3004);
+    int min = Integer.MIN_VALUE;
+    int max = Integer.MAX_VALUE;
+    stats.updateStats(min);
+    stats.updateStats(max);
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertEquals("Min should match",
+        min, BytesUtils.bytesToInt(formatStats.getMin()));
+    Assert.assertEquals("Max should match",
+        max, BytesUtils.bytesToInt(formatStats.getMax()));
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+  }
+
+  @Test
+  public void testLongStats() {
+    // make fake stats and verify the size check
+    LongStatistics stats = new LongStatistics();
+    stats.incrementNumNulls(3004);
+    long min = Long.MIN_VALUE;
+    long max = Long.MAX_VALUE;
+    stats.updateStats(min);
+    stats.updateStats(max);
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertEquals("Min should match",
+        min, BytesUtils.bytesToLong(formatStats.getMin()));
+    Assert.assertEquals("Max should match",
+        max, BytesUtils.bytesToLong(formatStats.getMax()));
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+  }
+
+  @Test
+  public void testFloatStats() {
+    // make fake stats and verify the size check
+    FloatStatistics stats = new FloatStatistics();
+    stats.incrementNumNulls(3004);
+    float min = Float.MIN_VALUE;
+    float max = Float.MAX_VALUE;
+    stats.updateStats(min);
+    stats.updateStats(max);
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertEquals("Min should match",
+        min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
+        0.000001);
+    Assert.assertEquals("Max should match",
+        max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
+        0.000001);
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+  }
+
+  @Test
+  public void testDoubleStats() {
+    // make fake stats and verify the size check
+    DoubleStatistics stats = new DoubleStatistics();
+    stats.incrementNumNulls(3004);
+    double min = Double.MIN_VALUE;
+    double max = Double.MAX_VALUE;
+    stats.updateStats(min);
+    stats.updateStats(max);
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertEquals("Min should match",
+        min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
+        0.000001);
+    Assert.assertEquals("Max should match",
+        max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
+        0.000001);
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+  }
+
+  @Test
+  public void testBooleanStats() {
+    // make fake stats and verify the size check
+    BooleanStatistics stats = new BooleanStatistics();
+    stats.incrementNumNulls(3004);
+    boolean min = Boolean.FALSE;
+    boolean max = Boolean.TRUE;
+    stats.updateStats(min);
+    stats.updateStats(max);
+
+    org.apache.parquet.format.Statistics formatStats =
+        ParquetMetadataConverter.toParquetStatistics(stats);
+
+    Assert.assertEquals("Min should match",
+        min, BytesUtils.bytesToBool(formatStats.getMin()));
+    Assert.assertEquals("Max should match",
+        max, BytesUtils.bytesToBool(formatStats.getMax()));
+    Assert.assertEquals("Num nulls should match",
+        3004, formatStats.getNull_count());
+  }
 }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c3f3830f/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java
index 5bc060d..d157cc3 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java
@@ -282,6 +282,15 @@ public class TestStatistics {
       PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
       Statistics stats = getStatisticsFromPageHeader(page);
 
+      if (stats.isEmpty()) {
+        // stats are empty if num nulls = 0 and there are no non-null values
+        // this happens if stats are not written (e.g., when stats are too big)
+        System.err.println(String.format(
+            "No stats written for page=%s col=%s",
+            page, Arrays.toString(desc.getPath())));
+        return;
+      }
+
       long numNulls = 0;
       ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
       for (int i = 0; i < reader.getTotalValueCount(); i += 1) {


Mime
View raw message