Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id CE090200C2A for ; Thu, 19 Jan 2017 02:27:18 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id CCBC5160B44; Thu, 19 Jan 2017 01:27:18 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 04B11160B5C for ; Thu, 19 Jan 2017 02:27:16 +0100 (CET) Received: (qmail 54165 invoked by uid 500); 19 Jan 2017 01:27:16 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 52892 invoked by uid 99); 19 Jan 2017 01:27:13 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 19 Jan 2017 01:27:13 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 3E44CF401A; Thu, 19 Jan 2017 01:27:13 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: blue@apache.org To: commits@parquet.apache.org Date: Thu, 19 Jan 2017 01:27:45 -0000 Message-Id: <8a26c984b69845b3bd142ce5f0aa949d@git.apache.org> In-Reply-To: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> References: <25406da3dfe343a9a44d6bc62fd223d9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [34/50] [abbrv] parquet-mr git commit: PARQUET-372: Do not write stats larger than 4k. archived-at: Thu, 19 Jan 2017 01:27:19 -0000 PARQUET-372: Do not write stats larger than 4k. This updates the stats conversion to check whether the min and max values for page stats are larger than 4k. If so, no statistics for a page are written. Author: Ryan Blue Closes #275 from rdblue/PARQUET-372-fix-min-max-for-long-values and squashes the following commits: 61e05d9 [Ryan Blue] PARQUET-372: Add comment to explain not truncating values. fbbc1c4 [Ryan Blue] PARQUET-372: Do not write stats larger than 4k. Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/e3ee35ad Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/e3ee35ad Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/e3ee35ad Branch: refs/heads/parquet-1.8.x Commit: e3ee35aded09d5a0cdec38fd4f7e5efccc370bbc Parents: 45e673f Author: Ryan Blue Authored: Thu May 5 13:54:28 2016 -0700 Committer: Ryan Blue Committed: Mon Jan 9 16:54:54 2017 -0800 ---------------------------------------------------------------------- .../column/statistics/BinaryStatistics.java | 25 +++ .../column/statistics/BooleanStatistics.java | 5 + .../column/statistics/DoubleStatistics.java | 5 + .../column/statistics/FloatStatistics.java | 5 + .../column/statistics/IntStatistics.java | 5 + .../column/statistics/LongStatistics.java | 5 + .../parquet/column/statistics/Statistics.java | 8 + .../converter/ParquetMetadataConverter.java | 8 +- .../converter/TestParquetMetadataConverter.java | 159 ++++++++++++++++++- .../parquet/statistics/TestStatistics.java | 9 ++ 10 files changed, 232 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index e8439f0..c319b4a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -68,6 +68,11 @@ public class BinaryStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || ((min.length() + max.length()) < size); + } + + @Override public String toString() { if (this.hasNonNullValue()) return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls()); @@ -77,11 +82,19 @@ public class BinaryStatistics extends Statistics { return "no stats for this column"; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void updateStats(Binary min_value, Binary max_value) { if (min.compareTo(min_value) > 0) { min = min_value.copy(); } if (max.compareTo(max_value) < 0) { max = max_value.copy(); } } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void initializeStats(Binary min_value, Binary max_value) { min = min_value.copy(); max = max_value.copy(); @@ -98,14 +111,26 @@ public class BinaryStatistics extends Statistics { return max; } + /** + * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMax() { return max; } + /** + * @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMin() { return min; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void setMinMax(Binary min, Binary max) { this.max = max; this.min = min; http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 1d02c74..22c2393 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -62,6 +62,11 @@ public class BooleanStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (2 < size); + } + + @Override public String toString() { if (this.hasNonNullValue()) return String.format("min: %b, max: %b, num_nulls: %d", min, max, this.getNumNulls()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 9d94439..d67a550 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -62,6 +62,11 @@ public class DoubleStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + + @Override public String toString() { if(this.hasNonNullValue()) return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index c164cf5..dffc207 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -62,6 +62,11 @@ public class FloatStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + + @Override public String toString() { if (this.hasNonNullValue()) return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 8deb28a..a5d7ba1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -62,6 +62,11 @@ public class IntStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + + @Override public String toString() { if (this.hasNonNullValue()) return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index a8c177e..f7971ef 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -62,6 +62,11 @@ public class LongStatistics extends Statistics { } @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + + @Override public String toString() { if (this.hasNonNullValue()) return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 5424414..30153c0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -191,6 +191,14 @@ public abstract class Statistics> { abstract public byte[] getMinBytes(); /** + * Abstract method to return whether the min and max values fit in the given + * size. + * @param size a size in bytes + * @return true iff the min and max values are less than size bytes + */ + abstract public boolean isSmallerThan(long size); + + /** * toString() to display min, max, num_nulls in a string */ abstract public String toString(); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 75b07fd..9eb471f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -77,6 +77,7 @@ public class ParquetMetadataConverter { public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); + public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k private static final Log LOG = Log.getLog(ParquetMetadataConverter.class); @@ -284,7 +285,11 @@ public class ParquetMetadataConverter { public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); - if (!statistics.isEmpty()) { + // Don't write stats larger than the max size rather than truncating. The + // rationale is that some engines may use the minimum value in the page as + // the true minimum for aggregations and there is no way to mark that a + // value has been truncated and is a lower bound and not in the page. + if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) { stats.setMax(statistics.getMaxBytes()); @@ -293,6 +298,7 @@ public class ParquetMetadataConverter { } return stats; } + /** * @deprecated Replaced by {@link #fromParquetStatistics( * String createdBy, Statistics statistics, PrimitiveTypeName type)} http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index b9cfde7..3c888c3 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -45,12 +45,21 @@ import java.util.Set; import java.util.TreeSet; import com.google.common.collect.Sets; +import org.apache.parquet.Version; +import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.BooleanStatistics; +import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; +import org.apache.parquet.column.statistics.IntStatistics; +import org.apache.parquet.column.statistics.LongStatistics; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.api.Binary; import org.junit.Assert; import org.junit.Test; @@ -357,5 +366,153 @@ public class TestParquetMetadataConverter { assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName()); - } + } + + @Test + public void testBinaryStats() { + // make fake stats and verify the size check + BinaryStatistics stats = new BinaryStatistics(); + stats.incrementNumNulls(3004); + byte[] min = new byte[904]; + byte[] max = new byte[2388]; + stats.updateStats(Binary.fromConstantByteArray(min)); + stats.updateStats(Binary.fromConstantByteArray(max)); + long totalLen = min.length + max.length; + Assert.assertFalse("Should not be smaller than min + max size", + stats.isSmallerThan(totalLen)); + Assert.assertTrue("Should be smaller than min + max size + 1", + stats.isSmallerThan(totalLen + 1)); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); + Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + + // convert to empty stats because the values are too large + stats.setMinMaxFromBytes(max, max); + + formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertFalse("Min should not be set", formatStats.isSetMin()); + Assert.assertFalse("Max should not be set", formatStats.isSetMax()); + Assert.assertFalse("Num nulls should not be set", + formatStats.isSetNull_count()); + + Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics( + Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY); + + Assert.assertTrue(roundTripStats.isEmpty()); + } + + @Test + public void testIntegerStats() { + // make fake stats and verify the size check + IntStatistics stats = new IntStatistics(); + stats.incrementNumNulls(3004); + int min = Integer.MIN_VALUE; + int max = Integer.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToInt(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToInt(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testLongStats() { + // make fake stats and verify the size check + LongStatistics stats = new LongStatistics(); + stats.incrementNumNulls(3004); + long min = Long.MIN_VALUE; + long max = Long.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToLong(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToLong(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testFloatStats() { + // make fake stats and verify the size check + FloatStatistics stats = new FloatStatistics(); + stats.incrementNumNulls(3004); + float min = Float.MIN_VALUE; + float max = Float.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testDoubleStats() { + // make fake stats and verify the size check + DoubleStatistics stats = new DoubleStatistics(); + stats.incrementNumNulls(3004); + double min = Double.MIN_VALUE; + double max = Double.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testBooleanStats() { + // make fake stats and verify the size check + BooleanStatistics stats = new BooleanStatistics(); + stats.incrementNumNulls(3004); + boolean min = Boolean.FALSE; + boolean max = Boolean.TRUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToBool(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToBool(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/e3ee35ad/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index 5bc060d..d157cc3 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -282,6 +282,15 @@ public class TestStatistics { PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); Statistics stats = getStatisticsFromPageHeader(page); + if (stats.isEmpty()) { + // stats are empty if num nulls = 0 and there are no non-null values + // this happens if stats are not written (e.g., when stats are too big) + System.err.println(String.format( + "No stats written for page=%s col=%s", + page, Arrays.toString(desc.getPath()))); + return; + } + long numNulls = 0; ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null); for (int i = 0; i < reader.getTotalValueCount(); i += 1) {