parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ziva...@apache.org
Subject [parquet-mr] branch master updated: PARQUET-1246: Ignore float/double statistics in case of NaN
Date Mon, 19 Mar 2018 13:43:20 GMT
This is an automated email from the ASF dual-hosted git repository.

zivanfi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new 0a86429  PARQUET-1246: Ignore float/double statistics in case of NaN
0a86429 is described below

commit 0a86429939075984edce5e3b8195dfb7f9e3ab6b
Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>
AuthorDate: Mon Mar 19 14:43:12 2018 +0100

    PARQUET-1246: Ignore float/double statistics in case of NaN
    
    Because of the ambigous sorting order of float/double the following changes made at the
reading path of the related statistics:
    - Ignoring statistics in case of it contains a NaN value.
    - Using -0.0 as min value and +0.0 as max value independently from which 0.0 value was
saved in the statistics.
    
    Author: Gabor Szadovszky <gabor.szadovszky@cloudera.com>
    
    Closes #461 from gszadovszky/PARQUET-1246 and squashes the following commits:
    
    20e9332 [Gabor Szadovszky] PARQUET-1246: Changes according to zi's comments
    3447938 [Gabor Szadovszky] PARQUET-1246: Ignore float/double statistics in case of NaN
---
 .../parquet/column/statistics/Statistics.java      |  81 ++++++++++-
 .../parquet/column/statistics/TestStatistics.java  | 151 ++++++++++++++++++++-
 .../format/converter/ParquetMetadataConverter.java |   2 +-
 .../statisticslevel/TestStatisticsFilter.java      |   4 +-
 .../hadoop/TestColumnChunkPageWriteStore.java      |   2 +-
 .../parquet/hadoop/TestParquetFileWriter.java      |   2 +-
 6 files changed, 232 insertions(+), 10 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index a087c5f..6888ad6 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -73,6 +73,72 @@ public abstract class Statistics<T extends Comparable<T>> {
     }
   }
 
+  // Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and
0.0
+  private static class FloatBuilder extends Builder {
+    public FloatBuilder(PrimitiveType type) {
+      super(type);
+      assert type.getPrimitiveTypeName() == PrimitiveTypeName.FLOAT;
+    }
+
+    @Override
+    public Statistics<?> build() {
+      FloatStatistics stats = (FloatStatistics) super.build();
+      if (stats.hasNonNullValue()) {
+        Float min = stats.genericGetMin();
+        Float max = stats.genericGetMax();
+        // Drop min/max values in case of NaN as the sorting order of values is undefined
for this case
+        if (min.isNaN() || max.isNaN()) {
+          stats.setMinMax(0.0f, 0.0f);
+          ((Statistics<?>) stats).hasNonNullValue = false;
+        } else {
+          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
+          if (Float.compare(min, 0.0f) == 0) {
+            min = -0.0f;
+            stats.setMinMax(min, max);
+          }
+          if (Float.compare(max, -0.0f) == 0) {
+            max = 0.0f;
+            stats.setMinMax(min, max);
+          }
+        }
+      }
+      return stats;
+    }
+  }
+
+  // Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and
0.0
+  private static class DoubleBuilder extends Builder {
+    public DoubleBuilder(PrimitiveType type) {
+      super(type);
+      assert type.getPrimitiveTypeName() == PrimitiveTypeName.DOUBLE;
+    }
+
+    @Override
+    public Statistics<?> build() {
+      DoubleStatistics stats = (DoubleStatistics) super.build();
+      if (stats.hasNonNullValue()) {
+        Double min = stats.genericGetMin();
+        Double max = stats.genericGetMax();
+        // Drop min/max values in case of NaN as the sorting order of values is undefined
for this case
+        if (min.isNaN() || max.isNaN()) {
+          stats.setMinMax(0.0, 0.0);
+          ((Statistics<?>) stats).hasNonNullValue = false;
+        } else {
+          // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
+          if (Double.compare(min, 0.0) == 0) {
+            min = -0.0;
+            stats.setMinMax(min, max);
+          }
+          if (Double.compare(max, -0.0) == 0) {
+            max = 0.0;
+            stats.setMinMax(min, max);
+          }
+        }
+      }
+      return stats;
+    }
+  }
+
   private final PrimitiveType type;
   private final PrimitiveComparator<T> comparator;
   private boolean hasNonNullValue;
@@ -154,8 +220,15 @@ public abstract class Statistics<T extends Comparable<T>>
{
    *          type of the column
    * @return builder to create new statistics object
    */
-  public static Builder getBuilder(PrimitiveType type) {
-    return new Builder(type);
+  public static Builder getBuilderForReading(PrimitiveType type) {
+    switch (type.getPrimitiveTypeName()) {
+      case FLOAT:
+        return new FloatBuilder(type);
+      case DOUBLE:
+        return new DoubleBuilder(type);
+      default:
+        return new Builder(type);
+    }
   }
 
   /**
@@ -266,7 +339,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
    * Abstract method to set min and max values from byte arrays.
    * @param minBytes byte array to set the min value to
    * @param maxBytes byte array to set the max value to
-   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)}
instead.
    */
   @Deprecated
   abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
@@ -401,7 +474,7 @@ public abstract class Statistics<T extends Comparable<T>>
{
    *
    * @param nulls
    *          null count to set the count to
-   * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
+   * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)}
instead.
    */
   @Deprecated
   public void setNumNulls(long nulls) {
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index 5e5d5fd..4431ea4 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -18,13 +18,24 @@
  */
 package org.apache.parquet.column.statistics;
 
+import static java.lang.Double.doubleToLongBits;
+import static java.lang.Float.floatToIntBits;
+import static org.apache.parquet.bytes.BytesUtils.intToBytes;
+import static org.apache.parquet.bytes.BytesUtils.longToBytes;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
 import static org.junit.Assert.*;
 
 import java.nio.ByteBuffer;
 import java.util.Locale;
 
 import org.junit.Test;
-
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.PrimitiveType;
@@ -637,4 +648,142 @@ public class TestStatistics {
     assertEquals(stats.getMax(), Binary.fromString("zzzz"));
     assertEquals(stats.getMin(), Binary.fromString(""));
   }
+
+  @Test
+  public void testBuilder() {
+    testBuilder(Types.required(BOOLEAN).named("test_boolean"), false, new byte[] { 0 }, true,
new byte[] { 1 });
+    testBuilder(Types.required(INT32).named("test_int32"), -42, intToBytes(-42), 42, intToBytes(42));
+    testBuilder(Types.required(INT64).named("test_int64"), -42l, longToBytes(-42), 42l, longToBytes(42));
+    testBuilder(Types.required(FLOAT).named("test_float"), -42.0f, intToBytes(floatToIntBits(-42.0f)),
42.0f,
+        intToBytes(floatToIntBits(42.0f)));
+    testBuilder(Types.required(DOUBLE).named("test_double"), -42.0, longToBytes(doubleToLongBits(-42.0)),
42.0,
+        longToBytes(Double.doubleToLongBits(42.0f)));
+
+    byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+    byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 };
+    testBuilder(Types.required(INT96).named("test_int96"), Binary.fromConstantByteArray(min),
min,
+        Binary.fromConstantByteArray(max), max);
+    testBuilder(Types.required(FIXED_LEN_BYTE_ARRAY).length(12).named("test_fixed"), Binary.fromConstantByteArray(min),
+        min,
+        Binary.fromConstantByteArray(max), max);
+    testBuilder(Types.required(BINARY).named("test_binary"), Binary.fromConstantByteArray(min),
min,
+        Binary.fromConstantByteArray(max), max);
+  }
+
+  private void testBuilder(PrimitiveType type, Object min, byte[] minBytes, Object max, byte[]
maxBytes) {
+    Statistics.Builder builder = Statistics.getBuilderForReading(type);
+    Statistics<?> stats = builder.build();
+    assertTrue(stats.isEmpty());
+    assertFalse(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withNumNulls(0).withMin(minBytes).build();
+    assertFalse(stats.isEmpty());
+    assertTrue(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertEquals(0, stats.getNumNulls());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withNumNulls(11).withMax(maxBytes).build();
+    assertFalse(stats.isEmpty());
+    assertTrue(stats.isNumNullsSet());
+    assertFalse(stats.hasNonNullValue());
+    assertEquals(11, stats.getNumNulls());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build();
+    assertFalse(stats.isEmpty());
+    assertTrue(stats.isNumNullsSet());
+    assertTrue(stats.hasNonNullValue());
+    assertEquals(42, stats.getNumNulls());
+    assertEquals(min, stats.genericGetMin());
+    assertEquals(max, stats.genericGetMax());
+  }
+
+  @Test
+  public void testSpecBuilderForFloat() {
+    PrimitiveType type = Types.required(FLOAT).named("test_float");
+    Statistics.Builder builder = Statistics.getBuilderForReading(type);
+    Statistics<?> stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
+        .withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(0, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
+        .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(11, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
+        .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(42, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
+        .withMax(intToBytes(floatToIntBits(42.0f))).build();
+    assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
+    assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax()));
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
+        .withMax(intToBytes(floatToIntBits(-0.0f))).build();
+    assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin()));
+    assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
+        .withMax(intToBytes(floatToIntBits(-0.0f))).build();
+    assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
+    assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
+  }
+
+  @Test
+  public void testSpecBuilderForDouble() {
+    PrimitiveType type = Types.required(DOUBLE).named("test_double");
+    Statistics.Builder builder = Statistics.getBuilderForReading(type);
+    Statistics<?> stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
+        .withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(0, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
+        .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(11, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
+        .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build();
+    assertTrue(stats.isNumNullsSet());
+    assertEquals(42, stats.getNumNulls());
+    assertFalse(stats.hasNonNullValue());
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
+        .withMax(longToBytes(doubleToLongBits(42.0))).build();
+    assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
+    assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax()));
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
+        .withMax(longToBytes(doubleToLongBits(-0.0))).build();
+    assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin()));
+    assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
+
+    builder = Statistics.getBuilderForReading(type);
+    stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
+        .withMax(longToBytes(doubleToLongBits(-0.0))).build();
+    assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
+    assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
+  }
 }
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 0daabb6..bc43516 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -402,7 +402,7 @@ public class ParquetMetadataConverter {
       (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder)
{
     // create stats object based on the column type
     org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
-        org.apache.parquet.column.statistics.Statistics.getBuilder(type);
+        org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
 
     if (formatStats != null) {
       // Use the new V2 min-max statistics over the former one if it is filled
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index 6fdec2a..97dd169 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -90,10 +90,10 @@ public class TestStatisticsFilter {
   private static final IntStatistics intStats = new IntStatistics();
   private static final IntStatistics nullIntStats = new IntStatistics();
   private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats
= org.apache.parquet.column.statistics.Statistics
-      .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
+      .getBuilderForReading(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
   private static final DoubleStatistics doubleStats = new DoubleStatistics();
   private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats
= org.apache.parquet.column.statistics.Statistics
-      .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
+      .getBuilderForReading(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
 
   static {
     intStats.setMinMax(10, 100);
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index 0b7b951..a5381f0 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -93,7 +93,7 @@ public class TestColumnChunkPageWriteStore {
     int v = 3;
     BytesInput definitionLevels = BytesInput.fromInt(d);
     BytesInput repetitionLevels = BytesInput.fromInt(r);
-    Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
+    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
         .build();
     BytesInput data = BytesInput.fromInt(v);
     int rowCount = 5;
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
index c73e569..095b575 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -96,7 +96,7 @@ public class TestParquetFileWriter {
   private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;
 
   private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS
= org.apache.parquet.column.statistics.Statistics
-      .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
+      .getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
 
   private String writeSchema;
 

-- 
To stop receiving notification emails like this one, please contact
zivanfi@apache.org.

Mime
View raw message