parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-745: TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType
Date Tue, 01 Nov 2016 02:27:49 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 69db1a835 -> be20e2e03


PARQUET-745: TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType

I'm not sure how this code is supposed to work, this seems to solve the issue.

The code was added in 176b08c305919551ecfd5fc2f741f7bff4deefdd by @lomereiter I think.

Author: fscheibner <florian.scheibner@snowflake.net>

Closes #176 from flode/stats and squashes the following commits:

7a24906 [fscheibner] fix format
f283edb [fscheibner] Pass wether minmax is set
b50c985 [fscheibner] format
f969a9f [fscheibner] Specialize PlainEncode and PlainDecode
6b5884d [fscheibner] format
f17926d [fscheibner] fix typedrowgroup


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/be20e2e0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/be20e2e0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/be20e2e0

Branch: refs/heads/master
Commit: be20e2e037033ab5f36b77c816565ee5afbe0cdc
Parents: 69db1a8
Author: fscheibner <florian.scheibner@snowflake.net>
Authored: Mon Oct 31 22:27:41 2016 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Oct 31 22:27:41 2016 -0400

----------------------------------------------------------------------
 src/parquet/column/statistics-test.cc | 27 ++++++++++++++++++++++++++-
 src/parquet/column/statistics.cc      | 18 ++++++++++++++++--
 src/parquet/column/statistics.h       | 11 ++++++++++-
 src/parquet/file/metadata.cc          |  3 ++-
 src/parquet/file/reader-internal.cc   |  8 --------
 5 files changed, 54 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/be20e2e0/src/parquet/column/statistics-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics-test.cc b/src/parquet/column/statistics-test.cc
index d1e1eeb..3ed7aab 100644
--- a/src/parquet/column/statistics-test.cc
+++ b/src/parquet/column/statistics-test.cc
@@ -66,7 +66,8 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType>
{
     std::string encoded_max = statistics1.EncodeMax();
 
     TypedStats statistics2(
-        this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0,
0);
+        this->schema_.Column(0), encoded_min, encoded_max,
+        this->values_.size(), 0, 0, true);
 
     ASSERT_EQ(encoded_min, statistics2.EncodeMin());
     ASSERT_EQ(encoded_max, statistics2.EncodeMax());
@@ -235,6 +236,30 @@ void TestRowGroupStatistics<ByteArrayType>::DeepFree(std::vector<ByteArray>&
val
   }
 }
 
+template<>
+void TestRowGroupStatistics<ByteArrayType>::TestMinMaxEncode() {
+  this->GenerateData(1000);
+  // Test that we encode min max strings correctly
+  TypedRowGroupStatistics<ByteArrayType> statistics1(this->schema_.Column(0));
+  statistics1.Update(this->values_ptr_, this->values_.size(), 0);
+  std::string encoded_min = statistics1.EncodeMin();
+  std::string encoded_max = statistics1.EncodeMax();
+
+  // encoded is same as unencoded
+  ASSERT_EQ(encoded_min, std::string((const char*)statistics1.min().ptr,
+      statistics1.min().len));
+  ASSERT_EQ(encoded_max, std::string((const char*)statistics1.max().ptr,
+      statistics1.max().len));
+
+  TypedRowGroupStatistics<ByteArrayType> statistics2(
+     this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0,
true);
+
+  ASSERT_EQ(encoded_min, statistics2.EncodeMin());
+  ASSERT_EQ(encoded_max, statistics2.EncodeMax());
+  ASSERT_EQ(statistics1.min(), statistics2.min());
+  ASSERT_EQ(statistics1.max(), statistics2.max());
+}
+
 using TestTypes = ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
     ByteArrayType, FLBAType, BooleanType>;
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/be20e2e0/src/parquet/column/statistics.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics.cc b/src/parquet/column/statistics.cc
index d8f8785..d761571 100644
--- a/src/parquet/column/statistics.cc
+++ b/src/parquet/column/statistics.cc
@@ -54,7 +54,8 @@ TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const typename
DType::c_
 template <typename DType>
 TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const ColumnDescriptor* schema,
     const std::string& encoded_min, const std::string& encoded_max, int64_t num_values,
-    int64_t null_count, int64_t distinct_count, MemoryAllocator* allocator)
+    int64_t null_count, int64_t distinct_count, bool has_min_max,
+    MemoryAllocator* allocator)
     : allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) {
   IncrementNumValues(num_values);
   IncrementNullCount(null_count);
@@ -64,7 +65,7 @@ TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const ColumnDescriptor*
 
   if (!encoded_min.empty()) { PlainDecode(encoded_min, &min_); }
   if (!encoded_max.empty()) { PlainDecode(encoded_max, &max_); }
-  has_min_max_ = !encoded_min.empty() && !encoded_max.empty();
+  has_min_max_ = has_min_max;
 }
 
 template <typename DType>
@@ -160,6 +161,19 @@ void TypedRowGroupStatistics<DType>::PlainDecode(const std::string&
src, T* dst)
   decoder.Decode(dst, 1);
 }
 
+template <>
+void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(
+     const T& src, std::string* dst) {
+  dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
+}
+
+template <>
+void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(
+     const std::string& src, T* dst) {
+  dst->len = src.size();
+  dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
+}
+
 template class TypedRowGroupStatistics<BooleanType>;
 template class TypedRowGroupStatistics<Int32Type>;
 template class TypedRowGroupStatistics<Int64Type>;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/be20e2e0/src/parquet/column/statistics.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics.h b/src/parquet/column/statistics.h
index dc576c8..a3b2821 100644
--- a/src/parquet/column/statistics.h
+++ b/src/parquet/column/statistics.h
@@ -142,7 +142,8 @@ class TypedRowGroupStatistics : public RowGroupStatistics {
 
   TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min,
       const std::string& encoded_max, int64_t num_values, int64_t null_count,
-      int64_t distinct_count, MemoryAllocator* allocator = default_allocator());
+      int64_t distinct_count, bool has_min_max,
+      MemoryAllocator* allocator = default_allocator());
 
   bool HasMinMax() const override;
   void Reset() override;
@@ -195,6 +196,14 @@ inline void TypedRowGroupStatistics<ByteArrayType>::Copy(
   *dst = ByteArray(src.len, buffer.data());
 }
 
+template <>
+void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(
+    const T& src, std::string* dst);
+
+template <>
+void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(
+    const std::string& src, T* dst);
+
 using BoolStatistics = TypedRowGroupStatistics<BooleanType>;
 using Int32Statistics = TypedRowGroupStatistics<Int32Type>;
 using Int64Statistics = TypedRowGroupStatistics<Int64Type>;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/be20e2e0/src/parquet/file/metadata.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc
index d50c915..2580706 100644
--- a/src/parquet/file/metadata.cc
+++ b/src/parquet/file/metadata.cc
@@ -29,7 +29,8 @@ static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
     const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
   return std::make_shared<TypedRowGroupStatistics<DType>>(descr, metadata.statistics.min,
       metadata.statistics.max, metadata.num_values - metadata.statistics.null_count,
-      metadata.statistics.null_count, metadata.statistics.distinct_count);
+      metadata.statistics.null_count, metadata.statistics.distinct_count,
+      metadata.statistics.__isset.max || metadata.statistics.__isset.min);
 }
 
 std::shared_ptr<RowGroupStatistics> MakeColumnStats(

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/be20e2e0/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index fa19390..5eda79b 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -175,14 +175,6 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int
i) {
       std::move(stream), col->compression(), properties_.allocator()));
 }
 
-template <typename DType>
-static std::shared_ptr<RowGroupStatistics> MakeColumnStats(
-    const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
-  return std::make_shared<TypedRowGroupStatistics<DType>>(descr, metadata.statistics.min,
-      metadata.statistics.max, metadata.num_values, metadata.statistics.null_count,
-      metadata.statistics.distinct_count);
-}
-
 // ----------------------------------------------------------------------
 // SerializedFile: Parquet on-disk layout
 


Mime
View raw message