parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jul...@apache.org
Subject parquet-cpp git commit: PARQUET-454: Fix inconsistencies with boolean PLAIN encoding
Date Tue, 02 Feb 2016 23:00:52 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master ee83fad67 -> c0eec9a59


PARQUET-454: Fix inconsistencies with boolean PLAIN encoding

Requires PARQUET-485 (#32)

The boolean Encoding::PLAIN code path was using RleDecoder, inconsistent with
other implementations of Parquet. This patch adds an implementation of plain
encoding and uses BitReader instead of RleDecoder to decode plain-encoded
boolean data. Unit tests to verify.

Also closes PR #12. Thanks to @edani for reporting.

Author: Wes McKinney <wes@cloudera.com>

Closes #34 from wesm/PARQUET-454 and squashes the following commits:

01cb5a7 [Wes McKinney] Use a seed in the data generation
0bf5d8a [Wes McKinney] Fix inconsistencies with boolean PLAIN encoding.


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c0eec9a5
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c0eec9a5
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c0eec9a5

Branch: refs/heads/master
Commit: c0eec9a593420a2f40786271fa813d44b85e7198
Parents: ee83fad
Author: Wes McKinney <wes@cloudera.com>
Authored: Tue Feb 2 15:00:45 2016 -0800
Committer: Julien Le Dem <julien@dremio.com>
Committed: Tue Feb 2 15:00:45 2016 -0800

----------------------------------------------------------------------
 src/parquet/encodings/CMakeLists.txt         |  2 +
 src/parquet/encodings/encodings.h            |  3 +-
 src/parquet/encodings/plain-encoding-test.cc | 65 +++++++++++++++++++++++
 src/parquet/encodings/plain-encoding.h       | 51 ++++++++++++++----
 src/parquet/util/bit-util.h                  |  8 +++
 src/parquet/util/test-common.h               | 27 ++++++++++
 6 files changed, 144 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/CMakeLists.txt b/src/parquet/encodings/CMakeLists.txt
index 69ef1f3..638fba0 100644
--- a/src/parquet/encodings/CMakeLists.txt
+++ b/src/parquet/encodings/CMakeLists.txt
@@ -24,3 +24,5 @@ install(FILES
   dictionary-encoding.h
   plain-encoding.h
   DESTINATION include/parquet/encodings)
+
+ADD_PARQUET_TEST(plain-encoding-test)

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/encodings.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h
index 0d9202e..c427ff4 100644
--- a/src/parquet/encodings/encodings.h
+++ b/src/parquet/encodings/encodings.h
@@ -20,10 +20,9 @@
 
 #include <cstdint>
 
+#include "parquet/exception.h"
 #include "parquet/types.h"
 
-#include "parquet/thrift/parquet_constants.h"
-#include "parquet/thrift/parquet_types.h"
 #include "parquet/util/rle-encoding.h"
 #include "parquet/util/bit-stream-utils.inline.h"
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/plain-encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/plain-encoding-test.cc b/src/parquet/encodings/plain-encoding-test.cc
new file mode 100644
index 0000000..ca425dd
--- /dev/null
+++ b/src/parquet/encodings/plain-encoding-test.cc
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/encodings/encodings.h"
+
+using std::string;
+using std::vector;
+
+namespace parquet_cpp {
+
+namespace test {
+
+TEST(BooleanTest, TestEncodeDecode) {
+  // PARQUET-454
+
+  size_t nvalues = 100;
+  size_t nbytes = BitUtil::RoundUp(nvalues, 8) / 8;
+
+  // seed the prng so failure is deterministic
+  vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);
+
+  PlainEncoder<Type::BOOLEAN> encoder(nullptr);
+  PlainDecoder<Type::BOOLEAN> decoder(nullptr);
+
+  std::vector<uint8_t> encode_buffer(nbytes);
+
+  size_t encoded_bytes = encoder.Encode(draws, nvalues, &encode_buffer[0]);
+  ASSERT_EQ(nbytes, encoded_bytes);
+
+  std::vector<uint8_t> decode_buffer(nbytes);
+  const uint8_t* decode_data = &decode_buffer[0];
+
+  decoder.SetData(nvalues, &encode_buffer[0], encoded_bytes);
+  size_t values_decoded = decoder.Decode(&decode_buffer[0], nvalues);
+  ASSERT_EQ(nvalues, values_decoded);
+
+  for (size_t i = 0; i < nvalues; ++i) {
+    ASSERT_EQ(BitUtil::GetArrayBit(decode_data, i), draws[i]) << i;
+  }
+}
+
+} // namespace test
+
+} // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/encodings/plain-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h
index 11e70c7..cc37776 100644
--- a/src/parquet/encodings/plain-encoding.h
+++ b/src/parquet/encodings/plain-encoding.h
@@ -21,6 +21,7 @@
 #include "parquet/encodings/encodings.h"
 
 #include <algorithm>
+#include <vector>
 
 using parquet::Type;
 
@@ -103,19 +104,38 @@ class PlainDecoder<Type::BOOLEAN> : public Decoder<Type::BOOLEAN>
{
 
   virtual void SetData(int num_values, const uint8_t* data, int len) {
     num_values_ = num_values;
-    decoder_ = RleDecoder(data, len, 1);
+    bit_reader_ = BitReader(data, len);
+  }
+
+  // Two flavors of bool decoding
+  int Decode(uint8_t* buffer, int max_values) {
+    max_values = std::min(max_values, num_values_);
+    bool val;
+    for (int i = 0; i < max_values; ++i) {
+      if (!bit_reader_.GetValue(1, &val)) {
+        ParquetException::EofException();
+      }
+      BitUtil::SetArrayBit(buffer, i, val);
+    }
+    num_values_ -= max_values;
+    return max_values;
   }
 
   virtual int Decode(bool* buffer, int max_values) {
     max_values = std::min(max_values, num_values_);
+    bool val;
     for (int i = 0; i < max_values; ++i) {
-      if (!decoder_.Get(&buffer[i])) ParquetException::EofException();
+      if (!bit_reader_.GetValue(1, &val)) {
+        ParquetException::EofException();
+      }
+      buffer[i] = val;
     }
     num_values_ -= max_values;
     return max_values;
   }
+
  private:
-  RleDecoder decoder_;
+  BitReader bit_reader_;
 };
 
 // ----------------------------------------------------------------------
@@ -132,6 +152,24 @@ class PlainEncoder : public Encoder<TYPE> {
   virtual size_t Encode(const T* src, int num_values, uint8_t* dst);
 };
 
+template <>
+class PlainEncoder<Type::BOOLEAN> : public Encoder<Type::BOOLEAN> {
+ public:
+  explicit PlainEncoder(const parquet::SchemaElement* schema) :
+      Encoder<Type::BOOLEAN>(schema, parquet::Encoding::PLAIN) {}
+
+  virtual size_t Encode(const std::vector<bool>& src, int num_values,
+      uint8_t* dst) {
+    size_t bytes_required = BitUtil::RoundUp(num_values, 8) / 8;
+    BitWriter bit_writer(dst, bytes_required);
+    for (size_t i = 0; i < num_values; ++i) {
+      bit_writer.PutValue(src[i], 1);
+    }
+    bit_writer.Flush();
+    return bit_writer.bytes_written();
+  }
+};
+
 template <int TYPE>
 inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values,
     uint8_t* dst) {
@@ -141,13 +179,6 @@ inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values,
 }
 
 template <>
-inline size_t PlainEncoder<Type::BOOLEAN>::Encode(
-    const bool* src, int num_values, uint8_t* dst) {
-  ParquetException::NYI("bool encoding");
-  return 0;
-}
-
-template <>
 inline size_t PlainEncoder<Type::BYTE_ARRAY>::Encode(const ByteArray* src,
     int num_values, uint8_t* dst) {
   ParquetException::NYI("byte array encoding");

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/util/bit-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h
index 7a2e921..eac5346 100644
--- a/src/parquet/util/bit-util.h
+++ b/src/parquet/util/bit-util.h
@@ -270,6 +270,14 @@ class BitUtil {
     return v | (static_cast<T>(0x1) << bitpos);
   }
 
+  static inline bool GetArrayBit(const uint8_t* bits, size_t i) {
+    return bits[i / 8] & (1 << (i % 8));
+  }
+
+  static inline void SetArrayBit(uint8_t* bits, size_t i, bool is_set) {
+    bits[i / 8] |= (1 << (i % 8)) * is_set;
+  }
+
   // Set a specific bit to 0
   // Behavior when bitpos is negative is undefined
   template<typename T>

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0eec9a5/src/parquet/util/test-common.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h
index 38bc32c..3cf82f5 100644
--- a/src/parquet/util/test-common.h
+++ b/src/parquet/util/test-common.h
@@ -19,6 +19,7 @@
 #define PARQUET_UTIL_TEST_COMMON_H
 
 #include <iostream>
+#include <random>
 #include <vector>
 
 using std::vector;
@@ -46,6 +47,32 @@ static inline bool vector_equal(const vector<T>& left, const
vector<T>& right) {
   return true;
 }
 
+static inline vector<bool> flip_coins_seed(size_t n, double p, uint32_t seed) {
+  std::mt19937 gen(seed);
+  std::bernoulli_distribution d(p);
+
+  vector<bool> draws;
+  for (size_t i = 0; i < n; ++i) {
+    draws.push_back(d(gen));
+  }
+  return draws;
+}
+
+
+static inline vector<bool> flip_coins(size_t n, double p) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+
+  std::bernoulli_distribution d(p);
+
+  vector<bool> draws;
+  for (size_t i = 0; i < n; ++i) {
+    draws.push_back(d(gen));
+  }
+  return draws;
+}
+
+
 } // namespace test
 
 } // namespace parquet_cpp


Mime
View raw message