Return-Path: X-Original-To: apmail-parquet-commits-archive@minotaur.apache.org Delivered-To: apmail-parquet-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 3547018D84 for ; Thu, 18 Feb 2016 23:24:49 +0000 (UTC) Received: (qmail 13915 invoked by uid 500); 18 Feb 2016 23:24:49 -0000 Delivered-To: apmail-parquet-commits-archive@parquet.apache.org Received: (qmail 13882 invoked by uid 500); 18 Feb 2016 23:24:49 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 13872 invoked by uid 99); 18 Feb 2016 23:24:49 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 18 Feb 2016 23:24:49 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id DE199DFF67; Thu, 18 Feb 2016 23:24:48 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: julien@apache.org To: commits@parquet.apache.org Message-Id: <8809d98705494ec980b79fd10ef18125@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: parquet-cpp git commit: PARQUET-499: Complete PlainEncoder implementation for all primitive types and test end to end Date: Thu, 18 Feb 2016 23:24:48 +0000 (UTC) Repository: parquet-cpp Updated Branches: refs/heads/master 6df383666 -> 4ae1b921b PARQUET-499: Complete PlainEncoder implementation for all primitive types and test end to end Includes tests for end to end plain encoding and decoding of all data types. Author: Deepak Majeti Closes #52 from majetideepak/PARQUET-499 and squashes the following commits: 897859b [Deepak Majeti] minor edits 2067ef5 [Deepak Majeti] renamed a test dfb19f8 [Deepak Majeti] templated all types 059967a [Deepak Majeti] templated int and real tests da86d4d [Deepak Majeti] minor fix 4976bec [Deepak Majeti] include pruning d0f8ab9 [Deepak Majeti] addressed comments 07257c0 [Deepak Majeti] minor format edits 6ca0b30 [Deepak Majeti] fixed formatting and casting issues 9815062 [Deepak Majeti] PARQUET-499 Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/4ae1b921 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/4ae1b921 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/4ae1b921 Branch: refs/heads/master Commit: 4ae1b921bb86133f3b1be753259f20a89c128366 Parents: 6df3836 Author: Deepak Majeti Authored: Thu Feb 18 15:24:45 2016 -0800 Committer: Julien Le Dem Committed: Thu Feb 18 15:24:45 2016 -0800 ---------------------------------------------------------------------- src/parquet/encodings/plain-encoding-test.cc | 164 ++++++++++++++++++++-- src/parquet/encodings/plain-encoding.h | 38 +++-- src/parquet/util/test-common.h | 82 ++++++++++- 3 files changed, 265 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4ae1b921/src/parquet/encodings/plain-encoding-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/plain-encoding-test.cc b/src/parquet/encodings/plain-encoding-test.cc index 955a415..b8ef13b 100644 --- a/src/parquet/encodings/plain-encoding-test.cc +++ b/src/parquet/encodings/plain-encoding-test.cc @@ -24,6 +24,7 @@ #include "parquet/encodings/plain-encoding.h" #include "parquet/types.h" +#include "parquet/schema/types.h" #include "parquet/util/bit-util.h" #include "parquet/util/output.h" #include "parquet/util/test-common.h" @@ -35,11 +36,10 @@ namespace parquet_cpp { namespace test { -TEST(BooleanTest, TestEncodeDecode) { +TEST(VectorBooleanTest, TestEncodeDecode) { // PARQUET-454 - - size_t nvalues = 100; - size_t nbytes = BitUtil::RoundUp(nvalues, 8) / 8; + size_t nvalues = 10000; + size_t nbytes = BitUtil::Ceil(nvalues, 8); // seed the prng so failure is deterministic vector draws = flip_coins_seed(nvalues, 0.5, 0); @@ -50,12 +50,12 @@ TEST(BooleanTest, TestEncodeDecode) { InMemoryOutputStream dst; encoder.Encode(draws, nvalues, &dst); - std::vector encode_buffer; + vector encode_buffer; dst.Transfer(&encode_buffer); ASSERT_EQ(nbytes, encode_buffer.size()); - std::vector decode_buffer(nbytes); + vector decode_buffer(nbytes); const uint8_t* decode_data = &decode_buffer[0]; decoder.SetData(nvalues, &encode_buffer[0], encode_buffer.size()); @@ -63,10 +63,158 @@ TEST(BooleanTest, TestEncodeDecode) { ASSERT_EQ(nvalues, values_decoded); for (size_t i = 0; i < nvalues; ++i) { - ASSERT_EQ(BitUtil::GetArrayBit(decode_data, i), draws[i]) << i; + ASSERT_EQ(draws[i], BitUtil::GetArrayBit(decode_data, i)) << i; } } -} // namespace test +template +class EncodeDecode{ + public: + void init_data(int nvalues) { + num_values_ = nvalues; + input_bytes_.resize(num_values_ * sizeof(T)); + output_bytes_.resize(num_values_ * sizeof(T)); + draws_ = reinterpret_cast(input_bytes_.data()); + decode_buf_ = reinterpret_cast(output_bytes_.data()); + } + + void generate_data() { + // seed the prng so failure is deterministic + random_numbers(num_values_, 0.5, draws_); + } + + void encode_decode(ColumnDescriptor *d) { + PlainEncoder encoder(d); + PlainDecoder decoder(d); + + InMemoryOutputStream dst; + encoder.Encode(draws_, num_values_, &dst); + + dst.Transfer(&encode_buffer_); + + decoder.SetData(num_values_, &encode_buffer_[0], encode_buffer_.size()); + size_t values_decoded = decoder.Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + } + + void verify_results() { + for (size_t i = 0; i < num_values_; ++i) { + ASSERT_EQ(draws_[i], decode_buf_[i]) << i; + } + } + + void execute(int nvalues, ColumnDescriptor *d) { + init_data(nvalues); + generate_data(); + encode_decode(d); + verify_results(); + } + + private: + int num_values_; + T* draws_; + T* decode_buf_; + vector input_bytes_; + vector output_bytes_; + vector data_buffer_; + vector encode_buffer_; +}; + +template<> +void EncodeDecode::generate_data() { + // seed the prng so failure is deterministic + random_bools(num_values_, 0.5, 0, draws_); +} + +template<> +void EncodeDecode::verify_results() { + for (size_t i = 0; i < num_values_; ++i) { + ASSERT_EQ(draws_[i].value[0], decode_buf_[i].value[0]) << i; + ASSERT_EQ(draws_[i].value[1], decode_buf_[i].value[1]) << i; + ASSERT_EQ(draws_[i].value[2], decode_buf_[i].value[2]) << i; + } +} + +template<> +void EncodeDecode::generate_data() { + // seed the prng so failure is deterministic + int max_byte_array_len = 12 + sizeof(uint32_t); + size_t nbytes = num_values_ * max_byte_array_len; + data_buffer_.resize(nbytes); + random_byte_array(num_values_, 0.5, data_buffer_.data(), draws_, + max_byte_array_len); +} + +template<> +void EncodeDecode::verify_results() { + for (size_t i = 0; i < num_values_; ++i) { + ASSERT_EQ(draws_[i].len, decode_buf_[i].len) << i; + ASSERT_EQ(0, memcmp(draws_[i].ptr, decode_buf_[i].ptr, draws_[i].len)) << i; + } +} + +static int flba_length = 8; +template<> +void EncodeDecode::generate_data() { + // seed the prng so failure is deterministic + size_t nbytes = num_values_ * flba_length; + data_buffer_.resize(nbytes); + ASSERT_EQ(nbytes, data_buffer_.size()); + random_fixed_byte_array(num_values_, 0.5, data_buffer_.data(), flba_length, draws_); +} + +template<> +void EncodeDecode::verify_results() { + for (size_t i = 0; i < 1000; ++i) { + ASSERT_EQ(0, memcmp(draws_[i].ptr, decode_buf_[i].ptr, flba_length)) << i; + } +} +int num_values = 10000; + +TEST(BoolEncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(Int32EncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(Int64EncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(FloatEncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(DoubleEncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(Int96EncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(BAEncodeDecode, TestEncodeDecode) { + EncodeDecode obj; + obj.execute(num_values, nullptr); +} + +TEST(FLBAEncodeDecode, TestEncodeDecode) { + schema::NodePtr node; + node = schema::PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, flba_length, LogicalType::UTF8); + ColumnDescriptor d(node, 0, 0); + EncodeDecode obj; + obj.execute(num_values, &d); +} + +} // namespace test } // namespace parquet_cpp http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4ae1b921/src/parquet/encodings/plain-encoding.h ---------------------------------------------------------------------- diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h index 78560fd..83ee40c 100644 --- a/src/parquet/encodings/plain-encoding.h +++ b/src/parquet/encodings/plain-encoding.h @@ -67,22 +67,26 @@ inline int PlainDecoder::Decode(T* buffer, int max_values) { } // Template specialization for BYTE_ARRAY +// BA does not currently own its data +// the lifetime is tied to the input stream template <> inline int PlainDecoder::Decode(ByteArray* buffer, int max_values) { max_values = std::min(max_values, num_values_); for (int i = 0; i < max_values; ++i) { - buffer[i].len = *reinterpret_cast(data_); - if (len_ < sizeof(uint32_t) + buffer[i].len) ParquetException::EofException(); + uint32_t len = buffer[i].len = *reinterpret_cast(data_); + if (len_ < sizeof(uint32_t) + len) ParquetException::EofException(); buffer[i].ptr = data_ + sizeof(uint32_t); - data_ += sizeof(uint32_t) + buffer[i].len; - len_ -= sizeof(uint32_t) + buffer[i].len; + data_ += sizeof(uint32_t) + len; + len_ -= sizeof(uint32_t) + len; } num_values_ -= max_values; return max_values; } // Template specialization for FIXED_LEN_BYTE_ARRAY +// FLBA does not currently own its data +// the lifetime is tied to the input stream template <> inline int PlainDecoder::Decode( FixedLenByteArray* buffer, int max_values) { @@ -161,11 +165,21 @@ class PlainEncoder : public Encoder { Encoder(descr, Encoding::PLAIN) {} virtual void Encode(const bool* src, int num_values, OutputStream* dst) { - throw ParquetException("this API for encoding bools not implemented"); + size_t bytes_required = BitUtil::Ceil(num_values, 8); + std::vector tmp_buffer(bytes_required); + + BitWriter bit_writer(&tmp_buffer[0], bytes_required); + for (size_t i = 0; i < num_values; ++i) { + bit_writer.PutValue(src[i], 1); + } + bit_writer.Flush(); + + // Write the result to the output stream + dst->Write(bit_writer.buffer(), bit_writer.bytes_written()); } void Encode(const std::vector& src, int num_values, OutputStream* dst) { - size_t bytes_required = BitUtil::RoundUp(num_values, 8) / 8; + size_t bytes_required = BitUtil::Ceil(num_values, 8); // TODO(wesm) // Use a temporary buffer for now and copy, because the BitWriter is not @@ -193,15 +207,21 @@ inline void PlainEncoder::Encode(const T* buffer, int num_values, template <> inline void PlainEncoder::Encode(const ByteArray* src, int num_values, OutputStream* dst) { - ParquetException::NYI("byte array encoding"); + for (size_t i = 0; i < num_values; ++i) { + // Write the result to the output stream + dst->Write(reinterpret_cast(&src[i].len), sizeof(uint32_t)); + dst->Write(reinterpret_cast(src[i].ptr), src[i].len); + } } template <> inline void PlainEncoder::Encode( const FixedLenByteArray* src, int num_values, OutputStream* dst) { - ParquetException::NYI("FLBA encoding"); + for (size_t i = 0; i < num_values; ++i) { + // Write the result to the output stream + dst->Write(reinterpret_cast(src[i].ptr), descr_->type_length()); + } } - } // namespace parquet_cpp #endif http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/4ae1b921/src/parquet/util/test-common.h ---------------------------------------------------------------------- diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h index e75b163..d8961d1 100644 --- a/src/parquet/util/test-common.h +++ b/src/parquet/util/test-common.h @@ -19,9 +19,12 @@ #define PARQUET_UTIL_TEST_COMMON_H #include +#include #include #include +#include "parquet/types.h" + using std::vector; namespace parquet_cpp { @@ -81,7 +84,6 @@ static inline vector flip_coins_seed(size_t n, double p, uint32_t seed) { return draws; } - static inline vector flip_coins(size_t n, double p) { std::random_device rd; std::mt19937 gen(rd()); @@ -104,8 +106,84 @@ void random_bytes(int n, uint32_t seed, std::vector* out) { } } -} // namespace test +template +void random_numbers(int n, uint32_t seed, T* out) { + std::mt19937 gen(seed); + std::uniform_real_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +void random_bools(int n, double p, uint32_t seed, bool* out) { + std::mt19937 gen(seed); + std::bernoulli_distribution d(p); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +template <> +void random_numbers(int n, uint32_t seed, int32_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +template <> +void random_numbers(int n, uint32_t seed, int64_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} +template <> +void random_numbers(int n, uint32_t seed, Int96* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(std::numeric_limits::lowest(), + std::numeric_limits::max()); + for (int i = 0; i < n; ++i) { + out[i].value[0] = d(gen); + out[i].value[1] = d(gen); + out[i].value[2] = d(gen); + } +} + +void random_fixed_byte_array(int n, uint32_t seed, uint8_t *buf, int len, + FLBA* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(0, 255); + for (int i = 0; i < n; ++i) { + out[i].ptr = buf; + for (int j = 0; j < len; ++j) { + buf[j] = d(gen) & 0xFF; + } + buf += len; + } +} + +void random_byte_array(int n, uint32_t seed, uint8_t *buf, + ByteArray* out, int max_size) { + std::mt19937 gen(seed); + std::uniform_int_distribution d1(0, max_size); + std::uniform_int_distribution d2(0, 255); + for (int i = 0; i < n; ++i) { + out[i].len = d1(gen); + out[i].ptr = buf; + for (int j = 0; j < out[i].len; ++j) { + buf[j] = d2(gen) & 0xFF; + } + buf += out[i].len; + } +} +} // namespace test } // namespace parquet_cpp #endif // PARQUET_UTIL_TEST_COMMON_H