Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id AA6CF200B85 for ; Thu, 15 Sep 2016 18:30:56 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id A8E8F160AC6; Thu, 15 Sep 2016 16:30:56 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id CC087160ABA for ; Thu, 15 Sep 2016 18:30:55 +0200 (CEST) Received: (qmail 71850 invoked by uid 500); 15 Sep 2016 16:30:55 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 71841 invoked by uid 99); 15 Sep 2016 16:30:55 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 15 Sep 2016 16:30:55 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id E6010DFE80; Thu, 15 Sep 2016 16:30:54 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@parquet.apache.org Message-Id: <4ec8108a1adb44579391a1450d2bbc8a@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: parquet-cpp git commit: PARQUET-719: Fix WriterBatch API to handle NULL values Date: Thu, 15 Sep 2016 16:30:54 +0000 (UTC) archived-at: Thu, 15 Sep 2016 16:30:56 -0000 Repository: parquet-cpp Updated Branches: refs/heads/master 0bf72a96b -> 942f2aedb PARQUET-719: Fix WriterBatch API to handle NULL values Fixed bug and added a test case Author: Deepak Majeti Closes #160 from majetideepak/PARQUET-719 and squashes the following commits: ff56788 [Deepak Majeti] use using c34ce78 [Deepak Majeti] added comments f38d6ed [Deepak Majeti] Fixed bug and added test case Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/942f2aed Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/942f2aed Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/942f2aed Branch: refs/heads/master Commit: 942f2aedb5262580c7dfcb511641b20e9bbb367d Parents: 0bf72a9 Author: Deepak Majeti Authored: Thu Sep 15 12:30:45 2016 -0400 Committer: Wes McKinney Committed: Thu Sep 15 12:30:45 2016 -0400 ---------------------------------------------------------------------- src/parquet/column/column-writer-test.cc | 23 +++++++++++++++++++++++ src/parquet/column/scanner-test.cc | 12 +++++------- src/parquet/column/writer.h | 14 +++++++++----- 3 files changed, 37 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/column-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index 230a843..57f1d1b 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -150,6 +150,8 @@ typedef ::testing::Types; + TYPED_TEST(TestPrimitiveWriter, RequiredPlain) { this->TestRequiredWithEncoding(Encoding::PLAIN); } @@ -303,5 +305,26 @@ TYPED_TEST(TestPrimitiveWriter, RequiredVeryLargeChunk) { } } +// PARQUET-719 +// Test case for NULL values +TEST_F(TestNullValuesWriter, OptionalNullValueChunk) { + this->SetUpSchemaOptional(); + + this->GenerateData(LARGE_SIZE); + + std::vector definition_levels(LARGE_SIZE, 0); + std::vector repetition_levels(LARGE_SIZE, 0); + + auto writer = this->BuildWriter(LARGE_SIZE); + // All values being written are NULL + writer->WriteBatch( + this->values_.size(), definition_levels.data(), repetition_levels.data(), NULL); + writer->Close(); + + // Just read the first SMALL_SIZE rows to ensure we could read it back in + this->ReadColumn(); + ASSERT_EQ(0, this->values_read_); +} + } // namespace test } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/scanner-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc index 78cd16c..fb5178a 100644 --- a/src/parquet/column/scanner-test.cc +++ b/src/parquet/column/scanner-test.cc @@ -141,8 +141,6 @@ class TestFlatScanner : public ::testing::Test { vector data_buffer_; // For BA and FLBA }; -typedef TestFlatScanner TestFlatFLBAScanner; - static int num_levels_per_page = 100; static int num_pages = 20; static int batch_size = 32; @@ -150,8 +148,8 @@ static int batch_size = 32; typedef ::testing::Types TestTypes; -typedef TestFlatScanner TestBooleanFlatScanner; -typedef TestFlatScanner TestFLBAFlatScanner; +using TestBooleanFlatScanner = TestFlatScanner; +using TestFLBAFlatScanner = TestFlatScanner; TYPED_TEST_CASE(TestFlatScanner, TestTypes); @@ -183,7 +181,7 @@ TEST_F(TestFLBAFlatScanner, TestPlainDictScanner) { } // PARQUET 502 -TEST_F(TestFlatFLBAScanner, TestSmallBatch) { +TEST_F(TestFLBAFlatScanner, TestSmallBatch) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 0, 0); @@ -194,7 +192,7 @@ TEST_F(TestFlatFLBAScanner, TestSmallBatch) { CheckResults(1, &d); } -TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) { +TEST_F(TestFLBAFlatScanner, TestDescriptorAPI) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 4, 0); @@ -209,7 +207,7 @@ TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) { ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length()); } -TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) { +TEST_F(TestFLBAFlatScanner, TestFLBAPrinterNext) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2); const ColumnDescriptor d(type, 4, 0); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/writer.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h index 6a6ee5f..4b2a021 100644 --- a/src/parquet/column/writer.h +++ b/src/parquet/column/writer.h @@ -156,7 +156,7 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter { void CheckDictionarySizeLimit() override; private: - void WriteMiniBatch(int64_t num_values, const int16_t* def_levels, + int64_t WriteMiniBatch(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const T* values); typedef Encoder EncoderType; @@ -167,7 +167,7 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter { }; template -inline void TypedColumnWriter::WriteMiniBatch(int64_t num_values, +inline int64_t TypedColumnWriter::WriteMiniBatch(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const T* values) { int64_t values_to_write = 0; // If the field is required and non-repeated, there are no definition levels @@ -209,6 +209,8 @@ inline void TypedColumnWriter::WriteMiniBatch(int64_t num_values, AddDataPage(); } if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); } + + return values_to_write; } template @@ -222,15 +224,17 @@ inline void TypedColumnWriter::WriteBatch(int64_t num_values, int64_t write_batch_size = properties_->write_batch_size(); int num_batches = num_values / write_batch_size; int64_t num_remaining = num_values % write_batch_size; + int64_t value_offset = 0; for (int round = 0; round < num_batches; round++) { int64_t offset = round * write_batch_size; - WriteMiniBatch( - write_batch_size, &def_levels[offset], &rep_levels[offset], &values[offset]); + int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset], + &rep_levels[offset], &values[value_offset]); + value_offset += num_values; } // Write the remaining values int64_t offset = num_batches * write_batch_size; WriteMiniBatch( - num_remaining, &def_levels[offset], &rep_levels[offset], &values[offset]); + num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]); } template