From commits-return-1355-archive-asf-public=cust-asf.ponee.io@parquet.apache.org Thu Jul 12 08:40:10 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 85FEE180654 for ; Thu, 12 Jul 2018 08:40:09 +0200 (CEST) Received: (qmail 40359 invoked by uid 500); 12 Jul 2018 06:40:08 -0000 Mailing-List: contact commits-help@parquet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@parquet.apache.org Delivered-To: mailing list commits@parquet.apache.org Received: (qmail 40350 invoked by uid 99); 12 Jul 2018 06:40:08 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 12 Jul 2018 06:40:08 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 810378212A; Thu, 12 Jul 2018 06:40:07 +0000 (UTC) Date: Thu, 12 Jul 2018 06:40:07 +0000 To: "commits@parquet.apache.org" Subject: [parquet-cpp] branch master updated: PARQUET-1346: [C++] Protect against empty Arrow arrays with null values MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <153137760740.3427.10776447103922106510@gitbox.apache.org> From: uwe@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: parquet-cpp X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: d9c262a00f512699b64472cf58ecff7642853efc X-Git-Newrev: e6739e95b33c13184e2313df5da902802430a945 X-Git-Rev: e6739e95b33c13184e2313df5da902802430a945 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git The following commit(s) were added to refs/heads/master by this push: new e6739e9 PARQUET-1346: [C++] Protect against empty Arrow arrays with null values e6739e9 is described below commit e6739e95b33c13184e2313df5da902802430a945 Author: Antoine Pitrou AuthorDate: Thu Jul 12 08:39:53 2018 +0200 PARQUET-1346: [C++] Protect against empty Arrow arrays with null values Author: Antoine Pitrou Closes #474 from pitrou/PARQUET-1346-null-values and squashes the following commits: 08bad23 [Antoine Pitrou] Do not ignore return value a1c0378 [Antoine Pitrou] Fix uninitialized value dfb42d2 [Antoine Pitrou] Try to fix lint b514d0d [Antoine Pitrou] Try to fix compile failures on Travis-CI (due to old Arrow?) 3951f25 [Antoine Pitrou] PARQUET-1346: [C++] Protect against empty Arrow arrays with null values --- src/parquet/arrow/arrow-reader-writer-test.cc | 15 +++++++++++++-- src/parquet/arrow/test-util.h | 27 +++++++++++++++++++++++++++ src/parquet/arrow/writer.cc | 17 ++++++++++++++--- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 6d7e1eb..e0ff7aa 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -536,6 +536,13 @@ class TestParquetIO : public ::testing::Test { *out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists); } + // Prepare table of empty lists, with null values array (ARROW-2744) + void PrepareEmptyListsTable(int64_t size, std::shared_ptr* out) { + std::shared_ptr lists; + ASSERT_OK(MakeEmptyListsArray(size, &lists)); + *out = MakeSimpleTable(lists, true /* nullable_lists */); + } + void PrepareListOfListTable(int64_t size, bool nullable_parent_lists, bool nullable_lists, bool nullable_elements, int64_t null_count, std::shared_ptr
* out) { @@ -713,6 +720,12 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); } +TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) { + std::shared_ptr
table; + ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table)); + ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); +} + TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table)); @@ -1524,8 +1537,6 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks, void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type, std::shared_ptr* out_array) { - ::arrow::Int32Builder offset_builder; - std::vector length_draws; randint(num_rows, 0, 100, &length_draws); diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index 7264324..c70e0ef 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -394,6 +394,33 @@ Status MakeListArray(const std::shared_ptr& values, int64_t size, return Status::OK(); } +// Make an array containing only empty lists, with a null values array +Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { + // Allocate an offsets buffer containing only zeroes + std::shared_ptr offsets_buffer; + const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t); + RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes, + &offsets_buffer)); + memset(offsets_buffer->mutable_data(), 0, offsets_nbytes); + + auto value_field = ::arrow::field("item", ::arrow::float64(), + false /* nullable_values */); + auto list_type = ::arrow::list(value_field); + + std::vector> child_buffers = {nullptr /* null bitmap */, + nullptr /* values */ }; + auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0, + std::move(child_buffers)); + + std::vector> buffers = {nullptr /* bitmap */, + offsets_buffer }; + auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers)); + array_data->child_data.push_back(child_data); + + *out_array = ::arrow::MakeArray(array_data); + return Status::OK(); +} + static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name, const std::shared_ptr& array, bool nullable) { diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index 50b4649..f772738 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -411,8 +411,13 @@ Status ArrowColumnWriter::TypedWriteBatch(const Array& array, int64_t num_levels using ArrowCType = typename ArrowType::c_type; const auto& data = static_cast(array); - auto values = - reinterpret_cast(data.values()->data()) + data.offset(); + const ArrowCType* values = nullptr; + // The values buffer may be null if the array is empty (ARROW-2744) + if (data.values() != nullptr) { + values = reinterpret_cast(data.values()->data()) + data.offset(); + } else { + DCHECK_EQ(data.length(), 0); + } if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) { // no nulls, just dump the data @@ -706,7 +711,13 @@ Status ArrowColumnWriter::TypedWriteBatch( RETURN_NOT_OK(ctx_->GetScratchData(array.length(), &buffer)); const auto& data = static_cast(array); - auto values = reinterpret_cast(data.values()->data()); + const uint8_t* values = nullptr; + // The values buffer may be null if the array is empty (ARROW-2744) + if (data.values() != nullptr) { + values = reinterpret_cast(data.values()->data()); + } else { + DCHECK_EQ(data.length(), 0); + } int buffer_idx = 0; int64_t offset = array.offset();