parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [parquet-cpp] branch master updated: PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
Date Thu, 12 Jul 2018 06:40:07 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new e6739e9  PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
e6739e9 is described below

commit e6739e95b33c13184e2313df5da902802430a945
Author: Antoine Pitrou <antoine@python.org>
AuthorDate: Thu Jul 12 08:39:53 2018 +0200

    PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
    
    Author: Antoine Pitrou <antoine@python.org>
    
    Closes #474 from pitrou/PARQUET-1346-null-values and squashes the following commits:
    
    08bad23 [Antoine Pitrou] Do not ignore return value
    a1c0378 [Antoine Pitrou] Fix uninitialized value
    dfb42d2 [Antoine Pitrou] Try to fix lint
    b514d0d [Antoine Pitrou] Try to fix compile failures on Travis-CI (due to old Arrow?)
    3951f25 [Antoine Pitrou] PARQUET-1346: [C++] Protect against empty Arrow arrays with null
values
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 15 +++++++++++++--
 src/parquet/arrow/test-util.h                 | 27 +++++++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   | 17 ++++++++++++++---
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 6d7e1eb..e0ff7aa 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -536,6 +536,13 @@ class TestParquetIO : public ::testing::Test {
     *out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists);
   }
 
+  // Prepare table of empty lists, with null values array (ARROW-2744)
+  void PrepareEmptyListsTable(int64_t size, std::shared_ptr<Table>* out) {
+    std::shared_ptr<Array> lists;
+    ASSERT_OK(MakeEmptyListsArray(size, &lists));
+    *out = MakeSimpleTable(lists, true /* nullable_lists */);
+  }
+
   void PrepareListOfListTable(int64_t size, bool nullable_parent_lists,
                               bool nullable_lists, bool nullable_elements,
                               int64_t null_count, std::shared_ptr<Table>* out) {
@@ -713,6 +720,12 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) {
   ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
 }
 
+TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {
+  std::shared_ptr<Table> table;
+  ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table));
+  ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
+}
+
 TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
   std::shared_ptr<Table> table;
   ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table));
@@ -1524,8 +1537,6 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks,
 
 void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type,
                    std::shared_ptr<Array>* out_array) {
-  ::arrow::Int32Builder offset_builder;
-
   std::vector<int32_t> length_draws;
   randint(num_rows, 0, 100, &length_draws);
 
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 7264324..c70e0ef 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -394,6 +394,33 @@ Status MakeListArray(const std::shared_ptr<Array>& values,
int64_t size,
   return Status::OK();
 }
 
+// Make an array containing only empty lists, with a null values array
+Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
+  // Allocate an offsets buffer containing only zeroes
+  std::shared_ptr<Buffer> offsets_buffer;
+  const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
+  RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes,
+                                        &offsets_buffer));
+  memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
+
+  auto value_field = ::arrow::field("item", ::arrow::float64(),
+                                    false /* nullable_values */);
+  auto list_type = ::arrow::list(value_field);
+
+  std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap
*/,
+                                                        nullptr /* values */ };
+  auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
+                                             std::move(child_buffers));
+
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
+                                                  offsets_buffer };
+  auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
+  array_data->child_data.push_back(child_data);
+
+  *out_array = ::arrow::MakeArray(array_data);
+  return Status::OK();
+}
+
 static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name,
                                                    const std::shared_ptr<Array>&
array,
                                                    bool nullable) {
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 50b4649..f772738 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -411,8 +411,13 @@ Status ArrowColumnWriter::TypedWriteBatch(const Array& array, int64_t
num_levels
   using ArrowCType = typename ArrowType::c_type;
 
   const auto& data = static_cast<const PrimitiveArray&>(array);
-  auto values =
-      reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
+  const ArrowCType* values = nullptr;
+  // The values buffer may be null if the array is empty (ARROW-2744)
+  if (data.values() != nullptr) {
+    values = reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
+  } else {
+    DCHECK_EQ(data.length(), 0);
+  }
 
   if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0))
{
     // no nulls, just dump the data
@@ -706,7 +711,13 @@ Status ArrowColumnWriter::TypedWriteBatch<BooleanType, ::arrow::BooleanType>(
   RETURN_NOT_OK(ctx_->GetScratchData<bool>(array.length(), &buffer));
 
   const auto& data = static_cast<const BooleanArray&>(array);
-  auto values = reinterpret_cast<const uint8_t*>(data.values()->data());
+  const uint8_t* values = nullptr;
+  // The values buffer may be null if the array is empty (ARROW-2744)
+  if (data.values() != nullptr) {
+    values = reinterpret_cast<const uint8_t*>(data.values()->data());
+  } else {
+    DCHECK_EQ(data.length(), 0);
+  }
 
   int buffer_idx = 0;
   int64_t offset = array.offset();


Mime
View raw message