parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables
Date Sat, 07 Oct 2017 19:55:49 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 2e0c28e5d -> 2a4fab5a2


PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables

I will fix the underlying issue in Arrow but this fixes the issue so we can get a 1.3.1 release
out soon.

Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>

Closes #407 from xhochy/PARQUET-1121 and squashes the following commits:

85223b9 [Korn, Uwe] PARQUET-1121: Handle Dictionary[Null] arrays on writing Arrow tables


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/2a4fab5a
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/2a4fab5a
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/2a4fab5a

Branch: refs/heads/master
Commit: 2a4fab5a2263b55a631c83aedea0c6b993b1b1c9
Parents: 2e0c28e
Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
Authored: Sat Oct 7 15:55:43 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sat Oct 7 15:55:43 2017 -0400

----------------------------------------------------------------------
 src/parquet/arrow/arrow-reader-writer-test.cc | 28 ++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   |  6 +++++
 2 files changed, 34 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2a4fab5a/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 4fd57ea..fc6410d 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -926,6 +926,34 @@ TEST_F(TestNullParquetIO, NullColumn) {
   internal::AssertArraysEqual(*values, *chunked_array->chunk(0));
 }
 
+TEST_F(TestNullParquetIO, NullDictionaryColumn) {
+  std::shared_ptr<Array> values = std::make_shared<::arrow::NullArray>(0);
+  std::shared_ptr<Array> indices =
+      std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE);
+  std::shared_ptr<::arrow::DictionaryType> dict_type =
+      std::make_shared<::arrow::DictionaryType>(::arrow::int8(), values);
+  std::shared_ptr<Array> dict_values =
+      std::make_shared<::arrow::DictionaryArray>(dict_type, indices);
+  std::shared_ptr<Table> table = MakeSimpleTable(dict_values, true);
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
+                                dict_values->length(), default_writer_properties()));
+
+  std::shared_ptr<Table> out;
+  std::unique_ptr<FileReader> reader;
+  this->ReaderFromSink(&reader);
+  this->ReadTableFromFile(std::move(reader), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+
+  std::shared_ptr<Array> expected_values =
+      std::make_shared<::arrow::NullArray>(SMALL_SIZE);
+  AssertArraysEqual(*expected_values, *chunked_array->chunk(0));
+}
+
 template <typename T>
 using ParquetCDataType = typename ParquetDataType<T>::c_type;
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2a4fab5a/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index e834042..b53c1ca 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -819,6 +819,12 @@ Status FileWriter::Impl::WriteColumnChunk(const Array& data) {
     const ::arrow::DictionaryType& dict_type =
         static_cast<const ::arrow::DictionaryType&>(*data.type());
 
+    // TODO(ARROW-1648): Remove this special handling once we require an Arrow
+    // version that has this fixed.
+    if (dict_type.dictionary()->type()->id() == ::arrow::Type::NA) {
+      return WriteColumnChunk(::arrow::NullArray(data.length()));
+    }
+
     FunctionContext ctx(pool_);
     std::shared_ptr<Array> plain_array;
     RETURN_NOT_OK(


Mime
View raw message