parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [parquet-cpp] branch master updated: PARQUET-1273: Properly write dictionary values when writing in chunks
Date Wed, 18 Apr 2018 08:09:20 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new 494658a  PARQUET-1273: Properly write dictionary values when writing in chunks
494658a is described below

commit 494658a1f3f411c624dc49f191511febccd0d04c
Author: Joshua Storck <joshua.storck@twosigma.com>
AuthorDate: Wed Apr 18 10:09:14 2018 +0200

    PARQUET-1273: Properly write dictionary values when writing in chunks
    
    The error was reported here: https://issues.apache.org/jira/browse/ARROW-1938.
    
    Because dictionary types are not supported in writing yet, the code converts the dictionary
column to the actual values first before writing. However, the existing code was accidentally
using zero as the offset and the length of the column as the size. This resulted in writing
all of the column values for each chunk of the column that was supposed to be written.
    
    The fix is to pass the offset and size when recursively calling through to WriteColumnChunk
with the "flattened" data.
    
    Author: Joshua Storck <joshua.storck@twosigma.com>
    
    Closes #453 from joshuastorck/ARROW_1938 and squashes the following commits:
    
    c2af50f [Joshua Storck] Remove extraneous semicolon in unit test
    23f5722 [Joshua Storck] Ran clang-format on arrow-reader-writer-test.cc
    314b159 [Joshua Storck] Removing print statements from AssertTableEqual
    f0bc71a [Joshua Storck] Fixing bug reported in https://issues.apache.org/jira/browse/ARROW-1938,
namely preventing all of the values in a dictionary column from being written to parquet for
each chunk created as a result of specifying row_group_size
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 63 +++++++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   |  2 +-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index f2402df..93ecd3c 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1759,6 +1759,69 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) {
   CheckSimpleRoundtrip(table, table->num_rows());
 }
 
+TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
+  // This is a regression test for this:
+  //
+  // https://issues.apache.org/jira/browse/ARROW-1938
+  //
+  // As of the writing of this test, columns of type
+  // dictionary are written as their raw/expanded values.
+  // The regression was that the whole column was being
+  // written for each chunk.
+  using ::arrow::ArrayFromVector;
+
+  std::vector<std::string> values = {"first", "second", "third"};
+  auto type = ::arrow::utf8();
+  std::shared_ptr<Array> dict_values;
+  ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values);
+
+  auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values);
+  auto f0 = field("dictionary", dict_type);
+  std::vector<std::shared_ptr<::arrow::Field>> fields;
+  fields.emplace_back(f0);
+  auto schema = ::arrow::schema(fields);
+
+  std::shared_ptr<Array> f0_values, f1_values;
+  ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values);
+  ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values);
+  ::arrow::ArrayVector dict_arrays = {
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values),
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
+
+  std::vector<std::shared_ptr<::arrow::Column>> columns;
+  auto column = MakeColumn("column", dict_arrays, false);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1,
+                    // Just need to make sure that we make
+                    // a chunk size that is smaller than the
+                    // total number of values
+                    2, {}, &result);
+
+  std::vector<std::string> expected_values = {"first",  "second", "first", "third",
+                                              "second", "third",  "first", "second",
+                                              "first",  "third"};
+  columns.clear();
+
+  std::shared_ptr<Array> expected_array;
+  ArrayFromVector<::arrow::StringType, std::string>(expected_values, &expected_array);
+
+  // The column name gets changed on output to the name of the
+  // field, and it also turns into a nullable column
+  columns.emplace_back(MakeColumn("dictionary", expected_array, true));
+
+  fields.clear();
+  fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
+  schema = ::arrow::schema(fields);
+
+  auto expected_table = Table::Make(schema, columns);
+
+  AssertTablesEqual(*expected_table, *result, false);
+}
+
 TEST(TestArrowWrite, CheckChunkSize) {
   const int num_columns = 2;
   const int num_rows = 128;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a5dae3c..14e9c6e 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -969,7 +969,7 @@ class FileWriter::Impl {
       ::arrow::compute::Datum cast_output;
       RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(), CastOptions(),
                          &cast_output));
-      return WriteColumnChunk(cast_output.chunked_array(), 0, data->length());
+      return WriteColumnChunk(cast_output.chunked_array(), offset, size);
     }
 
     ColumnWriter* column_writer;

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.

Mime
View raw message