parquet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject parquet-cpp git commit: PARQUET-759: Fix handling of columns of empty strings
Date Tue, 01 Nov 2016 01:18:01 GMT
Repository: parquet-cpp
Updated Branches:
  refs/heads/master 82515fead -> 9a0407e68


PARQUET-759: Fix handling of columns of empty strings

Depends on the changes in https://github.com/apache/arrow/pull/189

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #181 from xhochy/PARQUET-759 and squashes the following commits:

94b7054 [Uwe L. Korn] Increase Arrow hash
accd787 [Uwe L. Korn] PARQUET-759: Fix handling of columns of empty strings


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/9a0407e6
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/9a0407e6
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/9a0407e6

Branch: refs/heads/master
Commit: 9a0407e684c0a6299d0e6ab98c11c1162915c0ee
Parents: 82515fe
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Mon Oct 31 21:17:53 2016 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Oct 31 21:17:53 2016 -0400

----------------------------------------------------------------------
 src/parquet/arrow/arrow-reader-writer-test.cc | 25 ++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   | 10 +++++++--
 thirdparty/versions.sh                        |  2 +-
 3 files changed, 34 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 1f28e5c..5ec70f3 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -428,6 +428,31 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) {
   this->ReadAndCheckSingleColumnTable(expected_values);
 }
 
+using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
+
+TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
+  std::shared_ptr<Array> values;
+  ::arrow::StringBuilder builder(
+      ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+  for (size_t i = 0; i < SMALL_SIZE; i++) {
+    builder.Append("");
+  }
+  ASSERT_OK(builder.Finish(&values));
+  std::shared_ptr<Table> table = MakeSimpleTable(values, false);
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), ::arrow::default_memory_pool(),
+      this->sink_, values->length(), default_writer_properties()));
+
+  std::shared_ptr<Table> out;
+  this->ReadTableFromFile(this->ReaderFromSink(), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+}
+
 template <typename T>
 using ParquetCDataType = typename ParquetDataType<T>::c_type;
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index e75d4b7..e4d3745 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -262,8 +262,14 @@ Status FileWriter::Impl::WriteFlatColumnChunk(
   DCHECK((offset + length) <= data->length());
   RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(ByteArray)));
   auto buffer_ptr = reinterpret_cast<ByteArray*>(data_buffer_.mutable_data());
-  auto data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
-  DCHECK(data_ptr != nullptr);
+  // In the case of an array consisting of only empty strings or all null,
+  // data->data() points already to a nullptr, thus data->data()->data() will
+  // segfault.
+  const uint8_t* data_ptr = nullptr;
+  if (data->data()) {
+    data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
+    DCHECK(data_ptr != nullptr);
+  }
   auto writer = reinterpret_cast<TypedColumnWriter<ByteArrayType>*>(column_writer);
   if (writer->descr()->max_definition_level() > 0) {
     RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t)));

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/9a0407e6/thirdparty/versions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh
index 87fe6b6..855b6f7 100755
--- a/thirdparty/versions.sh
+++ b/thirdparty/versions.sh
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ARROW_VERSION="676c32ccea6274c75b2750453c1ddbc5f645c037"
+ARROW_VERSION="d946e7917d55cb220becd6469ae93430f2e60764"
 ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz"
 ARROW_BASEDIR="arrow-${ARROW_VERSION}"
 


Mime
View raw message