arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1187: Python: Feather: Serialize a DataFrame with None column
Date Thu, 13 Jul 2017 12:45:00 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 74bc8735b -> 85892a288


ARROW-1187: Python: Feather: Serialize a DataFrame with None column

Change-Id: Id489a4fdc203849b747f754f0b48d64a56b2ff8f

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #833 from xhochy/ARROW-1187 and squashes the following commits:

74a09a1 [Uwe L. Korn] ARROW-1187: Python: Feather: Serialize a DataFrame with None column


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/85892a28
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/85892a28
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/85892a28

Branch: refs/heads/master
Commit: 85892a28835ec61dd589a63874bf01b6d23d4655
Parents: 74bc873
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Thu Jul 13 08:44:54 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Thu Jul 13 08:44:54 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/compare.cc             |  2 ++
 cpp/src/arrow/ipc/feather-test.cc    | 19 +++++++++++
 cpp/src/arrow/ipc/feather.cc         | 54 +++++++++++++++++++++++++------
 cpp/src/arrow/ipc/test-common.h      |  9 ++++++
 python/pyarrow/tests/test_feather.py |  4 +++
 5 files changed, 79 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/85892a28/cpp/src/arrow/compare.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 23f5a19..1465e0b 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -615,6 +615,8 @@ inline Status ArrayEqualsImpl(const Array& left, const Array&
right, bool* are_e
     *are_equal = false;
   } else if (left.length() == 0) {
     *are_equal = true;
+  } else if (left.null_count() == left.length()) {
+    *are_equal = true;
   } else {
     VISITOR visitor(right);
     RETURN_NOT_OK(VisitArrayInline(left, &visitor));

http://git-wip-us.apache.org/repos/asf/arrow/blob/85892a28/cpp/src/arrow/ipc/feather-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc
index a7793f2..029aae3 100644
--- a/cpp/src/arrow/ipc/feather-test.cc
+++ b/cpp/src/arrow/ipc/feather-test.cc
@@ -386,6 +386,25 @@ TEST_F(TestTableWriter, VLenPrimitiveRoundTrip) {
   CheckBatch(*batch);
 }
 
+TEST_F(TestTableWriter, PrimitiveNullRoundTrip) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeNullRecordBatch(&batch));
+
+  for (int i = 0; i < batch->num_columns(); ++i) {
+    ASSERT_OK(writer_->Append(batch->column_name(i), *batch->column(i)));
+  }
+  Finish();
+
+  std::shared_ptr<Column> col;
+  for (int i = 0; i < batch->num_columns(); ++i) {
+    ASSERT_OK(reader_->GetColumn(i, &col));
+    ASSERT_EQ(batch->column_name(i), col->name());
+    StringArray str_values(batch->column(i)->length(), nullptr, nullptr,
+        batch->column(i)->null_bitmap(), batch->column(i)->null_count());
+    CheckArrays(str_values, *col->data()->chunk(0));
+  }
+}
+
 }  // namespace feather
 }  // namespace ipc
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/85892a28/cpp/src/arrow/ipc/feather.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 37b01c5..1bcd505 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -70,6 +70,21 @@ static Status WritePadded(io::OutputStream* stream, const uint8_t* data,
int64_t
   return Status::OK();
 }
 
+/// For compability, we need to write any data sometimes just to keep producing
+/// files that can be read with an older reader.
+static Status WritePaddedBlank(
+    io::OutputStream* stream, int64_t length, int64_t* bytes_written) {
+  const uint8_t null = 0;
+  for (int64_t i = 0; i < length; i++) {
+    RETURN_NOT_OK(stream->Write(&null, 1));
+  }
+
+  int64_t remainder = PaddedLength(length) - length;
+  if (remainder != 0) { RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); }
+  *bytes_written = length + remainder;
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // TableBuilder
 
@@ -542,8 +557,13 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
     if (values.null_count() > 0) {
       // We assume there is one bit for each value in values.nulls, aligned on a
       // byte boundary, and we write this much data into the stream
-      RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(),
-          values.null_bitmap()->size(), &bytes_written));
+      if (values.null_bitmap()) {
+        RETURN_NOT_OK(WritePadded(stream_.get(), values.null_bitmap()->data(),
+            values.null_bitmap()->size(), &bytes_written));
+      } else {
+        RETURN_NOT_OK(WritePaddedBlank(
+            stream_.get(), BitUtil::BytesForBits(values.length()), &bytes_written));
+      }
       meta->total_bytes += bytes_written;
     }
 
@@ -556,12 +576,16 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
 
       int64_t offset_bytes = sizeof(int32_t) * (values.length() + 1);
 
-      values_bytes = bin_values.raw_value_offsets()[values.length()];
+      if (bin_values.value_offsets()) {
+        values_bytes = bin_values.raw_value_offsets()[values.length()];
 
-      // Write the variable-length offsets
-      RETURN_NOT_OK(WritePadded(stream_.get(),
-          reinterpret_cast<const uint8_t*>(bin_values.raw_value_offsets()), offset_bytes,
-          &bytes_written));
+        // Write the variable-length offsets
+        RETURN_NOT_OK(WritePadded(stream_.get(),
+            reinterpret_cast<const uint8_t*>(bin_values.raw_value_offsets()),
+            offset_bytes, &bytes_written));
+      } else {
+        RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, &bytes_written));
+      }
       meta->total_bytes += bytes_written;
 
       if (bin_values.value_data()) { values_buffer = bin_values.value_data()->data();
}
@@ -578,8 +602,12 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
 
       if (prim_values.values()) { values_buffer = prim_values.values()->data(); }
     }
-    RETURN_NOT_OK(
-        WritePadded(stream_.get(), values_buffer, values_bytes, &bytes_written));
+    if (values_buffer) {
+      RETURN_NOT_OK(
+          WritePadded(stream_.get(), values_buffer, values_bytes, &bytes_written));
+    } else {
+      RETURN_NOT_OK(WritePaddedBlank(stream_.get(), values_bytes, &bytes_written));
+    }
     meta->total_bytes += bytes_written;
 
     return Status::OK();
@@ -593,6 +621,14 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
     return Status::OK();
   }
 
+  Status Visit(const NullArray& values) override {
+    // As long as R doesn't support NA, we write this as a StringColumn
+    // to ensure stable roundtrips.
+    StringArray str_values(
+        values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count());
+    return WritePrimitiveValues(str_values);
+  }
+
 #define VISIT_PRIMITIVE(TYPE) \
   Status Visit(const TYPE& values) override { return WritePrimitiveValues(values); }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/85892a28/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 6fdf1cc..a542c87 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -247,6 +247,15 @@ Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>*
out) {
   return Status::OK();
 }
 
+Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out) {
+  const int64_t length = 500;
+  auto f0 = field("f0", null());
+  std::shared_ptr<Schema> schema(new Schema({f0}));
+  std::shared_ptr<Array> a0 = std::make_shared<NullArray>(length);
+  out->reset(new RecordBatch(schema, length, {a0}));
+  return Status::OK();
+}
+
 Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
   // Make the schema
   auto f0 = field("f0", kListInt32);

http://git-wip-us.apache.org/repos/asf/arrow/blob/85892a28/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 71e4fee..91bf56b 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -294,6 +294,10 @@ class TestFeatherReader(unittest.TestCase):
         df = pd.DataFrame({'strings': [''] * 10})
         self._check_pandas_roundtrip(df)
 
+    def test_all_none(self):
+        df = pd.DataFrame({'all_none': [None] * 10})
+        self._check_pandas_roundtrip(df, null_counts=[10])
+
     def test_multithreaded_read(self):
         data = {'c{0}'.format(i): [''] * 10
                 for i in range(100)}


Mime
View raw message