arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-185: Make padding and alignment for all buffers be 64 bytes
Date Tue, 17 May 2016 23:46:44 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 6968ec01d -> 9c59158b4


ARROW-185: Make padding and alignment for all buffers be 64 bytes

+ some small cleanup/removal of unnecessary code.  I think there is likely a good opportunity
to factor this code better generally, but this seems to work for now.

Author: Micah Kornfield <emkornfield@gmail.com>

Closes #74 from emkornfield/emk_fix_allocations_PR and squashes the following commits:

e3cca14 [Micah Kornfield] fix cast style
1d006d8 [Micah Kornfield] fix warning
c140e04 [Micah Kornfield] fix lint
7543267 [Micah Kornfield] cleanup
11b3fd7 [Micah Kornfield] replace cython string conversion with string builder
05653cb [Micah Kornfield] add back in memsets because they make valgrind happy
6ff3048 [Micah Kornfield] ARROW-185: Make padding and alignment for all buffers be 64 bytes


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/9c59158b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/9c59158b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/9c59158b

Branch: refs/heads/master
Commit: 9c59158b4dc84e4de8e9271430befb840e523a4c
Parents: 6968ec0
Author: Micah Kornfield <emkornfield@gmail.com>
Authored: Tue May 17 16:46:40 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Tue May 17 16:46:40 2016 -0700

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc               | 11 +++++---
 cpp/src/arrow/ipc/adapter.cc           | 20 +++++++++++++-
 cpp/src/arrow/ipc/ipc-adapter-test.cc  |  6 ++---
 cpp/src/arrow/types/list.cc            |  2 +-
 cpp/src/arrow/types/list.h             |  3 +++
 cpp/src/arrow/types/primitive.cc       |  7 +++--
 cpp/src/arrow/util/bit-util-test.cc    | 10 +++++++
 cpp/src/arrow/util/bit-util.h          |  4 +++
 cpp/src/arrow/util/buffer.cc           | 17 ++++++++++++
 cpp/src/arrow/util/buffer.h            | 34 +++++++++++++++---------
 cpp/src/arrow/util/memory-pool-test.cc |  1 +
 cpp/src/arrow/util/memory-pool.cc      | 31 +++++++++++++++++-----
 python/src/pyarrow/adapters/pandas.cc  | 41 +++++++----------------------
 13 files changed, 124 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 87c1219..1fba961 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -45,12 +45,14 @@ Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t
length)
 }
 
 Status ArrayBuilder::Init(int32_t capacity) {
-  capacity_ = capacity;
   int32_t to_alloc = util::ceil_byte(capacity) / 8;
   null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
   RETURN_NOT_OK(null_bitmap_->Resize(to_alloc));
+  // Buffers might allocate more then necessary to satisfy padding requirements
+  const int byte_capacity = null_bitmap_->capacity();
+  capacity_ = capacity;
   null_bitmap_data_ = null_bitmap_->mutable_data();
-  memset(null_bitmap_data_, 0, to_alloc);
+  memset(null_bitmap_data_, 0, byte_capacity);
   return Status::OK();
 }
 
@@ -60,8 +62,11 @@ Status ArrayBuilder::Resize(int32_t new_bits) {
   int32_t old_bytes = null_bitmap_->size();
   RETURN_NOT_OK(null_bitmap_->Resize(new_bytes));
   null_bitmap_data_ = null_bitmap_->mutable_data();
+  // The buffer might be overpadded to deal with padding according to the spec
+  const int32_t byte_capacity = null_bitmap_->capacity();
+  capacity_ = new_bits;
   if (old_bytes < new_bytes) {
-    memset(null_bitmap_data_ + old_bytes, 0, new_bytes - old_bytes);
+    memset(null_bitmap_data_ + old_bytes, 0, byte_capacity - old_bytes);
   }
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index 3470008..45cc288 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -43,6 +43,15 @@ namespace flatbuf = apache::arrow::flatbuf;
 
 namespace ipc {
 
+namespace {
+Status CheckMultipleOf64(int64_t size) {
+  if (util::is_multiple_of_64(size)) { return Status::OK(); }
+  return Status::Invalid(
+      "Attempted to write a buffer that "
+      "wasn't a multiple of 64 bytes");
+}
+}
+
 static bool IsPrimitive(const DataType* type) {
   DCHECK(type != nullptr);
   switch (type->type) {
@@ -115,6 +124,8 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>*
field_nodes
   } else if (arr->type_enum() == Type::STRUCT) {
     // TODO(wesm)
     return Status::NotImplemented("Struct type");
+  } else {
+    return Status::NotImplemented("Unrecognized type");
   }
   return Status::OK();
 }
@@ -142,7 +153,13 @@ class RowBatchWriter {
       int64_t size = 0;
 
       // The buffer might be null if we are handling zero row lengths.
-      if (buffer) { size = buffer->size(); }
+      if (buffer) {
+        // We use capacity here, because size might not reflect the padding
+        // requirements of buffers but capacity always should.
+        size = buffer->capacity();
+        // check that padding is appropriate
+        RETURN_NOT_OK(CheckMultipleOf64(size));
+      }
       // TODO(wesm): We currently have no notion of shared memory page id's,
       // but we've included it in the metadata IDL for when we have it in the
       // future. Use page=0 for now
@@ -305,6 +322,7 @@ class RowBatchReader::Impl {
 
   Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
     BufferMetadata metadata = metadata_->buffer(buffer_index);
+    RETURN_NOT_OK(CheckMultipleOf64(metadata.length));
     return source_->ReadAt(metadata.offset, metadata.length, out);
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/ipc/ipc-adapter-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc
index 3b14734..eb47ac6 100644
--- a/cpp/src/arrow/ipc/ipc-adapter-test.cc
+++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc
@@ -197,8 +197,8 @@ INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch,
 
 void TestGetRowBatchSize(std::shared_ptr<RowBatch> batch) {
   MockMemorySource mock_source(1 << 16);
-  int64_t mock_header_location;
-  int64_t size;
+  int64_t mock_header_location = -1;
+  int64_t size = -1;
   ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location));
   ASSERT_OK(GetRowBatchSize(batch.get(), &size));
   ASSERT_EQ(mock_source.GetExtentBytesWritten(), size);
@@ -270,7 +270,7 @@ TEST_F(RecursionLimits, WriteLimit) {
 }
 
 TEST_F(RecursionLimits, ReadLimit) {
-  int64_t header_location;
+  int64_t header_location = -1;
   std::shared_ptr<Schema> schema;
   ASSERT_OK(WriteToMmap(64, true, &header_location, &schema));
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/types/list.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc
index fc33311..76e7fe5 100644
--- a/cpp/src/arrow/types/list.cc
+++ b/cpp/src/arrow/types/list.cc
@@ -47,7 +47,7 @@ bool ListArray::Equals(const std::shared_ptr<Array>& arr) const
{
 Status ListArray::Validate() const {
   if (length_ < 0) { return Status::Invalid("Length was negative"); }
   if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); }
-  if (offset_buf_->size() / sizeof(int32_t) < length_) {
+  if (offset_buf_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
     std::stringstream ss;
     ss << "offset buffer size (bytes): " << offset_buf_->size()
        << " isn't large enough for length: " << length_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index e2302d9..a020b8a 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -20,6 +20,7 @@
 
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <memory>
 
 #include "arrow/array.h"
@@ -113,12 +114,14 @@ class ListBuilder : public ArrayBuilder {
         values_(values) {}
 
   Status Init(int32_t elements) override {
+    DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
     RETURN_NOT_OK(ArrayBuilder::Init(elements));
     // one more then requested for offsets
     return offset_builder_.Resize((elements + 1) * sizeof(int32_t));
   }
 
   Status Resize(int32_t capacity) override {
+    DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
     // one more then requested for offsets
     RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t)));
     return ArrayBuilder::Resize(capacity);

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index 9102c53..57a3f1e 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -76,6 +76,7 @@ Status PrimitiveBuilder<T>::Init(int32_t capacity) {
 
   int64_t nbytes = type_traits<T>::bytes_required(capacity);
   RETURN_NOT_OK(data_->Resize(nbytes));
+  // TODO(emkornfield) valgrind complains without this
   memset(data_->mutable_data(), 0, nbytes);
 
   raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
@@ -91,15 +92,13 @@ Status PrimitiveBuilder<T>::Resize(int32_t capacity) {
     RETURN_NOT_OK(Init(capacity));
   } else {
     RETURN_NOT_OK(ArrayBuilder::Resize(capacity));
-
-    int64_t old_bytes = data_->size();
-    int64_t new_bytes = type_traits<T>::bytes_required(capacity);
+    const int64_t old_bytes = data_->size();
+    const int64_t new_bytes = type_traits<T>::bytes_required(capacity);
     RETURN_NOT_OK(data_->Resize(new_bytes));
     raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
 
     memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
   }
-  capacity_ = capacity;
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc
index 26554d2..e1d8a08 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -21,6 +21,16 @@
 
 namespace arrow {
 
+TEST(UtilTests, TestIsMultipleOf64) {
+  using util::is_multiple_of_64;
+  EXPECT_TRUE(is_multiple_of_64(64));
+  EXPECT_TRUE(is_multiple_of_64(0));
+  EXPECT_TRUE(is_multiple_of_64(128));
+  EXPECT_TRUE(is_multiple_of_64(192));
+  EXPECT_FALSE(is_multiple_of_64(23));
+  EXPECT_FALSE(is_multiple_of_64(32));
+}
+
 TEST(UtilTests, TestNextPower2) {
   using util::next_power2;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 1f0f08c..a6c8dd9 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -71,6 +71,10 @@ static inline int64_t next_power2(int64_t n) {
   return n;
 }
 
+static inline bool is_multiple_of_64(int64_t n) {
+  return (n & 63) == 0;
+}
+
 void bytes_to_bits(const std::vector<uint8_t>& bytes, uint8_t* bits);
 Status bytes_to_bits(const std::vector<uint8_t>&, std::shared_ptr<Buffer>*);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/buffer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc
index bc9c22c..703ef83 100644
--- a/cpp/src/arrow/util/buffer.cc
+++ b/cpp/src/arrow/util/buffer.cc
@@ -18,16 +18,32 @@
 #include "arrow/util/buffer.h"
 
 #include <cstdint>
+#include <limits>
 
+#include "arrow/util/logging.h"
 #include "arrow/util/memory-pool.h"
 #include "arrow/util/status.h"
 
 namespace arrow {
 
+namespace {
+int64_t RoundUpToMultipleOf64(int64_t num) {
+  DCHECK_GE(num, 0);
+  constexpr int64_t round_to = 64;
+  constexpr int64_t force_carry_addend = round_to - 1;
+  constexpr int64_t truncate_bitmask = ~(round_to - 1);
+  constexpr int64_t max_roundable_num = std::numeric_limits<int64_t>::max() - round_to;
+  if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask;
}
+  // handle overflow case.  This should result in a malloc error upstream
+  return num;
+}
+}  // namespace
+
 Buffer::Buffer(const std::shared_ptr<Buffer>& parent, int64_t offset, int64_t size)
{
   data_ = parent->data() + offset;
   size_ = size;
   parent_ = parent;
+  capacity_ = size;
 }
 
 Buffer::~Buffer() {}
@@ -48,6 +64,7 @@ PoolBuffer::~PoolBuffer() {
 Status PoolBuffer::Reserve(int64_t new_capacity) {
   if (!mutable_data_ || new_capacity > capacity_) {
     uint8_t* new_data;
+    new_capacity = RoundUpToMultipleOf64(new_capacity);
     if (mutable_data_) {
       RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data));
       memcpy(new_data, mutable_data_, size_);

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h
index 5ef0076..f845d67 100644
--- a/cpp/src/arrow/util/buffer.h
+++ b/cpp/src/arrow/util/buffer.h
@@ -36,15 +36,23 @@ class Status;
 // Buffer classes
 
 // Immutable API for a chunk of bytes which may or may not be owned by the
-// class instance
+// class instance.  Buffers have two related notions of length: size and
+// capacity.  Size is the number of bytes that might have valid data.
+// Capacity is the number of bytes that where allocated for the buffer in
+// total.
+// The following invariant is always true: Size < Capacity
 class Buffer : public std::enable_shared_from_this<Buffer> {
  public:
-  Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
+  Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size), capacity_(size) {}
   virtual ~Buffer();
 
   // An offset into data that is owned by another buffer, but we want to be
   // able to retain a valid pointer to it even after other shared_ptr's to the
   // parent buffer have been destroyed
+  //
+  // This method makes no assertions about alignment or padding of the buffer but
+  // in general we expected buffers to be aligned and padded to 64 bytes.  In the future
+  // we might add utility methods to help determine if a buffer satisfies this contract.
   Buffer(const std::shared_ptr<Buffer>& parent, int64_t offset, int64_t size);
 
   std::shared_ptr<Buffer> get_shared_ptr() { return shared_from_this(); }
@@ -63,6 +71,7 @@ class Buffer : public std::enable_shared_from_this<Buffer> {
                (data_ == other.data_ || !memcmp(data_, other.data_, size_)));
   }
 
+  int64_t capacity() const { return capacity_; }
   const uint8_t* data() const { return data_; }
 
   int64_t size() const { return size_; }
@@ -76,6 +85,7 @@ class Buffer : public std::enable_shared_from_this<Buffer> {
  protected:
   const uint8_t* data_;
   int64_t size_;
+  int64_t capacity_;
 
   // nullptr by default, but may be set
   std::shared_ptr<Buffer> parent_;
@@ -105,18 +115,17 @@ class MutableBuffer : public Buffer {
 class ResizableBuffer : public MutableBuffer {
  public:
   // Change buffer reported size to indicated size, allocating memory if
-  // necessary
+  // necessary.  This will ensure that the capacity of the buffer is a multiple
+  // of 64 bytes as defined in Layout.md.
   virtual Status Resize(int64_t new_size) = 0;
 
   // Ensure that buffer has enough memory allocated to fit the indicated
-  // capacity. Does not change buffer's reported size
+  // capacity (and meets the 64 byte padding requirement in Layout.md).
+  // It does not change buffer's reported size.
   virtual Status Reserve(int64_t new_capacity) = 0;
 
  protected:
-  ResizableBuffer(uint8_t* data, int64_t size)
-      : MutableBuffer(data, size), capacity_(size) {}
-
-  int64_t capacity_;
+  ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
 };
 
 // A Buffer whose lifetime is tied to a particular MemoryPool
@@ -125,8 +134,8 @@ class PoolBuffer : public ResizableBuffer {
   explicit PoolBuffer(MemoryPool* pool = nullptr);
   virtual ~PoolBuffer();
 
-  virtual Status Resize(int64_t new_size);
-  virtual Status Reserve(int64_t new_capacity);
+  Status Resize(int64_t new_size) override;
+  Status Reserve(int64_t new_capacity) override;
 
  private:
   MemoryPool* pool_;
@@ -138,10 +147,11 @@ class BufferBuilder {
  public:
   explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {}
 
+  // Resizes the buffer to the nearest multiple of 64 bytes per Layout.md
   Status Resize(int32_t elements) {
     if (capacity_ == 0) { buffer_ = std::make_shared<PoolBuffer>(pool_); }
-    capacity_ = elements;
-    RETURN_NOT_OK(buffer_->Resize(capacity_));
+    RETURN_NOT_OK(buffer_->Resize(elements));
+    capacity_ = buffer_->capacity();
     data_ = buffer_->mutable_data();
     return Status::OK();
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/memory-pool-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc
index e4600a9..4ab9736 100644
--- a/cpp/src/arrow/util/memory-pool-test.cc
+++ b/cpp/src/arrow/util/memory-pool-test.cc
@@ -31,6 +31,7 @@ TEST(DefaultMemoryPool, MemoryTracking) {
 
   uint8_t* data;
   ASSERT_OK(pool->Allocate(100, &data));
+  EXPECT_EQ(0, reinterpret_cast<uint64_t>(data) % 64);
   ASSERT_EQ(100, pool->bytes_allocated());
 
   pool->Free(data, 100);

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/cpp/src/arrow/util/memory-pool.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc
index 961554f..0a58e5a 100644
--- a/cpp/src/arrow/util/memory-pool.cc
+++ b/cpp/src/arrow/util/memory-pool.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/util/memory-pool.h"
 
+#include <stdlib.h>
 #include <cstdlib>
 #include <mutex>
 #include <sstream>
@@ -25,6 +26,28 @@
 
 namespace arrow {
 
+namespace {
+// Allocate memory according to the alignment requirements for Arrow
+// (as of May 2016 64 bytes)
+Status AllocateAligned(int64_t size, uint8_t** out) {
+  // TODO(emkornfield) find something compatible with windows
+  constexpr size_t kAlignment = 64;
+  const int result = posix_memalign(reinterpret_cast<void**>(out), kAlignment, size);
+  if (result == ENOMEM) {
+    std::stringstream ss;
+    ss << "malloc of size " << size << " failed";
+    return Status::OutOfMemory(ss.str());
+  }
+
+  if (result == EINVAL) {
+    std::stringstream ss;
+    ss << "invalid alignment parameter: " << kAlignment;
+    return Status::Invalid(ss.str());
+  }
+  return Status::OK();
+}
+}  // namespace
+
 MemoryPool::~MemoryPool() {}
 
 class InternalMemoryPool : public MemoryPool {
@@ -45,13 +68,7 @@ class InternalMemoryPool : public MemoryPool {
 
 Status InternalMemoryPool::Allocate(int64_t size, uint8_t** out) {
   std::lock_guard<std::mutex> guard(pool_lock_);
-  *out = static_cast<uint8_t*>(std::malloc(size));
-  if (*out == nullptr) {
-    std::stringstream ss;
-    ss << "malloc of size " << size << " failed";
-    return Status::OutOfMemory(ss.str());
-  }
-
+  RETURN_NOT_OK(AllocateAligned(size, out));
   bytes_allocated_ += size;
 
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/9c59158b/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index b39fde9..5159d86 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -147,17 +147,12 @@ class ArrowSerializer {
 
   Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
     PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+    arrow::TypePtr string_type(new arrow::StringType());
+    arrow::StringBuilder string_builder(pool_, string_type);
+    RETURN_ARROW_NOT_OK(string_builder.Resize(length_));
 
-    auto offsets_buffer = std::make_shared<arrow::PoolBuffer>(pool_);
-    RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1)));
-    int32_t* offsets = reinterpret_cast<int32_t*>(offsets_buffer->mutable_data());
-
-    arrow::BufferBuilder data_builder(pool_);
     arrow::Status s;
     PyObject* obj;
-    int length;
-    int offset = 0;
-    int64_t null_count = 0;
     for (int64_t i = 0; i < length_; ++i) {
       obj = objects[i];
       if (PyUnicode_Check(obj)) {
@@ -166,38 +161,20 @@ class ArrowSerializer {
           PyErr_Clear();
           return Status::TypeError("failed converting unicode to UTF8");
         }
-        length = PyBytes_GET_SIZE(obj);
-        s = data_builder.Append(
-            reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length);
+        const int32_t length = PyBytes_GET_SIZE(obj);
+        s = string_builder.Append(PyBytes_AS_STRING(obj), length);
         Py_DECREF(obj);
         if (!s.ok()) {
           return Status::ArrowError(s.ToString());
         }
-        util::set_bit(null_bitmap_data_, i);
       } else if (PyBytes_Check(obj)) {
-        length = PyBytes_GET_SIZE(obj);
-        RETURN_ARROW_NOT_OK(data_builder.Append(
-                reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length));
-        util::set_bit(null_bitmap_data_, i);
+        const int32_t length = PyBytes_GET_SIZE(obj);
+        RETURN_ARROW_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
       } else {
-        // NULL
-        // No change to offset
-        length = 0;
-        ++null_count;
+        string_builder.AppendNull();
       }
-      offsets[i] = offset;
-      offset += length;
     }
-    // End offset
-    offsets[length_] = offset;
-
-    std::shared_ptr<arrow::Buffer> data_buffer = data_builder.Finish();
-
-    auto values = std::make_shared<arrow::UInt8Array>(data_buffer->size(),
-        data_buffer);
-    *out = std::shared_ptr<arrow::Array>(
-        new arrow::StringArray(length_, offsets_buffer, values, null_count,
-            null_bitmap_));
+    *out = std::shared_ptr<arrow::Array>(string_builder.Finish());
 
     return Status::OK();
   }


Mime
View raw message