arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [1/2] arrow git commit: ARROW-1199: [C++] Implement mutable POD struct for Array data
Date Tue, 11 Jul 2017 05:39:26 GMT
Repository: arrow
Updated Branches:
  refs/heads/master ad57ea8ec -> 845207118


http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/ipc/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index ea16bf0..ae46207 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -35,6 +35,7 @@
 #include "arrow/tensor.h"
 #include "arrow/type.h"
 #include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
@@ -45,12 +46,13 @@ namespace ipc {
 // ----------------------------------------------------------------------
 // Record batch read path
 
-class IpcComponentSource : public ArrayComponentSource {
+/// Accessor class for flatbuffers metadata
+class IpcComponentSource {
  public:
   IpcComponentSource(const flatbuf::RecordBatch* metadata, io::RandomAccessFile* file)
       : metadata_(metadata), file_(file) {}
 
-  Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) override {
+  Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
     const flatbuf::Buffer* buffer = metadata_->buffers()->Get(buffer_index);
 
     if (buffer->length() == 0) {
@@ -61,7 +63,7 @@ class IpcComponentSource : public ArrayComponentSource {
     }
   }
 
-  Status GetFieldMetadata(int field_index, FieldMetadata* field) override {
+  Status GetFieldMetadata(int field_index, internal::ArrayData* out) {
     auto nodes = metadata_->nodes();
     // pop off a field
     if (field_index >= static_cast<int>(nodes->size())) {
@@ -69,9 +71,9 @@ class IpcComponentSource : public ArrayComponentSource {
     }
     const flatbuf::FieldNode* node = nodes->Get(field_index);
 
-    field->length = node->length();
-    field->null_count = node->null_count();
-    field->offset = 0;
+    out->length = node->length();
+    out->null_count = node->null_count();
+    out->offset = 0;
     return Status::OK();
   }
 
@@ -80,26 +82,204 @@ class IpcComponentSource : public ArrayComponentSource {
   io::RandomAccessFile* file_;
 };
 
+/// Bookkeeping struct for loading array objects from their constituent pieces of raw data
+///
+/// The field_index and buffer_index are incremented in the ArrayLoader
+/// based on how much of the batch is "consumed" (through nested data
+/// reconstruction, for example)
+struct ArrayLoaderContext {
+  IpcComponentSource* source;
+  int buffer_index;
+  int field_index;
+  int max_recursion_depth;
+};
+
+static Status LoadArray(const std::shared_ptr<DataType>& type,
+    ArrayLoaderContext* context, internal::ArrayData* out);
+
+class ArrayLoader {
+ public:
+  ArrayLoader(const std::shared_ptr<DataType>& type, internal::ArrayData* out,
+      ArrayLoaderContext* context)
+      : type_(type), context_(context), out_(out) {}
+
+  Status Load() {
+    if (context_->max_recursion_depth <= 0) {
+      return Status::Invalid("Max recursion depth reached");
+    }
+
+    out_->type = type_;
+
+    RETURN_NOT_OK(VisitTypeInline(*type_, this));
+    return Status::OK();
+  }
+
+  Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
+    return context_->source->GetBuffer(buffer_index, out);
+  }
+
+  Status LoadCommon() {
+    // This only contains the length and null count, which we need to figure
+    // out what to do with the buffers. For example, if null_count == 0, then
+    // we can skip that buffer without reading from shared memory
+    RETURN_NOT_OK(context_->source->GetFieldMetadata(context_->field_index++, out_));
+
+    // extract null_bitmap which is common to all arrays
+    if (out_->null_count == 0) {
+      out_->buffers[0] = nullptr;
+    } else {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index, &out_->buffers[0]));
+    }
+    context_->buffer_index++;
+    return Status::OK();
+  }
+
+  template <typename TYPE>
+  Status LoadPrimitive() {
+    out_->buffers.resize(2);
+
+    RETURN_NOT_OK(LoadCommon());
+    if (out_->length > 0) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1]));
+    } else {
+      context_->buffer_index++;
+      out_->buffers[1].reset(new Buffer(nullptr, 0));
+    }
+    return Status::OK();
+  }
+
+  template <typename TYPE>
+  Status LoadBinary() {
+    out_->buffers.resize(3);
+
+    RETURN_NOT_OK(LoadCommon());
+    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1]));
+    return GetBuffer(context_->buffer_index++, &out_->buffers[2]);
+  }
+
+  Status LoadChild(const Field& field, internal::ArrayData* out) {
+    ArrayLoader loader(field.type(), out, context_);
+    --context_->max_recursion_depth;
+    RETURN_NOT_OK(loader.Load());
+    ++context_->max_recursion_depth;
+    return Status::OK();
+  }
+
+  Status LoadChildren(std::vector<std::shared_ptr<Field>> child_fields) {
+    out_->child_data.reserve(static_cast<int>(child_fields.size()));
+
+    for (const auto& child_field : child_fields) {
+      auto field_array = std::make_shared<internal::ArrayData>();
+      RETURN_NOT_OK(LoadChild(*child_field.get(), field_array.get()));
+      out_->child_data.emplace_back(field_array);
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const NullType& type) { return Status::NotImplemented("null"); }
+
+  Status Visit(const DecimalType& type) { return Status::NotImplemented("decimal"); }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<FixedWidthType, T>::value &&
+                              !std::is_base_of<FixedSizeBinaryType, T>::value &&
+                              !std::is_base_of<DictionaryType, T>::value,
+      Status>::type
+  Visit(const T& type) {
+    return LoadPrimitive<T>();
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit(
+      const T& type) {
+    return LoadBinary<T>();
+  }
+
+  Status Visit(const FixedSizeBinaryType& type) {
+    out_->buffers.resize(2);
+    RETURN_NOT_OK(LoadCommon());
+    return GetBuffer(context_->buffer_index++, &out_->buffers[1]);
+  }
+
+  Status Visit(const ListType& type) {
+    out_->buffers.resize(2);
+
+    RETURN_NOT_OK(LoadCommon());
+    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1]));
+
+    const int num_children = type.num_children();
+    if (num_children != 1) {
+      std::stringstream ss;
+      ss << "Wrong number of children: " << num_children;
+      return Status::Invalid(ss.str());
+    }
+
+    return LoadChildren(type.children());
+  }
+
+  Status Visit(const StructType& type) {
+    out_->buffers.resize(1);
+    RETURN_NOT_OK(LoadCommon());
+    return LoadChildren(type.children());
+  }
+
+  Status Visit(const UnionType& type) {
+    out_->buffers.resize(3);
+
+    RETURN_NOT_OK(LoadCommon());
+    if (out_->length > 0) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index, &out_->buffers[1]));
+      if (type.mode() == UnionMode::DENSE) {
+        RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &out_->buffers[2]));
+      }
+    }
+    context_->buffer_index += type.mode() == UnionMode::DENSE ? 2 : 1;
+    return LoadChildren(type.children());
+  }
+
+  Status Visit(const DictionaryType& type) {
+    RETURN_NOT_OK(LoadArray(type.index_type(), context_, out_));
+    out_->type = type_;
+    return Status::OK();
+  }
+
+ private:
+  const std::shared_ptr<DataType>& type_;
+  ArrayLoaderContext* context_;
+
+  // Used in visitor pattern
+  internal::ArrayData* out_;
+};
+
+static Status LoadArray(const std::shared_ptr<DataType>& type,
+    ArrayLoaderContext* context, internal::ArrayData* out) {
+  ArrayLoader loader(type, out, context);
+  return loader.Load();
+}
+
 Status ReadRecordBatch(const Message& metadata, const std::shared_ptr<Schema>& schema,
     io::RandomAccessFile* file, std::shared_ptr<RecordBatch>* out) {
   return ReadRecordBatch(metadata, schema, kMaxNestingDepth, file, out);
 }
 
+// ----------------------------------------------------------------------
+// Array loading
+
 static Status LoadRecordBatchFromSource(const std::shared_ptr<Schema>& schema,
-    int64_t num_rows, int max_recursion_depth, ArrayComponentSource* source,
+    int64_t num_rows, int max_recursion_depth, IpcComponentSource* source,
     std::shared_ptr<RecordBatch>* out) {
-  std::vector<std::shared_ptr<Array>> arrays(schema->num_fields());
-
   ArrayLoaderContext context;
   context.source = source;
   context.field_index = 0;
   context.buffer_index = 0;
   context.max_recursion_depth = max_recursion_depth;
 
+  std::vector<std::shared_ptr<internal::ArrayData>> arrays(schema->num_fields());
   for (int i = 0; i < schema->num_fields(); ++i) {
-    RETURN_NOT_OK(LoadArray(schema->field(i)->type(), &context, &arrays[i]));
-    DCHECK_EQ(num_rows, arrays[i]->length())
-        << "Array length did not match record batch length";
+    auto arr = std::make_shared<internal::ArrayData>();
+    RETURN_NOT_OK(LoadArray(schema->field(i)->type(), &context, arr.get()));
+    DCHECK_EQ(num_rows, arr->length) << "Array length did not match record batch length";
+    arrays[i] = std::move(arr);
   }
 
   *out = std::make_shared<RecordBatch>(schema, num_rows, std::move(arrays));

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 747aca0..6fdf1cc 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -86,7 +86,9 @@ static inline void CompareArraysDetailed(
 static inline void CompareBatchColumnsDetailed(
     const RecordBatch& result, const RecordBatch& expected) {
   for (int i = 0; i < expected.num_columns(); ++i) {
-    CompareArraysDetailed(i, *result.column(i), *expected.column(i));
+    auto left = result.column(i);
+    auto right = expected.column(i);
+    CompareArraysDetailed(i, *left, *right);
   }
 }
 
@@ -471,7 +473,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
   RETURN_NOT_OK(test::GetBitmapFromBoolVector(is_valid, &null_bitmap));
 
   std::shared_ptr<Array> a3 = std::make_shared<ListArray>(f3_type, length,
-      std::static_pointer_cast<PrimitiveArray>(offsets)->data(),
+      std::static_pointer_cast<PrimitiveArray>(offsets)->values(),
       std::make_shared<DictionaryArray>(f1_type, indices3), null_bitmap, 1);
 
   // Dictionary-encoded list of integer
@@ -487,7 +489,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) {
   ArrayFromVector<Int8Type, int8_t>(std::vector<bool>(3, true), list_values4, &values4);
 
   auto dict3 = std::make_shared<ListArray>(f4_value_type, 3,
-      std::static_pointer_cast<PrimitiveArray>(offsets4)->data(), values4);
+      std::static_pointer_cast<PrimitiveArray>(offsets4)->values(), values4);
 
   std::vector<int8_t> indices4_values = {0, 1, 2, 0, 1, 2};
   ArrayFromVector<Int8Type, int8_t>(is_valid, indices4_values, &indices4);

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/ipc/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 60b1f47..592bca2 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -30,7 +30,6 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/metadata.h"
 #include "arrow/ipc/util.h"
-#include "arrow/loader.h"
 #include "arrow/memory_pool.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
@@ -233,7 +232,7 @@ class RecordBatchSerializer : public ArrayVisitor {
  protected:
   template <typename ArrayType>
   Status VisitFixedWidth(const ArrayType& array) {
-    std::shared_ptr<Buffer> data = array.data();
+    std::shared_ptr<Buffer> data = array.values();
 
     const auto& fw_type = static_cast<const FixedWidthType&>(*array.type());
     const int64_t type_width = fw_type.bit_width() / 8;
@@ -287,7 +286,7 @@ class RecordBatchSerializer : public ArrayVisitor {
   Status VisitBinary(const BinaryArray& array) {
     std::shared_ptr<Buffer> value_offsets;
     RETURN_NOT_OK(GetZeroBasedValueOffsets<BinaryArray>(array, &value_offsets));
-    auto data = array.data();
+    auto data = array.value_data();
 
     int64_t total_data_bytes = 0;
     if (value_offsets) {
@@ -309,7 +308,7 @@ class RecordBatchSerializer : public ArrayVisitor {
   Status Visit(const BooleanArray& array) override {
     std::shared_ptr<Buffer> data;
     RETURN_NOT_OK(
-        GetTruncatedBitmap(array.offset(), array.length(), array.data(), pool_, &data));
+        GetTruncatedBitmap(array.offset(), array.length(), array.values(), pool_, &data));
     buffers_.push_back(data);
     return Status::OK();
   }
@@ -367,7 +366,8 @@ class RecordBatchSerializer : public ArrayVisitor {
 
   Status Visit(const StructArray& array) override {
     --max_recursion_depth_;
-    for (std::shared_ptr<Array> field : array.fields()) {
+    for (int i = 0; i < array.num_fields(); ++i) {
+      std::shared_ptr<Array> field = array.field(i);
       if (array.offset() != 0 || array.length() < field->length()) {
         // If offset is non-zero, slice the child array
         field = field->Slice(array.offset(), array.length());
@@ -450,7 +450,9 @@ class RecordBatchSerializer : public ArrayVisitor {
         RETURN_NOT_OK(VisitArray(*child));
       }
     } else {
-      for (std::shared_ptr<Array> child : array.children()) {
+      for (int i = 0; i < array.num_fields(); ++i) {
+        std::shared_ptr<Array> child = array.child(i);
+
         // Sparse union, slicing is simpler
         if (offset != 0 || length < child->length()) {
           // If offset is non-zero, slice the child array

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/loader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/loader.cc b/cpp/src/arrow/loader.cc
deleted file mode 100644
index e4e1ba4..0000000
--- a/cpp/src/arrow/loader.cc
+++ /dev/null
@@ -1,297 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/loader.h"
-
-#include <cstdint>
-#include <memory>
-#include <sstream>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/visibility.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-class ArrayLoader {
- public:
-  ArrayLoader(const std::shared_ptr<DataType>& type, ArrayLoaderContext* context)
-      : type_(type), context_(context) {}
-
-  Status Load(std::shared_ptr<Array>* out) {
-    if (context_->max_recursion_depth <= 0) {
-      return Status::Invalid("Max recursion depth reached");
-    }
-
-    RETURN_NOT_OK(VisitTypeInline(*type_, this));
-
-    *out = std::move(result_);
-    return Status::OK();
-  }
-
-  Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
-    return context_->source->GetBuffer(buffer_index, out);
-  }
-
-  Status LoadCommon(FieldMetadata* field_meta, std::shared_ptr<Buffer>* null_bitmap) {
-    // This only contains the length and null count, which we need to figure
-    // out what to do with the buffers. For example, if null_count == 0, then
-    // we can skip that buffer without reading from shared memory
-    RETURN_NOT_OK(
-        context_->source->GetFieldMetadata(context_->field_index++, field_meta));
-
-    // extract null_bitmap which is common to all arrays
-    if (field_meta->null_count == 0) {
-      *null_bitmap = nullptr;
-    } else {
-      RETURN_NOT_OK(GetBuffer(context_->buffer_index, null_bitmap));
-    }
-    context_->buffer_index++;
-    return Status::OK();
-  }
-
-  template <typename TYPE>
-  Status LoadPrimitive() {
-    using ArrayType = typename TypeTraits<TYPE>::ArrayType;
-
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap, data;
-
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-    if (field_meta.length > 0) {
-      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &data));
-    } else {
-      context_->buffer_index++;
-      data.reset(new Buffer(nullptr, 0));
-    }
-    result_ = std::make_shared<ArrayType>(type_, field_meta.length, data, null_bitmap,
-        field_meta.null_count, field_meta.offset);
-    return Status::OK();
-  }
-
-  template <typename TYPE>
-  Status LoadBinary() {
-    using CONTAINER = typename TypeTraits<TYPE>::ArrayType;
-
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap, offsets, values;
-
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values));
-
-    result_ = std::make_shared<CONTAINER>(
-        field_meta.length, offsets, values, null_bitmap, field_meta.null_count);
-    return Status::OK();
-  }
-
-  Status LoadChild(const Field& field, std::shared_ptr<Array>* out) {
-    ArrayLoader loader(field.type(), context_);
-    --context_->max_recursion_depth;
-    RETURN_NOT_OK(loader.Load(out));
-    ++context_->max_recursion_depth;
-    return Status::OK();
-  }
-
-  Status LoadChildren(std::vector<std::shared_ptr<Field>> child_fields,
-      std::vector<std::shared_ptr<Array>>* arrays) {
-    arrays->reserve(static_cast<int>(child_fields.size()));
-
-    for (const auto& child_field : child_fields) {
-      std::shared_ptr<Array> field_array;
-      RETURN_NOT_OK(LoadChild(*child_field.get(), &field_array));
-      arrays->emplace_back(field_array);
-    }
-    return Status::OK();
-  }
-
-  Status Visit(const NullType& type) { return Status::NotImplemented("null"); }
-
-  Status Visit(const DecimalType& type) { return Status::NotImplemented("decimal"); }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<FixedWidthType, T>::value &&
-                              !std::is_base_of<FixedSizeBinaryType, T>::value &&
-                              !std::is_base_of<DictionaryType, T>::value,
-      Status>::type
-  Visit(const T& type) {
-    return LoadPrimitive<T>();
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type Visit(
-      const T& type) {
-    return LoadBinary<T>();
-  }
-
-  Status Visit(const FixedSizeBinaryType& type) {
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap, data;
-
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &data));
-
-    result_ = std::make_shared<FixedSizeBinaryArray>(
-        type_, field_meta.length, data, null_bitmap, field_meta.null_count);
-    return Status::OK();
-  }
-
-  Status Visit(const ListType& type) {
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap, offsets;
-
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
-
-    const int num_children = type.num_children();
-    if (num_children != 1) {
-      std::stringstream ss;
-      ss << "Wrong number of children: " << num_children;
-      return Status::Invalid(ss.str());
-    }
-    std::shared_ptr<Array> values_array;
-
-    RETURN_NOT_OK(LoadChild(*type.child(0).get(), &values_array));
-
-    result_ = std::make_shared<ListArray>(type_, field_meta.length, offsets, values_array,
-        null_bitmap, field_meta.null_count);
-    return Status::OK();
-  }
-
-  Status Visit(const StructType& type) {
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap;
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-
-    std::vector<std::shared_ptr<Array>> fields;
-    RETURN_NOT_OK(LoadChildren(type.children(), &fields));
-
-    result_ = std::make_shared<StructArray>(
-        type_, field_meta.length, fields, null_bitmap, field_meta.null_count);
-    return Status::OK();
-  }
-
-  Status Visit(const UnionType& type) {
-    FieldMetadata field_meta;
-    std::shared_ptr<Buffer> null_bitmap, type_ids, offsets;
-
-    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
-    if (field_meta.length > 0) {
-      RETURN_NOT_OK(GetBuffer(context_->buffer_index, &type_ids));
-      if (type.mode() == UnionMode::DENSE) {
-        RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &offsets));
-      }
-    }
-    context_->buffer_index += type.mode() == UnionMode::DENSE ? 2 : 1;
-
-    std::vector<std::shared_ptr<Array>> fields;
-    RETURN_NOT_OK(LoadChildren(type.children(), &fields));
-
-    result_ = std::make_shared<UnionArray>(type_, field_meta.length, fields, type_ids,
-        offsets, null_bitmap, field_meta.null_count);
-    return Status::OK();
-  }
-
-  Status Visit(const DictionaryType& type) {
-    std::shared_ptr<Array> indices;
-    RETURN_NOT_OK(LoadArray(type.index_type(), context_, &indices));
-    result_ = std::make_shared<DictionaryArray>(type_, indices);
-    return Status::OK();
-  }
-
-  std::shared_ptr<Array> result() const { return result_; }
-
- private:
-  const std::shared_ptr<DataType> type_;
-  ArrayLoaderContext* context_;
-
-  // Used in visitor pattern
-  std::shared_ptr<Array> result_;
-};
-
-Status LoadArray(const std::shared_ptr<DataType>& type, ArrayComponentSource* source,
-    std::shared_ptr<Array>* out) {
-  ArrayLoaderContext context;
-  context.source = source;
-  context.field_index = context.buffer_index = 0;
-  context.max_recursion_depth = kMaxNestingDepth;
-  return LoadArray(type, &context, out);
-}
-
-Status LoadArray(const std::shared_ptr<DataType>& type, ArrayLoaderContext* context,
-    std::shared_ptr<Array>* out) {
-  ArrayLoader loader(type, context);
-  RETURN_NOT_OK(loader.Load(out));
-
-  return Status::OK();
-}
-
-class InMemorySource : public ArrayComponentSource {
- public:
-  InMemorySource(const std::vector<FieldMetadata>& fields,
-      const std::vector<std::shared_ptr<Buffer>>& buffers)
-      : fields_(fields), buffers_(buffers) {}
-
-  Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
-    DCHECK(buffer_index < static_cast<int>(buffers_.size()));
-    *out = buffers_[buffer_index];
-    return Status::OK();
-  }
-
-  Status GetFieldMetadata(int field_index, FieldMetadata* metadata) {
-    DCHECK(field_index < static_cast<int>(fields_.size()));
-    *metadata = fields_[field_index];
-    return Status::OK();
-  }
-
- private:
-  const std::vector<FieldMetadata>& fields_;
-  const std::vector<std::shared_ptr<Buffer>>& buffers_;
-};
-
-Status LoadArray(const std::shared_ptr<DataType>& type,
-    const std::vector<FieldMetadata>& fields,
-    const std::vector<std::shared_ptr<Buffer>>& buffers, std::shared_ptr<Array>* out) {
-  InMemorySource source(fields, buffers);
-  return LoadArray(type, &source, out);
-}
-
-Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
-    const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap,
-    int64_t null_count, int64_t offset, std::shared_ptr<Array>* out) {
-  std::vector<std::shared_ptr<Buffer>> buffers = {null_bitmap, data};
-  return MakePrimitiveArray(type, buffers, length, null_count, offset, out);
-}
-
-Status MakePrimitiveArray(const std::shared_ptr<DataType>& type,
-    const std::vector<std::shared_ptr<Buffer>>& buffers, int64_t length,
-    int64_t null_count, int64_t offset, std::shared_ptr<Array>* out) {
-  std::vector<FieldMetadata> fields(1);
-  fields[0].length = length;
-  fields[0].null_count = null_count;
-  fields[0].offset = offset;
-
-  return LoadArray(type, fields, buffers, out);
-}
-
-}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/loader.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/loader.h b/cpp/src/arrow/loader.h
deleted file mode 100644
index f5e3995..0000000
--- a/cpp/src/arrow/loader.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Function for constructing Array array objects from metadata and raw memory
-// buffers
-
-#ifndef ARROW_LOADER_H
-#define ARROW_LOADER_H
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class Buffer;
-class DataType;
-
-// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
-// deeply nested schemas, it is expected the user will indicate explicitly the
-// maximum allowed recursion depth
-constexpr int kMaxNestingDepth = 64;
-
-struct ARROW_EXPORT FieldMetadata {
-  FieldMetadata() {}
-  FieldMetadata(int64_t length, int64_t null_count, int64_t offset)
-      : length(length), null_count(null_count), offset(offset) {}
-
-  FieldMetadata(const FieldMetadata& other) {
-    this->length = other.length;
-    this->null_count = other.null_count;
-    this->offset = other.offset;
-  }
-
-  int64_t length;
-  int64_t null_count;
-  int64_t offset;
-};
-
-struct ARROW_EXPORT BufferMetadata {
-  BufferMetadata() {}
-  BufferMetadata(int32_t page, int64_t offset, int64_t length)
-      : page(page), offset(offset), length(length) {}
-
-  /// The shared memory page id where to find this. Set to -1 if unused
-  int32_t page;
-
-  /// The relative offset into the memory page to the starting byte of the buffer
-  int64_t offset;
-
-  /// Absolute length in bytes of the buffer
-  int64_t length;
-};
-
-/// Implement this to create new types of Arrow data loaders
-class ARROW_EXPORT ArrayComponentSource {
- public:
-  virtual ~ArrayComponentSource() = default;
-
-  virtual Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) = 0;
-  virtual Status GetFieldMetadata(int field_index, FieldMetadata* metadata) = 0;
-};
-
-/// Bookkeeping struct for loading array objects from their constituent pieces of raw data
-///
-/// The field_index and buffer_index are incremented in the ArrayLoader
-/// based on how much of the batch is "consumed" (through nested data
-/// reconstruction, for example)
-struct ArrayLoaderContext {
-  ArrayComponentSource* source;
-  int buffer_index;
-  int field_index;
-  int max_recursion_depth;
-};
-
-/// Construct an Array container from type metadata and a collection of memory
-/// buffers
-///
-/// \param[in] field the data type of the array being loaded
-/// \param[in] source an implementation of ArrayComponentSource
-/// \param[out] out the constructed array
-/// \return Status indicating success or failure
-Status ARROW_EXPORT LoadArray(const std::shared_ptr<DataType>& type,
-    ArrayComponentSource* source, std::shared_ptr<Array>* out);
-
-Status ARROW_EXPORT LoadArray(const std::shared_ptr<DataType>& field,
-    ArrayLoaderContext* context, std::shared_ptr<Array>* out);
-
-Status ARROW_EXPORT LoadArray(const std::shared_ptr<DataType>& type,
-    const std::vector<FieldMetadata>& fields,
-    const std::vector<std::shared_ptr<Buffer>>& buffers, std::shared_ptr<Array>* out);
-
-/// Create new arrays for logical types that are backed by primitive arrays.
-Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
-    int64_t length, const std::shared_ptr<Buffer>& data,
-    const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, int64_t offset,
-    std::shared_ptr<Array>* out);
-
-Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
-    const std::vector<std::shared_ptr<Buffer>>& buffers, int64_t length,
-    int64_t null_count, int64_t offset, std::shared_ptr<Array>* out);
-
-}  // namespace arrow
-
-#endif  // ARROW_LOADER_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/pretty_print.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 1f4bfa9..93f6ff0 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -26,6 +26,7 @@
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/string.h"
 #include "arrow/visitor_inline.h"
 
@@ -39,7 +40,7 @@ class ArrayPrinter {
   template <typename T>
   inline typename std::enable_if<IsInteger<T>::value, void>::type WriteDataValues(
       const T& array) {
-    const auto data = array.raw_data();
+    const auto data = array.raw_values();
     for (int i = 0; i < array.length(); ++i) {
       if (i > 0) { (*sink_) << ", "; }
       if (array.IsNull(i)) {
@@ -53,7 +54,7 @@ class ArrayPrinter {
   template <typename T>
   inline typename std::enable_if<IsFloatingPoint<T>::value, void>::type WriteDataValues(
       const T& array) {
-    const auto data = array.raw_data();
+    const auto data = array.raw_values();
     for (int i = 0; i < array.length(); ++i) {
       if (i > 0) { (*sink_) << ", "; }
       if (array.IsNull(i)) {
@@ -187,7 +188,12 @@ class ArrayPrinter {
 
   Status Visit(const StructArray& array) {
     RETURN_NOT_OK(WriteValidityBitmap(array));
-    return PrintChildren(array.fields(), array.offset(), array.length());
+    std::vector<std::shared_ptr<Array>> children;
+    children.reserve(array.num_fields());
+    for (int i = 0; i < array.num_fields(); ++i) {
+      children.emplace_back(array.field(i));
+    }
+    return PrintChildren(children, array.offset(), array.length());
   }
 
   Status Visit(const UnionArray& array) {
@@ -207,7 +213,12 @@ class ArrayPrinter {
     }
 
     // Print the children without any offset, because the type ids are absolute
-    return PrintChildren(array.children(), 0, array.length() + array.offset());
+    std::vector<std::shared_ptr<Array>> children;
+    children.reserve(array.num_fields());
+    for (int i = 0; i < array.num_fields(); ++i) {
+      children.emplace_back(array.child(i));
+    }
+    return PrintChildren(children, 0, array.length() + array.offset());
   }
 
   Status Visit(const DictionaryArray& array) {
@@ -286,4 +297,8 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) {
   return Status::OK();
 }
 
+Status ARROW_EXPORT DebugPrint(const Array& arr, int indent) {
+  return PrettyPrint(arr, indent, &std::cout);
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/pretty_print.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h
index f508aa0..a45c8a8 100644
--- a/cpp/src/arrow/pretty_print.h
+++ b/cpp/src/arrow/pretty_print.h
@@ -25,6 +25,7 @@
 
 namespace arrow {
 
+class Array;
 class Status;
 
 struct PrettyPrintOptions {
@@ -34,6 +35,8 @@ struct PrettyPrintOptions {
 Status ARROW_EXPORT PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink);
 Status ARROW_EXPORT PrettyPrint(const Array& arr, int indent, std::ostream* sink);
 
+Status ARROW_EXPORT DebugPrint(const Array& arr, int indent);
+
 }  // namespace arrow
 
 #endif  // ARROW_PRETTY_PRINT_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 2364f13..cdd3f58 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -34,7 +34,6 @@
 #include <vector>
 
 #include "arrow/array.h"
-#include "arrow/loader.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/type_fwd.h"
@@ -340,12 +339,10 @@ class PandasConverter {
       null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
     }
 
-    std::vector<FieldMetadata> fields(1);
-    fields[0].length = length_;
-    fields[0].null_count = null_count;
-    fields[0].offset = 0;
-
-    return LoadArray(type_, fields, {null_bitmap_, data}, &out_);
+    BufferVector buffers = {null_bitmap_, data};
+    auto array_data = std::make_shared<internal::ArrayData>(
+        type_, length_, std::move(buffers), null_count, 0);
+    return internal::MakeArray(array_data, &out_);
   }
 
   template <typename T>
@@ -617,9 +614,9 @@ Status PandasConverter::ConvertObjectStrings() {
   RETURN_NOT_OK(builder.Finish(&out_));
 
   if (have_bytes) {
-    const auto& arr = static_cast<const StringArray&>(*out_);
-    out_ = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
-        arr.null_bitmap(), arr.null_count());
+    auto binary_data = out_->data()->ShallowCopy();
+    binary_data->type = ::arrow::binary();
+    out_ = std::make_shared<BinaryArray>(binary_data);
   }
   return Status::OK();
 }
@@ -1223,7 +1220,7 @@ inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->raw_values());
     // Upcast to double, set NaN as appropriate
 
     for (int i = 0; i < arr->length(); ++i) {
@@ -1237,7 +1234,7 @@ inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_value
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->raw_values());
     memcpy(out_values, in_values, sizeof(T) * arr->length());
     out_values += arr->length();
   }
@@ -1248,7 +1245,7 @@ inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_val
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->raw_values());
     for (int64_t i = 0; i < arr->length(); ++i) {
       *out_values = in_values[i];
     }
@@ -1371,14 +1368,14 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) {
   // ChunkedArray has at least one chunk
   auto arr = static_cast<const StructArray*>(data.chunk(0).get());
   // Use it to cache the struct type and number of fields for all chunks
-  auto num_fields = arr->fields().size();
+  int32_t num_fields = arr->num_fields();
   auto array_type = arr->type();
   std::vector<OwnedRef> fields_data(num_fields);
   OwnedRef dict_item;
   for (int c = 0; c < data.num_chunks(); c++) {
     auto arr = static_cast<const StructArray*>(data.chunk(c).get());
     // Convert the struct arrays first
-    for (size_t i = 0; i < num_fields; i++) {
+    for (int32_t i = 0; i < num_fields; i++) {
       PyObject* numpy_array;
       RETURN_NOT_OK(
           ConvertArrayToPandas(arr->field(static_cast<int>(i)), nullptr, &numpy_array));
@@ -1395,7 +1392,7 @@ inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) {
         // Build the new dict object for the row
         dict_item.reset(PyDict_New());
         RETURN_IF_PYERROR();
-        for (size_t field_idx = 0; field_idx < num_fields; ++field_idx) {
+        for (int32_t field_idx = 0; field_idx < num_fields; ++field_idx) {
           OwnedRef field_value;
           auto name = array_type->child(static_cast<int>(field_idx))->name();
           if (!arr->field(static_cast<int>(field_idx))->IsNull(i)) {
@@ -1475,7 +1472,7 @@ inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->raw_values());
 
     const uint8_t* valid_bits = arr->null_bitmap_data();
 
@@ -1496,7 +1493,7 @@ inline void ConvertNumericNullableCast(
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->raw_values());
 
     for (int64_t i = 0; i < arr->length(); ++i) {
       *out_values++ = arr->IsNull(i) ? na_value : static_cast<OutType>(in_values[i]);
@@ -1509,7 +1506,7 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values)
   for (int c = 0; c < data.num_chunks(); c++) {
     const std::shared_ptr<Array> arr = data.chunk(c);
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->raw_values());
 
     for (int64_t i = 0; i < arr->length(); ++i) {
       *out_values++ = arr->IsNull(i) ? kPandasTimestampNull
@@ -1838,7 +1835,7 @@ class CategoricalBlock : public PandasBlock {
       const std::shared_ptr<Array> arr = data.chunk(c);
       const auto& dict_arr = static_cast<const DictionaryArray&>(*arr);
       const auto& indices = static_cast<const PrimitiveArray&>(*dict_arr.indices());
-      auto in_values = reinterpret_cast<const T*>(indices.data()->data());
+      auto in_values = reinterpret_cast<const T*>(indices.raw_values());
 
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr->length(); ++i) {
@@ -2214,7 +2211,7 @@ class ArrowDeserializer {
     typedef typename arrow_traits<TYPE>::T T;
 
     auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->raw_values());
 
     // Zero-Copy. We can pass the data pointer directly to NumPy.
     void* data = const_cast<T*>(in_values);
@@ -2290,7 +2287,7 @@ class ArrowDeserializer {
     for (int c = 0; c < data_.num_chunks(); c++) {
       const std::shared_ptr<Array> arr = data_.chunk(c);
       auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
-      auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+      auto in_values = reinterpret_cast<const T*>(prim_arr->raw_values());
 
       for (int64_t i = 0; i < arr->length(); ++i) {
         *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / kShift;

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index c110ec1..aa04243 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -146,12 +146,30 @@ void AssertBatchValid(const RecordBatch& batch) {
 
 RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
     const std::vector<std::shared_ptr<Array>>& columns)
-    : schema_(schema), num_rows_(num_rows), columns_(columns) {}
+    : schema_(schema), num_rows_(num_rows), columns_(columns.size()) {
+  for (size_t i = 0; i < columns.size(); ++i) {
+    columns_[i] = columns[i]->data();
+  }
+}
 
 RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
     std::vector<std::shared_ptr<Array>>&& columns)
+    : schema_(schema), num_rows_(num_rows), columns_(columns.size()) {
+  for (size_t i = 0; i < columns.size(); ++i) {
+    columns_[i] = columns[i]->data();
+  }
+}
+
+RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
+    std::vector<std::shared_ptr<internal::ArrayData>>&& columns)
     : schema_(schema), num_rows_(num_rows), columns_(std::move(columns)) {}
 
+std::shared_ptr<Array> RecordBatch::column(int i) const {
+  std::shared_ptr<Array> result;
+  DCHECK(MakeArray(columns_[i], &result).ok());
+  return result;
+}
+
 const std::string& RecordBatch::column_name(int i) const {
   return schema_->field(i)->name();
 }
@@ -185,30 +203,36 @@ std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset) const {
 }
 
 std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset, int64_t length) const {
-  std::vector<std::shared_ptr<Array>> arrays;
+  std::vector<std::shared_ptr<internal::ArrayData>> arrays;
   arrays.reserve(num_columns());
   for (const auto& field : columns_) {
-    arrays.emplace_back(field->Slice(offset, length));
+    int64_t col_length = std::min(field->length - offset, length);
+    int64_t col_offset = field->offset + offset;
+
+    auto new_data = std::make_shared<internal::ArrayData>(*field);
+    new_data->length = col_length;
+    new_data->offset = col_offset;
+    new_data->null_count = kUnknownNullCount;
+    arrays.emplace_back(new_data);
   }
-
   int64_t num_rows = std::min(num_rows_ - offset, length);
-  return std::make_shared<RecordBatch>(schema_, num_rows, arrays);
+  return std::make_shared<RecordBatch>(schema_, num_rows, std::move(arrays));
 }
 
 Status RecordBatch::Validate() const {
   for (int i = 0; i < num_columns(); ++i) {
-    const Array& arr = *columns_[i];
-    if (arr.length() != num_rows_) {
+    const internal::ArrayData& arr = *columns_[i];
+    if (arr.length != num_rows_) {
       std::stringstream ss;
-      ss << "Number of rows in column " << i << " did not match batch: " << arr.length()
+      ss << "Number of rows in column " << i << " did not match batch: " << arr.length
          << " vs " << num_rows_;
       return Status::Invalid(ss.str());
     }
     const auto& schema_type = *schema_->field(i)->type();
-    if (!arr.type()->Equals(schema_type)) {
+    if (!arr.type->Equals(schema_type)) {
       std::stringstream ss;
-      ss << "Column " << i << " type not match schema: " << arr.type()->ToString()
-         << " vs " << schema_type.ToString();
+      ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs "
+         << schema_type.ToString();
       return Status::Invalid(ss.str());
     }
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index 67710a8..18315f3 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -28,6 +28,12 @@
 
 namespace arrow {
 
+namespace internal {
+
+struct ArrayData;
+
+}  // namespace internal
+
 class Array;
 class Column;
 class Schema;
@@ -106,15 +112,29 @@ class ARROW_EXPORT Column {
 // corresponding sequence of equal-length Arrow arrays
 class ARROW_EXPORT RecordBatch {
  public:
-  // num_rows is a parameter to allow for record batches of a particular size not
-  // having any materialized columns. Each array should have the same length as
-  // num_rows
+  /// num_rows is a parameter to allow for record batches of a particular size not
+  /// having any materialized columns. Each array should have the same length as
+  /// num_rows
+
   RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
       const std::vector<std::shared_ptr<Array>>& columns);
 
+  /// \brief Deprecated move constructor for a vector of Array instances
   RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
       std::vector<std::shared_ptr<Array>>&& columns);
 
+  /// \brief Construct record batch from vector of internal data structures
+  ///
+  /// This class is only provided with an rvalue-reference for the input data,
+  /// and is intended for internal use, or advanced users.
+  ///
+  /// \param schema the record batch schema
+  /// \param num_rows the number of semantic rows in the record batch. This
+  /// should be equal to the length of each field
+  /// \param columns the data for the batch's columns
+  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
+      std::vector<std::shared_ptr<internal::ArrayData>>&& columns);
+
   bool Equals(const RecordBatch& other) const;
 
   bool ApproxEquals(const RecordBatch& other) const;
@@ -124,9 +144,9 @@ class ARROW_EXPORT RecordBatch {
 
   // @returns: the i-th column
   // Note: Does not boundscheck
-  std::shared_ptr<Array> column(int i) const { return columns_[i]; }
+  std::shared_ptr<Array> column(int i) const;
 
-  const std::vector<std::shared_ptr<Array>>& columns() const { return columns_; }
+  std::shared_ptr<internal::ArrayData> column_data(int i) const { return columns_[i]; }
 
   const std::string& column_name(int i) const;
 
@@ -147,7 +167,7 @@ class ARROW_EXPORT RecordBatch {
  private:
   std::shared_ptr<Schema> schema_;
   int64_t num_rows_;
-  std::vector<std::shared_ptr<Array>> columns_;
+  std::vector<std::shared_ptr<internal::ArrayData>> columns_;
 };
 
 // Immutable container of fixed-length columns conforming to a particular schema

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/python/doc/source/development.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst
index 8a70180..b5aba6c 100644
--- a/python/doc/source/development.rst
+++ b/python/doc/source/development.rst
@@ -267,6 +267,7 @@ Now, we build and install Arrow C++ libraries
          -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
          -DCMAKE_BUILD_TYPE=Release ^
          -DARROW_BUILD_TESTS=off ^
+         -DARROW_ZLIB_VENDORED=off ^
          -DARROW_PYTHON=on ..
    cmake --build . --target INSTALL --config Release
    cd ..\..
@@ -282,7 +283,6 @@ Now, we build parquet-cpp and install the result in the same place:
    cmake -G "Visual Studio 14 2015 Win64" ^
          -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
          -DCMAKE_BUILD_TYPE=Release ^
-         -DPARQUET_ZLIB_VENDORED=off ^
          -DPARQUET_BUILD_TESTS=off ..
    cmake --build . --target INSTALL --config Release
    popd

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index bf87173..ae9ff88 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1084,8 +1084,8 @@ cdef class StructValue(ArrayValue):
             CStructArray* ap
             vector[shared_ptr[CField]] child_fields = self.type.type.children()
         ap = <CStructArray*> self.sp_array.get()
-        child_arrays = ap.fields()
-        wrapped_arrays = (pyarrow_wrap_array(child) for child in child_arrays)
+        wrapped_arrays = (pyarrow_wrap_array(ap.field(i))
+                          for i in range(ap.num_fields()))
         child_names = (child.get().name() for child in child_fields)
         # Return the struct as a dict
         return {
@@ -1214,6 +1214,9 @@ cdef class Array:
         self.ap = sp_array.get()
         self.type = pyarrow_wrap_data_type(self.sp_array.get().type())
 
+    def _debug_print(self):
+        check_status(DebugPrint(deref(self.ap), 0))
+
     @staticmethod
     def from_pandas(obj, mask=None, DataType type=None,
                     timestamps_to_ms=False,

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index cc46c76..2db1dd1 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -91,12 +91,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int64_t null_count()
         Type type_id()
 
+        int num_fields()
+
         c_bool Equals(const CArray& arr)
         c_bool IsNull(int i)
 
         shared_ptr[CArray] Slice(int64_t offset)
         shared_ptr[CArray] Slice(int64_t offset, int64_t length)
 
+    CStatus DebugPrint(const CArray& arr, int indent)
+
     cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
         int bit_width()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/84520711/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index ac4ad82..4c51d71 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -477,6 +477,23 @@ class TestPandasConversion(unittest.TestCase):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], type=field.type)
 
+    def test_column_of_arrays_to_py(self):
+        # Test regression in ARROW-1199 not caught in above test
+        dtype = 'i1'
+        arr = np.array([
+            np.arange(10, dtype=dtype),
+            np.arange(5, dtype=dtype),
+            None,
+            np.arange(1, dtype=dtype)
+        ])
+        type_ = pa.list_(pa.int8())
+        parr = pa.Array.from_pandas(arr, type=type_)
+
+        assert parr[0].as_py() == list(range(10))
+        assert parr[1].as_py() == list(range(5))
+        assert parr[2].as_py() is None
+        assert parr[3].as_py() == [0]
+
     def test_column_of_lists(self):
         df, schema = dataframe_with_lists()
         self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)


Mime
View raw message