arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject arrow git commit: ARROW-108: [C++] Add Union implementation and IPC/JSON serialization tests
Date Tue, 03 Jan 2017 07:27:53 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 806239fdd -> 9f7d4ae6d


ARROW-108: [C++] Add Union implementation and IPC/JSON serialization tests

Closes #206.

Still need to add test cases for JSON read/write and dense union IPC. Integration tests can happen in a subsequent PR (but the Java library does not support dense unions yet, so sparse only -- i.e. no offsets vector)

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #264 from wesm/ARROW-108 and squashes the following commits:

86c4191 [Wes McKinney] Fix valgrind error
cdfc61d [Wes McKinney] Export UnionArray
3edca1e [Wes McKinney] Implement basic JSON roundtrip for unions
30b7188 [Wes McKinney] Add test case for dense union, implement RangeEquals for it
4887fd2 [Wes McKinney] Move Windows stuff into a compatibility header, exclude from clang-format because of include order sensitivity
5ca9c57 [Wes McKinney] Implement IPC/JSON serializationf or unions. Test UnionMode::SPARSE example in IPC


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/9f7d4ae6
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/9f7d4ae6
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/9f7d4ae6

Branch: refs/heads/master
Commit: 9f7d4ae6da04d9339dfa2811d750ccf616568bc8
Parents: 806239f
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Tue Jan 3 08:27:36 2017 +0100
Committer: Uwe L. Korn <uwelk@xhochy.com>
Committed: Tue Jan 3 08:27:36 2017 +0100

----------------------------------------------------------------------
 cpp/CMakeLists.txt                       |   4 +-
 cpp/src/arrow/array-list-test.cc         |   2 +-
 cpp/src/arrow/array-primitive-test.cc    |   2 +-
 cpp/src/arrow/array-struct-test.cc       |   5 +-
 cpp/src/arrow/array-test.cc              |   6 +-
 cpp/src/arrow/array.cc                   | 120 +++++++++++++++++++++++---
 cpp/src/arrow/array.h                    |  90 ++++++++++++-------
 cpp/src/arrow/builder.h                  |   2 +-
 cpp/src/arrow/io/hdfs-internal.h         |  12 +--
 cpp/src/arrow/io/windows_compatibility.h |  36 ++++++++
 cpp/src/arrow/ipc/adapter.cc             |  56 +++++++++---
 cpp/src/arrow/ipc/ipc-adapter-test.cc    |   6 +-
 cpp/src/arrow/ipc/ipc-json-test.cc       |  18 +++-
 cpp/src/arrow/ipc/json-internal.cc       |  90 ++++++++++++++-----
 cpp/src/arrow/ipc/test-common.h          |  83 ++++++++++++++++--
 cpp/src/arrow/pretty_print.cc            |  44 +++++++---
 cpp/src/arrow/test-util.h                |  14 ++-
 cpp/src/arrow/type.cc                    |   2 +-
 cpp/src/arrow/type.h                     |   8 +-
 19 files changed, 476 insertions(+), 124 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bf30543..13f0354 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -668,7 +668,9 @@ endif (UNIX)
 if (${CLANG_FORMAT_FOUND})
   # runs clang format and updates files in place.
   add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
-    `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`
+    `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h |
+    sed -e '/_generated/g' |
+    sed -e '/windows_compatibility.h/g'`
     `find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`)
 
   # runs clang format and exits with a non-zero exit code if any files need to be reformatted

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array-list-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc
index 8baaf06..8e4d319 100644
--- a/cpp/src/arrow/array-list-test.cc
+++ b/cpp/src/arrow/array-list-test.cc
@@ -89,7 +89,7 @@ class TestListBuilder : public TestBuilder {
 TEST_F(TestListBuilder, Equality) {
   Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get());
 
-  ArrayPtr array, equal_array, unequal_array;
+  std::shared_ptr<Array> array, equal_array, unequal_array;
   vector<int32_t> equal_offsets = {0, 1, 2, 5};
   vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2};
   vector<int32_t> unequal_offsets = {0, 1, 4};

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array-primitive-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc
index a10e240..443abac 100644
--- a/cpp/src/arrow/array-primitive-test.cc
+++ b/cpp/src/arrow/array-primitive-test.cc
@@ -318,7 +318,7 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) {
   this->RandomData(size);
   vector<T>& draws = this->draws_;
   vector<uint8_t>& valid_bytes = this->valid_bytes_;
-  ArrayPtr array, equal_array, unequal_array;
+  std::shared_ptr<Array> array, equal_array, unequal_array;
   auto builder = this->builder_.get();
   ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array));
   ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array));

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array-struct-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc
index 58386fe..5827c39 100644
--- a/cpp/src/arrow/array-struct-test.cc
+++ b/cpp/src/arrow/array-struct-test.cc
@@ -261,8 +261,9 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) {
 }
 
 TEST_F(TestStructBuilder, TestEquality) {
-  ArrayPtr array, equal_array;
-  ArrayPtr unequal_bitmap_array, unequal_offsets_array, unequal_values_array;
+  std::shared_ptr<Array> array, equal_array;
+  std::shared_ptr<Array> unequal_bitmap_array, unequal_offsets_array,
+      unequal_values_array;
 
   vector<int32_t> int_values = {1, 2, 3, 4};
   vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'};

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 783104e..a1d8fdf 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -56,7 +56,8 @@ TEST_F(TestArray, TestLength) {
   ASSERT_EQ(arr->length(), 100);
 }
 
-ArrayPtr MakeArrayFromValidBytes(const std::vector<uint8_t>& v, MemoryPool* pool) {
+std::shared_ptr<Array> MakeArrayFromValidBytes(
+    const std::vector<uint8_t>& v, MemoryPool* pool) {
   int32_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0);
   std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(v);
 
@@ -65,7 +66,8 @@ ArrayPtr MakeArrayFromValidBytes(const std::vector<uint8_t>& v, MemoryPool* pool
     value_builder.Append<int32_t>(0);
   }
 
-  ArrayPtr arr(new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf));
+  std::shared_ptr<Array> arr(
+      new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf));
   return arr;
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index d13fa1e..3d309b8 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -189,14 +189,14 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const {
   }
 }
 
-bool BooleanArray::Equals(const ArrayPtr& arr) const {
+bool BooleanArray::Equals(const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) return true;
   if (Type::BOOL != arr->type_enum()) { return false; }
   return EqualsExact(*static_cast<const BooleanArray*>(arr.get()));
 }
 
 bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx,
-    int32_t other_start_idx, const ArrayPtr& arr) const {
+    int32_t other_start_idx, const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) { return true; }
   if (!arr) { return false; }
   if (this->type_enum() != arr->type_enum()) { return false; }
@@ -222,7 +222,7 @@ bool ListArray::EqualsExact(const ListArray& other) const {
   if (null_count_ != other.null_count_) { return false; }
 
   bool equal_offsets =
-      offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t));
+      offsets_buffer_->Equals(*other.offsets_buffer_, (length_ + 1) * sizeof(int32_t));
   if (!equal_offsets) { return false; }
   bool equal_null_bitmap = true;
   if (null_count_ > 0) {
@@ -269,10 +269,10 @@ bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st
 
 Status ListArray::Validate() const {
   if (length_ < 0) { return Status::Invalid("Length was negative"); }
-  if (!offset_buffer_) { return Status::Invalid("offset_buffer_ was null"); }
-  if (offset_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
+  if (!offsets_buffer_) { return Status::Invalid("offsets_buffer_ was null"); }
+  if (offsets_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
     std::stringstream ss;
-    ss << "offset buffer size (bytes): " << offset_buffer_->size()
+    ss << "offset buffer size (bytes): " << offsets_buffer_->size()
        << " isn't large enough for length: " << length_;
     return Status::Invalid(ss.str());
   }
@@ -337,8 +337,8 @@ BinaryArray::BinaryArray(const TypePtr& type, int32_t length,
     const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data,
     int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
     : Array(type, length, null_count, null_bitmap),
-      offset_buffer_(offsets),
-      offsets_(reinterpret_cast<const int32_t*>(offset_buffer_->data())),
+      offsets_buffer_(offsets),
+      offsets_(reinterpret_cast<const int32_t*>(offsets_buffer_->data())),
       data_buffer_(data),
       data_(nullptr) {
   if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); }
@@ -353,7 +353,7 @@ bool BinaryArray::EqualsExact(const BinaryArray& other) const {
   if (!Array::EqualsExact(other)) { return false; }
 
   bool equal_offsets =
-      offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t));
+      offsets_buffer_->Equals(*other.offsets_buffer_, (length_ + 1) * sizeof(int32_t));
   if (!equal_offsets) { return false; }
 
   if (!data_buffer_ && !(other.data_buffer_)) { return true; }
@@ -433,7 +433,7 @@ bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_
   if (this == arr.get()) { return true; }
   if (!arr) { return false; }
   if (Type::STRUCT != arr->type_enum()) { return false; }
-  const auto other = static_cast<StructArray*>(arr.get());
+  const auto& other = static_cast<const StructArray&>(*arr.get());
 
   bool equal_fields = true;
   for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
@@ -442,7 +442,7 @@ bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_
     for (size_t j = 0; j < field_arrays_.size(); ++j) {
       // TODO: really we should be comparing stretches of non-null data rather
       // than looking at one value at a time.
-      equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j));
+      equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other.field(j));
       if (!equal_fields) { return false; }
     }
   }
@@ -491,6 +491,102 @@ Status StructArray::Accept(ArrayVisitor* visitor) const {
 }
 
 // ----------------------------------------------------------------------
+// UnionArray
+
+UnionArray::UnionArray(const TypePtr& type, int32_t length,
+    const std::vector<std::shared_ptr<Array>>& children,
+    const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& offsets,
+    int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
+    : Array(type, length, null_count, null_bitmap),
+      children_(children),
+      type_ids_buffer_(type_ids),
+      offsets_buffer_(offsets) {
+  type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data());
+  if (offsets) { offsets_ = reinterpret_cast<const int32_t*>(offsets->data()); }
+}
+
+std::shared_ptr<Array> UnionArray::child(int32_t pos) const {
+  DCHECK_GT(children_.size(), 0);
+  return children_[pos];
+}
+
+bool UnionArray::Equals(const std::shared_ptr<Array>& arr) const {
+  if (this == arr.get()) { return true; }
+  if (!arr) { return false; }
+  if (!this->type_->Equals(arr->type())) { return false; }
+  if (null_count_ != arr->null_count()) { return false; }
+  return RangeEquals(0, length_, 0, arr);
+}
+
+bool UnionArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+    const std::shared_ptr<Array>& arr) const {
+  if (this == arr.get()) { return true; }
+  if (!arr) { return false; }
+  if (Type::UNION != arr->type_enum()) { return false; }
+  const auto& other = static_cast<const UnionArray&>(*arr.get());
+
+  const UnionMode union_mode = mode();
+  if (union_mode != other.mode()) { return false; }
+
+  // Define a mapping from the type id to child number
+  const auto& type_codes = static_cast<const UnionType&>(*arr->type().get()).type_ids;
+  uint8_t max_code = 0;
+  for (uint8_t code : type_codes) {
+    if (code > max_code) { max_code = code; }
+  }
+
+  // Store mapping in a vector for constant time lookups
+  std::vector<uint8_t> type_id_to_child_num(max_code + 1);
+  for (uint8_t i = 0; i < static_cast<uint8_t>(type_codes.size()); ++i) {
+    type_id_to_child_num[type_codes[i]] = i;
+  }
+
+  const uint8_t* this_ids = raw_type_ids();
+  const uint8_t* other_ids = other.raw_type_ids();
+
+  uint8_t id, child_num;
+  for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
+    if (IsNull(i) != other.IsNull(o_i)) { return false; }
+    if (IsNull(i)) continue;
+    if (this_ids[i] != other_ids[o_i]) { return false; }
+
+    id = this_ids[i];
+    child_num = type_id_to_child_num[id];
+
+    // TODO(wesm): really we should be comparing stretches of non-null data
+    // rather than looking at one value at a time.
+    if (union_mode == UnionMode::SPARSE) {
+      if (!child(child_num)->RangeEquals(i, i + 1, o_i, other.child(child_num))) {
+        return false;
+      }
+    } else {
+      const int32_t offset = offsets_[i];
+      const int32_t o_offset = other.offsets_[i];
+      if (!child(child_num)->RangeEquals(
+              offset, offset + 1, o_offset, other.child(child_num))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+Status UnionArray::Validate() const {
+  if (length_ < 0) { return Status::Invalid("Length was negative"); }
+
+  if (null_count() > length_) {
+    return Status::Invalid("Null count exceeds the length of this struct");
+  }
+
+  DCHECK(false) << "Validate not yet implemented";
+  return Status::OK();
+}
+
+Status UnionArray::Accept(ArrayVisitor* visitor) const {
+  return visitor->Visit(*this);
+}
+
+// ----------------------------------------------------------------------
 
 #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType)                          \
   case Type::ENUM:                                                          \
@@ -499,7 +595,7 @@ Status StructArray::Accept(ArrayVisitor* visitor) const {
 
 Status MakePrimitiveArray(const TypePtr& type, int32_t length,
     const std::shared_ptr<Buffer>& data, int32_t null_count,
-    const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) {
+    const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out) {
   switch (type->type) {
     MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray);
     MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array);

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 6239ccc..cd42a28 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -108,8 +108,6 @@ class ARROW_EXPORT NullArray : public Array {
   Status Accept(ArrayVisitor* visitor) const override;
 };
 
-typedef std::shared_ptr<Array> ArrayPtr;
-
 Status ARROW_EXPORT GetEmptyBitmap(
     MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result);
 
@@ -152,7 +150,7 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
   }
 
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-      const ArrayPtr& arr) const override {
+      const std::shared_ptr<Array>& arr) const override {
     if (this == arr.get()) { return true; }
     if (!arr) { return false; }
     if (this->type_enum() != arr->type_enum()) { return false; }
@@ -256,9 +254,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
       int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   bool EqualsExact(const BooleanArray& other) const;
-  bool Equals(const ArrayPtr& arr) const override;
+  bool Equals(const std::shared_ptr<Array>& arr) const override;
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-      const ArrayPtr& arr) const override;
+      const std::shared_ptr<Array>& arr) const override;
 
   Status Accept(ArrayVisitor* visitor) const override;
 
@@ -274,13 +272,13 @@ class ARROW_EXPORT ListArray : public Array {
  public:
   using TypeClass = ListType;
 
-  ListArray(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets,
-      const ArrayPtr& values, int32_t null_count = 0,
-      std::shared_ptr<Buffer> null_bitmap = nullptr)
+  ListArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets,
+      const std::shared_ptr<Array>& values, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr)
       : Array(type, length, null_count, null_bitmap) {
-    offset_buffer_ = offsets;
+    offsets_buffer_ = offsets;
     offsets_ = offsets == nullptr ? nullptr : reinterpret_cast<const int32_t*>(
-                                                  offset_buffer_->data());
+                                                  offsets_buffer_->data());
     values_ = values;
   }
 
@@ -291,9 +289,7 @@ class ARROW_EXPORT ListArray : public Array {
   // Return a shared pointer in case the requestor desires to share ownership
   // with this array.
   std::shared_ptr<Array> values() const { return values_; }
-  std::shared_ptr<Buffer> offsets() const {
-    return std::static_pointer_cast<Buffer>(offset_buffer_);
-  }
+  std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
 
   std::shared_ptr<DataType> value_type() const { return values_->type(); }
 
@@ -309,14 +305,14 @@ class ARROW_EXPORT ListArray : public Array {
   bool Equals(const std::shared_ptr<Array>& arr) const override;
 
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-      const ArrayPtr& arr) const override;
+      const std::shared_ptr<Array>& arr) const override;
 
   Status Accept(ArrayVisitor* visitor) const override;
 
  protected:
-  std::shared_ptr<Buffer> offset_buffer_;
+  std::shared_ptr<Buffer> offsets_buffer_;
   const int32_t* offsets_;
-  ArrayPtr values_;
+  std::shared_ptr<Array> values_;
 };
 
 // ----------------------------------------------------------------------
@@ -346,7 +342,7 @@ class ARROW_EXPORT BinaryArray : public Array {
   }
 
   std::shared_ptr<Buffer> data() const { return data_buffer_; }
-  std::shared_ptr<Buffer> offsets() const { return offset_buffer_; }
+  std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
 
   const int32_t* raw_offsets() const { return offsets_; }
 
@@ -359,14 +355,14 @@ class ARROW_EXPORT BinaryArray : public Array {
   bool EqualsExact(const BinaryArray& other) const;
   bool Equals(const std::shared_ptr<Array>& arr) const override;
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-      const ArrayPtr& arr) const override;
+      const std::shared_ptr<Array>& arr) const override;
 
   Status Validate() const override;
 
   Status Accept(ArrayVisitor* visitor) const override;
 
  private:
-  std::shared_ptr<Buffer> offset_buffer_;
+  std::shared_ptr<Buffer> offsets_buffer_;
   const int32_t* offsets_;
 
   std::shared_ptr<Buffer> data_buffer_;
@@ -401,8 +397,9 @@ class ARROW_EXPORT StructArray : public Array {
  public:
   using TypeClass = StructType;
 
-  StructArray(const TypePtr& type, int32_t length, std::vector<ArrayPtr>& field_arrays,
-      int32_t null_count = 0, std::shared_ptr<Buffer> null_bitmap = nullptr)
+  StructArray(const TypePtr& type, int32_t length,
+      const std::vector<std::shared_ptr<Array>>& field_arrays, int32_t null_count = 0,
+      std::shared_ptr<Buffer> null_bitmap = nullptr)
       : Array(type, length, null_count, null_bitmap) {
     type_ = type;
     field_arrays_ = field_arrays;
@@ -416,7 +413,7 @@ class ARROW_EXPORT StructArray : public Array {
   // with this array.
   std::shared_ptr<Array> field(int32_t pos) const;
 
-  const std::vector<ArrayPtr>& fields() const { return field_arrays_; }
+  const std::vector<std::shared_ptr<Array>>& fields() const { return field_arrays_; }
 
   bool EqualsExact(const StructArray& other) const;
   bool Equals(const std::shared_ptr<Array>& arr) const override;
@@ -427,25 +424,54 @@ class ARROW_EXPORT StructArray : public Array {
 
  protected:
   // The child arrays corresponding to each field of the struct data type.
-  std::vector<ArrayPtr> field_arrays_;
+  std::vector<std::shared_ptr<Array>> field_arrays_;
 };
 
 // ----------------------------------------------------------------------
 // Union
 
-class UnionArray : public Array {
+class ARROW_EXPORT UnionArray : public Array {
+ public:
+  using TypeClass = UnionType;
+
+  UnionArray(const TypePtr& type, int32_t length,
+      const std::vector<std::shared_ptr<Array>>& children,
+      const std::shared_ptr<Buffer>& type_ids,
+      const std::shared_ptr<Buffer>& offsets = nullptr, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+
+  Status Validate() const override;
+
+  virtual ~UnionArray() {}
+
+  std::shared_ptr<Buffer> type_ids() const { return type_ids_buffer_; }
+  const uint8_t* raw_type_ids() const { return type_ids_; }
+
+  std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
+  const int32_t* raw_offsets() const { return offsets_; }
+
+  UnionMode mode() const { return static_cast<const UnionType&>(*type_.get()).mode; }
+
+  std::shared_ptr<Array> child(int32_t pos) const;
+
+  const std::vector<std::shared_ptr<Array>>& children() const { return children_; }
+
+  bool EqualsExact(const UnionArray& other) const;
+  bool Equals(const std::shared_ptr<Array>& arr) const override;
+  bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+      const std::shared_ptr<Array>& arr) const override;
+
+  Status Accept(ArrayVisitor* visitor) const override;
+
  protected:
-  // The data are types encoded as int16
-  Buffer* types_;
   std::vector<std::shared_ptr<Array>> children_;
-};
 
-class DenseUnionArray : public UnionArray {
- protected:
-  Buffer* offset_buf_;
-};
+  std::shared_ptr<Buffer> type_ids_buffer_;
+  const uint8_t* type_ids_;
 
-class SparseUnionArray : public UnionArray {};
+  std::shared_ptr<Buffer> offsets_buffer_;
+  const int32_t* offsets_;
+};
 
 // ----------------------------------------------------------------------
 // extern templates and other details

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 2051398..1837340 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -228,7 +228,7 @@ using DoubleBuilder = NumericBuilder<DoubleType>;
 
 class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
  public:
-  explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type)
+  explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean())
       : ArrayBuilder(pool, type), data_(nullptr) {}
 
   virtual ~BooleanBuilder() {}

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/io/hdfs-internal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs-internal.h b/cpp/src/arrow/io/hdfs-internal.h
index 8f9a067..01cf149 100644
--- a/cpp/src/arrow/io/hdfs-internal.h
+++ b/cpp/src/arrow/io/hdfs-internal.h
@@ -20,21 +20,11 @@
 
 #ifndef _WIN32
 #include <dlfcn.h>
-#else
-
-// Windows defines min and max macros that mess up std::min/maxa
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <winsock2.h>
-#include <windows.h>
-
-// TODO(wesm): address when/if we add windows support
-// #include <util/syserr_reporting.hpp>
 #endif
 
 #include <hdfs.h>
 
+#include "arrow/io/windows_compatibility.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/io/windows_compatibility.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/windows_compatibility.h b/cpp/src/arrow/io/windows_compatibility.h
new file mode 100644
index 0000000..ac8f6ae
--- /dev/null
+++ b/cpp/src/arrow/io/windows_compatibility.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_IO_WINDOWS_COMPATIBILITY
+#define ARROW_IO_WINDOWS_COMPATIBILITY
+
+#ifdef _WIN32
+
+// Windows defines min and max macros that mess up std::min/max
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+#include <winsock2.h>
+#include <windows.h>
+
+// TODO(wesm): address when/if we add windows support
+// #include <util/syserr_reporting.hpp>
+
+#endif  // _WIN32
+
+#endif  // ARROW_IO_WINDOWS_COMPATIBILITY

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index ac4054b..9bfd11f 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -276,7 +276,16 @@ class RecordBatchWriter : public ArrayVisitor {
   }
 
   Status Visit(const UnionArray& array) override {
-    return Status::NotImplemented("union");
+    buffers_.push_back(array.type_ids());
+
+    if (array.mode() == UnionMode::DENSE) { buffers_.push_back(array.offsets()); }
+
+    --max_recursion_depth_;
+    for (const auto& field : array.children()) {
+      RETURN_NOT_OK(VisitArray(*field.get()));
+    }
+    ++max_recursion_depth_;
+    return Status::OK();
   }
 
   // Do not copy this vector. Ownership must be retained elsewhere
@@ -464,9 +473,10 @@ class ArrayLoader : public TypeVisitor {
   Status Visit(const ListType& type) override {
     FieldMetadata field_meta;
     std::shared_ptr<Buffer> null_bitmap;
-    std::shared_ptr<Buffer> offsets;
 
     RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
+
+    std::shared_ptr<Buffer> offsets;
     RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
 
     const int num_children = type.num_children();
@@ -484,20 +494,25 @@ class ArrayLoader : public TypeVisitor {
     return Status::OK();
   }
 
+  Status LoadChildren(std::vector<std::shared_ptr<Field>> child_fields,
+      std::vector<std::shared_ptr<Array>>* arrays) {
+    arrays->reserve(static_cast<int>(child_fields.size()));
+
+    for (const auto& child_field : child_fields) {
+      std::shared_ptr<Array> field_array;
+      RETURN_NOT_OK(LoadChild(*child_field.get(), &field_array));
+      arrays->emplace_back(field_array);
+    }
+    return Status::OK();
+  }
+
   Status Visit(const StructType& type) override {
     FieldMetadata field_meta;
     std::shared_ptr<Buffer> null_bitmap;
     RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
 
-    const int num_children = type.num_children();
-    std::vector<ArrayPtr> fields;
-    fields.reserve(num_children);
-
-    for (int child_idx = 0; child_idx < num_children; ++child_idx) {
-      std::shared_ptr<Array> field_array;
-      RETURN_NOT_OK(LoadChild(*type.child(child_idx).get(), &field_array));
-      fields.emplace_back(field_array);
-    }
+    std::vector<std::shared_ptr<Array>> fields;
+    RETURN_NOT_OK(LoadChildren(type.children(), &fields));
 
     result_ = std::make_shared<StructArray>(
         field_.type, field_meta.length, fields, field_meta.null_count, null_bitmap);
@@ -505,7 +520,24 @@ class ArrayLoader : public TypeVisitor {
   }
 
   Status Visit(const UnionType& type) override {
-    return Status::NotImplemented(type.ToString());
+    FieldMetadata field_meta;
+    std::shared_ptr<Buffer> null_bitmap;
+    RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
+
+    std::shared_ptr<Buffer> type_ids;
+    std::shared_ptr<Buffer> offsets = nullptr;
+    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &type_ids));
+
+    if (type.mode == UnionMode::DENSE) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
+    }
+
+    std::vector<std::shared_ptr<Array>> fields;
+    RETURN_NOT_OK(LoadChildren(type.children(), &fields));
+
+    result_ = std::make_shared<UnionArray>(field_.type, field_meta.length, fields,
+        type_ids, offsets, field_meta.null_count, null_bitmap);
+    return Status::OK();
   }
 };
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/ipc/ipc-adapter-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc
index f309b85..6ba0a6e 100644
--- a/cpp/src/arrow/ipc/ipc-adapter-test.cc
+++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc
@@ -95,7 +95,7 @@ TEST_P(TestWriteRecordBatch, RoundTrip) {
 INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRecordBatch,
     ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch,
                             &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList,
-                            &MakeStringTypesRecordBatch, &MakeStruct));
+                            &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion));
 
 void TestGetRecordBatchSize(std::shared_ptr<RecordBatch> batch) {
   ipc::MockOutputStream mock;
@@ -136,7 +136,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture {
       int64_t* body_length, std::shared_ptr<Schema>* schema) {
     const int batch_length = 5;
     TypePtr type = int32();
-    ArrayPtr array;
+    std::shared_ptr<Array> array;
     const bool include_nulls = true;
     RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool_, &array));
     for (int i = 0; i < recursion_level; ++i) {
@@ -149,7 +149,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture {
 
     *schema = std::shared_ptr<Schema>(new Schema({f0}));
 
-    std::vector<ArrayPtr> arrays = {array};
+    std::vector<std::shared_ptr<Array>> arrays = {array};
     auto batch = std::make_shared<RecordBatch>(*schema, batch_length, arrays);
 
     std::string path = "test-write-past-max-recursion";

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
index f793a26..0750989 100644
--- a/cpp/src/arrow/ipc/ipc-json-test.cc
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -29,6 +29,7 @@
 #include "arrow/builder.h"
 #include "arrow/ipc/json-internal.h"
 #include "arrow/ipc/json.h"
+#include "arrow/ipc/test-common.h"
 #include "arrow/memory_pool.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
@@ -142,11 +143,16 @@ TEST(TestJsonArrayWriter, NestedTypes) {
   auto value_type = int32();
 
   std::vector<bool> values_is_valid = {true, false, true, true, false, true, true};
-  std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
 
+  std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
   std::shared_ptr<Array> values_array;
   ArrayFromVector<Int32Type, int32_t>(int32(), values_is_valid, values, &values_array);
 
+  std::vector<int16_t> i16_values = {0, 1, 2, 3, 4, 5, 6};
+  std::shared_ptr<Array> i16_values_array;
+  ArrayFromVector<Int16Type, int16_t>(
+      int16(), values_is_valid, i16_values, &i16_values_array);
+
   // List
   std::vector<bool> list_is_valid = {true, false, true, true, true};
   std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
@@ -173,6 +179,16 @@ TEST(TestJsonArrayWriter, NestedTypes) {
   TestArrayRoundTrip(struct_array);
 }
 
+TEST(TestJsonArrayWriter, Unions) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeUnion(&batch));
+
+  for (int i = 0; i < batch->num_columns(); ++i) {
+    std::shared_ptr<Array> col = batch->column(i);
+    TestArrayRoundTrip(*col.get());
+  }
+}
+
 // Data generation for test case below
 void MakeBatchArrays(const std::shared_ptr<Schema>& schema, const int num_rows,
     std::vector<std::shared_ptr<Array>>* arrays) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index db11b7d..4f980d3 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -415,11 +415,11 @@ class JsonArrayWriter : public ArrayVisitor {
   }
 
   template <typename T>
-  void WriteOffsetsField(const T* offsets, int32_t length) {
-    writer_->Key("OFFSET");
+  void WriteIntegerField(const char* name, const T* values, int32_t length) {
+    writer_->Key(name);
     writer_->StartArray();
     for (int i = 0; i < length; ++i) {
-      writer_->Int64(offsets[i]);
+      writer_->Int64(values[i]);
     }
     writer_->EndArray();
   }
@@ -456,7 +456,7 @@ class JsonArrayWriter : public ArrayVisitor {
   template <typename T>
   Status WriteVarBytes(const T& array) {
     WriteValidityField(array);
-    WriteOffsetsField(array.raw_offsets(), array.length() + 1);
+    WriteIntegerField("OFFSET", array.raw_offsets(), array.length() + 1);
     WriteDataField(array);
     SetNoChildren();
     return Status::OK();
@@ -524,7 +524,7 @@ class JsonArrayWriter : public ArrayVisitor {
 
   Status Visit(const ListArray& array) override {
     WriteValidityField(array);
-    WriteOffsetsField(array.raw_offsets(), array.length() + 1);
+    WriteIntegerField("OFFSET", array.raw_offsets(), array.length() + 1);
     auto type = static_cast<const ListType*>(array.type().get());
     return WriteChildren(type->children(), {array.values()});
   }
@@ -536,7 +536,14 @@ class JsonArrayWriter : public ArrayVisitor {
   }
 
   Status Visit(const UnionArray& array) override {
-    return Status::NotImplemented("union");
+    WriteValidityField(array);
+    auto type = static_cast<const UnionType*>(array.type().get());
+
+    WriteIntegerField("TYPE_ID", array.raw_type_ids(), array.length());
+    if (type->mode == UnionMode::DENSE) {
+      WriteIntegerField("OFFSET", array.raw_offsets(), array.length());
+    }
+    return WriteChildren(type->children(), array.children());
   }
 
  private:
@@ -848,26 +855,34 @@ class JsonArrayReader {
   }
 
   template <typename T>
+  Status GetIntArray(
+      const RjArray& json_array, const int32_t length, std::shared_ptr<Buffer>* out) {
+    auto buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(buffer->Resize(length * sizeof(T)));
+    T* values = reinterpret_cast<T*>(buffer->mutable_data());
+    for (int i = 0; i < length; ++i) {
+      const rj::Value& val = json_array[i];
+      DCHECK(val.IsInt());
+      values[i] = static_cast<T>(val.GetInt());
+    }
+
+    *out = buffer;
+    return Status::OK();
+  }
+
+  template <typename T>
   typename std::enable_if<std::is_base_of<ListType, T>::value, Status>::type ReadArray(
       const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
       const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
-    const auto& json_offsets = json_array.FindMember("OFFSET");
-    RETURN_NOT_ARRAY("OFFSET", json_offsets, json_array);
-    const auto& json_offsets_arr = json_offsets->value.GetArray();
-
     int32_t null_count = 0;
     std::shared_ptr<Buffer> validity_buffer;
     RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer));
 
-    auto offsets_buffer = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(offsets_buffer->Resize((length + 1) * sizeof(int32_t)));
-    int32_t* offsets = reinterpret_cast<int32_t*>(offsets_buffer->mutable_data());
-
-    for (int i = 0; i < length + 1; ++i) {
-      const rj::Value& val = json_offsets_arr[i];
-      DCHECK(val.IsInt());
-      offsets[i] = val.GetInt();
-    }
+    const auto& json_offsets = json_array.FindMember("OFFSET");
+    RETURN_NOT_ARRAY("OFFSET", json_offsets, json_array);
+    std::shared_ptr<Buffer> offsets_buffer;
+    RETURN_NOT_OK(GetIntArray<int32_t>(
+        json_offsets->value.GetArray(), length + 1, &offsets_buffer));
 
     std::vector<std::shared_ptr<Array>> children;
     RETURN_NOT_OK(GetChildren(json_array, type, &children));
@@ -897,6 +912,41 @@ class JsonArrayReader {
   }
 
   template <typename T>
+  typename std::enable_if<std::is_base_of<UnionType, T>::value, Status>::type ReadArray(
+      const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+      const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+    int32_t null_count = 0;
+
+    const auto& union_type = static_cast<const UnionType&>(*type.get());
+
+    std::shared_ptr<Buffer> validity_buffer;
+    std::shared_ptr<Buffer> type_id_buffer;
+    std::shared_ptr<Buffer> offsets_buffer;
+
+    RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer));
+
+    const auto& json_type_ids = json_array.FindMember("TYPE_ID");
+    RETURN_NOT_ARRAY("TYPE_ID", json_type_ids, json_array);
+    RETURN_NOT_OK(
+        GetIntArray<uint8_t>(json_type_ids->value.GetArray(), length, &type_id_buffer));
+
+    if (union_type.mode == UnionMode::DENSE) {
+      const auto& json_offsets = json_array.FindMember("OFFSET");
+      RETURN_NOT_ARRAY("OFFSET", json_offsets, json_array);
+      RETURN_NOT_OK(
+          GetIntArray<int32_t>(json_offsets->value.GetArray(), length, &offsets_buffer));
+    }
+
+    std::vector<std::shared_ptr<Array>> children;
+    RETURN_NOT_OK(GetChildren(json_array, type, &children));
+
+    *array = std::make_shared<UnionArray>(type, length, children, type_id_buffer,
+        offsets_buffer, null_count, validity_buffer);
+
+    return Status::OK();
+  }
+
+  template <typename T>
   typename std::enable_if<std::is_base_of<NullType, T>::value, Status>::type ReadArray(
       const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
       const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
@@ -992,7 +1042,7 @@ class JsonArrayReader {
       NOT_IMPLEMENTED_CASE(INTERVAL);
       TYPE_CASE(ListType);
       TYPE_CASE(StructType);
-      NOT_IMPLEMENTED_CASE(UNION);
+      TYPE_CASE(UnionType);
       default:
         std::stringstream ss;
         ss << type->ToString();

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 8416f0d..3faeebf 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -110,7 +110,7 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) {
 
 template <class Builder, class RawType>
 Status MakeRandomBinaryArray(
-    const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) {
+    const TypePtr& type, int32_t length, MemoryPool* pool, std::shared_ptr<Array>* out) {
   const std::vector<std::string> values = {
       "", "", "abc", "123", "efg", "456!@#!@#", "12312"};
   Builder builder(pool, type);
@@ -225,7 +225,7 @@ Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out) {
   TypePtr type = int32();
 
   MemoryPool* pool = default_memory_pool();
-  ArrayPtr array;
+  std::shared_ptr<Array> array;
   const bool include_nulls = true;
   RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array));
   for (int i = 0; i < 63; ++i) {
@@ -235,7 +235,7 @@ Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out) {
 
   auto f0 = std::make_shared<Field>("f0", type);
   std::shared_ptr<Schema> schema(new Schema({f0}));
-  std::vector<ArrayPtr> arrays = {array};
+  std::vector<std::shared_ptr<Array>> arrays = {array};
   out->reset(new RecordBatch(schema, batch_length, arrays));
   return Status::OK();
 }
@@ -244,7 +244,7 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
   // reuse constructed list columns
   std::shared_ptr<RecordBatch> list_batch;
   RETURN_NOT_OK(MakeListRecordBatch(&list_batch));
-  std::vector<ArrayPtr> columns = {
+  std::vector<std::shared_ptr<Array>> columns = {
       list_batch->column(0), list_batch->column(1), list_batch->column(2)};
   auto list_schema = list_batch->schema();
 
@@ -256,20 +256,89 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
   std::shared_ptr<Schema> schema(new Schema({f0, f1}));
 
   // construct individual nullable/non-nullable struct arrays
-  ArrayPtr no_nulls(new StructArray(type, list_batch->num_rows(), columns));
+  std::shared_ptr<Array> no_nulls(new StructArray(type, list_batch->num_rows(), columns));
   std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1);
   null_bytes[0] = 0;
   std::shared_ptr<Buffer> null_bitmask;
   RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, &null_bitmask));
-  ArrayPtr with_nulls(
+  std::shared_ptr<Array> with_nulls(
       new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask));
 
   // construct batch
-  std::vector<ArrayPtr> arrays = {no_nulls, with_nulls};
+  std::vector<std::shared_ptr<Array>> arrays = {no_nulls, with_nulls};
   out->reset(new RecordBatch(schema, list_batch->num_rows(), arrays));
   return Status::OK();
 }
 
+Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
+  // Define schema
+  std::vector<std::shared_ptr<Field>> union_types(
+      {std::make_shared<Field>("u0", int32()), std::make_shared<Field>("u1", uint8())});
+
+  std::vector<uint8_t> type_codes = {5, 10};
+  auto sparse_type =
+      std::make_shared<UnionType>(union_types, type_codes, UnionMode::SPARSE);
+
+  auto dense_type =
+      std::make_shared<UnionType>(union_types, type_codes, UnionMode::DENSE);
+
+  auto f0 = std::make_shared<Field>("sparse_nonnull", sparse_type, false);
+  auto f1 = std::make_shared<Field>("sparse", sparse_type);
+  auto f2 = std::make_shared<Field>("dense", dense_type);
+
+  std::shared_ptr<Schema> schema(new Schema({f0, f1, f2}));
+
+  // Create data
+  std::vector<std::shared_ptr<Array>> sparse_children(2);
+  std::vector<std::shared_ptr<Array>> dense_children(2);
+
+  const int32_t length = 7;
+
+  std::shared_ptr<Buffer> type_ids_buffer;
+  std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5};
+  RETURN_NOT_OK(test::CopyBufferFromVector(type_ids, &type_ids_buffer));
+
+  std::vector<int32_t> u0_values = {0, 1, 2, 3, 4, 5, 6};
+  ArrayFromVector<Int32Type, int32_t>(
+      sparse_type->child(0)->type, u0_values, &sparse_children[0]);
+
+  std::vector<uint8_t> u1_values = {10, 11, 12, 13, 14, 15, 16};
+  ArrayFromVector<UInt8Type, uint8_t>(
+      sparse_type->child(1)->type, u1_values, &sparse_children[1]);
+
+  // dense children
+  u0_values = {0, 2, 3, 7};
+  ArrayFromVector<Int32Type, int32_t>(
+      dense_type->child(0)->type, u0_values, &dense_children[0]);
+
+  u1_values = {11, 14, 15};
+  ArrayFromVector<UInt8Type, uint8_t>(
+      dense_type->child(1)->type, u1_values, &dense_children[1]);
+
+  std::shared_ptr<Buffer> offsets_buffer;
+  std::vector<int32_t> offsets = {0, 0, 1, 2, 1, 2, 3};
+  RETURN_NOT_OK(test::CopyBufferFromVector(offsets, &offsets_buffer));
+
+  std::vector<uint8_t> null_bytes(length, 1);
+  null_bytes[2] = 0;
+  std::shared_ptr<Buffer> null_bitmask;
+  RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, &null_bitmask));
+
+  // construct individual nullable/non-nullable struct arrays
+  auto sparse_no_nulls =
+      std::make_shared<UnionArray>(sparse_type, length, sparse_children, type_ids_buffer);
+  auto sparse = std::make_shared<UnionArray>(
+      sparse_type, length, sparse_children, type_ids_buffer, nullptr, 1, null_bitmask);
+
+  auto dense = std::make_shared<UnionArray>(dense_type, length, dense_children,
+      type_ids_buffer, offsets_buffer, 1, null_bitmask);
+
+  // construct batch
+  std::vector<std::shared_ptr<Array>> arrays = {sparse_no_nulls, sparse, dense};
+  out->reset(new RecordBatch(schema, length, arrays));
+  return Status::OK();
+}
+
 }  // namespace ipc
 }  // namespace arrow
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/pretty_print.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 9c439c4..324f81b 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -161,44 +161,60 @@ class ArrayPrinter : public ArrayVisitor {
     return Status::NotImplemented("decimal");
   }
 
-  Status Visit(const ListArray& array) override {
+  Status WriteValidityBitmap(const Array& array) {
     Newline();
     Write("-- is_valid: ");
     BooleanArray is_valid(array.length(), array.null_bitmap());
-    PrettyPrint(is_valid, indent_ + 2, sink_);
+    return PrettyPrint(is_valid, indent_ + 2, sink_);
+  }
+
+  Status Visit(const ListArray& array) override {
+    RETURN_NOT_OK(WriteValidityBitmap(array));
 
     Newline();
     Write("-- offsets: ");
     Int32Array offsets(array.length() + 1, array.offsets());
-    PrettyPrint(offsets, indent_ + 2, sink_);
+    RETURN_NOT_OK(PrettyPrint(offsets, indent_ + 2, sink_));
 
     Newline();
     Write("-- values: ");
-    PrettyPrint(*array.values().get(), indent_ + 2, sink_);
+    RETURN_NOT_OK(PrettyPrint(*array.values().get(), indent_ + 2, sink_));
 
     return Status::OK();
   }
 
-  Status Visit(const StructArray& array) override {
-    Newline();
-    Write("-- is_valid: ");
-    BooleanArray is_valid(array.length(), array.null_bitmap());
-    PrettyPrint(is_valid, indent_ + 2, sink_);
-
-    const std::vector<std::shared_ptr<Array>>& fields = array.fields();
+  Status PrintChildren(const std::vector<std::shared_ptr<Array>>& fields) {
     for (size_t i = 0; i < fields.size(); ++i) {
       Newline();
       std::stringstream ss;
       ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << " values: ";
       Write(ss.str());
-      PrettyPrint(*fields[i].get(), indent_ + 2, sink_);
+      RETURN_NOT_OK(PrettyPrint(*fields[i].get(), indent_ + 2, sink_));
     }
-
     return Status::OK();
   }
 
+  Status Visit(const StructArray& array) override {
+    RETURN_NOT_OK(WriteValidityBitmap(array));
+    return PrintChildren(array.fields());
+  }
+
   Status Visit(const UnionArray& array) override {
-    return Status::NotImplemented("union");
+    RETURN_NOT_OK(WriteValidityBitmap(array));
+
+    Newline();
+    Write("-- type_ids: ");
+    UInt8Array type_ids(array.length(), array.type_ids());
+    RETURN_NOT_OK(PrettyPrint(type_ids, indent_ + 2, sink_));
+
+    if (array.mode() == UnionMode::DENSE) {
+      Newline();
+      Write("-- offsets: ");
+      Int32Array offsets(array.length(), array.offsets());
+      RETURN_NOT_OK(PrettyPrint(offsets, indent_ + 2, sink_));
+    }
+
+    return PrintChildren(array.children());
   }
 
   void Write(const char* data) { (*sink_) << data; }

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index aa310b1..ce9327d 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -274,6 +274,18 @@ void ArrayFromVector(const std::shared_ptr<DataType>& type,
       values_buffer, null_count, values_bitmap);
 }
 
+template <typename TYPE, typename C_TYPE>
+void ArrayFromVector(const std::shared_ptr<DataType>& type,
+    const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) {
+  std::shared_ptr<Buffer> values_buffer;
+
+  ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer));
+
+  using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+  *out = std::make_shared<ArrayType>(
+      type, static_cast<int32_t>(values.size()), values_buffer);
+}
+
 class TestBuilder : public ::testing::Test {
  public:
   void SetUp() {
@@ -293,7 +305,7 @@ class TestBuilder : public ::testing::Test {
 
 template <class T, class Builder>
 Status MakeArray(const std::vector<uint8_t>& valid_bytes, const std::vector<T>& values,
-    int size, Builder* builder, ArrayPtr* out) {
+    int size, Builder* builder, std::shared_ptr<Array>* out) {
   // Append the first 1000
   for (int i = 0; i < size; ++i) {
     if (valid_bytes[i] > 0) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 8ff9eea..89faab6 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -103,7 +103,7 @@ std::string UnionType::ToString() const {
 
   for (size_t i = 0; i < children_.size(); ++i) {
     if (i) { s << ", "; }
-    s << children_[i]->ToString();
+    s << children_[i]->ToString() << "=" << static_cast<int>(type_ids[i]);
   }
   s << ">";
   return s.str();

http://git-wip-us.apache.org/repos/asf/arrow/blob/9f7d4ae6/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 7300570..530c323 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -394,10 +394,10 @@ enum class UnionMode : char { SPARSE, DENSE };
 struct ARROW_EXPORT UnionType : public DataType {
   static constexpr Type::type type_id = Type::UNION;
 
-  UnionType(const std::vector<std::shared_ptr<Field>>& child_fields,
+  UnionType(const std::vector<std::shared_ptr<Field>>& fields,
       const std::vector<uint8_t>& type_ids, UnionMode mode = UnionMode::SPARSE)
       : DataType(Type::UNION), mode(mode), type_ids(type_ids) {
-    children_ = child_fields;
+    children_ = fields;
   }
 
   std::string ToString() const override;
@@ -407,6 +407,10 @@ struct ARROW_EXPORT UnionType : public DataType {
   std::vector<BufferDescr> GetBufferLayout() const override;
 
   UnionMode mode;
+
+  // The type id used in the data to indicate each data type in the union. For
+  // example, the first type in the union might be denoted by the id 5 (instead
+  // of 0).
   std::vector<uint8_t> type_ids;
 };
 


Mime
View raw message