arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-972: UnionArray in pyarrow
Date Thu, 09 Nov 2017 16:31:44 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 65a9055  ARROW-972: UnionArray in pyarrow
65a9055 is described below

commit 65a9055c705e5f09c949c12365d12839ace063f5
Author: Philipp Moritz <pcmoritz@gmail.com>
AuthorDate: Thu Nov 9 11:31:36 2017 -0500

    ARROW-972: UnionArray in pyarrow
    
    This is taking a stab at exposing UnionArray to pyarrow. Tasks to be done:
    
    - [x] Support UnionType::SPARSE
    - [x] Add doc strings
    
    Author: Philipp Moritz <pcmoritz@gmail.com>
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1216 from pcmoritz/pyarrow-union-array and squashes the following commits:
    
    7f3ca313 [Wes McKinney] Fix flakes
    9f33076b [Wes McKinney] Change UnionMode to scoped enumeration
    9e602a8d [Philipp Moritz] wrap UnionType in pyarrow
    eeef7226 [Philipp Moritz] linting
    502c335a [Philipp Moritz] fixes
    c6c85491 [Philipp Moritz] add doc strings
    9068bbb5 [Philipp Moritz] linting
    d8da0170 [Philipp Moritz] implement dense and sparse UnionArrays
    cbdedc7a [Philipp Moritz] make fields in UnionArray unique to be compatiable with Java
    b796ce64 [Philipp Moritz] Implement UnionArray in pyarrow
---
 cpp/src/arrow/array.cc                 | 56 ++++++++++++++++++++++++++++++++++
 cpp/src/arrow/array.h                  | 33 +++++++++++++++++++-
 cpp/src/arrow/compare.cc               |  2 +-
 cpp/src/arrow/ipc/json-internal.cc     |  2 +-
 cpp/src/arrow/ipc/metadata-internal.cc |  5 +--
 cpp/src/arrow/type.cc                  | 18 +++++++++--
 cpp/src/arrow/type.h                   | 18 ++++++++---
 python/pyarrow/__init__.py             |  4 +--
 python/pyarrow/_parquet.pxd            |  2 +-
 python/pyarrow/array.pxi               | 53 ++++++++++++++++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd   | 25 +++++++++++++++
 python/pyarrow/lib.pxd                 | 16 ++++++++++
 python/pyarrow/lib.pyx                 |  2 ++
 python/pyarrow/public-api.pxi          |  2 +-
 python/pyarrow/scalar.pxi              | 19 ++++++++++++
 python/pyarrow/tests/test_array.py     | 22 +++++++++++++
 python/pyarrow/tests/test_schema.py    |  8 +++++
 python/pyarrow/tests/test_types.py     | 13 ++++----
 python/pyarrow/types.pxi               | 49 +++++++++++++++++++++++++++++
 19 files changed, 327 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index b523876..9c91d61 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -393,6 +393,62 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& type,
int64_t length,
   SetData(internal_data);
 }
 
+Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
+                             const std::vector<std::shared_ptr<Array>>& children,
+                             std::shared_ptr<Array>* out) {
+  if (value_offsets.length() == 0) {
+    return Status::Invalid("UnionArray offsets must have non-zero length");
+  }
+
+  if (value_offsets.type_id() != Type::INT32) {
+    return Status::Invalid("UnionArray offsets must be signed int32");
+  }
+
+  if (type_ids.type_id() != Type::INT8) {
+    return Status::Invalid("UnionArray type_ids must be signed int8");
+  }
+
+  if (value_offsets.null_count() != 0) {
+    return Status::Invalid("MakeDense does not allow NAs in value_offsets");
+  }
+
+  BufferVector buffers = {type_ids.null_bitmap(),
+                          static_cast<const UInt8Array&>(type_ids).values(),
+                          static_cast<const Int32Array&>(value_offsets).values()};
+  auto union_type = union_(children, UnionMode::DENSE);
+  auto internal_data =
+      std::make_shared<ArrayData>(union_type, type_ids.length(), std::move(buffers),
+                                  type_ids.null_count(), type_ids.offset());
+  for (const auto& child : children) {
+    internal_data->child_data.push_back(child->data());
+  }
+  *out = std::make_shared<UnionArray>(internal_data);
+  return Status::OK();
+}
+
+Status UnionArray::MakeSparse(const Array& type_ids,
+                              const std::vector<std::shared_ptr<Array>>&
children,
+                              std::shared_ptr<Array>* out) {
+  if (type_ids.type_id() != Type::INT8) {
+    return Status::Invalid("UnionArray type_ids must be signed int8");
+  }
+  BufferVector buffers = {type_ids.null_bitmap(),
+                          static_cast<const UInt8Array&>(type_ids).values(), nullptr};
+  auto union_type = union_(children, UnionMode::SPARSE);
+  auto internal_data =
+      std::make_shared<ArrayData>(union_type, type_ids.length(), std::move(buffers),
+                                  type_ids.null_count(), type_ids.offset());
+  for (const auto& child : children) {
+    internal_data->child_data.push_back(child->data());
+    if (child->length() != type_ids.length()) {
+      return Status::Invalid(
+          "Sparse UnionArray must have len(child) == len(type_ids) for all children");
+    }
+  }
+  *out = std::make_shared<UnionArray>(internal_data);
+  return Status::OK();
+}
+
 std::shared_ptr<Array> UnionArray::child(int i) const {
   if (!boxed_fields_[i]) {
     boxed_fields_[i] = MakeArray(data_->child_data[i]);
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index afbd780..f7762ce 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -612,16 +612,47 @@ class ARROW_EXPORT UnionArray : public Array {
              const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count
= 0,
              int64_t offset = 0);
 
+  /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types. The value_offsets are assumed to be well-formed.
+  ///
+  /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+  /// 0 corresponding to each type.
+  /// \param[in] value_offsets An array of signed int32 values indicating the
+  /// relative offset into the respective child array for the type in a given slot.
+  /// The respective offsets for each child value array must be in order / increasing.
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[out] out Will have length equal to value_offsets.length()
+  static Status MakeDense(const Array& type_ids, const Array& value_offsets,
+                          const std::vector<std::shared_ptr<Array>>& children,
+                          std::shared_ptr<Array>* out);
+
+  /// \brief Construct Sparse UnionArray from type_ids and children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types.
+  ///
+  /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+  /// 0 corresponding to each type.
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[out] out Will have length equal to type_ids.length()
+  static Status MakeSparse(const Array& type_ids,
+                           const std::vector<std::shared_ptr<Array>>& children,
+                           std::shared_ptr<Array>* out);
+
   /// Note that this buffer does not account for any slice offset
   std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; }
 
   /// Note that this buffer does not account for any slice offset
   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
 
+  int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset];
}
+
   const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
   const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset;
}
 
-  UnionMode mode() const { return static_cast<const UnionType&>(*type()).mode();
}
+  UnionMode::type mode() const { return static_cast<const UnionType&>(*type()).mode();
}
 
   std::shared_ptr<Array> child(int pos) const;
 
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 2ec86c3..a2d4de7 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -152,7 +152,7 @@ class RangeEqualsVisitor {
   bool CompareUnions(const UnionArray& left) const {
     const auto& right = static_cast<const UnionArray&>(right_);
 
-    const UnionMode union_mode = left.mode();
+    const UnionMode::type union_mode = left.mode();
     if (union_mode != right.mode()) {
       return false;
     }
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index c1c0661..1b9baee 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type,
   RETURN_NOT_STRING("mode", it_mode, json_type);
 
   std::string mode_str = it_mode->value.GetString();
-  UnionMode mode;
+  UnionMode::type mode;
 
   if (mode_str == "SPARSE") {
     mode = UnionMode::SPARSE;
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index f0f0f67..63ef8a5 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type,
 static Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
                                   const std::vector<std::shared_ptr<Field>>&
children,
                                   std::shared_ptr<DataType>* out) {
-  UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE
-                                                                   : UnionMode::DENSE;
+  UnionMode::type mode =
+      (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE
+                                                       : UnionMode::DENSE);
 
   std::vector<uint8_t> type_codes;
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index a9bf591..0d1985f 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -190,7 +190,7 @@ std::string TimestampType::ToString() const {
 // Union type
 
 UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
-                     const std::vector<uint8_t>& type_codes, UnionMode mode)
+                     const std::vector<uint8_t>& type_codes, UnionMode::type mode)
     : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
   children_ = fields;
 }
@@ -440,10 +440,24 @@ std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>&
fie
 }
 
 std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>&
child_fields,
-                                 const std::vector<uint8_t>& type_codes, UnionMode
mode) {
+                                 const std::vector<uint8_t>& type_codes,
+                                 UnionMode::type mode) {
   return std::make_shared<UnionType>(child_fields, type_codes, mode);
 }
 
+std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>&
children,
+                                 UnionMode::type mode) {
+  std::vector<std::shared_ptr<Field>> types;
+  std::vector<uint8_t> type_codes;
+  uint8_t counter = 0;
+  for (const auto& child : children) {
+    types.push_back(field(std::to_string(counter), child->type()));
+    type_codes.push_back(counter);
+    counter++;
+  }
+  return union_(types, type_codes, mode);
+}
+
 std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
                                      const std::shared_ptr<Array>& dict_values,
                                      bool ordered) {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 446f4d3..9e11a03 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -517,14 +517,17 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
   int32_t scale_;
 };
 
-enum class UnionMode : char { SPARSE, DENSE };
+struct UnionMode {
+  enum type { SPARSE, DENSE };
+};
 
 class ARROW_EXPORT UnionType : public NestedType {
  public:
   static constexpr Type::type type_id = Type::UNION;
 
   UnionType(const std::vector<std::shared_ptr<Field>>& fields,
-            const std::vector<uint8_t>& type_codes, UnionMode mode = UnionMode::SPARSE);
+            const std::vector<uint8_t>& type_codes,
+            UnionMode::type mode = UnionMode::SPARSE);
 
   std::string ToString() const override;
   std::string name() const override { return "union"; }
@@ -534,10 +537,10 @@ class ARROW_EXPORT UnionType : public NestedType {
 
   const std::vector<uint8_t>& type_codes() const { return type_codes_; }
 
-  UnionMode mode() const { return mode_; }
+  UnionMode::type mode() const { return mode_; }
 
  private:
-  UnionMode mode_;
+  UnionMode::type mode_;
 
   // The type id used in the data to indicate each data type in the union. For
   // example, the first type in the union might be denoted by the id 5 (instead
@@ -842,7 +845,12 @@ struct_(const std::vector<std::shared_ptr<Field>>& fields);
 /// \brief Create an instance of Union type
 std::shared_ptr<DataType> ARROW_EXPORT
 union_(const std::vector<std::shared_ptr<Field>>& child_fields,
-       const std::vector<uint8_t>& type_codes, UnionMode mode = UnionMode::SPARSE);
+       const std::vector<uint8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE);
+
+/// \brief Create and instance of Union type
+std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+       UnionMode::type mode = UnionMode::SPARSE);
 
 /// \brief Create an instance of Dictionary type
 std::shared_ptr<DataType> ARROW_EXPORT
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 1215c82..2d7d728 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -36,7 +36,7 @@ from pyarrow.lib import (null, bool_,
                          time32, time64, timestamp, date32, date64,
                          float16, float32, float64,
                          binary, string, decimal,
-                         list_, struct, dictionary, field,
+                         list_, struct, union, dictionary, field,
                          type_for_alias,
                          DataType, NAType,
                          Field,
@@ -52,7 +52,7 @@ from pyarrow.lib import (null, bool_,
                          Int16Array, UInt16Array,
                          Int32Array, UInt32Array,
                          Int64Array, UInt64Array,
-                         ListArray,
+                         ListArray, UnionArray,
                          BinaryArray, StringArray,
                          FixedSizeBinaryArray,
                          DictionaryArray,
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 04a5b13..7e5e575 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         int64_t num_values() const
         shared_ptr[ColumnPath] path_in_schema() const
         bint is_stats_set() const
-        shared_ptr[CRowGroupStatistics] statistics() const;
+        shared_ptr[CRowGroupStatistics] statistics() const
         ParquetCompression compression() const
         const vector[ParquetEncoding]& encodings() const
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 7752d06..9991411 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -631,6 +631,58 @@ cdef class ListArray(Array):
         return pyarrow_wrap_array(out)
 
 
+cdef class UnionArray(Array):
+
+    @staticmethod
+    def from_dense(Array types, Array value_offsets, list children):
+        """
+        Construct dense UnionArray from arrays of int8 types, int32 offsets and
+        children arrays
+
+        Parameters
+        ----------
+        types : Array (int8 type)
+        value_offsets : Array (int32 type)
+        children : list
+
+        Returns
+        -------
+        union_array : UnionArray
+        """
+        cdef shared_ptr[CArray] out
+        cdef vector[shared_ptr[CArray]] c
+        cdef Array child
+        for child in children:
+            c.push_back(child.sp_array)
+        with nogil:
+            check_status(CUnionArray.MakeDense(
+                deref(types.ap), deref(value_offsets.ap), c, &out))
+        return pyarrow_wrap_array(out)
+
+    @staticmethod
+    def from_sparse(Array types, list children):
+        """
+        Construct sparse UnionArray from arrays of int8 types and children
+        arrays
+
+        Parameters
+        ----------
+        types : Array (int8 type)
+        children : list
+
+        Returns
+        -------
+        union_array : UnionArray
+        """
+        cdef shared_ptr[CArray] out
+        cdef vector[shared_ptr[CArray]] c
+        cdef Array child
+        for child in children:
+            c.push_back(child.sp_array)
+        with nogil:
+            check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out))
+        return pyarrow_wrap_array(out)
+
 cdef class StringArray(Array):
     pass
 
@@ -789,6 +841,7 @@ cdef dict _array_classes = {
     _Type_FLOAT: FloatArray,
     _Type_DOUBLE: DoubleArray,
     _Type_LIST: ListArray,
+    _Type_UNION: UnionArray,
     _Type_BINARY: BinaryArray,
     _Type_STRING: StringArray,
     _Type_DICTIONARY: DictionaryArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 731ef94..dfafd37 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_DICTIONARY" arrow::Type::DICTIONARY"
         _Type_MAP" arrow::Type::MAP"
 
+    enum UnionMode" arrow::UnionMode::type":
+        _UnionMode_SPARSE" arrow::UnionMode::SPARSE"
+        _UnionMode_DENSE" arrow::UnionMode::DENSE"
+
     enum TimeUnit" arrow::TimeUnit::type":
         TimeUnit_SECOND" arrow::TimeUnit::SECOND"
         TimeUnit_MILLI" arrow::TimeUnit::MILLI"
@@ -222,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStructType" arrow::StructType"(CDataType):
         CStructType(const vector[shared_ptr[CField]]& fields)
 
+    cdef cppclass CUnionType" arrow::UnionType"(CDataType):
+        CUnionType(const vector[shared_ptr[CField]]& fields,
+                   const vector[uint8_t]& type_codes, UnionMode mode)
+        UnionMode mode()
+
     cdef cppclass CSchema" arrow::Schema":
         CSchema(const vector[shared_ptr[CField]]& fields)
         CSchema(const vector[shared_ptr[CField]]& fields,
@@ -317,6 +326,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] values()
         shared_ptr[CDataType] value_type()
 
+    cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
+        @staticmethod
+        CStatus MakeSparse(const CArray& type_ids,
+                           const vector[shared_ptr[CArray]]& children,
+                           shared_ptr[CArray]* out)
+
+        @staticmethod
+        CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets,
+                          const vector[shared_ptr[CArray]]& children,
+                          shared_ptr[CArray]* out)
+        uint8_t* raw_type_ids()
+        int32_t value_offset(int i)
+        shared_ptr[CArray] child(int pos)
+        const CArray* UnsafeChild(int pos)
+        UnionMode mode()
+
     cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
         const uint8_t* GetValue(int i, int32_t* length)
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 8fdcf55..5314894 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -56,6 +56,11 @@ cdef class DictionaryType(DataType):
         const CDictionaryType* dict_type
 
 
+cdef class UnionType(DataType):
+    cdef:
+        list child_types
+
+
 cdef class TimestampType(DataType):
     cdef:
         const CTimestampType* ts_type
@@ -139,6 +144,13 @@ cdef class ListValue(ArrayValue):
     cdef getitem(self, int64_t i)
 
 
+cdef class UnionValue(ArrayValue):
+    cdef:
+        CUnionArray* ap
+        list value_types
+
+    cdef getitem(self, int64_t i)
+
 cdef class StringValue(ArrayValue):
     pass
 
@@ -242,6 +254,10 @@ cdef class ListArray(Array):
     pass
 
 
+cdef class UnionArray(Array):
+    pass
+
+
 cdef class StringArray(Array):
     pass
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 6f4451e..b4ca49c 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -92,6 +92,8 @@ Type_UNION = _Type_UNION
 Type_DICTIONARY = _Type_DICTIONARY
 Type_MAP = _Type_MAP
 
+UnionMode_SPARSE = _UnionMode_SPARSE
+UnionMode_DENSE = _UnionMode_DENSE
 
 # Exception types
 include "error.pxi"
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 9f10512..90aff9e 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -72,7 +72,7 @@ cdef public api object pyarrow_wrap_data_type(
     elif type.get().id() == _Type_STRUCT:
         out = StructType()
     elif type.get().id() == _Type_UNION:
-        out = StructType()
+        out = UnionType()
     elif type.get().id() == _Type_TIMESTAMP:
         out = TimestampType()
     elif type.get().id() == _Type_FIXED_SIZE_BINARY:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index c37ed3b..a396fa7 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -315,6 +315,24 @@ cdef class ListValue(ArrayValue):
         return result
 
 
+cdef class UnionValue(ArrayValue):
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+        self.sp_array = sp_array
+        self.ap = <CUnionArray*> sp_array.get()
+
+    cdef getitem(self, int64_t i):
+        cdef int8_t type_id = self.ap.raw_type_ids()[i]
+        cdef shared_ptr[CArray] child = self.ap.child(type_id)
+        if self.ap.mode() == _UnionMode_SPARSE:
+            return box_scalar(self.type[type_id], child, i)
+        else:
+            return box_scalar(self.type[type_id], child,
+                              self.ap.value_offset(i))
+
+    def as_py(self):
+        return self.getitem(self.index).as_py()
+
 cdef class FixedSizeBinaryValue(ArrayValue):
 
     def as_py(self):
@@ -364,6 +382,7 @@ cdef dict _scalar_classes = {
     _Type_FLOAT: FloatValue,
     _Type_DOUBLE: DoubleValue,
     _Type_LIST: ListValue,
+    _Type_UNION: UnionValue,
     _Type_BINARY: BinaryValue,
     _Type_STRING: StringValue,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index e3a4c97..7dc93c2 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -235,6 +235,28 @@ def test_list_from_arrays():
     assert result.equals(expected)
 
 
+def test_union_from_dense():
+    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
+    int64 = pa.array([1, 2, 3], type='int64')
+    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
+    value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')
+
+    result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
+
+    assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd']
+
+
+def test_union_from_sparse():
+    binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'],
+                      type='binary')
+    int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64')
+    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
+
+    result = pa.UnionArray.from_sparse(types, [binary, int64])
+
+    assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd']
+
+
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
 
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index d6b2655..116f397 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -319,6 +319,14 @@ def test_type_schema_pickling():
             pa.field('a', 'int8'),
             pa.field('b', 'string')
         ]),
+        pa.union([
+            pa.field('a', pa.int8()),
+            pa.field('b', pa.int16())
+        ], pa.lib.UnionMode_SPARSE),
+        pa.union([
+            pa.field('a', pa.int8()),
+            pa.field('b', pa.int16())
+        ], pa.lib.UnionMode_DENSE),
         pa.time32('s'),
         pa.time64('us'),
         pa.date32(),
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index e6ff5b1..0e3ea1f 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -85,16 +85,17 @@ def test_is_nested_or_struct():
     assert not types.is_nested(pa.int32())
 
 
-# TODO(wesm): Union types not yet implemented in pyarrow
+def test_is_union():
+    assert types.is_union(pa.union([pa.field('a', pa.int32()),
+                                    pa.field('b', pa.int8()),
+                                    pa.field('c', pa.string())],
+                                   pa.lib.UnionMode_SPARSE))
+    assert not types.is_union(pa.list_(pa.int32()))
 
-# def test_is_union():
-#     assert types.is_union(pa.union([pa.field('a', pa.int32()),
-#                                     pa.field('b', pa.int8()),
-#                                     pa.field('c', pa.string())]))
-#     assert not types.is_union(pa.list_(pa.int32()))
 
 # TODO(wesm): is_map, once implemented
 
+
 def test_is_binary_string():
     assert types.is_binary(pa.binary())
     assert not types.is_binary(pa.string())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index c9a4909..d2e68ff 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -186,7 +186,32 @@ cdef class UnionType(DataType):
 
     cdef void init(self, const shared_ptr[CDataType]& type):
         DataType.init(self, type)
+        self.child_types = [
+            pyarrow_wrap_data_type(type.get().child(i).get().type())
+            for i in range(self.num_children)]
 
+    property num_children:
+
+        def __get__(self):
+            return self.type.num_children()
+
+    property mode:
+
+        def __get__(self):
+            cdef CUnionType* type = <CUnionType*> self.sp_type.get()
+            return type.mode()
+
+    def __getitem__(self, i):
+        return self.child_types[i]
+
+    def __getstate__(self):
+        children = [pyarrow_wrap_field(self.type.child(i))
+                    for i in range(self.num_children)]
+        return children, self.mode
+
+    def __setstate__(self, state):
+        cdef DataType reconstituted = union(*state)
+        self.init(reconstituted.sp_type)
 
 cdef class TimestampType(DataType):
 
@@ -1056,6 +1081,30 @@ def struct(fields):
     return pyarrow_wrap_data_type(struct_type)
 
 
+def union(children_fields, mode):
+    """
+    Create UnionType from children fields.
+    """
+    cdef:
+        Field child_field
+        vector[shared_ptr[CField]] c_fields
+        vector[uint8_t] type_codes
+        shared_ptr[CDataType] union_type
+        int i
+
+    for i, child_field in enumerate(children_fields):
+        type_codes.push_back(i)
+        c_fields.push_back(child_field.sp_field)
+
+        if mode == UnionMode_SPARSE:
+            union_type.reset(new CUnionType(c_fields, type_codes,
+                                            _UnionMode_SPARSE))
+        else:
+            union_type.reset(new CUnionType(c_fields, type_codes,
+                                            _UnionMode_DENSE))
+
+    return pyarrow_wrap_data_type(union_type)
+
 cdef dict _type_aliases = {
     'null': null,
     'i1': int8,

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message