arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-915: [Python] Struct Array reads limited support
Date Sun, 02 Jul 2017 17:41:07 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 96e7e9979 -> e268ce87e


ARROW-915: [Python] Struct Array reads limited support

Add limited struct array reading support in pyarrow.

This is done to complement parquet-cpp struct reader.

cc @wesm

Author: Itai Incze <itai.in@gmail.com>

Closes #615 from itaiin/ARROW-915 and squashes the following commits:

d8f2636e [Itai Incze] convert struct field names using frombytes
e654abfa [Itai Incze] fix python3 tests & msvc build
3a4edf43 [Itai Incze] fix lint errors
bef46447 [Itai Incze] Refactor due to review
e2a697ff [Itai Incze] Further fixes due to review
eecbb32f [Itai Incze] fix per code review
4c255391 [Itai Incze] Add basic StructArray read support


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e268ce87
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e268ce87
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e268ce87

Branch: refs/heads/master
Commit: e268ce87e5d8f93ff6ecc871d218a58605b9692f
Parents: 96e7e99
Author: Itai Incze <itai.in@gmail.com>
Authored: Sun Jul 2 13:38:46 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sun Jul 2 13:38:46 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/pandas_convert.cc       | 69 ++++++++++++++++++++++-
 python/pyarrow/__init__.py                   |  2 +-
 python/pyarrow/_parquet.pxd                  |  1 +
 python/pyarrow/_parquet.pyx                  | 11 ++++
 python/pyarrow/array.pxi                     | 53 +++++++++++++++++
 python/pyarrow/includes/libarrow.pxd         | 15 +++++
 python/pyarrow/lib.pxd                       |  1 +
 python/pyarrow/tests/test_convert_builtin.py | 17 ++++++
 python/pyarrow/tests/test_convert_pandas.py  | 17 ++++++
 9 files changed, 184 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 9b65570..60ae1b2 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -1354,6 +1354,66 @@ inline Status ConvertFixedSizeBinary(const ChunkedArray& data,
PyObject** out_va
   return Status::OK();
 }
 
+inline Status ConvertStruct(const ChunkedArray& data, PyObject** out_values) {
+  PyAcquireGIL lock;
+  if (data.num_chunks() <= 0) { return Status::OK(); }
+  // ChunkedArray has at least one chunk
+  auto arr = static_cast<const StructArray*>(data.chunk(0).get());
+  // Use it to cache the struct type and number of fields for all chunks
+  auto num_fields = arr->fields().size();
+  auto array_type = arr->type();
+  std::vector<OwnedRef> fields_data(num_fields);
+  OwnedRef dict_item;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    auto arr = static_cast<const StructArray*>(data.chunk(c).get());
+    // Convert the struct arrays first
+    for (size_t i = 0; i < num_fields; i++) {
+      PyObject* numpy_array;
+      RETURN_NOT_OK(
+          ConvertArrayToPandas(arr->field(static_cast<int>(i)), nullptr, &numpy_array));
+      fields_data[i].reset(numpy_array);
+    }
+
+    // Construct a dictionary for each row
+    const bool has_nulls = data.null_count() > 0;
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        // Build the new dict object for the row
+        dict_item.reset(PyDict_New());
+        RETURN_IF_PYERROR();
+        for (size_t field_idx = 0; field_idx < num_fields; ++field_idx) {
+          OwnedRef field_value;
+          auto name = array_type->child(static_cast<int>(field_idx))->name();
+          if (!arr->field(static_cast<int>(field_idx))->IsNull(i)) {
+            // Value exists in child array, obtain it
+            auto array = reinterpret_cast<PyArrayObject*>(fields_data[field_idx].obj());
+            auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));
+            field_value.reset(PyArray_GETITEM(array, ptr));
+            RETURN_IF_PYERROR();
+          } else {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            field_value.reset(Py_None);
+          }
+          // PyDict_SetItemString does not steal the value reference
+          auto setitem_result =
+              PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj());
+          RETURN_IF_PYERROR();
+          DCHECK_EQ(setitem_result, 0);
+        }
+        *out_values = dict_item.obj();
+        // Grant ownership to the resulting array
+        Py_INCREF(*out_values);
+      }
+      ++out_values;
+    }
+  }
+  return Status::OK();
+}
+
 template <typename ArrowType>
 inline Status ConvertListsLike(
     const std::shared_ptr<Column>& col, PyObject** out_values) {
@@ -1499,6 +1559,8 @@ class ObjectBlock : public PandasBlock {
           return Status::NotImplemented(ss.str());
         }
       }
+    } else if (type == Type::STRUCT) {
+      RETURN_NOT_OK(ConvertStruct(data, out_buffer));
     } else {
       std::stringstream ss;
       ss << "Unsupported type for object array output: " << col->type()->ToString();
@@ -1960,6 +2022,7 @@ class DataFrameBlockCreator {
           output_type = PandasBlock::DECIMAL;
           break;
         case Type::NA:
+        case Type::STRUCT:
           output_type = PandasBlock::OBJECT;
           break;
         default:
@@ -2355,7 +2418,11 @@ class ArrowDeserializer {
     return ConvertNulls(data_, out_values);
   }
 
-  Status Visit(const StructType& type) { return Status::NotImplemented("struct type");
}
+  Status Visit(const StructType& type) {
+    AllocateOutput(NPY_OBJECT);
+    auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+    return ConvertStruct(data_, out_values);
+  }
 
   Status Visit(const UnionType& type) { return Status::NotImplemented("union type");
}
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 237d44f..771a516 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -59,7 +59,7 @@ from pyarrow.lib import (null, bool_,
                          DictionaryArray,
                          Date32Array, Date64Array,
                          TimestampArray, Time32Array, Time64Array,
-                         DecimalArray,
+                         DecimalArray, StructArray,
                          ArrayValue, Scalar, NA, NAType,
                          BooleanValue,
                          Int8Value, Int16Value, Int32Value, Int64Value,

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 2f6b9a9..3d2d0c8 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -214,6 +214,7 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
     cdef cppclass FileReader:
         FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader)
         CStatus ReadColumn(int i, shared_ptr[CArray]* out)
+        CStatus ReadSchemaField(int i, shared_ptr[CArray]* out)
 
         int num_row_groups()
         CStatus ReadRowGroup(int i, shared_ptr[CTable]* out)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 77ef7ad..5d446a8 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -501,6 +501,17 @@ cdef class ParquetReader:
         array.init(carray)
         return array
 
+    def read_schema_field(self, int field_index):
+        cdef:
+            Array array = Array()
+            shared_ptr[CArray] carray
+
+        with nogil:
+            check_status(self.reader.get()
+                         .ReadSchemaField(field_index, &carray));
+
+        array.init(carray)
+        return array
 
 cdef int check_compression_name(name) except -1:
     if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI']:

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index e1be5b1..79e88fc 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -996,6 +996,21 @@ cdef class FixedSizeBinaryValue(ArrayValue):
         return cp.PyBytes_FromStringAndSize(data, length)
 
 
+cdef class StructValue(ArrayValue):
+    def as_py(self):
+        cdef:
+            CStructArray* ap
+            vector[shared_ptr[CField]] child_fields = self.type.type.children()
+        ap = <CStructArray*> self.sp_array.get()
+        child_arrays = ap.fields()
+        wrapped_arrays = (pyarrow_wrap_array(child) for child in child_arrays)
+        child_names = (child.get().name() for child in child_fields)
+        # Return the struct as a dict
+        return {
+            frombytes(name): child_array[self.index].as_py()
+            for name, child_array in
+            zip(child_names, wrapped_arrays)
+        }
 
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanValue,
@@ -1019,6 +1034,7 @@ cdef dict _scalar_classes = {
     _Type_STRING: StringValue,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
     _Type_DECIMAL: DecimalValue,
+    _Type_STRUCT: StructValue,
 }
 
 cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
@@ -1589,6 +1605,42 @@ cdef class DictionaryArray(Array):
         result.init(c_result)
         return result
 
+cdef class StructArray(Array):
+    @staticmethod
+    def from_arrays(field_names, arrays):
+        cdef:
+            Array array
+            shared_ptr[CArray] c_array
+            vector[shared_ptr[CArray]] c_arrays
+            shared_ptr[CArray] c_result
+            ssize_t num_arrays
+            ssize_t length
+            ssize_t i
+
+        num_arrays = len(arrays)
+        if num_arrays == 0:
+            raise ValueError("arrays list is empty")
+
+        length = len(arrays[0])
+
+        c_arrays.resize(num_arrays)
+        for i in range(num_arrays):
+            array = arrays[i]
+            if len(array) != length:
+                raise ValueError("All arrays must have the same length")
+            c_arrays[i] = array.sp_array
+
+        cdef DataType struct_type = struct([
+            field(name, array.type)
+            for name, array in
+            zip(field_names, arrays)
+        ])
+
+        c_result.reset(new CStructArray(struct_type.sp_type, length, c_arrays))
+        result = StructArray()
+        result.init(c_result)
+        return result
+
 
 cdef dict _array_classes = {
     _Type_NA: NullArray,
@@ -1614,6 +1666,7 @@ cdef dict _array_classes = {
     _Type_DICTIONARY: DictionaryArray,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
     _Type_DECIMAL: DecimalArray,
+    _Type_STRUCT: StructArray,
 }
 
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 741d832..fb101fe 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -76,6 +76,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         c_bool Equals(const CDataType& other)
 
+        shared_ptr[CField] child(int i)
+
+        const vector[shared_ptr[CField]] children()
+
+        int num_children()
+
         c_string ToString()
 
     cdef cppclass CArray" arrow::Array":
@@ -283,6 +289,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray):
         c_string GetString(int i)
 
+    cdef cppclass CStructArray" arrow::StructArray"(CArray):
+        CStructArray(shared_ptr[CDataType] type, int64_t length,
+            vector[shared_ptr[CArray]] children,
+            shared_ptr[CBuffer] null_bitmap = nullptr, int64_t null_count = 0,
+            int64_t offset = 0)
+
+        shared_ptr[CArray] field(int pos)
+        const vector[shared_ptr[CArray]] fields()
+
     cdef cppclass CChunkedArray" arrow::ChunkedArray":
         int64_t length()
         int64_t null_count()

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/lib.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 4a2ab86..8fa7fd9 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -18,6 +18,7 @@
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 from cpython cimport PyObject
+from libcpp cimport nullptr
 
 cdef extern from "Python.h":
     int PySlice_Check(object)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index bf14c4f..62592f9 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -258,3 +258,20 @@ class TestConvertSequence(unittest.TestCase):
         assert arr.null_count == 0
         assert arr.type == pa.null()
         assert arr.to_pylist() == []
+
+    def test_structarray(self):
+        ints = pa.array([None, 2, 3], type=pa.int64())
+        strs = pa.array([u'a', None, u'c'], type=pa.string())
+        bools = pa.array([True, False, None], type=pa.bool_())
+        arr = pa.StructArray.from_arrays(
+            ['ints', 'strs', 'bools'],
+            [ints, strs, bools])
+
+        expected = [
+            {'ints': None, 'strs': u'a', 'bools': True},
+            {'ints': 2, 'strs': None, 'bools': False},
+            {'ints': 3, 'strs': u'c', 'bools': None},
+        ]
+
+        pylist = arr.to_pylist()
+        assert pylist == expected, (pylist, expected)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e268ce87/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 9cce7bb..b952d4a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -674,3 +674,20 @@ class TestPandasConversion(unittest.TestCase):
 
         self._check_pandas_roundtrip(df, schema=partial_schema,
                                      expected_schema=expected_schema)
+
+    def test_structarray(self):
+        ints = pa.array([None, 2, 3], type=pa.int64())
+        strs = pa.array([u'a', None, u'c'], type=pa.string())
+        bools = pa.array([True, False, None], type=pa.bool_())
+        arr = pa.StructArray.from_arrays(
+            ['ints', 'strs', 'bools'],
+            [ints, strs, bools])
+
+        expected = pd.Series([
+            {'ints': None, 'strs': u'a', 'bools': True},
+            {'ints': 2, 'strs': None, 'bools': False},
+            {'ints': 3, 'strs': u'c', 'bools': None},
+        ])
+
+        series = pd.Series(arr.to_pandas())
+        tm.assert_series_equal(series, expected)


Mime
View raw message