arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-468: Python: Conversion of nested data in pd.DataFrames
Date Wed, 18 Jan 2017 23:28:09 GMT
Repository: arrow
Updated Branches:
  refs/heads/master b1472305c -> 353772f84


ARROW-468: Python: Conversion of nested data in pd.DataFrames

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #289 from xhochy/ARROW-468 and squashes the following commits:

6fab6b2 [Uwe L. Korn] clang-format
c30da77 [Uwe L. Korn] Conversion for Lists of String and Timestamp
3ac373e [Uwe L. Korn] Fix string conversion
23fdc97 [Uwe L. Korn] Conversion of nested arrays to Pandas
a8197f7 [Uwe L. Korn] ARROW-468: Python: Conversion of nested data in pd.DataFrames


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/353772f8
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/353772f8
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/353772f8

Branch: refs/heads/master
Commit: 353772f844e227038ea8a3c5328a70e5fe553773
Parents: b147230
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Wed Jan 18 18:28:02 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Wed Jan 18 18:28:02 2017 -0500

----------------------------------------------------------------------
 python/pyarrow/array.pyx                    |  14 +-
 python/pyarrow/includes/libarrow.pxd        |   1 +
 python/pyarrow/includes/pyarrow.pxd         |   9 +-
 python/pyarrow/schema.pxd                   |   1 +
 python/pyarrow/schema.pyx                   |  19 +
 python/pyarrow/table.pyx                    |  26 +-
 python/pyarrow/tests/test_convert_pandas.py |  53 ++-
 python/src/pyarrow/adapters/pandas.cc       | 448 +++++++++++++++++++----
 python/src/pyarrow/adapters/pandas.h        |   7 +-
 9 files changed, 477 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 266768f..4299ba6 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -33,7 +33,7 @@ from pyarrow.error cimport check_status
 cimport pyarrow.scalar as scalar
 from pyarrow.scalar import NA
 
-from pyarrow.schema cimport Schema
+from pyarrow.schema cimport Field, Schema
 import pyarrow.schema as schema
 
 cimport cpython
@@ -322,7 +322,7 @@ def from_pylist(object list_obj, DataType type=None):
     return box_arrow_array(sp_array)
 
 
-def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
+def from_pandas_series(object series, object mask=None, timestamps_to_ms=False, Field field=None):
     """
     Convert pandas.Series to an Arrow Array.
 
@@ -338,26 +338,32 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
         compability with other functionality like Parquet I/O which
         only supports milliseconds.
 
+    field: pyarrow.Field
+        Schema indicator to what type this column should render in Arrow
+
     Returns
     -------
     pyarrow.array.Array
     """
     cdef:
         shared_ptr[CArray] out
+        shared_ptr[CField] c_field
 
     series_values = series_as_ndarray(series)
     if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
         series_values = series_values.astype('datetime64[ms]')
+    if field is not None:
+        c_field = field.sp_field
 
     if mask is None:
         with nogil:
             check_status(pyarrow.PandasToArrow(pyarrow.get_memory_pool(),
-                                               series_values, &out))
+                                               series_values, c_field, &out))
     else:
         mask = series_as_ndarray(mask)
         with nogil:
             check_status(pyarrow.PandasMaskedToArrow(
-                pyarrow.get_memory_pool(), series_values, mask, &out))
+                pyarrow.get_memory_pool(), series_values, mask, c_field, &out))
 
     return box_arrow_array(out)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8cfaaf7..8b0e3b6 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -107,6 +107,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         c_bool Equals(const shared_ptr[CSchema]& other)
 
         shared_ptr[CField] field(int i)
+        shared_ptr[CField] GetFieldByName(c_string& name)
         int num_fields()
         c_string ToString()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index 901e6c9..b7b8d7c 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,9 +18,9 @@
 # distutils: language = c++
 
 from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
-                                        CDataType, CStatus, Type, MemoryPool,
-                                        TimeUnit)
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CField,
+                                        CTable, CDataType, CStatus, Type,
+                                        MemoryPool, TimeUnit)
 
 cimport pyarrow.includes.libarrow_io as arrow_io
 
@@ -30,9 +30,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
     CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out)
 
-    CStatus PandasToArrow(MemoryPool* pool, object ao,
+    CStatus PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CField] field,
                           shared_ptr[CArray]* out)
     CStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo,
+                                shared_ptr[CField] field,
                                 shared_ptr[CArray]* out)
 
     CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr,

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
index f2cb776..42588d4 100644
--- a/python/pyarrow/schema.pxd
+++ b/python/pyarrow/schema.pxd
@@ -44,4 +44,5 @@ cdef class Schema:
     cdef init_schema(self, const shared_ptr[CSchema]& schema)
 
 cdef DataType box_data_type(const shared_ptr[CDataType]& type)
+cdef Field box_field(const shared_ptr[CField]& field)
 cdef Schema box_schema(const shared_ptr[CSchema]& schema)

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index a6aa9d5..85b1617 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -133,6 +133,20 @@ cdef class Schema:
 
         return self.sp_schema.get().Equals(_other.sp_schema)
 
+    def field_by_name(self, name):
+        """
+        Access a field by its name rather than the column index.
+
+        Parameters
+        ----------
+        name: str
+
+        Returns
+        -------
+        field: pyarrow.Field
+        """
+        return box_field(self.schema.GetFieldByName(tobytes(name)))
+
     @classmethod
     def from_fields(cls, fields):
         cdef:
@@ -287,6 +301,11 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type):
     out.init(type)
     return out
 
+cdef Field box_field(const shared_ptr[CField]& field):
+    cdef Field out = Field()
+    out.init(field)
+    return out
+
 cdef Schema box_schema(const shared_ptr[CSchema]& type):
     cdef Schema out = Schema()
     out.init_schema(type)

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index dce125a..b720a47 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -30,7 +30,7 @@ import pyarrow.config
 from pyarrow.array cimport Array, box_arrow_array
 from pyarrow.error import ArrowException
 from pyarrow.error cimport check_status
-from pyarrow.schema cimport box_data_type, box_schema
+from pyarrow.schema cimport box_data_type, box_schema, Field
 
 from pyarrow.compat import frombytes, tobytes
 
@@ -277,16 +277,20 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
 
 
 
-cdef _dataframe_to_arrays(df, name, timestamps_to_ms):
+cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema):
     from pyarrow.array import from_pandas_series
 
     cdef:
         list names = []
         list arrays = []
+        Field field = None
 
     for name in df.columns:
         col = df[name]
-        arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms)
+        if schema is not None:
+            field = schema.field_by_name(name)
+        arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms,
+                                 field=field)
 
         names.append(name)
         arrays.append(arr)
@@ -424,19 +428,22 @@ cdef class RecordBatch:
         return pd.DataFrame(dict(zip(names, data)), columns=names)
 
     @classmethod
-    def from_pandas(cls, df):
+    def from_pandas(cls, df, schema=None):
         """
         Convert pandas.DataFrame to an Arrow RecordBatch
 
         Parameters
         ----------
         df: pandas.DataFrame
+        schema: pyarrow.Schema (optional)
+            The expected schema of the RecordBatch. This can be used to
+            indicate the type of columns if we cannot infer it automatically.
 
         Returns
         -------
         pyarrow.table.RecordBatch
         """
-        names, arrays = _dataframe_to_arrays(df, None, False)
+        names, arrays = _dataframe_to_arrays(df, None, False, schema)
         return cls.from_arrays(names, arrays)
 
     @staticmethod
@@ -552,7 +559,7 @@ cdef class Table:
         return result
 
     @classmethod
-    def from_pandas(cls, df, name=None, timestamps_to_ms=False):
+    def from_pandas(cls, df, name=None, timestamps_to_ms=False, schema=None):
         """
         Convert pandas.DataFrame to an Arrow Table
 
@@ -567,6 +574,10 @@ cdef class Table:
             compability with other functionality like Parquet I/O which
             only supports milliseconds.
 
+        schema: pyarrow.Schema (optional)
+            The expected schema of the Arrow Table. This can be used to
+            indicate the type of columns if we cannot infer it automatically.
+
         Returns
         -------
         pyarrow.table.Table
@@ -584,7 +595,8 @@ cdef class Table:
         <pyarrow.table.Table object at 0x7f05d1fb1b40>
         """
         names, arrays = _dataframe_to_arrays(df, name=name,
-                                             timestamps_to_ms=timestamps_to_ms)
+                                             timestamps_to_ms=timestamps_to_ms,
+                                             schema=schema)
         return cls.from_arrays(names, arrays, name=name)
 
     @staticmethod

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 261eaa8..3928a1f 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -16,6 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from collections import OrderedDict
+
 import datetime
 import unittest
 
@@ -60,8 +62,8 @@ class TestPandasConversion(unittest.TestCase):
         pass
 
     def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
-                                timestamps_to_ms=False, expected_schema=None):
-        table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms)
+                                timestamps_to_ms=False, expected_schema=None, schema=None):
+        table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms, schema=schema)
         result = table.to_pandas(nthreads=nthreads)
         if expected_schema:
             assert table.schema.equals(expected_schema)
@@ -284,6 +286,53 @@ class TestPandasConversion(unittest.TestCase):
         expected['date'] = pd.to_datetime(df['date'])
         tm.assert_frame_equal(result, expected)
 
+    def test_column_of_lists(self):
+        dtypes = [('i1', A.int8()), ('i2', A.int16()),
+                  ('i4', A.int32()), ('i8', A.int64()),
+                  ('u1', A.uint8()), ('u2', A.uint16()),
+                  ('u4', A.uint32()), ('u8', A.uint64()),
+                  ('f4', A.float_()), ('f8', A.double())]
+
+        arrays = OrderedDict()
+        fields = []
+        for dtype, arrow_dtype in dtypes:
+            fields.append(A.field(dtype, A.list_(arrow_dtype)))
+            arrays[dtype] = [
+                np.arange(10, dtype=dtype),
+                np.arange(5, dtype=dtype),
+                None,
+                np.arange(1, dtype=dtype)
+            ]
+
+        fields.append(A.field('str', A.list_(A.string())))
+        arrays['str'] = [
+            np.array([u"1", u"รค"], dtype="object"),
+            None,
+            np.array([u"1"], dtype="object"),
+            np.array([u"1", u"2", u"3"], dtype="object")
+        ]
+
+        fields.append(A.field('datetime64', A.list_(A.timestamp('ns'))))
+        arrays['datetime64'] = [
+            np.array(['2007-07-13T01:23:34.123456789',
+                      None,
+                      '2010-08-13T05:46:57.437699912'],
+                      dtype='datetime64[ns]'),
+            None,
+            None,
+            np.array(['2007-07-13T02',
+                      None,
+                      '2010-08-13T05:46:57.437699912'],
+                      dtype='datetime64[ns]'),
+        ]
+
+        df = pd.DataFrame(arrays)
+        schema = A.Schema.from_fields(fields)
+        self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
+        table = A.Table.from_pandas(df, schema=schema)
+        assert table.schema.equals(schema)
+        df_new = table.to_pandas(nthreads=1)
+
     def test_threaded_conversion(self):
         df = _alltypes_example()
         self._check_pandas_roundtrip(df, nthreads=2,

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index ad18eca..8c2d350 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -36,6 +36,7 @@
 #include "arrow/api.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/macros.h"
 
@@ -50,6 +51,8 @@ using arrow::ChunkedArray;
 using arrow::Column;
 using arrow::Field;
 using arrow::DataType;
+using arrow::ListType;
+using arrow::ListBuilder;
 using arrow::Status;
 using arrow::Table;
 using arrow::Type;
@@ -66,6 +69,7 @@ template <>
 struct npy_traits<NPY_BOOL> {
   typedef uint8_t value_type;
   using TypeClass = arrow::BooleanType;
+  using BuilderClass = arrow::BooleanBuilder;
 
   static constexpr bool supports_nulls = false;
   static inline bool isnull(uint8_t v) { return false; }
@@ -76,6 +80,7 @@ struct npy_traits<NPY_BOOL> {
   struct npy_traits<NPY_##TYPE> {                    \
     typedef T value_type;                            \
     using TypeClass = arrow::CapType##Type;          \
+    using BuilderClass = arrow::CapType##Builder;    \
                                                      \
     static constexpr bool supports_nulls = false;    \
     static inline bool isnull(T v) { return false; } \
@@ -94,6 +99,7 @@ template <>
 struct npy_traits<NPY_FLOAT32> {
   typedef float value_type;
   using TypeClass = arrow::FloatType;
+  using BuilderClass = arrow::FloatBuilder;
 
   static constexpr bool supports_nulls = true;
 
@@ -104,6 +110,7 @@ template <>
 struct npy_traits<NPY_FLOAT64> {
   typedef double value_type;
   using TypeClass = arrow::DoubleType;
+  using BuilderClass = arrow::DoubleBuilder;
 
   static constexpr bool supports_nulls = true;
 
@@ -114,6 +121,7 @@ template <>
 struct npy_traits<NPY_DATETIME> {
   typedef int64_t value_type;
   using TypeClass = arrow::TimestampType;
+  using BuilderClass = arrow::TimestampBuilder;
 
   static constexpr bool supports_nulls = true;
 
@@ -132,6 +140,107 @@ struct npy_traits<NPY_OBJECT> {
   static constexpr bool supports_nulls = true;
 };
 
+static inline bool PyObject_is_null(const PyObject* obj) {
+  return obj == Py_None || obj == numpy_nan;
+}
+
+static inline bool PyObject_is_string(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_Check(obj) || PyBytes_Check(obj);
+#else
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+static inline bool PyObject_is_bool(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyString_Check(obj) || PyBytes_Check(obj);
+#else
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+template <int TYPE>
+static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+  typedef npy_traits<TYPE> traits;
+  typedef typename traits::value_type T;
+
+  int64_t null_count = 0;
+  const T* values = reinterpret_cast<const T*>(data);
+
+  // TODO(wesm): striding
+  for (int i = 0; i < length; ++i) {
+    if (traits::isnull(values[i])) {
+      ++null_count;
+    } else {
+      BitUtil::SetBit(bitmap, i);
+    }
+  }
+
+  return null_count;
+}
+
+template <int TYPE>
+static int64_t ValuesToBytemap(const void* data, int64_t length, uint8_t* valid_bytes) {
+  typedef npy_traits<TYPE> traits;
+  typedef typename traits::value_type T;
+
+  int64_t null_count = 0;
+  const T* values = reinterpret_cast<const T*>(data);
+
+  // TODO(wesm): striding
+  for (int i = 0; i < length; ++i) {
+    valid_bytes[i] = not traits::isnull(values[i]);
+    if (traits::isnull(values[i])) null_count++;
+  }
+
+  return null_count;
+}
+
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+  if (PyArray_NDIM(numpy_array) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  if (PyArray_DESCR(numpy_array)->type_num != np_type) {
+    return Status::Invalid("can only handle exact conversions");
+  }
+
+  npy_intp* astrides = PyArray_STRIDES(numpy_array);
+  if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
+    return Status::Invalid("No support for strided arrays in lists yet");
+  }
+  return Status::OK();
+}
+
+Status AppendObjectStrings(arrow::StringBuilder& string_builder, PyObject** objects,
+    int64_t objects_length, bool* have_bytes) {
+  PyObject* obj;
+
+  for (int64_t i = 0; i < objects_length; ++i) {
+    obj = objects[i];
+    if (PyUnicode_Check(obj)) {
+      obj = PyUnicode_AsUTF8String(obj);
+      if (obj == NULL) {
+        PyErr_Clear();
+        return Status::TypeError("failed converting unicode to UTF8");
+      }
+      const int32_t length = PyBytes_GET_SIZE(obj);
+      Status s = string_builder.Append(PyBytes_AS_STRING(obj), length);
+      Py_DECREF(obj);
+      if (!s.ok()) { return s; }
+    } else if (PyBytes_Check(obj)) {
+      *have_bytes = true;
+      const int32_t length = PyBytes_GET_SIZE(obj);
+      RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
+    } else {
+      string_builder.AppendNull();
+    }
+  }
+
+  return Status::OK();
+}
+
 template <int TYPE>
 class ArrowSerializer {
  public:
@@ -140,6 +249,8 @@ class ArrowSerializer {
     length_ = PyArray_SIZE(arr_);
   }
 
+  void IndicateType(const std::shared_ptr<Field> field) { field_indicator_ = field;
}
+
   Status Convert(std::shared_ptr<Array>* out);
 
   int stride() const { return PyArray_STRIDES(arr_)[0]; }
@@ -198,28 +309,8 @@ class ArrowSerializer {
     RETURN_NOT_OK(string_builder.Resize(length_));
 
     Status s;
-    PyObject* obj;
     bool have_bytes = false;
-    for (int64_t i = 0; i < length_; ++i) {
-      obj = objects[i];
-      if (PyUnicode_Check(obj)) {
-        obj = PyUnicode_AsUTF8String(obj);
-        if (obj == NULL) {
-          PyErr_Clear();
-          return Status::TypeError("failed converting unicode to UTF8");
-        }
-        const int32_t length = PyBytes_GET_SIZE(obj);
-        s = string_builder.Append(PyBytes_AS_STRING(obj), length);
-        Py_DECREF(obj);
-        if (!s.ok()) { return s; }
-      } else if (PyBytes_Check(obj)) {
-        have_bytes = true;
-        const int32_t length = PyBytes_GET_SIZE(obj);
-        RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
-      } else {
-        string_builder.AppendNull();
-      }
-    }
+    RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes));
     RETURN_NOT_OK(string_builder.Finish(out));
 
     if (have_bytes) {
@@ -258,6 +349,36 @@ class ArrowSerializer {
     return Status::OK();
   }
 
+  template <int ITEM_TYPE, typename ArrowType>
+  Status ConvertTypedLists(
+      const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out);
+
+#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                            \
+  case Type::TYPE: {                                                      \
+    return ConvertTypedLists<NUMPY_TYPE, ::arrow::ArrowType>(field, out); \
+  }
+
+  Status ConvertLists(const std::shared_ptr<Field>& field, std::shared_ptr<Array>*
out) {
+    switch (field->type->type) {
+      LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
+      LIST_CASE(INT8, NPY_INT8, Int8Type)
+      LIST_CASE(UINT16, NPY_UINT16, UInt16Type)
+      LIST_CASE(INT16, NPY_INT16, Int16Type)
+      LIST_CASE(UINT32, NPY_UINT32, UInt32Type)
+      LIST_CASE(INT32, NPY_INT32, Int32Type)
+      LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
+      LIST_CASE(INT64, NPY_INT64, Int64Type)
+      LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
+      LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
+      LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
+      LIST_CASE(STRING, NPY_OBJECT, StringType)
+      default:
+        return Status::TypeError("Unknown list item type");
+    }
+
+    return Status::TypeError("Unknown list type");
+  }
+
   Status MakeDataType(std::shared_ptr<DataType>* out);
 
   arrow::MemoryPool* pool_;
@@ -267,6 +388,7 @@ class ArrowSerializer {
 
   int64_t length_;
 
+  std::shared_ptr<Field> field_indicator_;
   std::shared_ptr<arrow::Buffer> data_;
   std::shared_ptr<arrow::ResizableBuffer> null_bitmap_;
   uint8_t* null_bitmap_data_;
@@ -288,26 +410,6 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t*
bitmap
 }
 
 template <int TYPE>
-static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
-  typedef npy_traits<TYPE> traits;
-  typedef typename traits::value_type T;
-
-  int64_t null_count = 0;
-  const T* values = reinterpret_cast<const T*>(data);
-
-  // TODO(wesm): striding
-  for (int i = 0; i < length; ++i) {
-    if (traits::isnull(values[i])) {
-      ++null_count;
-    } else {
-      BitUtil::SetBit(bitmap, i);
-    }
-  }
-
-  return null_count;
-}
-
-template <int TYPE>
 inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>*
out) {
   out->reset(new typename npy_traits<TYPE>::TypeClass());
   return Status::OK();
@@ -361,26 +463,6 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>*
out) {
   return Status::OK();
 }
 
-static inline bool PyObject_is_null(const PyObject* obj) {
-  return obj == Py_None || obj == numpy_nan;
-}
-
-static inline bool PyObject_is_string(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
-  return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
-  return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
-static inline bool PyObject_is_bool(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
-  return PyString_Check(obj) || PyBytes_Check(obj);
-#else
-  return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
 template <>
 inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
{
   // Python object arrays are annoying, since we could have one of:
@@ -401,17 +483,34 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>*
out)
     PyDateTime_IMPORT;
   }
 
-  for (int64_t i = 0; i < length_; ++i) {
-    if (PyObject_is_null(objects[i])) {
-      continue;
-    } else if (PyObject_is_string(objects[i])) {
-      return ConvertObjectStrings(out);
-    } else if (PyBool_Check(objects[i])) {
-      return ConvertBooleans(out);
-    } else if (PyDate_CheckExact(objects[i])) {
-      return ConvertDates(out);
-    } else {
-      return Status::TypeError("unhandled python type");
+  if (field_indicator_) {
+    switch (field_indicator_->type->type) {
+      case Type::STRING:
+        return ConvertObjectStrings(out);
+      case Type::BOOL:
+        return ConvertBooleans(out);
+      case Type::DATE:
+        return ConvertDates(out);
+      case Type::LIST: {
+        auto list_field = static_cast<ListType*>(field_indicator_->type.get());
+        return ConvertLists(list_field->value_field(), out);
+      }
+      default:
+        return Status::TypeError("No known conversion to Arrow type");
+    }
+  } else {
+    for (int64_t i = 0; i < length_; ++i) {
+      if (PyObject_is_null(objects[i])) {
+        continue;
+      } else if (PyObject_is_string(objects[i])) {
+        return ConvertObjectStrings(out);
+      } else if (PyBool_Check(objects[i])) {
+        return ConvertBooleans(out);
+      } else if (PyDate_CheckExact(objects[i])) {
+        return ConvertDates(out);
+      } else {
+        return Status::TypeError("unhandled python type");
+      }
     }
   }
 
@@ -449,6 +548,81 @@ inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
   return Status::OK();
 }
 
+template <int TYPE>
+template <int ITEM_TYPE, typename ArrowType>
+inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
+    const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+  typedef npy_traits<ITEM_TYPE> traits;
+  typedef typename traits::value_type T;
+  typedef typename traits::BuilderClass BuilderT;
+
+  auto value_builder = std::make_shared<BuilderT>(pool_, field->type);
+  ListBuilder list_builder(pool_, value_builder);
+  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  for (int64_t i = 0; i < length_; ++i) {
+    if (PyObject_is_null(objects[i])) {
+      RETURN_NOT_OK(list_builder.AppendNull());
+    } else if (PyArray_Check(objects[i])) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
+      RETURN_NOT_OK(list_builder.Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
+
+      int32_t size = PyArray_DIM(numpy_array, 0);
+      auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
+      if (traits::supports_nulls) {
+        null_bitmap_->Resize(size, false);
+        // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
+        // currently support this.
+        // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
+        ValuesToBytemap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
+        RETURN_NOT_OK(value_builder->Append(data, size, null_bitmap_->data()));
+      } else {
+        RETURN_NOT_OK(value_builder->Append(data, size));
+      }
+    } else if (PyList_Check(objects[i])) {
+      return Status::TypeError("Python lists are not yet supported");
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  }
+  return list_builder.Finish(out);
+}
+
+template <>
+template <>
+inline Status
+ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
+    const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+  // TODO: If there are bytes involed, convert to Binary representation
+  bool have_bytes = false;
+
+  auto value_builder = std::make_shared<arrow::StringBuilder>(pool_, field->type);
+  ListBuilder list_builder(pool_, value_builder);
+  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  for (int64_t i = 0; i < length_; ++i) {
+    if (PyObject_is_null(objects[i])) {
+      RETURN_NOT_OK(list_builder.AppendNull());
+    } else if (PyArray_Check(objects[i])) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
+      RETURN_NOT_OK(list_builder.Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
+
+      int32_t size = PyArray_DIM(numpy_array, 0);
+      auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
+      RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
+    } else if (PyList_Check(objects[i])) {
+      return Status::TypeError("Python lists are not yet supported");
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  }
+  return list_builder.Finish(out);
+}
+
 template <>
 inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
   return Status::TypeError("NYI");
@@ -460,8 +634,8 @@ inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
     RETURN_NOT_OK(converter.Convert(out));                  \
   } break;
 
-Status PandasMaskedToArrow(
-    arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr<Array>* out)
{
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+    const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
   PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
   PyArrayObject* mask = nullptr;
 
@@ -484,7 +658,11 @@ Status PandasMaskedToArrow(
     TO_ARROW_CASE(FLOAT32);
     TO_ARROW_CASE(FLOAT64);
     TO_ARROW_CASE(DATETIME);
-    TO_ARROW_CASE(OBJECT);
+    case NPY_OBJECT: {
+      ArrowSerializer<NPY_OBJECT> converter(pool, arr, mask);
+      converter.IndicateType(field);
+      RETURN_NOT_OK(converter.Convert(out));
+    } break;
     default:
       std::stringstream ss;
       ss << "unsupported type " << PyArray_DESCR(arr)->type_num << std::endl;
@@ -493,8 +671,9 @@ Status PandasMaskedToArrow(
   return Status::OK();
 }
 
-Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<Array>*
out) {
-  return PandasMaskedToArrow(pool, ao, nullptr, out);
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+    const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+  return PandasMaskedToArrow(pool, ao, nullptr, field, out);
 }
 
 // ----------------------------------------------------------------------
@@ -739,6 +918,56 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject**
out_values)
   return Status::OK();
 }
 
+template <typename ArrowType>
+inline Status ConvertListsLike(
+    const std::shared_ptr<Column>& col, PyObject** out_values) {
+  typedef arrow_traits<ArrowType::type_id> traits;
+  typedef typename ::arrow::TypeTraits<ArrowType>::ArrayType ArrayType;
+
+  const ChunkedArray& data = *col->data().get();
+  auto list_type = std::static_pointer_cast<ListType>(col->type());
+
+  // Get column of underlying value arrays
+  std::vector<std::shared_ptr<Array>> value_arrays;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    auto arr = std::static_pointer_cast<arrow::ListArray>(data.chunk(c));
+    value_arrays.emplace_back(arr->values());
+  }
+  auto flat_column = std::make_shared<Column>(list_type->value_field(), value_arrays);
+  // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+  //    Storing a reference to the whole Array would be to expensive.
+  PyObject* numpy_array;
+  RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array));
+
+  PyAcquireGIL lock;
+
+  for (int c = 0; c < data.num_chunks(); c++) {
+    auto arr = std::static_pointer_cast<arrow::ListArray>(data.chunk(c));
+
+    const uint8_t* data_ptr;
+    int32_t length;
+    const bool has_nulls = data.null_count() > 0;
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        PyObject* start = PyLong_FromLong(arr->value_offset(i));
+        PyObject* end = PyLong_FromLong(arr->value_offset(i + 1));
+        PyObject* slice = PySlice_New(start, end, NULL);
+        *out_values = PyObject_GetItem(numpy_array, slice);
+        Py_DECREF(start);
+        Py_DECREF(end);
+        Py_DECREF(slice);
+      }
+      ++out_values;
+    }
+  }
+
+  Py_XDECREF(numpy_array);
+  return Status::OK();
+}
+
 template <typename T>
 inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values)
{
   for (int c = 0; c < data.num_chunks(); c++) {
@@ -886,8 +1115,11 @@ class ArrowDeserializer {
       CONVERT_CASE(STRING);
       CONVERT_CASE(DATE);
       CONVERT_CASE(TIMESTAMP);
-      default:
-        return Status::NotImplemented("Arrow type reading not implemented");
+      default: {
+        std::stringstream ss;
+        ss << "Arrow type reading not implemented for " << col_->type()->ToString();
+        return Status::NotImplemented(ss.str());
+      }
     }
 
 #undef CONVERT_CASE
@@ -903,7 +1135,7 @@ class ArrowDeserializer {
     typedef typename arrow_traits<TYPE>::T T;
     int npy_type = arrow_traits<TYPE>::npy_type;
 
-    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+    if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ !=
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
     }
 
@@ -933,7 +1165,7 @@ class ArrowDeserializer {
     typedef typename arrow_traits<TYPE>::T T;
     int npy_type = arrow_traits<TYPE>::npy_type;
 
-    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+    if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ !=
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
     }
 
@@ -1028,6 +1260,7 @@ class PandasBlock {
 
   PandasBlock(int64_t num_rows, int num_columns)
       : num_rows_(num_rows), num_columns_(num_columns) {}
+  virtual ~PandasBlock() {}
 
   virtual Status Allocate() = 0;
   virtual Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1080,9 +1313,15 @@ class PandasBlock {
   DISALLOW_COPY_AND_ASSIGN(PandasBlock);
 };
 
+#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum)                         \
+  case Type::ArrowEnum:                                                     \
+    RETURN_NOT_OK((ConvertListsLike<::arrow::ArrowType>(col, out_buffer))); \
+    break;
+
 class ObjectBlock : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
+  virtual ~ObjectBlock() {}
 
   Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
 
@@ -1101,6 +1340,27 @@ class ObjectBlock : public PandasBlock {
       RETURN_NOT_OK(ConvertBinaryLike<arrow::BinaryArray>(data, out_buffer));
     } else if (type == Type::STRING) {
       RETURN_NOT_OK(ConvertBinaryLike<arrow::StringArray>(data, out_buffer));
+    } else if (type == Type::LIST) {
+      auto list_type = std::static_pointer_cast<ListType>(col->type());
+      switch (list_type->value_type()->type) {
+        CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
+        CONVERTLISTSLIKE_CASE(Int8Type, INT8)
+        CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
+        CONVERTLISTSLIKE_CASE(Int16Type, INT16)
+        CONVERTLISTSLIKE_CASE(UInt32Type, UINT32)
+        CONVERTLISTSLIKE_CASE(Int32Type, INT32)
+        CONVERTLISTSLIKE_CASE(UInt64Type, UINT64)
+        CONVERTLISTSLIKE_CASE(Int64Type, INT64)
+        CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP)
+        CONVERTLISTSLIKE_CASE(FloatType, FLOAT)
+        CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE)
+        CONVERTLISTSLIKE_CASE(StringType, STRING)
+        default: {
+          std::stringstream ss;
+          ss << "Not implemented type for lists: " << list_type->value_type()->ToString();
+          return Status::NotImplemented(ss.str());
+        }
+      }
     } else {
       std::stringstream ss;
       ss << "Unsupported type for object array output: " << col->type()->ToString();
@@ -1396,6 +1656,32 @@ class DataFrameBlockCreator {
         case Type::TIMESTAMP:
           output_type = PandasBlock::DATETIME;
           break;
+        case Type::LIST: {
+          auto list_type = std::static_pointer_cast<ListType>(col->type());
+          switch (list_type->value_type()->type) {
+            case Type::UINT8:
+            case Type::INT8:
+            case Type::UINT16:
+            case Type::INT16:
+            case Type::UINT32:
+            case Type::INT32:
+            case Type::INT64:
+            case Type::UINT64:
+            case Type::FLOAT:
+            case Type::DOUBLE:
+            case Type::STRING:
+            case Type::TIMESTAMP:
+              // The above types are all supported.
+              break;
+            default: {
+              std::stringstream ss;
+              ss << "Not implemented type for lists: "
+                 << list_type->value_type()->ToString();
+              return Status::NotImplemented(ss.str());
+            }
+          }
+          output_type = PandasBlock::OBJECT;
+        } break;
         default:
           return Status::NotImplemented(col->type()->ToString());
       }

http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index 60dadd4..664365e 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -31,6 +31,7 @@ namespace arrow {
 
 class Array;
 class Column;
+class Field;
 class MemoryPool;
 class Status;
 class Table;
@@ -63,11 +64,11 @@ arrow::Status ConvertTableToPandas(
 
 PYARROW_EXPORT
 arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
-    std::shared_ptr<arrow::Array>* out);
+    const std::shared_ptr<arrow::Field>& field, std::shared_ptr<arrow::Array>*
out);
 
 PYARROW_EXPORT
-arrow::Status PandasToArrow(
-    arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<arrow::Array>* out);
+arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+    const std::shared_ptr<arrow::Field>& field, std::shared_ptr<arrow::Array>*
out);
 
 }  // namespace pyarrow
 


Mime
View raw message