arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists
Date Wed, 09 Aug 2017 21:47:35 GMT
Repository: arrow
Updated Branches:
  refs/heads/master e44ede87c -> 2972c9d3a


ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists

This does drop the vector append to the builder that was there before. I'm going to do some
local benchmarking to make sure this doesn't degrade performance unacceptably, will report
back here

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #956 from wesm/ARROW-1342 and squashes the following commits:

f2ebeba8 [Wes McKinney] Fix cpplint issue
f403f9dd [Wes McKinney] Fix test case to be platform independent, note ARROW-1345. Improve
quality of error message
f4f44c18 [Wes McKinney] Fix test case where inferred list type is null
ae5c8312 [Wes McKinney] Drop striding check
b4aecd3a [Wes McKinney] Support strided ndarrays in pandas conversion from nested lists


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2972c9d3
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2972c9d3
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2972c9d3

Branch: refs/heads/master
Commit: 2972c9d3a0d371dbdcf69c68a0109b83aa6fd944
Parents: e44ede8
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Wed Aug 9 17:47:30 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Wed Aug 9 17:47:30 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/numpy-internal.h       | 41 +++++++++++++++
 cpp/src/arrow/python/pandas_to_arrow.cc     | 67 ++++++++++--------------
 cpp/src/arrow/table.cc                      |  4 +-
 python/pyarrow/error.pxi                    |  3 +-
 python/pyarrow/includes/common.pxd          |  1 +
 python/pyarrow/tests/pandas_examples.py     | 10 ++--
 python/pyarrow/tests/test_convert_pandas.py |  9 ++++
 7 files changed, 91 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/python/numpy-internal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
index f1ef7da..db34d24 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -25,6 +25,7 @@
 #include "arrow/python/platform.h"
 
 #include <cstdint>
+#include <string>
 
 namespace arrow {
 namespace py {
@@ -51,7 +52,12 @@ class Ndarray1DIndexer {
 
   int64_t size() const { return PyArray_SIZE(arr_); }
 
+  T* data() const { return data_; }
+
+  bool is_strided() const { return stride_ == 1; }
+
   T& operator[](size_type index) { return *(data_ + index * stride_); }
+  T& operator[](size_type index) const { return *(data_ + index * stride_); }
 
  private:
   PyArrayObject* arr_;
@@ -59,6 +65,41 @@ class Ndarray1DIndexer {
   int64_t stride_;
 };
 
+static inline std::string GetNumPyTypeName(int npy_type) {
+#define TYPE_CASE(TYPE, NAME) \
+  case NPY_##TYPE:            \
+    return NAME;
+
+  switch (npy_type) {
+    TYPE_CASE(BOOL, "bool")
+    TYPE_CASE(INT8, "int8")
+    TYPE_CASE(INT16, "int16")
+    TYPE_CASE(INT32, "int32")
+    TYPE_CASE(INT64, "int64")
+#if (NPY_INT64 != NPY_LONGLONG)
+    TYPE_CASE(LONGLONG, "longlong")
+#endif
+    TYPE_CASE(UINT8, "uint8")
+    TYPE_CASE(UINT16, "uint16")
+    TYPE_CASE(UINT32, "uint32")
+    TYPE_CASE(UINT64, "uint64")
+#if (NPY_UINT64 != NPY_ULONGLONG)
+    TYPE_CASE(ULONGLONG, "ulonglong")
+#endif
+    TYPE_CASE(FLOAT16, "float16")
+    TYPE_CASE(FLOAT32, "float32")
+    TYPE_CASE(FLOAT64, "float64")
+    TYPE_CASE(DATETIME, "datetime64")
+    TYPE_CASE(OBJECT, "object")
+    TYPE_CASE(VOID, "void")
+    default:
+      break;
+  }
+
+#undef TYPE_CASE
+  return "unrecognized type in GetNumPyTypeName";
+}
+
 }  // namespace py
 }  // namespace arrow
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/python/pandas_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc
index 060fcb2..b6cc16b 100644
--- a/cpp/src/arrow/python/pandas_to_arrow.cc
+++ b/cpp/src/arrow/python/pandas_to_arrow.cc
@@ -97,8 +97,6 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   int64_t null_count = 0;
 
   Ndarray1DIndexer<T> values(arr);
-
-  // TODO(wesm): striding
   for (int i = 0; i < values.size(); ++i) {
     if (traits::isnull(values[i])) {
       ++null_count;
@@ -125,22 +123,27 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t*
bitmap
   return null_count;
 }
 
-template <int TYPE>
-static int64_t ValuesToValidBytes(const void* data, int64_t length,
-                                  uint8_t* valid_bytes) {
+template <int TYPE, typename BuilderType>
+static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) {
   typedef internal::npy_traits<TYPE> traits;
   typedef typename traits::value_type T;
 
-  int64_t null_count = 0;
-  const T* values = reinterpret_cast<const T*>(data);
-
-  // TODO(wesm): striding
-  for (int i = 0; i < length; ++i) {
-    valid_bytes[i] = !traits::isnull(values[i]);
-    if (traits::isnull(values[i])) null_count++;
+  // TODO(wesm): Vector append when not strided
+  Ndarray1DIndexer<T> values(array);
+  if (traits::supports_nulls) {
+    for (int64_t i = 0; i < values.size(); ++i) {
+      if (traits::isnull(values[i])) {
+        RETURN_NOT_OK(builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(builder->Append(values[i]));
+      }
+    }
+  } else {
+    for (int64_t i = 0; i < values.size(); ++i) {
+      RETURN_NOT_OK(builder->Append(values[i]));
+    }
   }
-
-  return null_count;
+  return Status::OK();
 }
 
 Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
@@ -148,14 +151,14 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type)
{
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  if (PyArray_DESCR(numpy_array)->type_num != np_type) {
-    return Status::Invalid("can only handle exact conversions");
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) <<
" but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
   }
 
-  npy_intp* astrides = PyArray_STRIDES(numpy_array);
-  if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
-    return Status::Invalid("No support for strided arrays in lists yet");
-  }
   return Status::OK();
 }
 
@@ -577,7 +580,7 @@ Status PandasConverter::ConvertDecimals() {
   RETURN_NOT_OK(ImportModule("decimal", &decimal));
   RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
   PyObject* object = objects[0];
 
   int precision;
@@ -618,7 +621,7 @@ Status PandasConverter::ConvertTimes() {
   PyAcquireGIL lock;
   PyDateTime_IMPORT;
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
 
   // datetime.time stores microsecond resolution
   Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_);
@@ -906,7 +909,7 @@ Status LoopPySequence(PyObject* sequence, T func) {
     Py_ssize_t size = PySequence_Size(sequence);
     if (PyArray_Check(sequence)) {
       auto array = reinterpret_cast<PyArrayObject*>(sequence);
-      PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(array));
+      Ndarray1DIndexer<PyObject*> objects(array);
       for (int64_t i = 0; i < size; ++i) {
         RETURN_NOT_OK(func(objects[i]));
       }
@@ -934,7 +937,6 @@ template <int ITEM_TYPE, typename ArrowType>
 inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>&
type,
                                                  ListBuilder* builder, PyObject* list) {
   typedef internal::npy_traits<ITEM_TYPE> traits;
-  typedef typename traits::value_type T;
   typedef typename traits::BuilderClass BuilderT;
 
   PyAcquireGIL lock;
@@ -956,24 +958,13 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
       // TODO(uwe): Support more complex numpy array structures
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
 
-      int64_t size = PyArray_DIM(numpy_array, 0);
-      auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
-      if (traits::supports_nulls) {
-        RETURN_NOT_OK(null_bitmap_->Resize(size, false));
-        // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
-        // currently support this.
-        // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
-        ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
-        return value_builder->Append(data, size, null_bitmap_->data());
-      } else {
-        return value_builder->Append(data, size);
-      }
+      return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder);
     } else if (PyList_Check(object)) {
       int64_t size;
       std::shared_ptr<DataType> inferred_type;
       RETURN_NOT_OK(builder->Append(true));
       RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != type->id()) {
+      if (inferred_type->id() != Type::NA && inferred_type->id() != type->id())
{
         std::stringstream ss;
         ss << inferred_type->ToString() << " cannot be converted to " <<
type->ToString();
         return Status::TypeError(ss.str());
@@ -1064,7 +1055,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
       std::shared_ptr<DataType> inferred_type;
       RETURN_NOT_OK(builder->Append(true));
       RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != Type::STRING) {
+      if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING)
{
         std::stringstream ss;
         ss << inferred_type->ToString() << " cannot be converted to STRING.";
         return Status::TypeError(ss.str());

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 1f0c6d7..ae48698 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr<Schema>& schema,
 
   columns_.resize(columns.size());
   for (size_t i = 0; i < columns.size(); ++i) {
-    columns_[i] = std::make_shared<Column>(schema->field(static_cast<int>(i)),
-                                           columns[i]);
+    columns_[i] =
+        std::make_shared<Column>(schema->field(static_cast<int>(i)), columns[i]);
   }
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/error.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi
index 8a3f57d..8793c4e 100644
--- a/python/pyarrow/error.pxi
+++ b/python/pyarrow/error.pxi
@@ -65,7 +65,7 @@ cdef int check_status(const CStatus& status) nogil except -1:
         return 0
 
     with gil:
-        message = frombytes(status.ToString())
+        message = frombytes(status.message())
         if status.IsInvalid():
             raise ArrowInvalid(message)
         elif status.IsIOError():
@@ -85,4 +85,5 @@ cdef int check_status(const CStatus& status) nogil except -1:
         elif status.IsPlasmaStoreFull():
             raise PlasmaStoreFull(message)
         else:
+            message = frombytes(status.ToString())
             raise ArrowException(message)

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/includes/common.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 637a133..6be08b0 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -42,6 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CStatus()
 
         c_string ToString()
+        c_string message()
 
         c_bool ok()
         c_bool IsIOError()

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/tests/pandas_examples.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
index 17ad4b2..c145e96 100644
--- a/python/pyarrow/tests/pandas_examples.py
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -98,21 +98,25 @@ def dataframe_with_lists(include_index=False):
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
         [0, 1, 2, 3, 4],
         None,
-        [0]
+        [0],
+        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
+                 dtype=np.int64)[::2]
     ]
     fields.append(pa.field('double', pa.list_(pa.float64())))
     arrays['double'] = [
         [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
         [0., 1., 2., 3., 4.],
         None,
-        [0.]
+        [0.],
+        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
     ]
     fields.append(pa.field('str_list', pa.list_(pa.string())))
     arrays['str_list'] = [
         [u"1", u"รค"],
         None,
         [u"1"],
-        [u"1", u"2", u"3"]
+        [u"1", u"2", u"3"],
+        [],
     ]
 
     if include_index:

http://git-wip-us.apache.org/repos/asf/arrow/blob/2972c9d3/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 8969777..61bd072 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -534,6 +534,15 @@ class TestPandasConversion(unittest.TestCase):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], type=field.type)
 
+    def test_column_of_lists_strided(self):
+        df, schema = dataframe_with_lists()
+        df = pd.concat([df] * 6, ignore_index=True)
+
+        arr = df['int64'].values[::3]
+        assert arr.strides[0] != 8
+
+        self._check_array_roundtrip(arr)
+
     def test_nested_lists_all_none(self):
         data = np.array([[None, None], None], dtype=object)
 


Mime
View raw message