arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-443: [Python] Support ingest of strided NumPy arrays from pandas
Date Mon, 03 Apr 2017 22:05:44 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 7d1d4e751 -> f05b7c62c


ARROW-443: [Python] Support ingest of strided NumPy arrays from pandas

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #482 from wesm/ARROW-443 and squashes the following commits:

d9b36c0 [Wes McKinney] Run commented out test cases, fix issue
8f0ff38 [Wes McKinney] cpplint
88eef1a [Wes McKinney] Support strided mask argument in some object conversions
22d4489 [Wes McKinney] First cut at strided NumPy import


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f05b7c62
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f05b7c62
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f05b7c62

Branch: refs/heads/master
Commit: f05b7c62cf6151a2a03292508628f8f1a8e7a1aa
Parents: 7d1d4e7
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Apr 3 18:05:39 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Apr 3 18:05:39 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/config.cc              |   1 +
 cpp/src/arrow/python/numpy-internal.h       |  66 ++++++
 cpp/src/arrow/python/pandas_convert.cc      | 252 +++++++++++++++--------
 python/pyarrow/array.pyx                    |   4 +-
 python/pyarrow/tests/test_convert_pandas.py |  59 +++++-
 5 files changed, 290 insertions(+), 92 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/config.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/config.cc b/cpp/src/arrow/python/config.cc
index 2abc4dd..c2a6916 100644
--- a/cpp/src/arrow/python/config.cc
+++ b/cpp/src/arrow/python/config.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <Python.h>
+#include <datetime.h>
 
 #include "arrow/python/config.h"
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/numpy-internal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
new file mode 100644
index 0000000..fcc6a58
--- /dev/null
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal utilities for dealing with NumPy
+
+#ifndef ARROW_PYTHON_NUMPY_INTERNAL_H
+#define ARROW_PYTHON_NUMPY_INTERNAL_H
+
+#include <Python.h>
+
+#include <cstdint>
+
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_interop.h"
+
+namespace arrow {
+namespace py {
+
+/// Indexing convenience for interacting with strided 1-dim ndarray objects
+template <typename T>
+class Ndarray1DIndexer {
+ public:
+  typedef int64_t size_type;
+
+  Ndarray1DIndexer() : arr_(nullptr), data_(nullptr) {}
+
+  explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { Init(arr); }
+
+  void Init(PyArrayObject* arr) {
+    arr_ = arr;
+    DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
+    Py_INCREF(arr);
+    data_ = reinterpret_cast<T*>(PyArray_DATA(arr));
+    stride_ = PyArray_STRIDES(arr)[0] / sizeof(T);
+  }
+
+  ~Ndarray1DIndexer() { Py_XDECREF(arr_); }
+
+  int64_t size() const { return PyArray_SIZE(arr_); }
+
+  T& operator[](size_type index) { return *(data_ + index * stride_); }
+
+ private:
+  PyArrayObject* arr_;
+  T* data_;
+  int64_t stride_;
+};
+
+}  // namespace py
+}  // namespace arrow
+
+#endif  // ARROW_PYTHON_NUMPY_INTERNAL_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 01019e5..9577892 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -47,6 +47,7 @@
 #include "arrow/python/builtin_convert.h"
 #include "arrow/python/common.h"
 #include "arrow/python/config.h"
+#include "arrow/python/numpy-internal.h"
 #include "arrow/python/numpy_convert.h"
 #include "arrow/python/type_traits.h"
 #include "arrow/python/util/datetime.h"
@@ -70,15 +71,16 @@ static inline bool PyObject_is_string(const PyObject* obj) {
 }
 
 template <int TYPE>
-static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   typedef npy_traits<TYPE> traits;
   typedef typename traits::value_type T;
 
   int64_t null_count = 0;
-  const T* values = reinterpret_cast<const T*>(data);
+
+  Ndarray1DIndexer<T> values(arr);
 
   // TODO(wesm): striding
-  for (int i = 0; i < length; ++i) {
+  for (int i = 0; i < values.size(); ++i) {
     if (traits::isnull(values[i])) {
       ++null_count;
     } else {
@@ -92,8 +94,8 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t*
bitmap)
 // Returns null count
 static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   int64_t null_count = 0;
-  const uint8_t* mask_values = static_cast<const uint8_t*>(PyArray_DATA(mask));
-  // TODO(wesm): strided null mask
+
+  Ndarray1DIndexer<uint8_t> mask_values(mask);
   for (int i = 0; i < length; ++i) {
     if (mask_values[i]) {
       ++null_count;
@@ -138,13 +140,24 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type)
{
   return Status::OK();
 }
 
-Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
-    PyObject** objects, bool* have_bytes) {
+static Status AppendObjectStrings(
+    PyArrayObject* arr, PyArrayObject* mask, StringBuilder* builder, bool* have_bytes) {
   PyObject* obj;
 
-  for (int64_t i = 0; i < objects_length; ++i) {
+  Ndarray1DIndexer<PyObject*> objects(arr);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask != nullptr) {
+    mask_values.Init(mask);
+    have_mask = true;
+  }
+
+  for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if (PyUnicode_Check(obj)) {
+    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+      RETURN_NOT_OK(builder->AppendNull());
+    } else if (PyUnicode_Check(obj)) {
       obj = PyUnicode_AsUTF8String(obj);
       if (obj == NULL) {
         PyErr_Clear();
@@ -158,8 +171,6 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
       *have_bytes = true;
       const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
       RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
-    } else if (PyObject_is_null(obj)) {
-      RETURN_NOT_OK(builder->AppendNull());
     } else {
       return InvalidConversion(obj, "string or bytes");
     }
@@ -168,13 +179,24 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
   return Status::OK();
 }
 
-static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width,
-    FixedSizeBinaryBuilder* builder, PyObject** objects) {
+static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask,
+    int byte_width, FixedSizeBinaryBuilder* builder) {
   PyObject* obj;
 
-  for (int64_t i = 0; i < objects_length; ++i) {
+  Ndarray1DIndexer<PyObject*> objects(arr);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask != nullptr) {
+    mask_values.Init(mask);
+    have_mask = true;
+  }
+
+  for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if (PyUnicode_Check(obj)) {
+    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+      RETURN_NOT_OK(builder->AppendNull());
+    } else if (PyUnicode_Check(obj)) {
       obj = PyUnicode_AsUTF8String(obj);
       if (obj == NULL) {
         PyErr_Clear();
@@ -190,8 +212,6 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int
byte_width
       RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
       RETURN_NOT_OK(
           builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
-    } else if (PyObject_is_null(obj)) {
-      RETURN_NOT_OK(builder->AppendNull());
     } else {
       return InvalidConversion(obj, "string or bytes");
     }
@@ -299,8 +319,7 @@ class PandasConverter : public TypeVisitor {
     } else if (traits::supports_nulls) {
       // TODO(wesm): this presumes the NumPy C type and arrow C type are the
       // same
-      null_count = ValuesToBitmap<traits::npy_type>(
-          PyArray_DATA(arr_), length_, null_bitmap_data_);
+      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
     }
 
     std::vector<FieldMetadata> fields(1);
@@ -329,36 +348,33 @@ class PandasConverter : public TypeVisitor {
 
 #undef VISIT_NATIVE
 
-  Status Convert(std::shared_ptr<Array>* out) {
+  Status Convert() {
     if (PyArray_NDIM(arr_) != 1) {
       return Status::Invalid("only handle 1-dimensional arrays");
     }
-    // TODO(wesm): strided arrays
-    if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
 
     if (type_ == nullptr) { return Status::Invalid("Must pass data type"); }
 
     // Visit the type to perform conversion
     RETURN_NOT_OK(type_->Accept(this));
 
-    *out = out_;
     return Status::OK();
   }
 
+  std::shared_ptr<Array> result() const { return out_; }
+
   // ----------------------------------------------------------------------
   // Conversion logic for various object dtype arrays
 
   template <int ITEM_TYPE, typename ArrowType>
-  Status ConvertTypedLists(
-      const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
+  Status ConvertTypedLists(const std::shared_ptr<DataType>& type);
 
-  Status ConvertObjectStrings(std::shared_ptr<Array>* out);
-  Status ConvertObjectFixedWidthBytes(
-      const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-  Status ConvertBooleans(std::shared_ptr<Array>* out);
-  Status ConvertDates(std::shared_ptr<Array>* out);
-  Status ConvertLists(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>*
out);
-  Status ConvertObjects(std::shared_ptr<Array>* out);
+  Status ConvertObjectStrings();
+  Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
+  Status ConvertBooleans();
+  Status ConvertDates();
+  Status ConvertLists(const std::shared_ptr<DataType>& type);
+  Status ConvertObjects();
 
  protected:
   MemoryPool* pool_;
@@ -374,9 +390,31 @@ class PandasConverter : public TypeVisitor {
   uint8_t* null_bitmap_data_;
 };
 
+template <typename T>
+void CopyStrided(T* input_data, int64_t length, int64_t stride, T* output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = input_data[j];
+    j += stride;
+  }
+}
+
+template <>
+void CopyStrided<PyObject*>(
+    PyObject** input_data, int64_t length, int64_t stride, PyObject** output_data) {
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = input_data[j];
+    if (output_data[i] != nullptr) { Py_INCREF(output_data[i]); }
+    j += stride;
+  }
+}
+
 template <typename ArrowType>
 inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   using traits = arrow_traits<ArrowType::type_id>;
+  using T = typename traits::T;
 
   // Handle LONGLONG->INT64 and other fun things
   int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
@@ -385,7 +423,20 @@ inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>*
data) {
     return Status::NotImplemented("NumPy type casts not yet implemented");
   }
 
-  *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+  if (is_strided()) {
+    // Strided, must copy into new contiguous memory
+    const int64_t stride = PyArray_STRIDES(arr_)[0];
+    const int64_t stride_elements = stride / sizeof(T);
+
+    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_));
+    CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, stride_elements,
+        reinterpret_cast<T*>(new_buffer->mutable_data()));
+    *data = new_buffer;
+  } else {
+    // Can zero-copy
+    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+  }
   return Status::OK();
 }
 
@@ -395,7 +446,7 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
   auto buffer = std::make_shared<PoolBuffer>(pool_);
   RETURN_NOT_OK(buffer->Resize(nbytes));
 
-  const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<uint8_t> values(arr_);
 
   uint8_t* bitmap = buffer->mutable_data();
 
@@ -434,13 +485,22 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name)
{
   return Status::TypeError(ss.str());
 }
 
-Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertDates() {
   PyAcquireGIL lock;
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
   Date64Builder date_builder(pool_);
   RETURN_NOT_OK(date_builder.Resize(length_));
 
+  /// We have to run this in this compilation unit, since we cannot use the
+  /// datetime API otherwise
+  PyDateTime_IMPORT;
+
   Status s;
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
@@ -454,50 +514,57 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out)
{
       return InvalidConversion(obj, "date");
     }
   }
-  return date_builder.Finish(out);
+  return date_builder.Finish(&out_);
 }
 
-Status PandasConverter::ConvertObjectStrings(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertObjectStrings() {
   PyAcquireGIL lock;
 
   // The output type at this point is inconclusive because there may be bytes
   // and unicode mixed in the object array
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
   StringBuilder builder(pool_);
   RETURN_NOT_OK(builder.Resize(length_));
 
   Status s;
   bool have_bytes = false;
-  RETURN_NOT_OK(AppendObjectStrings(length_, &builder, objects, &have_bytes));
-  RETURN_NOT_OK(builder.Finish(out));
+  RETURN_NOT_OK(AppendObjectStrings(arr_, mask_, &builder, &have_bytes));
+  RETURN_NOT_OK(builder.Finish(&out_));
 
   if (have_bytes) {
-    const auto& arr = static_cast<const StringArray&>(*out->get());
-    *out = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
+    const auto& arr = static_cast<const StringArray&>(*out_);
+    out_ = std::make_shared<BinaryArray>(arr.length(), arr.value_offsets(), arr.data(),
         arr.null_bitmap(), arr.null_count());
   }
   return Status::OK();
 }
 
 Status PandasConverter::ConvertObjectFixedWidthBytes(
-    const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+    const std::shared_ptr<DataType>& type) {
   PyAcquireGIL lock;
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+
+  int32_t value_size = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
+
   FixedSizeBinaryBuilder builder(pool_, type);
   RETURN_NOT_OK(builder.Resize(length_));
-  RETURN_NOT_OK(AppendObjectFixedWidthBytes(length_,
-      std::dynamic_pointer_cast<FixedSizeBinaryType>(builder.type())->byte_width(),
-      &builder, objects));
-  RETURN_NOT_OK(builder.Finish(out));
+  RETURN_NOT_OK(AppendObjectFixedWidthBytes(arr_, mask_, value_size, &builder));
+  RETURN_NOT_OK(builder.Finish(&out_));
   return Status::OK();
 }
 
-Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertBooleans() {
   PyAcquireGIL lock;
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
 
   int64_t nbytes = BitUtil::BytesForBits(length_);
   auto data = std::make_shared<PoolBuffer>(pool_);
@@ -509,24 +576,24 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>*
out) {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if (obj == Py_True) {
+    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+      ++null_count;
+    } else if (obj == Py_True) {
       BitUtil::SetBit(bitmap, i);
       BitUtil::SetBit(null_bitmap_data_, i);
     } else if (obj == Py_False) {
       BitUtil::SetBit(null_bitmap_data_, i);
-    } else if (PyObject_is_null(obj)) {
-      ++null_count;
     } else {
       return InvalidConversion(obj, "bool");
     }
   }
 
-  *out = std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count);
+  out_ = std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count);
 
   return Status::OK();
 }
 
-Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertObjects() {
   // Python object arrays are annoying, since we could have one of:
   //
   // * Strings
@@ -538,31 +605,27 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>*
out) {
 
   RETURN_NOT_OK(InitNullBitmap());
 
-  // TODO: mask not supported here
-  if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions yet");
-  }
+  Ndarray1DIndexer<PyObject*> objects;
 
-  const PyObject** objects;
   {
     PyAcquireGIL lock;
-    objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
+    objects.Init(arr_);
     PyDateTime_IMPORT;
   }
 
   if (type_) {
     switch (type_->type) {
       case Type::STRING:
-        return ConvertObjectStrings(out);
+        return ConvertObjectStrings();
       case Type::FIXED_SIZE_BINARY:
-        return ConvertObjectFixedWidthBytes(type_, out);
+        return ConvertObjectFixedWidthBytes(type_);
       case Type::BOOL:
-        return ConvertBooleans(out);
+        return ConvertBooleans();
       case Type::DATE64:
-        return ConvertDates(out);
+        return ConvertDates();
       case Type::LIST: {
         const auto& list_field = static_cast<const ListType&>(*type_);
-        return ConvertLists(list_field.value_field()->type, out);
+        return ConvertLists(list_field.value_field()->type);
       }
       default:
         return Status::TypeError("No known conversion to Arrow type");
@@ -572,11 +635,11 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>*
out) {
       if (PyObject_is_null(objects[i])) {
         continue;
       } else if (PyObject_is_string(objects[i])) {
-        return ConvertObjectStrings(out);
+        return ConvertObjectStrings();
       } else if (PyBool_Check(objects[i])) {
-        return ConvertBooleans(out);
+        return ConvertBooleans();
       } else if (PyDate_CheckExact(objects[i])) {
-        return ConvertDates(out);
+        return ConvertDates();
       } else {
         return InvalidConversion(
             const_cast<PyObject*>(objects[i]), "string, bool, or date");
@@ -588,14 +651,22 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>*
out) {
 }
 
 template <int ITEM_TYPE, typename ArrowType>
-inline Status PandasConverter::ConvertTypedLists(
-    const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>&
type) {
   typedef npy_traits<ITEM_TYPE> traits;
   typedef typename traits::value_type T;
   typedef typename traits::BuilderClass BuilderT;
 
   PyAcquireGIL lock;
 
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  if (is_strided()) {
+    return Status::NotImplemented("strided arrays not implemented for lists");
+  }
+
   auto value_builder = std::make_shared<BuilderT>(pool_, type);
   ListBuilder list_builder(pool_, value_builder);
   PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
@@ -637,16 +708,25 @@ inline Status PandasConverter::ConvertTypedLists(
       return Status::TypeError("Unsupported Python type for list items");
     }
   }
-  return list_builder.Finish(out);
+  return list_builder.Finish(&out_);
 }
 
 template <>
 inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
-    const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+    const std::shared_ptr<DataType>& type) {
   PyAcquireGIL lock;
   // TODO: If there are bytes involed, convert to Binary representation
   bool have_bytes = false;
 
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  if (is_strided()) {
+    return Status::NotImplemented("strided arrays not implemented for lists");
+  }
+
   auto value_builder = std::make_shared<StringBuilder>(pool_);
   ListBuilder list_builder(pool_, value_builder);
   PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
@@ -660,9 +740,8 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
       // TODO(uwe): Support more complex numpy array structures
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
 
-      int64_t size = static_cast<int64_t>(PyArray_DIM(numpy_array, 0));
-      auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
-      RETURN_NOT_OK(AppendObjectStrings(size, value_builder.get(), data, &have_bytes));
+      RETURN_NOT_OK(
+          AppendObjectStrings(numpy_array, nullptr, value_builder.get(), &have_bytes));
     } else if (PyList_Check(objects[i])) {
       int64_t size;
       std::shared_ptr<DataType> inferred_type;
@@ -678,16 +757,15 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
       return Status::TypeError("Unsupported Python type for list items");
     }
   }
-  return list_builder.Finish(out);
+  return list_builder.Finish(&out_);
 }
 
-#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                  \
-  case Type::TYPE: {                                            \
-    return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, out); \
+#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)             \
+  case Type::TYPE: {                                       \
+    return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type); \
   }
 
-Status PandasConverter::ConvertLists(
-    const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type) {
   switch (type->type) {
     LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
     LIST_CASE(INT8, NPY_INT8, Int8Type)
@@ -711,13 +789,17 @@ Status PandasConverter::ConvertLists(
 Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
     const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
   PandasConverter converter(pool, ao, mo, type);
-  return converter.Convert(out);
+  RETURN_NOT_OK(converter.Convert());
+  *out = converter.result();
+  return Status::OK();
 }
 
 Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
     const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
   PandasConverter converter(pool, ao, mo, type);
-  return converter.ConvertObjects(out);
+  RETURN_NOT_OK(converter.ConvertObjects());
+  *out = converter.result();
+  return Status::OK();
 }
 
 // ----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index e7c456d..67785e3 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -82,8 +82,8 @@ cdef class Array:
 
     @staticmethod
     def from_numpy(obj, mask=None, DataType type=None,
-                    timestamps_to_ms=False,
-                    MemoryPool memory_pool=None):
+                   timestamps_to_ms=False,
+                   MemoryPool memory_pool=None):
         """
         Convert pandas.Series to an Arrow Array.
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/f05b7c62/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 0b3c02e..56830a8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -75,16 +75,25 @@ class TestPandasConversion(unittest.TestCase):
             expected = df
         tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
 
-    def _check_array_roundtrip(self, values, expected=None,
+    def _check_array_roundtrip(self, values, expected=None, mask=None,
                                timestamps_to_ms=False, type=None):
         arr = A.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
-                                 type=type)
+                                 mask=mask, type=type)
         result = arr.to_pandas()
 
-        assert arr.null_count == pd.isnull(values).sum()
+        values_nulls = pd.isnull(values)
+        if mask is None:
+            assert arr.null_count == values_nulls.sum()
+        else:
+            assert arr.null_count == (mask | values_nulls).sum()
 
-        tm.assert_series_equal(pd.Series(result), pd.Series(values),
-                               check_names=False)
+        if mask is None:
+            tm.assert_series_equal(pd.Series(result), pd.Series(values),
+                                   check_names=False)
+        else:
+            expected = pd.Series(np.ma.masked_array(values, mask=mask))
+            tm.assert_series_equal(pd.Series(result), expected,
+                                   check_names=False)
 
     def test_float_no_nulls(self):
         data = {}
@@ -402,3 +411,43 @@ class TestPandasConversion(unittest.TestCase):
         data = pd.DataFrame({'a': ['a', 1, 2.0]})
         with self.assertRaises(A.error.ArrowException):
             A.Table.from_pandas(data)
+
+    def test_strided_data_import(self):
+        cases = []
+
+        columns = ['a', 'b', 'c']
+        N, K = 100, 3
+        random_numbers = np.random.randn(N, K).copy() * 100
+
+        numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
+                          'f4', 'f8']
+
+        for type_name in numeric_dtypes:
+            cases.append(random_numbers.astype(type_name))
+
+        # strings
+        cases.append(np.array([tm.rands(10) for i in range(N * K)],
+                              dtype=object)
+                     .reshape(N, K).copy())
+
+        # booleans
+        boolean_objects = (np.array([True, False, True] * N, dtype=object)
+                           .reshape(N, K).copy())
+
+        # add some nulls, so dtype comes back as objects
+        boolean_objects[5] = None
+        cases.append(boolean_objects)
+
+        cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
+                               dtype='datetime64[ms]')
+                     .reshape(N, K).copy())
+
+        strided_mask = (random_numbers > 0).astype(bool)[:, 0]
+
+        for case in cases:
+            df = pd.DataFrame(case, columns=columns)
+            col = df['a']
+
+            self._check_pandas_roundtrip(df)
+            self._check_array_roundtrip(col)
+            self._check_array_roundtrip(col, mask=strided_mask)


Mime
View raw message