arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-1689: [Python] Allow user to request no data copies
Date Thu, 26 Oct 2017 13:58:53 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6b16cca  ARROW-1689: [Python] Allow user to request no data copies
6b16cca is described below

commit 6b16cca3e081fc05652d75f2d4f854b958d9f6d9
Author: Nick White <n.j.white@gmail.com>
AuthorDate: Thu Oct 26 09:58:48 2017 -0400

    ARROW-1689: [Python] Allow user to request no data copies
    
    This makes performance debugging much easier, as it allows you to track down what (Arrow)
data is causing unexpected delays in loading. It also makes testing features like ARROW-1689
easier as you can prove (via unit tests) that copies are not being made.
    
    Author: Nick White <n.j.white@gmail.com>
    
    Closes #1233 from njwhite/feature/zerocopycategories and squashes the following commits:
    
    b06f50d3 [Nick White] ARROW-1689 Don't Deserialize the Dictionary Twice
    a968b0b8 [Nick White] ARROW-1689 Allow User To Request No Data Copies
---
 cpp/src/arrow/python/arrow_to_pandas.cc     | 43 ++++++++++++++++++++---------
 cpp/src/arrow/python/arrow_to_pandas.h      |  3 ++
 python/pyarrow/array.pxi                    |  9 ++++--
 python/pyarrow/includes/libarrow.pxd        |  1 +
 python/pyarrow/table.pxi                    | 15 +++++++---
 python/pyarrow/tests/test_convert_pandas.py | 40 +++++++++++++++++++++++++++
 6 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 88b594c..0c2e0ad 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1040,6 +1040,8 @@ class CategoricalBlock : public PandasBlock {
     return Status::OK();
   }
 
+  PyObject* dictionary() const { return dictionary_.obj(); }
+
  protected:
   MemoryPool* pool_;
   OwnedRef dictionary_;
@@ -1399,6 +1401,11 @@ class ArrowDeserializer {
 
     if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ !=
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(options_, npy_type, data_.chunk(0));
+    } else if (options_.zero_copy_only) {
+      std::stringstream ss;
+      ss << "Needed to copy " << data_.num_chunks() << " chunks with "
+         << data_.null_count() << " nulls, but zero_copy_only was True";
+      return Status::Invalid(ss.str());
     }
 
     RETURN_NOT_OK(AllocateOutput(npy_type));
@@ -1413,6 +1420,10 @@ class ArrowDeserializer {
                               std::is_base_of<TimestampType, Type>::value,
                           Status>::type
   Visit(const Type& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Copy Needed, but zero_copy_only was True");
+    }
+
     constexpr int TYPE = Type::type_id;
     using traits = internal::arrow_traits<TYPE>;
     using c_type = typename Type::c_type;
@@ -1453,6 +1464,11 @@ class ArrowDeserializer {
 
     if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ !=
nullptr) {
       return ConvertValuesZeroCopy<TYPE>(options_, traits::npy_type, data_.chunk(0));
+    } else if (options_.zero_copy_only) {
+      std::stringstream ss;
+      ss << "Needed to copy " << data_.num_chunks() << " chunks with "
+         << data_.null_count() << " nulls, but zero_copy_only was True";
+      return Status::Invalid(ss.str());
     }
 
     if (data_.null_count() > 0) {
@@ -1470,6 +1486,9 @@ class ArrowDeserializer {
 
   template <typename FUNCTOR>
   inline Status VisitObjects(FUNCTOR func) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Object types need copies, but zero_copy_only was True");
+    }
     RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
     auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
     return func(options_, data_, out_values);
@@ -1499,7 +1518,9 @@ class ArrowDeserializer {
 
   // Boolean specialization
   Status Visit(const BooleanType& type) {
-    if (data_.null_count() > 0) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("BooleanType needs copies, but zero_copy_only was True");
+    } else if (data_.null_count() > 0) {
       return VisitObjects(ConvertBooleanWithNulls);
     } else {
       RETURN_NOT_OK(AllocateOutput(internal::arrow_traits<Type::BOOL>::npy_type));
@@ -1510,6 +1531,9 @@ class ArrowDeserializer {
   }
 
   Status Visit(const ListType& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("ListType needs copies, but zero_copy_only was True");
+    }
 #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \
   case Type::ArrowEnum:                                    \
     return ConvertListsLike<ArrowType>(options_, col_, out_values);
@@ -1542,26 +1566,19 @@ class ArrowDeserializer {
   }
 
   Status Visit(const DictionaryType& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("DictionaryType needs copies, but zero_copy_only was True");
+    }
+
     auto block = std::make_shared<CategoricalBlock>(options_, nullptr, col_->length());
     RETURN_NOT_OK(block->Write(col_, 0, 0));
 
-    auto dict_type = static_cast<const DictionaryType*>(col_->type().get());
-
     PyAcquireGIL lock;
     result_ = PyDict_New();
     RETURN_IF_PYERROR();
 
-    PyObject* dictionary;
-
-    // Release GIL before calling ConvertArrayToPandas, will be reacquired
-    // there if needed
-    lock.release();
-    RETURN_NOT_OK(
-        ConvertArrayToPandas(options_, dict_type->dictionary(), nullptr, &dictionary));
-    lock.acquire();
-
     PyDict_SetItemString(result_, "indices", block->block_arr());
-    PyDict_SetItemString(result_, "dictionary", dictionary);
+    PyDict_SetItemString(result_, "dictionary", block->dictionary());
 
     return Status::OK();
   }
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 1d716a5..1e48646 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -41,6 +41,9 @@ namespace py {
 
 struct PandasOptions {
   bool strings_to_categorical;
+  bool zero_copy_only;
+
+  PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {}
 };
 
 ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 72262f0..c57eda1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -373,7 +373,7 @@ cdef class Array:
 
         return pyarrow_wrap_array(result)
 
-    def to_pandas(self, c_bool strings_to_categorical=False):
+    def to_pandas(self, c_bool strings_to_categorical=False, zero_copy_only=False):
         """
         Convert to an array object suitable for use in pandas
 
@@ -381,6 +381,9 @@ cdef class Array:
         ----------
         strings_to_categorical : boolean, default False
             Encode string (UTF8) and binary types to pandas.Categorical
+        zero_copy_only : boolean, default False
+            Raise an ArrowException if this function call would require copying
+            the underlying data
 
         See also
         --------
@@ -392,7 +395,9 @@ cdef class Array:
             PyObject* out
             PandasOptions options
 
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
         with nogil:
             check_status(ConvertArrayToPandas(options, self.sp_array,
                                               self, &out))
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 809bb96..731ef94 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -814,6 +814,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
 
     cdef struct PandasOptions:
         c_bool strings_to_categorical
+        c_bool zero_copy_only
 
 cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index eb19115..6165a66 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -163,7 +163,7 @@ cdef class Column:
         sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array))
         return pyarrow_wrap_column(sp_column)
 
-    def to_pandas(self, strings_to_categorical=False):
+    def to_pandas(self, strings_to_categorical=False, zero_copy_only=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -175,7 +175,9 @@ cdef class Column:
             PyObject* out
             PandasOptions options
 
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
 
         with nogil:
             check_status(libarrow.ConvertColumnToPandas(options,
@@ -857,7 +859,7 @@ cdef class Table:
         return pyarrow_wrap_table(c_table)
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None):
+                  memory_pool=None, zero_copy_only=False):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -871,6 +873,9 @@ cdef class Table:
             Encode string (UTF8) and binary types to pandas.Categorical
         memory_pool: MemoryPool, optional
             Specific memory pool to use to allocate casted columns
+        zero_copy_only : boolean, default False
+            Raise an ArrowException if this function call would require copying
+            the underlying data
 
         Returns
         -------
@@ -878,7 +883,9 @@ cdef class Table:
         """
         cdef:
             PandasOptions options
-        options = PandasOptions(strings_to_categorical=strings_to_categorical)
+        options = PandasOptions(
+            strings_to_categorical=strings_to_categorical,
+            zero_copy_only=zero_copy_only)
         self._check_nullptr()
         if nthreads is None:
             nthreads = cpu_count()
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 6d146f9..ddb7eb4 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -213,6 +213,46 @@ class TestPandasConversion(object):
         schema = pa.schema(fields)
         self._check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_zero_copy_success(self):
+        result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
+        npt.assert_array_equal(result, [0, 1, 2])
+
+    def test_zero_copy_failure_on_object_types(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_with_int_when_nulls(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([0, 1, None]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_with_float_when_nulls(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_bool_types(self):
+        with self.assertRaises(pa.ArrowException):
+            pa.array([True, False]).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_list_types(self):
+        arr = np.array([[1, 2], [8, 9]], dtype=object)
+
+        with self.assertRaises(pa.ArrowException):
+            pa.array(arr).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_failure_on_timestamp_types(self):
+        arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
+
+        with self.assertRaises(pa.ArrowException):
+            pa.array(arr).to_pandas(zero_copy_only=True)
+
+    def test_zero_copy_dictionaries(self):
+        arr = pa.DictionaryArray.from_arrays(
+            np.array([0, 0]),
+            np.array(['A']))
+
+        with self.assertRaises(pa.ArrowException):
+            arr.to_pandas(zero_copy_only=True)
+
     def test_float_nulls(self):
         num_values = 100
 

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message