Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 9FDF8200D29 for ; Thu, 26 Oct 2017 15:58:57 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 9E5B2160BF2; Thu, 26 Oct 2017 13:58:57 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 983BC1609E8 for ; Thu, 26 Oct 2017 15:58:56 +0200 (CEST) Received: (qmail 38963 invoked by uid 500); 26 Oct 2017 13:58:55 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 38954 invoked by uid 99); 26 Oct 2017 13:58:55 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 26 Oct 2017 13:58:55 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 0BD5581BDC; Thu, 26 Oct 2017 13:58:53 +0000 (UTC) Date: Thu, 26 Oct 2017 13:58:53 +0000 To: "commits@arrow.apache.org" Subject: [arrow] branch master updated: ARROW-1689: [Python] Allow user to request no data copies MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <150902633342.1994.8530657807665251362@gitbox.apache.org> From: wesm@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: arrow X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: c30a7e30af2469dde1a00f74d8ba9631887825c4 X-Git-Newrev: 6b16cca3e081fc05652d75f2d4f854b958d9f6d9 X-Git-Rev: 6b16cca3e081fc05652d75f2d4f854b958d9f6d9 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated archived-at: Thu, 26 Oct 2017 13:58:57 -0000 This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 6b16cca ARROW-1689: [Python] Allow user to request no data copies 6b16cca is described below commit 6b16cca3e081fc05652d75f2d4f854b958d9f6d9 Author: Nick White AuthorDate: Thu Oct 26 09:58:48 2017 -0400 ARROW-1689: [Python] Allow user to request no data copies This makes performance debugging much easier, as it allows you to track down what (Arrow) data is causing unexpected delays in loading. It also makes testing features like ARROW-1689 easier as you can prove (via unit tests) that copies are not being made. Author: Nick White Closes #1233 from njwhite/feature/zerocopycategories and squashes the following commits: b06f50d3 [Nick White] ARROW-1689 Don't Deserialize the Dictionary Twice a968b0b8 [Nick White] ARROW-1689 Allow User To Request No Data Copies --- cpp/src/arrow/python/arrow_to_pandas.cc | 43 ++++++++++++++++++++--------- cpp/src/arrow/python/arrow_to_pandas.h | 3 ++ python/pyarrow/array.pxi | 9 ++++-- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/table.pxi | 15 +++++++--- python/pyarrow/tests/test_convert_pandas.py | 40 +++++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 88b594c..0c2e0ad 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1040,6 +1040,8 @@ class CategoricalBlock : public PandasBlock { return Status::OK(); } + PyObject* dictionary() const { return dictionary_.obj(); } + protected: MemoryPool* pool_; OwnedRef dictionary_; @@ -1399,6 +1401,11 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); + } else if (options_.zero_copy_only) { + std::stringstream ss; + ss << "Needed to copy " << data_.num_chunks() << " chunks with " + << data_.null_count() << " nulls, but zero_copy_only was True"; + return Status::Invalid(ss.str()); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1413,6 +1420,10 @@ class ArrowDeserializer { std::is_base_of::value, Status>::type Visit(const Type& type) { + if (options_.zero_copy_only) { + return Status::Invalid("Copy Needed, but zero_copy_only was True"); + } + constexpr int TYPE = Type::type_id; using traits = internal::arrow_traits; using c_type = typename Type::c_type; @@ -1453,6 +1464,11 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) { return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); + } else if (options_.zero_copy_only) { + std::stringstream ss; + ss << "Needed to copy " << data_.num_chunks() << " chunks with " + << data_.null_count() << " nulls, but zero_copy_only was True"; + return Status::Invalid(ss.str()); } if (data_.null_count() > 0) { @@ -1470,6 +1486,9 @@ class ArrowDeserializer { template inline Status VisitObjects(FUNCTOR func) { + if (options_.zero_copy_only) { + return Status::Invalid("Object types need copies, but zero_copy_only was True"); + } RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); return func(options_, data_, out_values); @@ -1499,7 +1518,9 @@ class ArrowDeserializer { // Boolean specialization Status Visit(const BooleanType& type) { - if (data_.null_count() > 0) { + if (options_.zero_copy_only) { + return Status::Invalid("BooleanType needs copies, but zero_copy_only was True"); + } else if (data_.null_count() > 0) { return VisitObjects(ConvertBooleanWithNulls); } else { RETURN_NOT_OK(AllocateOutput(internal::arrow_traits::npy_type)); @@ -1510,6 +1531,9 @@ class ArrowDeserializer { } Status Visit(const ListType& type) { + if (options_.zero_copy_only) { + return Status::Invalid("ListType needs copies, but zero_copy_only was True"); + } #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ return ConvertListsLike(options_, col_, out_values); @@ -1542,26 +1566,19 @@ class ArrowDeserializer { } Status Visit(const DictionaryType& type) { + if (options_.zero_copy_only) { + return Status::Invalid("DictionaryType needs copies, but zero_copy_only was True"); + } + auto block = std::make_shared(options_, nullptr, col_->length()); RETURN_NOT_OK(block->Write(col_, 0, 0)); - auto dict_type = static_cast(col_->type().get()); - PyAcquireGIL lock; result_ = PyDict_New(); RETURN_IF_PYERROR(); - PyObject* dictionary; - - // Release GIL before calling ConvertArrayToPandas, will be reacquired - // there if needed - lock.release(); - RETURN_NOT_OK( - ConvertArrayToPandas(options_, dict_type->dictionary(), nullptr, &dictionary)); - lock.acquire(); - PyDict_SetItemString(result_, "indices", block->block_arr()); - PyDict_SetItemString(result_, "dictionary", dictionary); + PyDict_SetItemString(result_, "dictionary", block->dictionary()); return Status::OK(); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 1d716a5..1e48646 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -41,6 +41,9 @@ namespace py { struct PandasOptions { bool strings_to_categorical; + bool zero_copy_only; + + PandasOptions() : strings_to_categorical(false), zero_copy_only(false) {} }; ARROW_EXPORT diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 72262f0..c57eda1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -373,7 +373,7 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self, c_bool strings_to_categorical=False): + def to_pandas(self, c_bool strings_to_categorical=False, zero_copy_only=False): """ Convert to an array object suitable for use in pandas @@ -381,6 +381,9 @@ cdef class Array: ---------- strings_to_categorical : boolean, default False Encode string (UTF8) and binary types to pandas.Categorical + zero_copy_only : boolean, default False + Raise an ArrowException if this function call would require copying + the underlying data See also -------- @@ -392,7 +395,9 @@ cdef class Array: PyObject* out PandasOptions options - options = PandasOptions(strings_to_categorical=strings_to_categorical) + options = PandasOptions( + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only) with nogil: check_status(ConvertArrayToPandas(options, self.sp_array, self, &out)) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 809bb96..731ef94 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -814,6 +814,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef struct PandasOptions: c_bool strings_to_categorical + c_bool zero_copy_only cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil: diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index eb19115..6165a66 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -163,7 +163,7 @@ cdef class Column: sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) return pyarrow_wrap_column(sp_column) - def to_pandas(self, strings_to_categorical=False): + def to_pandas(self, strings_to_categorical=False, zero_copy_only=False): """ Convert the arrow::Column to a pandas.Series @@ -175,7 +175,9 @@ cdef class Column: PyObject* out PandasOptions options - options = PandasOptions(strings_to_categorical=strings_to_categorical) + options = PandasOptions( + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only) with nogil: check_status(libarrow.ConvertColumnToPandas(options, @@ -857,7 +859,7 @@ cdef class Table: return pyarrow_wrap_table(c_table) def to_pandas(self, nthreads=None, strings_to_categorical=False, - memory_pool=None): + memory_pool=None, zero_copy_only=False): """ Convert the arrow::Table to a pandas DataFrame @@ -871,6 +873,9 @@ cdef class Table: Encode string (UTF8) and binary types to pandas.Categorical memory_pool: MemoryPool, optional Specific memory pool to use to allocate casted columns + zero_copy_only : boolean, default False + Raise an ArrowException if this function call would require copying + the underlying data Returns ------- @@ -878,7 +883,9 @@ cdef class Table: """ cdef: PandasOptions options - options = PandasOptions(strings_to_categorical=strings_to_categorical) + options = PandasOptions( + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only) self._check_nullptr() if nthreads is None: nthreads = cpu_count() diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 6d146f9..ddb7eb4 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -213,6 +213,46 @@ class TestPandasConversion(object): schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema) + def test_zero_copy_success(self): + result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True) + npt.assert_array_equal(result, [0, 1, 2]) + + def test_zero_copy_failure_on_object_types(self): + with self.assertRaises(pa.ArrowException): + pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_with_int_when_nulls(self): + with self.assertRaises(pa.ArrowException): + pa.array([0, 1, None]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_with_float_when_nulls(self): + with self.assertRaises(pa.ArrowException): + pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_bool_types(self): + with self.assertRaises(pa.ArrowException): + pa.array([True, False]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_list_types(self): + arr = np.array([[1, 2], [8, 9]], dtype=object) + + with self.assertRaises(pa.ArrowException): + pa.array(arr).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_timestamp_types(self): + arr = np.array(['2007-07-13'], dtype='datetime64[ns]') + + with self.assertRaises(pa.ArrowException): + pa.array(arr).to_pandas(zero_copy_only=True) + + def test_zero_copy_dictionaries(self): + arr = pa.DictionaryArray.from_arrays( + np.array([0, 0]), + np.array(['A'])) + + with self.assertRaises(pa.ArrowException): + arr.to_pandas(zero_copy_only=True) + def test_float_nulls(self): num_values = 100 -- To stop receiving notification emails like this one, please contact ['"commits@arrow.apache.org" '].