Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 5FABF200D1D for ; Sat, 14 Oct 2017 17:46:45 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 5E3D11609EA; Sat, 14 Oct 2017 15:46:45 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 7D0761609D5 for ; Sat, 14 Oct 2017 17:46:44 +0200 (CEST) Received: (qmail 79988 invoked by uid 500); 14 Oct 2017 15:46:43 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 79978 invoked by uid 99); 14 Oct 2017 15:46:43 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 14 Oct 2017 15:46:43 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 8DD71DFAE1; Sat, 14 Oct 2017 15:46:43 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@arrow.apache.org Message-Id: <0fb4f647e4994f69a494880c72ae6c05@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-1670: [Serialization] Speed up deserialization by getting rid of smart pointer overhead Date: Sat, 14 Oct 2017 15:46:43 +0000 (UTC) archived-at: Sat, 14 Oct 2017 15:46:45 -0000 Repository: arrow Updated Branches: refs/heads/master dc533211a -> 894f74009 ARROW-1670: [Serialization] Speed up deserialization by getting rid of smart pointer overhead This optimization makes our deserialization codepath faster than pickle pretty much across the board. Author: Philipp Moritz Author: Wes McKinney Closes #1197 from pcmoritz/deserialization-speedup and squashes the following commits: 88cf3ea1 [Wes McKinney] clang-format c1ddbcd5 [Philipp Moritz] introduce unsafe child function for UnionArray ae0a7202 [Philipp Moritz] speed up deserialization by getting rid of smart pointer overhead Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/894f7400 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/894f7400 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/894f7400 Branch: refs/heads/master Commit: 894f7400977693b4e0e8f4b9845fd89481f6bf29 Parents: dc53321 Author: Philipp Moritz Authored: Sat Oct 14 11:46:36 2017 -0400 Committer: Wes McKinney Committed: Sat Oct 14 11:46:36 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/array.cc | 8 +++++ cpp/src/arrow/array.h | 3 ++ cpp/src/arrow/python/arrow_to_python.cc | 52 +++++++++++++--------------- 3 files changed, 36 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index cd1721f..a7930a1 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -375,6 +375,14 @@ std::shared_ptr UnionArray::child(int i) const { return boxed_fields_[i]; } +const Array* UnionArray::UnsafeChild(int i) const { + if (!boxed_fields_[i]) { + DCHECK(MakeArray(data_->child_data[i], &boxed_fields_[i]).ok()); + } + DCHECK(boxed_fields_[i]); + return boxed_fields_[i].get(); +} + // ---------------------------------------------------------------------- // DictionaryArray http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 36bceeb..0805cad 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -610,6 +610,9 @@ class ARROW_EXPORT UnionArray : public Array { std::shared_ptr child(int pos) const; + /// Only use this while the UnionArray is in scope + const Array* UnsafeChild(int pos) const; + protected: void SetData(const std::shared_ptr& data); http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/python/arrow_to_python.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index de05a23..ac459d4 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -163,26 +163,25 @@ Status GetValue(PyObject* context, const Array& arr, int64_t index, int32_t type return Status::OK(); } -#define DESERIALIZE_SEQUENCE(CREATE_FN, SET_ITEM_FN) \ - const auto& data = static_cast(array); \ - int64_t size = array.length(); \ - ScopedRef result(CREATE_FN(stop_idx - start_idx)); \ - auto types = std::make_shared(size, data.type_ids()); \ - auto offsets = std::make_shared(size, data.value_offsets()); \ - for (int64_t i = start_idx; i < stop_idx; ++i) { \ - if (data.IsNull(i)) { \ - Py_INCREF(Py_None); \ - SET_ITEM_FN(result.get(), i - start_idx, Py_None); \ - } else { \ - int64_t offset = offsets->Value(i); \ - int8_t type = types->Value(i); \ - PyObject* value; \ - RETURN_NOT_OK( \ - GetValue(context, *data.child(type), offset, type, base, tensors, &value)); \ - SET_ITEM_FN(result.get(), i - start_idx, value); \ - } \ - } \ - *out = result.release(); \ +#define DESERIALIZE_SEQUENCE(CREATE_FN, SET_ITEM_FN) \ + const auto& data = static_cast(array); \ + ScopedRef result(CREATE_FN(stop_idx - start_idx)); \ + const uint8_t* type_ids = data.raw_type_ids(); \ + const int32_t* value_offsets = data.raw_value_offsets(); \ + for (int64_t i = start_idx; i < stop_idx; ++i) { \ + if (data.IsNull(i)) { \ + Py_INCREF(Py_None); \ + SET_ITEM_FN(result.get(), i - start_idx, Py_None); \ + } else { \ + int64_t offset = value_offsets[i]; \ + uint8_t type = type_ids[i]; \ + PyObject* value; \ + RETURN_NOT_OK(GetValue(context, *data.UnsafeChild(type), offset, type, base, \ + tensors, &value)); \ + SET_ITEM_FN(result.get(), i - start_idx, value); \ + } \ + } \ + *out = result.release(); \ return Status::OK() Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx, @@ -204,10 +203,9 @@ Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, const std::vector>& tensors, PyObject** out) { const auto& data = static_cast(array); - int64_t size = array.length(); ScopedRef result(PySet_New(nullptr)); - auto types = std::make_shared(size, data.type_ids()); - auto offsets = std::make_shared(size, data.value_offsets()); + const uint8_t* type_ids = data.raw_type_ids(); + const int32_t* value_offsets = data.raw_value_offsets(); for (int64_t i = start_idx; i < stop_idx; ++i) { if (data.IsNull(i)) { Py_INCREF(Py_None); @@ -215,11 +213,11 @@ Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, RETURN_IF_PYERROR(); } } else { - int64_t offset = offsets->Value(i); - int8_t type = types->Value(i); + int32_t offset = value_offsets[i]; + int8_t type = type_ids[i]; PyObject* value; - RETURN_NOT_OK( - GetValue(context, *data.child(type), offset, type, base, tensors, &value)); + RETURN_NOT_OK(GetValue(context, *data.UnsafeChild(type), offset, type, base, + tensors, &value)); if (PySet_Add(result.get(), value) < 0) { RETURN_IF_PYERROR(); }