arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1670: [Serialization] Speed up deserialization by getting rid of smart pointer overhead
Date Sat, 14 Oct 2017 15:46:43 GMT
Repository: arrow
Updated Branches:
  refs/heads/master dc533211a -> 894f74009


ARROW-1670: [Serialization] Speed up deserialization by getting rid of smart pointer overhead

This optimization makes our deserialization codepath faster than pickle pretty much across
the board.

Author: Philipp Moritz <pcmoritz@gmail.com>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1197 from pcmoritz/deserialization-speedup and squashes the following commits:

88cf3ea1 [Wes McKinney] clang-format
c1ddbcd5 [Philipp Moritz] introduce unsafe child function for UnionArray
ae0a7202 [Philipp Moritz] speed up deserialization by getting rid of smart pointer overhead


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/894f7400
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/894f7400
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/894f7400

Branch: refs/heads/master
Commit: 894f7400977693b4e0e8f4b9845fd89481f6bf29
Parents: dc53321
Author: Philipp Moritz <pcmoritz@gmail.com>
Authored: Sat Oct 14 11:46:36 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sat Oct 14 11:46:36 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array.cc                  |  8 +++++
 cpp/src/arrow/array.h                   |  3 ++
 cpp/src/arrow/python/arrow_to_python.cc | 52 +++++++++++++---------------
 3 files changed, 36 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index cd1721f..a7930a1 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -375,6 +375,14 @@ std::shared_ptr<Array> UnionArray::child(int i) const {
   return boxed_fields_[i];
 }
 
+const Array* UnionArray::UnsafeChild(int i) const {
+  if (!boxed_fields_[i]) {
+    DCHECK(MakeArray(data_->child_data[i], &boxed_fields_[i]).ok());
+  }
+  DCHECK(boxed_fields_[i]);
+  return boxed_fields_[i].get();
+}
+
 // ----------------------------------------------------------------------
 // DictionaryArray
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 36bceeb..0805cad 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -610,6 +610,9 @@ class ARROW_EXPORT UnionArray : public Array {
 
   std::shared_ptr<Array> child(int pos) const;
 
+  /// Only use this while the UnionArray is in scope
+  const Array* UnsafeChild(int pos) const;
+
  protected:
   void SetData(const std::shared_ptr<ArrayData>& data);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/894f7400/cpp/src/arrow/python/arrow_to_python.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc
index de05a23..ac459d4 100644
--- a/cpp/src/arrow/python/arrow_to_python.cc
+++ b/cpp/src/arrow/python/arrow_to_python.cc
@@ -163,26 +163,25 @@ Status GetValue(PyObject* context, const Array& arr, int64_t index,
int32_t type
   return Status::OK();
 }
 
-#define DESERIALIZE_SEQUENCE(CREATE_FN, SET_ITEM_FN)                                  \
-  const auto& data = static_cast<const UnionArray&>(array);               
           \
-  int64_t size = array.length();                                                      \
-  ScopedRef result(CREATE_FN(stop_idx - start_idx));                                  \
-  auto types = std::make_shared<Int8Array>(size, data.type_ids());                
   \
-  auto offsets = std::make_shared<Int32Array>(size, data.value_offsets());        
   \
-  for (int64_t i = start_idx; i < stop_idx; ++i) {                                   
\
-    if (data.IsNull(i)) {                                                             \
-      Py_INCREF(Py_None);                                                             \
-      SET_ITEM_FN(result.get(), i - start_idx, Py_None);                              \
-    } else {                                                                          \
-      int64_t offset = offsets->Value(i);                                            
\
-      int8_t type = types->Value(i);                                                 
\
-      PyObject* value;                                                                \
-      RETURN_NOT_OK(                                                                  \
-          GetValue(context, *data.child(type), offset, type, base, tensors, &value));
\
-      SET_ITEM_FN(result.get(), i - start_idx, value);                                \
-    }                                                                                 \
-  }                                                                                   \
-  *out = result.release();                                                            \
+#define DESERIALIZE_SEQUENCE(CREATE_FN, SET_ITEM_FN)                               \
+  const auto& data = static_cast<const UnionArray&>(array);               
        \
+  ScopedRef result(CREATE_FN(stop_idx - start_idx));                               \
+  const uint8_t* type_ids = data.raw_type_ids();                                   \
+  const int32_t* value_offsets = data.raw_value_offsets();                         \
+  for (int64_t i = start_idx; i < stop_idx; ++i) {                                 \
+    if (data.IsNull(i)) {                                                          \
+      Py_INCREF(Py_None);                                                          \
+      SET_ITEM_FN(result.get(), i - start_idx, Py_None);                           \
+    } else {                                                                       \
+      int64_t offset = value_offsets[i];                                           \
+      uint8_t type = type_ids[i];                                                  \
+      PyObject* value;                                                             \
+      RETURN_NOT_OK(GetValue(context, *data.UnsafeChild(type), offset, type, base, \
+                             tensors, &value));                                    \
+      SET_ITEM_FN(result.get(), i - start_idx, value);                             \
+    }                                                                              \
+  }                                                                                \
+  *out = result.release();                                                         \
   return Status::OK()
 
 Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx,
@@ -204,10 +203,9 @@ Status DeserializeSet(PyObject* context, const Array& array, int64_t
start_idx,
                       const std::vector<std::shared_ptr<Tensor>>& tensors,
                       PyObject** out) {
   const auto& data = static_cast<const UnionArray&>(array);
-  int64_t size = array.length();
   ScopedRef result(PySet_New(nullptr));
-  auto types = std::make_shared<Int8Array>(size, data.type_ids());
-  auto offsets = std::make_shared<Int32Array>(size, data.value_offsets());
+  const uint8_t* type_ids = data.raw_type_ids();
+  const int32_t* value_offsets = data.raw_value_offsets();
   for (int64_t i = start_idx; i < stop_idx; ++i) {
     if (data.IsNull(i)) {
       Py_INCREF(Py_None);
@@ -215,11 +213,11 @@ Status DeserializeSet(PyObject* context, const Array& array, int64_t
start_idx,
         RETURN_IF_PYERROR();
       }
     } else {
-      int64_t offset = offsets->Value(i);
-      int8_t type = types->Value(i);
+      int32_t offset = value_offsets[i];
+      int8_t type = type_ids[i];
       PyObject* value;
-      RETURN_NOT_OK(
-          GetValue(context, *data.child(type), offset, type, base, tensors, &value));
+      RETURN_NOT_OK(GetValue(context, *data.UnsafeChild(type), offset, type, base,
+                             tensors, &value));
       if (PySet_Add(result.get(), value) < 0) {
         RETURN_IF_PYERROR();
       }


Mime
View raw message