arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-547: [Python] Add zero-copy slice methods to Array, RecordBatch
Date Mon, 13 Feb 2017 14:04:44 GMT
Repository: arrow
Updated Branches:
  refs/heads/master ad0157547 -> 66f650cd3


ARROW-547: [Python] Add zero-copy slice methods to Array, RecordBatch

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #336 from wesm/ARROW-547 and squashes the following commits:

42037c2 [Wes McKinney] cpplint
2b91b5b [Wes McKinney] Tweak docstring
5f80d80 [Wes McKinney] Add slice methods to pyarrow.Array and RecordBatch. Fix bug in RecordBatch::Slice
20dc23f [Wes McKinney] Draft Array.slice implementation


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/66f650cd
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/66f650cd
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/66f650cd

Branch: refs/heads/master
Commit: 66f650cd359e13f3d5c3d4ef78d89f389d6bcecc
Parents: ad01575
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Feb 13 09:04:37 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Feb 13 09:04:37 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/ipc/adapter.cc         |  2 +-
 cpp/src/arrow/table-test.cc          |  2 ++
 cpp/src/arrow/table.cc               |  5 +++-
 python/pyarrow/array.pxd             |  2 +-
 python/pyarrow/array.pyx             | 42 +++++++++++++++++++++++++------
 python/pyarrow/includes/libarrow.pxd |  6 +++++
 python/pyarrow/scalar.pxd            |  6 ++---
 python/pyarrow/scalar.pyx            |  7 +++---
 python/pyarrow/table.pyx             | 37 ++++++++++++++++++++++-----
 python/pyarrow/tests/test_array.py   | 36 ++++++++++++++++++++++++++
 python/pyarrow/tests/test_table.py   | 32 +++++++++++++++++++++++
 11 files changed, 153 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index f36ff37..a24c007 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -727,7 +727,7 @@ class ArrayLoader : public TypeVisitor {
         RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &offsets));
       }
     }
-    context_->buffer_index += type.mode == UnionMode::DENSE? 2 : 1;
+    context_->buffer_index += type.mode == UnionMode::DENSE ? 2 : 1;
 
     std::vector<std::shared_ptr<Array>> fields;
     RETURN_NOT_OK(LoadChildren(type.children(), &fields));

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index e7c5d66..25f12c4 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -259,6 +259,8 @@ TEST_F(TestRecordBatch, Slice) {
   auto batch_slice = batch.Slice(2);
   auto batch_slice2 = batch.Slice(1, 5);
 
+  ASSERT_EQ(batch_slice->num_rows(), batch.num_rows() - 2);
+
   for (int i = 0; i < batch.num_columns(); ++i) {
     ASSERT_EQ(2, batch_slice->column(i)->offset());
     ASSERT_EQ(length - 2, batch_slice->column(i)->length());

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index a9e0909..8ac06b8 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/table.h"
 
+#include <algorithm>
 #include <cstdlib>
 #include <memory>
 #include <sstream>
@@ -70,7 +71,9 @@ std::shared_ptr<RecordBatch> RecordBatch::Slice(int32_t offset, int32_t
length)
   for (const auto& field : columns_) {
     arrays.emplace_back(field->Slice(offset, length));
   }
-  return std::make_shared<RecordBatch>(schema_, num_rows_, arrays);
+
+  int32_t num_rows = std::min(num_rows_ - offset, length);
+  return std::make_shared<RecordBatch>(schema_, num_rows, arrays);
 }
 
 // ----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd
index af10535..9e4d469 100644
--- a/python/pyarrow/array.pxd
+++ b/python/pyarrow/array.pxd
@@ -38,7 +38,7 @@ cdef class Array:
     cdef init(self, const shared_ptr[CArray]& sp_array)
     cdef getitem(self, int i)
 
-cdef object box_arrow_array(const shared_ptr[CArray]& sp_array)
+cdef object box_array(const shared_ptr[CArray]& sp_array)
 
 
 cdef class BooleanArray(Array):

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 9b34f56..11abf03 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -131,7 +131,7 @@ cdef class Array:
                 check_status(pyarrow.PandasToArrow(
                     pool, series_values, mask, c_field, &out))
 
-            return box_arrow_array(out)
+            return box_array(out)
 
     @staticmethod
     def from_list(object list_obj, DataType type=None, MemoryPool memory_pool=None):
@@ -156,7 +156,7 @@ cdef class Array:
         else:
             raise NotImplementedError()
 
-        return box_arrow_array(sp_array)
+        return box_array(sp_array)
 
     property null_count:
 
@@ -201,9 +201,9 @@ cdef class Array:
 
             step = key.step or 1
             if step != 1:
-                raise NotImplementedError
+                raise IndexError('only slices with step 1 supported')
             else:
-                return self.slice(start, stop)
+                return self.slice(start, stop - start)
 
         while key < 0:
             key += len(self)
@@ -211,10 +211,36 @@ cdef class Array:
         return self.getitem(key)
 
     cdef getitem(self, int i):
-        return scalar.box_arrow_scalar(self.type, self.sp_array, i)
+        return scalar.box_scalar(self.type, self.sp_array, i)
 
-    def slice(self, start, end):
-        pass
+    def slice(self, offset=0, length=None):
+        """
+        Compute zero-copy slice of this array
+
+        Parameters
+        ----------
+        offset : int, default 0
+            Offset from start of array to slice
+        length : int, default None
+            Length of slice (default is until end of Array starting from
+            offset)
+
+        Returns
+        -------
+        sliced : RecordBatch
+        """
+        cdef:
+            shared_ptr[CArray] result
+
+        if offset < 0:
+            raise IndexError('Offset must be non-negative')
+
+        if length is None:
+            result = self.ap.Slice(offset)
+        else:
+            result = self.ap.Slice(offset, length)
+
+        return box_array(result)
 
     def to_pandas(self):
         """
@@ -390,7 +416,7 @@ cdef dict _array_classes = {
     Type_DICTIONARY: DictionaryArray
 }
 
-cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
+cdef object box_array(const shared_ptr[CArray]& sp_array):
     if sp_array.get() == NULL:
         raise ValueError('Array was NULL')
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index ebfdc41..702acfb 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -71,6 +71,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         c_bool Equals(const shared_ptr[CArray]& arr)
         c_bool IsNull(int i)
 
+        shared_ptr[CArray] Slice(int32_t offset)
+        shared_ptr[CArray] Slice(int32_t offset, int32_t length)
+
     cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
         int bit_width()
 
@@ -228,6 +231,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int num_columns()
         int32_t num_rows()
 
+        shared_ptr[CRecordBatch] Slice(int32_t offset)
+        shared_ptr[CRecordBatch] Slice(int32_t offset, int32_t length)
+
     cdef cppclass CTable" arrow::Table":
         CTable(const c_string& name, const shared_ptr[CSchema]& schema,
                const vector[shared_ptr[CColumn]]& columns)

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/scalar.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd
index b068457..2d55757 100644
--- a/python/pyarrow/scalar.pxd
+++ b/python/pyarrow/scalar.pxd
@@ -61,6 +61,6 @@ cdef class ListValue(ArrayValue):
 cdef class StringValue(ArrayValue):
     pass
 
-cdef object box_arrow_scalar(DataType type,
-                             const shared_ptr[CArray]& sp_array,
-                             int index)
+cdef object box_scalar(DataType type,
+                       const shared_ptr[CArray]& sp_array,
+                       int index)

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 9d2b2b1..57a15ad 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -203,7 +203,7 @@ cdef class ListValue(ArrayValue):
 
     cdef getitem(self, int i):
         cdef int j = self.ap.value_offset(self.index) + i
-        return box_arrow_scalar(self.value_type, self.ap.values(), j)
+        return box_scalar(self.value_type, self.ap.values(), j)
 
     def as_py(self):
         cdef:
@@ -235,9 +235,8 @@ cdef dict _scalar_classes = {
     Type_STRING: StringValue,
 }
 
-cdef object box_arrow_scalar(DataType type,
-                             const shared_ptr[CArray]& sp_array,
-                             int index):
+cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
+                       int index):
     cdef ArrayValue val
     if type.type.type == Type_NA:
         return NA

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 1707210..7d73362 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -27,7 +27,7 @@ cimport pyarrow.includes.pyarrow as pyarrow
 
 import pyarrow.config
 
-from pyarrow.array cimport Array, box_arrow_array, wrap_array_output
+from pyarrow.array cimport Array, box_array, wrap_array_output
 from pyarrow.error import ArrowException
 from pyarrow.error cimport check_status
 from pyarrow.schema cimport box_data_type, box_schema, Field
@@ -109,8 +109,7 @@ cdef class ChunkedArray:
         pyarrow.array.Array
         """
         self._check_nullptr()
-        return box_arrow_array(self.chunked_array.chunk(i))
-
+        return box_array(self.chunked_array.chunk(i))
 
     def iterchunks(self):
         for i in range(self.num_chunks):
@@ -387,9 +386,35 @@ cdef class RecordBatch:
         return self._schema
 
     def __getitem__(self, i):
-        cdef Array arr = Array()
-        arr.init(self.batch.column(i))
-        return arr
+        return box_array(self.batch.column(i))
+
+    def slice(self, offset=0, length=None):
+        """
+        Compute zero-copy slice of this RecordBatch
+
+        Parameters
+        ----------
+        offset : int, default 0
+            Offset from start of array to slice
+        length : int, default None
+            Length of slice (default is until end of batch starting from
+            offset)
+
+        Returns
+        -------
+        sliced : RecordBatch
+        """
+        cdef shared_ptr[CRecordBatch] result
+
+        if offset < 0:
+            raise IndexError('Offset must be non-negative')
+
+        if length is None:
+            result = self.batch.Slice(offset)
+        else:
+            result = self.batch.Slice(offset, length)
+
+        return batch_from_cbatch(result)
 
     def equals(self, RecordBatch other):
         cdef:

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index ead17db..d8b2e2f 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -17,6 +17,8 @@
 
 import sys
 
+import pytest
+
 import pyarrow
 import pyarrow.formatting as fmt
 
@@ -100,3 +102,37 @@ def test_to_pandas_zero_copy():
         base_refcount = sys.getrefcount(np_arr.base)
         assert base_refcount == 2
         np_arr.sum()
+
+
+def test_array_slice():
+    arr = pyarrow.from_pylist(range(10))
+
+    sliced = arr.slice(2)
+    expected = pyarrow.from_pylist(range(2, 10))
+    assert sliced.equals(expected)
+
+    sliced2 = arr.slice(2, 4)
+    expected2 = pyarrow.from_pylist(range(2, 6))
+    assert sliced2.equals(expected2)
+
+    # 0 offset
+    assert arr.slice(0).equals(arr)
+
+    # Slice past end of array
+    assert len(arr.slice(len(arr))) == 0
+
+    with pytest.raises(IndexError):
+        arr.slice(-1)
+
+    # Test slice notation
+    assert arr[2:].equals(arr.slice(2))
+
+    assert arr[2:5].equals(arr.slice(2, 3))
+
+    assert arr[-5:].equals(arr.slice(len(arr) - 5))
+
+    with pytest.raises(IndexError):
+        arr[::-1]
+
+    with pytest.raises(IndexError):
+        arr[::2]

http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/tests/test_table.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index d49b33c..67f1892 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -68,6 +68,38 @@ def test_recordbatch_basics():
     ])
 
 
+def test_recordbatch_slice():
+    data = [
+        pa.from_pylist(range(5)),
+        pa.from_pylist([-10, -5, 0, 5, 10])
+    ]
+    names = ['c0', 'c1']
+
+    batch = pa.RecordBatch.from_arrays(data, names)
+
+    sliced = batch.slice(2)
+
+    assert sliced.num_rows == 3
+
+    expected = pa.RecordBatch.from_arrays(
+        [x.slice(2) for x in data], names)
+    assert sliced.equals(expected)
+
+    sliced2 = batch.slice(2, 2)
+    expected2 = pa.RecordBatch.from_arrays(
+        [x.slice(2, 2) for x in data], names)
+    assert sliced2.equals(expected2)
+
+    # 0 offset
+    assert batch.slice(0).equals(batch)
+
+    # Slice past end of array
+    assert len(batch.slice(len(batch))) == 0
+
+    with pytest.raises(IndexError):
+        batch.slice(-1)
+
+
 def test_recordbatch_from_to_pandas():
     data = pd.DataFrame({
         'c1': np.array([1, 2, 3, 4, 5], dtype='int64'),


Mime
View raw message