arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-44: Python: prototype object model for array slot values ("scalars")
Date Tue, 08 Mar 2016 06:39:14 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 9afb66778 -> ae95dbd18


ARROW-44: Python: prototype object model for array slot values ("scalars")

Non-exhaustive, but this will facilitate inspecting Arrow data while the library is in development.

```python
In [2]: arr = arrow.from_pylist([['foo', None], None, [], ['qux']])

In [3]: arr
Out[3]: <arrow.array.ListArray at 0x7f1970030f98>

In [4]: arr[0]
Out[4]: ['foo', None]

In [5]: type(arr[0])
Out[5]: arrow.scalar.ListValue

In [6]: arr[0][0]
Out[6]: 'foo'

In [7]: arr[0][1]
Out[7]: NA

In [8]: arr[1]
Out[8]: NA

In [9]: arr[2]
Out[9]: []

In [10]: len(arr[2])
Out[10]: 0

In [11]: arr.type
Out[11]: DataType(list<string>)
```

Author: Wes McKinney <wesm@apache.org>

Closes #20 from wesm/ARROW-44 and squashes the following commits:

df06ba1 [Wes McKinney] Add tests for scalars proxying implemented Python list type conversions,
fix associated bugs
20fbdc1 [Wes McKinney] Draft scalar box types, no tests yet


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ae95dbd1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ae95dbd1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ae95dbd1

Branch: refs/heads/master
Commit: ae95dbd189477442d39e55fb0a1aede206906cd9
Parents: 9afb667
Author: Wes McKinney <wesm@apache.org>
Authored: Mon Mar 7 22:39:07 2016 -0800
Committer: Wes McKinney <wesm@apache.org>
Committed: Mon Mar 7 22:39:07 2016 -0800

----------------------------------------------------------------------
 cpp/src/arrow/types/list.h             |   6 +-
 python/arrow/__init__.py               |   6 +-
 python/arrow/array.pxd                 |   1 -
 python/arrow/array.pyx                 |  17 ++-
 python/arrow/compat.py                 |   6 +
 python/arrow/includes/arrow.pxd        |  36 +++++-
 python/arrow/scalar.pxd                |  25 ++++-
 python/arrow/scalar.pyx                | 165 ++++++++++++++++++++++++++++
 python/arrow/schema.pxd                |   2 +
 python/arrow/schema.pyx                |  14 +++
 python/arrow/tests/test_scalars.py     |  82 ++++++++++++++
 python/src/pyarrow/adapters/builtin.cc |   2 +-
 12 files changed, 342 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index f40a824..210c76a 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -63,7 +63,11 @@ class ListArray : public Array {
 
   // Return a shared pointer in case the requestor desires to share ownership
   // with this array.
-  const ArrayPtr& values() const {return values_;}
+  const std::shared_ptr<Array>& values() const {return values_;}
+
+  const std::shared_ptr<DataType>& value_type() const {
+    return values_->type();
+  }
 
   const int32_t* offsets() const { return offsets_;}
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py
index 3c049b8..3507ea0 100644
--- a/python/arrow/__init__.py
+++ b/python/arrow/__init__.py
@@ -24,7 +24,11 @@ from arrow.array import (Array, from_pylist, total_allocated_bytes,
 
 from arrow.error import ArrowException
 
-from arrow.scalar import ArrayValue, NA, Scalar
+from arrow.scalar import (ArrayValue, Scalar, NA, NAType,
+                          BooleanValue,
+                          Int8Value, Int16Value, Int32Value, Int64Value,
+                          UInt8Value, UInt16Value, UInt32Value, UInt64Value,
+                          FloatValue, DoubleValue, ListValue, StringValue)
 
 from arrow.schema import (null, bool_,
                           int8, int16, int32, int64,

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd
index e32d277..04dd8d1 100644
--- a/python/arrow/array.pxd
+++ b/python/arrow/array.pxd
@@ -34,7 +34,6 @@ cdef class Array:
         DataType type
 
     cdef init(self, const shared_ptr[CArray]& sp_array)
-    cdef _getitem(self, int i)
 
 
 cdef class BooleanArray(Array):

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx
index 3a3210d..8ebd01d 100644
--- a/python/arrow/array.pyx
+++ b/python/arrow/array.pyx
@@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow
 from arrow.compat import frombytes, tobytes
 from arrow.error cimport check_status
 
+cimport arrow.scalar as scalar
 from arrow.scalar import NA
 
 def total_allocated_bytes():
@@ -73,13 +74,7 @@ cdef class Array:
         while key < 0:
             key += len(self)
 
-        if self.ap.IsNull(key):
-            return NA
-        else:
-            return self._getitem(key)
-
-    cdef _getitem(self, int i):
-        raise NotImplementedError
+        return scalar.box_arrow_scalar(self.type, self.sp_array, key)
 
     def slice(self, start, end):
         pass
@@ -168,12 +163,16 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
     return arr
 
 
-def from_pylist(object list_obj, type=None):
+def from_pylist(object list_obj, DataType type=None):
     """
     Convert Python list to Arrow array
     """
     cdef:
         shared_ptr[CArray] sp_array
 
-    check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
+    if type is None:
+        check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
+    else:
+        raise NotImplementedError
+
     return box_arrow_array(sp_array)

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/compat.py
----------------------------------------------------------------------
diff --git a/python/arrow/compat.py b/python/arrow/compat.py
index 2ac41ac..08f0f23 100644
--- a/python/arrow/compat.py
+++ b/python/arrow/compat.py
@@ -54,6 +54,9 @@ if PY2:
     range = xrange
     long = long
 
+    def u(s):
+        return unicode(s, "unicode_escape")
+
     def tobytes(o):
         if isinstance(o, unicode):
             return o.encode('utf8')
@@ -73,6 +76,9 @@ else:
     from decimal import Decimal
     range = range
 
+    def u(s):
+        return s
+
     def tobytes(o):
         if isinstance(o, str):
             return o.encode('utf8')

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/includes/arrow.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd
index fde5de9..0cc44c0 100644
--- a/python/arrow/includes/arrow.pxd
+++ b/python/arrow/includes/arrow.pxd
@@ -84,13 +84,41 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         c_bool IsNull(int i)
 
     cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray):
-        pass
+        uint8_t Value(int i)
 
     cdef cppclass CInt8Array" arrow::Int8Array"(CArray):
-        pass
+        int8_t Value(int i)
+
+    cdef cppclass CUInt16Array" arrow::UInt16Array"(CArray):
+        uint16_t Value(int i)
+
+    cdef cppclass CInt16Array" arrow::Int16Array"(CArray):
+        int16_t Value(int i)
+
+    cdef cppclass CUInt32Array" arrow::UInt32Array"(CArray):
+        uint32_t Value(int i)
+
+    cdef cppclass CInt32Array" arrow::Int32Array"(CArray):
+        int32_t Value(int i)
+
+    cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray):
+        uint64_t Value(int i)
+
+    cdef cppclass CInt64Array" arrow::Int64Array"(CArray):
+        int64_t Value(int i)
+
+    cdef cppclass CFloatArray" arrow::FloatArray"(CArray):
+        float Value(int i)
+
+    cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray):
+        double Value(int i)
 
     cdef cppclass CListArray" arrow::ListArray"(CArray):
-        pass
+        const int32_t* offsets()
+        int32_t offset(int i)
+        int32_t value_length(int i)
+        const shared_ptr[CArray]& values()
+        const shared_ptr[CDataType]& value_type()
 
     cdef cppclass CStringArray" arrow::StringArray"(CListArray):
-        pass
+        c_string GetString(int i)

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd
index e193c09..15cdc95 100644
--- a/python/arrow/scalar.pxd
+++ b/python/arrow/scalar.pxd
@@ -16,7 +16,7 @@
 # under the License.
 
 from arrow.includes.common cimport *
-from arrow.includes.arrow cimport CArray, CListArray
+from arrow.includes.arrow cimport *
 
 from arrow.schema cimport DataType
 
@@ -31,17 +31,36 @@ cdef class NAType(Scalar):
 
 cdef class ArrayValue(Scalar):
     cdef:
-        shared_ptr[CArray] array
+        shared_ptr[CArray] sp_array
         int index
 
+    cdef void init(self, DataType type,
+                   const shared_ptr[CArray]& sp_array, int index)
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array)
+
 
 cdef class Int8Value(ArrayValue):
     pass
 
 
-cdef class ListValue(ArrayValue):
+cdef class Int64Value(ArrayValue):
     pass
 
 
+cdef class ListValue(ArrayValue):
+    cdef readonly:
+        DataType value_type
+
+    cdef:
+        CListArray* ap
+
+    cdef _getitem(self, int i)
+
+
 cdef class StringValue(ArrayValue):
     pass
+
+cdef object box_arrow_scalar(DataType type,
+                             const shared_ptr[CArray]& sp_array,
+                             int index)

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx
index 78dadec..951ede2 100644
--- a/python/arrow/scalar.pyx
+++ b/python/arrow/scalar.pyx
@@ -15,14 +15,179 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from arrow.schema cimport DataType, box_data_type
+
+from arrow.compat import frombytes
 import arrow.schema as schema
 
+NA = None
+
 cdef class NAType(Scalar):
 
     def __cinit__(self):
+        global NA
+        if NA is not None:
+            raise Exception('Cannot create multiple NAType instances')
+
         self.type = schema.null()
 
     def __repr__(self):
         return 'NA'
 
+    def as_py(self):
+        return None
+
 NA = NAType()
+
+cdef class ArrayValue(Scalar):
+
+    cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
+                   int index):
+        self.type = type
+        self.index = index
+        self._set_array(sp_array)
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+        self.sp_array = sp_array
+
+    def __repr__(self):
+        if hasattr(self, 'as_py'):
+            return repr(self.as_py())
+        else:
+            return Scalar.__repr__(self)
+
+
+cdef class BooleanValue(ArrayValue):
+    pass
+
+
+cdef class Int8Value(ArrayValue):
+
+    def as_py(self):
+        cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class UInt8Value(ArrayValue):
+
+    def as_py(self):
+        cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class Int16Value(ArrayValue):
+
+    def as_py(self):
+        cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class UInt16Value(ArrayValue):
+
+    def as_py(self):
+        cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class Int32Value(ArrayValue):
+
+    def as_py(self):
+        cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class UInt32Value(ArrayValue):
+
+    def as_py(self):
+        cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class Int64Value(ArrayValue):
+
+    def as_py(self):
+        cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class UInt64Value(ArrayValue):
+
+    def as_py(self):
+        cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class FloatValue(ArrayValue):
+
+    def as_py(self):
+        cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class DoubleValue(ArrayValue):
+
+    def as_py(self):
+        cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
+        return ap.Value(self.index)
+
+
+cdef class StringValue(ArrayValue):
+
+    def as_py(self):
+        cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
+        return frombytes(ap.GetString(self.index))
+
+
+cdef class ListValue(ArrayValue):
+
+    def __len__(self):
+        return self.ap.value_length(self.index)
+
+    def __getitem__(self, i):
+        return self._getitem(i)
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+        self.sp_array = sp_array
+        self.ap = <CListArray*> sp_array.get()
+        self.value_type = box_data_type(self.ap.value_type())
+
+    cdef _getitem(self, int i):
+        cdef int j = self.ap.offset(self.index) + i
+        return box_arrow_scalar(self.value_type, self.ap.values(), j)
+
+    def as_py(self):
+        cdef:
+            int j
+            list result = []
+
+        for j in range(len(self)):
+            result.append(self._getitem(j).as_py())
+
+        return result
+
+
+cdef dict _scalar_classes = {
+    LogicalType_UINT8: Int8Value,
+    LogicalType_UINT16: Int16Value,
+    LogicalType_UINT32: Int32Value,
+    LogicalType_UINT64: Int64Value,
+    LogicalType_INT8: Int8Value,
+    LogicalType_INT16: Int16Value,
+    LogicalType_INT32: Int32Value,
+    LogicalType_INT64: Int64Value,
+    LogicalType_FLOAT: FloatValue,
+    LogicalType_DOUBLE: DoubleValue,
+    LogicalType_LIST: ListValue,
+    LogicalType_STRING: StringValue
+}
+
+cdef object box_arrow_scalar(DataType type,
+                             const shared_ptr[CArray]& sp_array,
+                             int index):
+    cdef ArrayValue val
+    if sp_array.get().IsNull(index):
+        return NA
+    else:
+        val = _scalar_classes[type.type.type]()
+        val.init(type, sp_array, index)
+        return val

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd
index 487c246..8cc244a 100644
--- a/python/arrow/schema.pxd
+++ b/python/arrow/schema.pxd
@@ -37,3 +37,5 @@ cdef class Schema:
     cdef:
         shared_ptr[CSchema] sp_schema
         CSchema* schema
+
+cdef DataType box_data_type(const shared_ptr[CDataType]& type)

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx
index 63cd6e8..3001531 100644
--- a/python/arrow/schema.pyx
+++ b/python/arrow/schema.pyx
@@ -85,6 +85,14 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True):
 def field(name, type):
     return Field(name, type)
 
+cdef set PRIMITIVE_TYPES = set([
+    LogicalType_NA, LogicalType_BOOL,
+    LogicalType_UINT8, LogicalType_INT8,
+    LogicalType_UINT16, LogicalType_INT16,
+    LogicalType_UINT32, LogicalType_INT32,
+    LogicalType_UINT64, LogicalType_INT64,
+    LogicalType_FLOAT, LogicalType_DOUBLE])
+
 def null():
     return primitive_type(LogicalType_NA)
 
@@ -148,3 +156,9 @@ def struct(fields, c_bool nullable=True):
     out.init(shared_ptr[CDataType](
         new CStructType(c_fields, nullable)))
     return out
+
+
+cdef DataType box_data_type(const shared_ptr[CDataType]& type):
+    cdef DataType out = DataType()
+    out.init(type)
+    return out

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/arrow/tests/test_scalars.py b/python/arrow/tests/test_scalars.py
new file mode 100644
index 0000000..951380b
--- /dev/null
+++ b/python/arrow/tests/test_scalars.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from arrow.compat import unittest, u
+import arrow
+
+
+class TestScalars(unittest.TestCase):
+
+    def test_null_singleton(self):
+        with self.assertRaises(Exception):
+            arrow.NAType()
+
+    def test_bool(self):
+        pass
+
+    def test_int64(self):
+        arr = arrow.from_pylist([1, 2, None])
+
+        v = arr[0]
+        assert isinstance(v, arrow.Int64Value)
+        assert repr(v) == "1"
+        assert v.as_py() == 1
+
+        assert arr[2] is arrow.NA
+
+    def test_double(self):
+        arr = arrow.from_pylist([1.5, None, 3])
+
+        v = arr[0]
+        assert isinstance(v, arrow.DoubleValue)
+        assert repr(v) == "1.5"
+        assert v.as_py() == 1.5
+
+        assert arr[1] is arrow.NA
+
+        v = arr[2]
+        assert v.as_py() == 3.0
+
+    def test_string(self):
+        arr = arrow.from_pylist(['foo', None, u('bar')])
+
+        v = arr[0]
+        assert isinstance(v, arrow.StringValue)
+        assert repr(v) == "'foo'"
+        assert v.as_py() == 'foo'
+
+        assert arr[1] is arrow.NA
+
+        v = arr[2].as_py()
+        assert v == 'bar'
+        assert isinstance(v, str)
+
+    def test_list(self):
+        arr = arrow.from_pylist([['foo', None], None, ['bar'], []])
+
+        v = arr[0]
+        assert len(v) == 2
+        assert isinstance(v, arrow.ListValue)
+        assert repr(v) == "['foo', None]"
+        assert v.as_py() == ['foo', None]
+        assert v[0].as_py() == 'foo'
+        assert v[1] is arrow.NA
+
+        assert arr[1] is arrow.NA
+
+        v = arr[3]
+        assert len(v) == 0

http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index ae84fa1..60d6248 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -276,7 +276,7 @@ class Int64Converter : public TypedConverter<arrow::Int64Builder>
{
 class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
  public:
   Status AppendData(PyObject* seq) override {
-    int64_t val;
+    double val;
     Py_ssize_t size = PySequence_Size(seq);
     for (int64_t i = 0; i < size; ++i) {
       OwnedRef item(PySequence_GetItem(seq, i));


Mime
View raw message