arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-49: [Python] Add Column and Table wrapper interface
Date Fri, 01 Apr 2016 00:47:50 GMT
Repository: arrow
Updated Branches:
  refs/heads/master b3ebce1b3 -> 6d31d5928


ARROW-49: [Python] Add Column and Table wrapper interface

After https://github.com/apache/arrow/pull/52 is merged, I'd like to split Column and Table
into separate .pyx files, array.pyx seems a bit overcrowded.

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #53 from xhochy/arrow-49 and squashes the following commits:

b01b201 [Uwe L. Korn] Use correct number of chunks
e422faf [Uwe L. Korn] Incoportate PR feedback, Add ChunkedArray interface
e8f84a9 [Uwe L. Korn] ARROW-49: [Python] Add Column and Table wrapper interface


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/6d31d592
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/6d31d592
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/6d31d592

Branch: refs/heads/master
Commit: 6d31d5928f4ec5ced14a105b5b05d46a7dab5264
Parents: b3ebce1
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Thu Mar 31 17:47:42 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Thu Mar 31 17:47:42 2016 -0700

----------------------------------------------------------------------
 python/CMakeLists.txt                |   1 +
 python/pyarrow/__init__.py           |   4 +-
 python/pyarrow/array.pxd             |   2 +
 python/pyarrow/array.pyx             |  75 +--------
 python/pyarrow/includes/libarrow.pxd |   5 +-
 python/pyarrow/schema.pxd            |   2 +
 python/pyarrow/schema.pyx            |   9 +
 python/pyarrow/table.pxd             |  46 ++++++
 python/pyarrow/table.pyx             | 264 ++++++++++++++++++++++++++++++
 python/pyarrow/tests/test_column.py  |  49 ++++++
 python/pyarrow/tests/test_table.py   |  39 +++++
 python/setup.py                      |   2 +-
 12 files changed, 422 insertions(+), 76 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ebe825f..2173232 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -444,6 +444,7 @@ set(CYTHON_EXTENSIONS
   error
   scalar
   schema
+  table
 )
 
 foreach(module ${CYTHON_EXTENSIONS})

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index c343f5b..40a09c2 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -41,4 +41,6 @@ from pyarrow.schema import (null, bool_,
                             list_, struct, field,
                             DataType, Field, Schema, schema)
 
-from pyarrow.array import RowBatch, Table, from_pandas_dataframe
+from pyarrow.array import RowBatch, from_pandas_dataframe
+
+from pyarrow.table import Column, Table

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd
index de3c774..8cd15cd 100644
--- a/python/pyarrow/array.pxd
+++ b/python/pyarrow/array.pxd
@@ -36,6 +36,8 @@ cdef class Array:
     cdef init(self, const shared_ptr[CArray]& sp_array)
     cdef getitem(self, int i)
 
+cdef object box_arrow_array(const shared_ptr[CArray]& sp_array)
+
 
 cdef class BooleanArray(Array):
     pass

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 255efc2..456bf6d 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -33,6 +33,8 @@ from pyarrow.scalar import NA
 from pyarrow.schema cimport Schema
 import pyarrow.schema as schema
 
+from pyarrow.table cimport Table
+
 def total_allocated_bytes():
     cdef MemoryPool* pool = pyarrow.GetMemoryPool()
     return pool.bytes_allocated()
@@ -287,76 +289,3 @@ cdef class RowBatch:
         return self.arrays[i]
 
 
-cdef class Table:
-    '''
-    Do not call this class's constructor directly.
-    '''
-    cdef:
-        shared_ptr[CTable] sp_table
-        CTable* table
-
-    def __cinit__(self):
-        pass
-
-    cdef init(self, const shared_ptr[CTable]& table):
-        self.sp_table = table
-        self.table = table.get()
-
-    @staticmethod
-    def from_pandas(df, name=None):
-        pass
-
-    @staticmethod
-    def from_arrays(names, arrays, name=None):
-        cdef:
-            Array arr
-            Table result
-            c_string c_name
-            vector[shared_ptr[CField]] fields
-            vector[shared_ptr[CColumn]] columns
-            shared_ptr[CSchema] schema
-            shared_ptr[CTable] table
-
-        cdef int K = len(arrays)
-
-        fields.resize(K)
-        columns.resize(K)
-        for i in range(K):
-            arr = arrays[i]
-            c_name = tobytes(names[i])
-
-            fields[i].reset(new CField(c_name, arr.type.sp_type, True))
-            columns[i].reset(new CColumn(fields[i], arr.sp_array))
-
-        if name is None:
-            c_name = ''
-        else:
-            c_name = tobytes(name)
-
-        schema.reset(new CSchema(fields))
-        table.reset(new CTable(c_name, schema, columns))
-
-        result = Table()
-        result.init(table)
-
-        return result
-
-    def to_pandas(self):
-        """
-        Convert the arrow::Table to a pandas DataFrame
-        """
-        cdef:
-            PyObject* arr
-            shared_ptr[CColumn] col
-
-        import pandas as pd
-
-        names = []
-        data = []
-        for i in range(self.table.num_columns()):
-            col = self.table.column(i)
-            check_status(pyarrow.ArrowToPandas(col, &arr))
-            names.append(frombytes(col.get().name()))
-            data.append(<object> arr)
-
-        return pd.DataFrame(dict(zip(names, data)), columns=names)

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 42f1f25..b2ef45a 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -149,7 +149,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         c_string GetString(int i)
 
     cdef cppclass CChunkedArray" arrow::ChunkedArray":
-        pass
+        int64_t length()
+        int64_t null_count()
+        int num_chunks()
+        const shared_ptr[CArray]& chunk(int i)
 
     cdef cppclass CColumn" arrow::Column":
         CColumn(const shared_ptr[CField]& field,

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
index 61458b7..f2cb776 100644
--- a/python/pyarrow/schema.pxd
+++ b/python/pyarrow/schema.pxd
@@ -41,5 +41,7 @@ cdef class Schema:
         CSchema* schema
 
     cdef init(self, const vector[shared_ptr[CField]]& fields)
+    cdef init_schema(self, const shared_ptr[CSchema]& schema)
 
 cdef DataType box_data_type(const shared_ptr[CDataType]& type)
+cdef Schema box_schema(const shared_ptr[CSchema]& schema)

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index b3bf02a..22ddf0c 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -106,6 +106,10 @@ cdef class Schema:
         self.schema = new CSchema(fields)
         self.sp_schema.reset(self.schema)
 
+    cdef init_schema(self, const shared_ptr[CSchema]& schema):
+        self.schema = schema.get()
+        self.sp_schema = schema
+
     @classmethod
     def from_fields(cls, fields):
         cdef:
@@ -223,3 +227,8 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type):
     cdef DataType out = DataType()
     out.init(type)
     return out
+
+cdef Schema box_schema(const shared_ptr[CSchema]& type):
+    cdef Schema out = Schema()
+    out.init_schema(type)
+    return out

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/table.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxd b/python/pyarrow/table.pxd
new file mode 100644
index 0000000..0a5c122
--- /dev/null
+++ b/python/pyarrow/table.pxd
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow.includes.common cimport shared_ptr
+from pyarrow.includes.libarrow cimport CChunkedArray, CColumn, CTable
+
+
+cdef class ChunkedArray:
+    cdef:
+        shared_ptr[CChunkedArray] sp_chunked_array
+        CChunkedArray* chunked_array
+
+    cdef init(self, const shared_ptr[CChunkedArray]& chunked_array)
+    cdef _check_nullptr(self)
+
+
+cdef class Column:
+    cdef:
+        shared_ptr[CColumn] sp_column
+        CColumn* column
+
+    cdef init(self, const shared_ptr[CColumn]& column)
+    cdef _check_nullptr(self)
+
+
+cdef class Table:
+    cdef:
+        shared_ptr[CTable] sp_table
+        CTable* table
+    
+    cdef init(self, const shared_ptr[CTable]& table)
+    cdef _check_nullptr(self)

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
new file mode 100644
index 0000000..4c4816f
--- /dev/null
+++ b/python/pyarrow/table.pyx
@@ -0,0 +1,264 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+
+from pyarrow.includes.libarrow cimport *
+cimport pyarrow.includes.pyarrow as pyarrow
+
+import pyarrow.config
+
+from pyarrow.array cimport Array, box_arrow_array
+from pyarrow.compat import frombytes, tobytes
+from pyarrow.error cimport check_status
+from pyarrow.schema cimport box_data_type, box_schema
+
+cdef class ChunkedArray:
+    '''
+    Do not call this class's constructor directly.
+    '''
+
+    def __cinit__(self):
+        self.chunked_array = NULL
+
+    cdef init(self, const shared_ptr[CChunkedArray]& chunked_array):
+        self.sp_chunked_array = chunked_array
+        self.chunked_array = chunked_array.get()
+
+    cdef _check_nullptr(self):
+        if self.chunked_array == NULL:
+            raise ReferenceError("ChunkedArray object references a NULL pointer."
+                    "Not initialized.")
+
+    def length(self):
+        self._check_nullptr()
+        return self.chunked_array.length()
+
+    def __len__(self):
+        return self.length()
+
+    property null_count:
+
+        def __get__(self):
+            self._check_nullptr()
+            return self.chunked_array.null_count()
+
+    property num_chunks:
+
+        def __get__(self):
+            self._check_nullptr()
+            return self.chunked_array.num_chunks()
+
+    def chunk(self, i):
+        self._check_nullptr()
+        return box_arrow_array(self.chunked_array.chunk(i))
+
+
+    def iterchunks(self):
+        for i in range(self.num_chunks):
+            yield self.chunk(i)
+
+
+cdef class Column:
+    '''
+    Do not call this class's constructor directly.
+    '''
+
+    def __cinit__(self):
+        self.column = NULL
+
+    cdef init(self, const shared_ptr[CColumn]& column):
+        self.sp_column = column
+        self.column = column.get()
+
+    def to_pandas(self):
+        """
+        Convert the arrow::Column to a pandas Series
+        """
+        cdef:
+            PyObject* arr
+
+        import pandas as pd
+
+        check_status(pyarrow.ArrowToPandas(self.sp_column, &arr))
+        return pd.Series(<object>arr, name=self.name)
+
+    cdef _check_nullptr(self):
+        if self.column == NULL:
+            raise ReferenceError("Column object references a NULL pointer."
+                    "Not initialized.")
+
+    def __len__(self):
+        self._check_nullptr()
+        return self.column.length()
+
+    def length(self):
+        self._check_nullptr()
+        return self.column.length()
+
+    property shape:
+
+        def __get__(self):
+            self._check_nullptr()
+            return (self.length(),)
+
+    property null_count:
+
+        def __get__(self):
+            self._check_nullptr()
+            return self.column.null_count()
+
+    property name:
+
+        def __get__(self):
+            return frombytes(self.column.name())
+
+    property type:
+
+        def __get__(self):
+            return box_data_type(self.column.type())
+
+    property data:
+
+        def __get__(self):
+            cdef ChunkedArray chunked_array = ChunkedArray()
+            chunked_array.init(self.column.data())
+            return chunked_array
+
+
+cdef class Table:
+    '''
+    Do not call this class's constructor directly.
+    '''
+
+    def __cinit__(self):
+        self.table = NULL
+
+    cdef init(self, const shared_ptr[CTable]& table):
+        self.sp_table = table
+        self.table = table.get()
+
+    cdef _check_nullptr(self):
+        if self.table == NULL:
+            raise ReferenceError("Table object references a NULL pointer."
+                    "Not initialized.")
+
+    @staticmethod
+    def from_pandas(df, name=None):
+        pass
+
+    @staticmethod
+    def from_arrays(names, arrays, name=None):
+        cdef:
+            Array arr
+            Table result
+            c_string c_name
+            vector[shared_ptr[CField]] fields
+            vector[shared_ptr[CColumn]] columns
+            shared_ptr[CSchema] schema
+            shared_ptr[CTable] table
+
+        cdef int K = len(arrays)
+
+        fields.resize(K)
+        columns.resize(K)
+        for i in range(K):
+            arr = arrays[i]
+            c_name = tobytes(names[i])
+
+            fields[i].reset(new CField(c_name, arr.type.sp_type, True))
+            columns[i].reset(new CColumn(fields[i], arr.sp_array))
+
+        if name is None:
+            c_name = ''
+        else:
+            c_name = tobytes(name)
+
+        schema.reset(new CSchema(fields))
+        table.reset(new CTable(c_name, schema, columns))
+
+        result = Table()
+        result.init(table)
+
+        return result
+
+    def to_pandas(self):
+        """
+        Convert the arrow::Table to a pandas DataFrame
+        """
+        cdef:
+            PyObject* arr
+            shared_ptr[CColumn] col
+
+        import pandas as pd
+
+        names = []
+        data = []
+        for i in range(self.table.num_columns()):
+            col = self.table.column(i)
+            check_status(pyarrow.ArrowToPandas(col, &arr))
+            names.append(frombytes(col.get().name()))
+            data.append(<object> arr)
+
+        return pd.DataFrame(dict(zip(names, data)), columns=names)
+
+    property name:
+
+        def __get__(self):
+            self._check_nullptr()
+            return frombytes(self.table.name())
+
+    property schema:
+
+        def __get__(self):
+            raise box_schema(self.table.schema())
+
+    def column(self, index):
+        self._check_nullptr()
+        cdef Column column = Column()
+        column.init(self.table.column(index))
+        return column
+
+    def __getitem__(self, i):
+        return self.column(i)
+
+    def itercolumns(self):
+        for i in range(self.num_columns):
+            yield self.column(i)
+
+    property num_columns:
+
+        def __get__(self):
+            self._check_nullptr()
+            return self.table.num_columns()
+
+    property num_rows:
+
+        def __get__(self):
+            self._check_nullptr()
+            return self.table.num_rows()
+
+    def __len__(self):
+        return self.num_rows
+
+    property shape:
+
+        def __get__(self):
+            return (self.num_rows, self.num_columns)
+

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/tests/test_column.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_column.py b/python/pyarrow/tests/test_column.py
new file mode 100644
index 0000000..b62f582
--- /dev/null
+++ b/python/pyarrow/tests/test_column.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow.compat import unittest
+import pyarrow as arrow
+
+A = arrow
+
+import pandas as pd
+
+
+class TestColumn(unittest.TestCase):
+
+    def test_basics(self):
+        data = [
+            A.from_pylist([-10, -5, 0, 5, 10])
+        ]
+        table = A.Table.from_arrays(('a'), data, 'table_name')
+        column = table.column(0)
+        assert column.name == 'a'
+        assert column.length() == 5
+        assert len(column) == 5
+        assert column.shape == (5,)
+
+    def test_pandas(self):
+        data = [
+            A.from_pylist([-10, -5, 0, 5, 10])
+        ]
+        table = A.Table.from_arrays(('a'), data, 'table_name')
+        column = table.column(0)
+        series = column.to_pandas()
+        assert series.name == 'a'
+        assert series.shape == (5,)
+        assert series.iloc[0] == -10
+

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/pyarrow/tests/test_table.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 2e24445..83fcbb8 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -20,6 +20,8 @@ import pyarrow as arrow
 
 A = arrow
 
+import pandas as pd
+
 
 class TestRowBatch(unittest.TestCase):
 
@@ -38,3 +40,40 @@ class TestRowBatch(unittest.TestCase):
         assert len(batch) == num_rows
         assert batch.num_rows == num_rows
         assert batch.num_columns == len(data)
+
+
+class TestTable(unittest.TestCase):
+
+    def test_basics(self):
+        data = [
+            A.from_pylist(range(5)),
+            A.from_pylist([-10, -5, 0, 5, 10])
+        ]
+        table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
+        assert table.name == 'table_name'
+        assert len(table) == 5
+        assert table.num_rows == 5
+        assert table.num_columns == 2
+        assert table.shape == (5, 2)
+
+        for col in table.itercolumns():
+            for chunk in col.data.iterchunks():
+                assert chunk is not None
+
+    def test_pandas(self):
+        data = [
+            A.from_pylist(range(5)),
+            A.from_pylist([-10, -5, 0, 5, 10])
+        ]
+        table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
+
+        # TODO: Use this part once from_pandas is implemented
+        # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]}
+        # df = pd.DataFrame(data)
+        # A.Table.from_pandas(df)
+
+        df = table.to_pandas()
+        assert set(df.columns) == set(('a', 'b'))
+        assert df.shape == (5, 2)
+        assert df.ix[0, 'b'] == -10
+

http://git-wip-us.apache.org/repos/asf/arrow/blob/6d31d592/python/setup.py
----------------------------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index 5cc871a..ebd80de 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -214,7 +214,7 @@ class build_ext(_build_ext):
             return name + suffix
 
     def get_cmake_cython_names(self):
-        return ['array', 'config', 'error', 'scalar', 'schema']
+        return ['array', 'config', 'error', 'scalar', 'schema', 'table']
 
     def get_names(self):
         return self._found_names


Mime
View raw message