arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1357: [Python] Account for chunked arrays when converting lists back to pandas form
Date Sun, 20 Aug 2017 17:49:51 GMT
Repository: arrow
Updated Branches:
  refs/heads/master b50f2351e -> de7c6715b


ARROW-1357: [Python] Account for chunked arrays when converting lists back to pandas form

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #979 from wesm/ARROW-1357 and squashes the following commits:

8318a121 [Wes McKinney] Use PyLong_FromLongLong so Windows is happy
18acdd91 [Wes McKinney] Account for chunked arrays when converting lists back to pandas form


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/de7c6715
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/de7c6715
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/de7c6715

Branch: refs/heads/master
Commit: de7c6715ba244e119913bfa31b8de46dbbd450bf
Parents: b50f235
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Sun Aug 20 13:49:46 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sun Aug 20 13:49:46 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/arrow_to_pandas.cc     |  7 +++++--
 python/pyarrow/tests/test_convert_pandas.py | 25 ++++++++++++++++++++++++
 python/pyarrow/tests/test_serialization.py  |  1 -
 3 files changed, 30 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/de7c6715/cpp/src/arrow/python/arrow_to_pandas.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 23bef7b..d1fca70 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -498,6 +498,7 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu
 
   PyAcquireGIL lock;
 
+  int64_t chunk_offset = 0;
   for (int c = 0; c < data.num_chunks(); c++) {
     auto arr = std::static_pointer_cast<ListArray>(data.chunk(c));
 
@@ -507,8 +508,8 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu
         Py_INCREF(Py_None);
         *out_values = Py_None;
       } else {
-        PyObject* start = PyLong_FromLong(arr->value_offset(i));
-        PyObject* end = PyLong_FromLong(arr->value_offset(i + 1));
+        PyObject* start = PyLong_FromLongLong(arr->value_offset(i) + chunk_offset);
+        PyObject* end = PyLong_FromLongLong(arr->value_offset(i + 1) + chunk_offset);
         PyObject* slice = PySlice_New(start, end, NULL);
         *out_values = PyObject_GetItem(numpy_array, slice);
         Py_DECREF(start);
@@ -517,6 +518,8 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr<Colu
       }
       ++out_values;
     }
+
+    chunk_offset += arr->length();
   }
 
   Py_XDECREF(numpy_array);

http://git-wip-us.apache.org/repos/asf/arrow/blob/de7c6715/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 61bd072..c263184 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -534,6 +534,31 @@ class TestPandasConversion(unittest.TestCase):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], type=field.type)
 
+    def test_column_of_lists_chunked(self):
+        # ARROW-1357
+        df = pd.DataFrame({
+            'lists': np.array([
+                [1, 2],
+                None,
+                [2, 3],
+                [4, 5],
+                [6, 7],
+                [8, 9]
+            ], dtype=object)
+        })
+
+        schema = pa.schema([
+            pa.field('lists', pa.list_(pa.int64()))
+        ])
+
+        t1 = pa.Table.from_pandas(df[:2], schema=schema)
+        t2 = pa.Table.from_pandas(df[2:], schema=schema)
+
+        table = pa.concat_tables([t1, t2])
+        result = table.to_pandas()
+
+        tm.assert_frame_equal(result, df)
+
     def test_column_of_lists_strided(self):
         df, schema = dataframe_with_lists()
         df = pd.concat([df] * 6, ignore_index=True)

http://git-wip-us.apache.org/repos/asf/arrow/blob/de7c6715/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index f6f9840..013d86e 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -20,7 +20,6 @@ from __future__ import division
 import pytest
 
 from collections import namedtuple
-import os
 import string
 import sys
 


Mime
View raw message