arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-1675: [Python] Use RecordBatch.from_pandas in Feather write path
Date Thu, 26 Oct 2017 02:24:25 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 238881f  ARROW-1675: [Python] Use RecordBatch.from_pandas in Feather write path
238881f is described below

commit 238881fae8530a1ae994eb0e283e4783d3dd2855
Author: Wes McKinney <wes.mckinney@twosigma.com>
AuthorDate: Wed Oct 25 22:24:21 2017 -0400

    ARROW-1675: [Python] Use RecordBatch.from_pandas in Feather write path
    
    This also makes Feather writes more robust to columns having a mix of unicode and bytes
(these gets coerced to binary)
    
    Also resolves ARROW-1672
    
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1250 from wesm/ARROW-1675 and squashes the following commits:
    
    9d55886b [Wes McKinney] Use RecordBatch.from_pandas in Feather write path. Fix test cases
---
 python/pyarrow/feather.py            | 32 +++++++-------------------------
 python/pyarrow/tests/test_feather.py | 12 +++++++++---
 python/pyarrow/types.pxi             |  1 -
 3 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 2091c91..3ba9d65 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -23,7 +23,7 @@ import pandas as pd
 
 from pyarrow.compat import pdapi
 from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import Table
+from pyarrow.lib import RecordBatch, Table
 import pyarrow.lib as ext
 
 try:
@@ -75,30 +75,12 @@ class FeatherWriter(object):
         if not df.columns.is_unique:
             raise ValueError("cannot serialize duplicate column names")
 
-        # TODO(wesm): pipeline conversion to Arrow memory layout
-        for i, name in enumerate(df.columns):
-            col = df.iloc[:, i]
-
-            if pdapi.is_object_dtype(col):
-                inferred_type = infer_dtype(col)
-                msg = ("cannot serialize column {n} "
-                       "named {name} with dtype {dtype}".format(
-                           n=i, name=name, dtype=inferred_type))
-
-                if inferred_type in ['mixed']:
-
-                    # allow columns with nulls + an inferable type
-                    inferred_type = infer_dtype(col[col.notnull()])
-                    if inferred_type in ['mixed']:
-                        raise ValueError(msg)
-
-                elif inferred_type not in ['unicode', 'string']:
-                    raise ValueError(msg)
-
-            if not isinstance(name, six.string_types):
-                name = str(name)
-
-            self.writer.write_array(name, col)
+        # TODO(wesm): Remove this length check, see ARROW-1732
+        if len(df.columns) > 0:
+            batch = RecordBatch.from_pandas(df, preserve_index=False)
+            for i, name in enumerate(batch.schema.names):
+                col = batch[i]
+                self.writer.write_array(name, col)
 
         self.writer.close()
 
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 810ee3c..9e7fc88 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -279,11 +279,14 @@ class TestFeatherReader(unittest.TestCase):
         if sys.platform == 'win32':
             pytest.skip('Windows hangs on to file handle for some reason')
 
+        class CustomClass(object):
+            pass
+
         # strings will fail
         df = pd.DataFrame(
             {
                 'numbers': range(5),
-                'strings': [b'foo', None, u'bar', 'qux', np.nan]},
+                'strings': [b'foo', None, u'bar', CustomClass(), np.nan]},
             columns=['numbers', 'strings'])
 
         path = random_path()
@@ -297,10 +300,13 @@ class TestFeatherReader(unittest.TestCase):
     def test_strings(self):
         repeats = 1000
 
-        # we hvae mixed bytes, unicode, strings
+        # Mixed bytes, unicode, strings coerced to binary
         values = [b'foo', None, u'bar', 'qux', np.nan]
         df = pd.DataFrame({'strings': values * repeats})
-        self._assert_error_on_write(df, ValueError)
+
+        ex_values = [b'foo', None, b'bar', b'qux', np.nan]
+        expected = pd.DataFrame({'strings': ex_values * repeats})
+        self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
 
         # embedded nulls are ok
         values = ['foo', None, 'bar', 'qux', None]
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 686e56e..c9a4909 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -662,7 +662,6 @@ cdef _as_type(type):
     return type_for_alias(type)
 
 
-
 cdef set PRIMITIVE_TYPES = set([
     _Type_NA, _Type_BOOL,
     _Type_UINT8, _Type_INT8,

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message