arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject [arrow] branch master updated: ARROW-1732: [Python] Permit creating record batches with no columns, test pandas roundtrips
Date Thu, 26 Oct 2017 12:13:56 GMT
This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c30a7e3  ARROW-1732: [Python] Permit creating record batches with no columns, test
pandas roundtrips
c30a7e3 is described below

commit c30a7e30af2469dde1a00f74d8ba9631887825c4
Author: Wes McKinney <wes.mckinney@twosigma.com>
AuthorDate: Thu Oct 26 14:13:50 2017 +0200

    ARROW-1732: [Python] Permit creating record batches with no columns, test pandas roundtrips
    
    I ran into this rough edge today, invariably serialization code paths will need to send
across a DataFrame with no columns, this will need to work even if `preserve_index=False`
    
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1252 from wesm/ARROW-1732 and squashes the following commits:
    
    a240c05 [Wes McKinney] Permit creating record batches with no columns, test pandas roundtrips
---
 python/pyarrow/table.pxi                    | 20 ++++++++++----------
 python/pyarrow/tests/test_convert_pandas.py | 25 ++++++++++++++++++-------
 python/pyarrow/tests/test_table.py          |  8 ++++++++
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 694fe91..eb19115 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -308,8 +308,8 @@ cdef shared_ptr[const CKeyValueMetadata] unbox_metadata(dict metadata):
             make_shared[CKeyValueMetadata](unordered_metadata))
 
 
-cdef int _schema_from_arrays(
-        arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1:
+cdef _schema_from_arrays(arrays, names, dict metadata,
+                         shared_ptr[CSchema]* schema):
     cdef:
         Column col
         c_string c_name
@@ -317,10 +317,11 @@ cdef int _schema_from_arrays(
         shared_ptr[CDataType] type_
         Py_ssize_t K = len(arrays)
 
-    fields.resize(K)
+    if K == 0:
+        schema.reset(new CSchema(fields, unbox_metadata(metadata)))
+        return
 
-    if not K:
-        raise ValueError('Must pass at least one array')
+    fields.resize(K)
 
     if isinstance(arrays[0], Column):
         for i in range(K):
@@ -346,7 +347,6 @@ cdef int _schema_from_arrays(
             fields[i].reset(new CField(c_name, type_, True))
 
     schema.reset(new CSchema(fields, unbox_metadata(metadata)))
-    return 0
 
 
 cdef class RecordBatch:
@@ -613,10 +613,10 @@ cdef class RecordBatch:
             int64_t i
             int64_t number_of_arrays = len(arrays)
 
-        if not number_of_arrays:
-            raise ValueError('Record batch cannot contain no arrays (for now)')
-
-        num_rows = len(arrays[0])
+        if len(arrays) > 0:
+            num_rows = len(arrays[0])
+        else:
+            num_rows = 0
         _schema_from_arrays(arrays, names, metadata, &schema)
 
         c_arrays.reserve(len(arrays))
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 527466e..6d146f9 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -71,11 +71,11 @@ class TestPandasConversion(object):
     def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
                                 expected_schema=None,
                                 check_dtype=True, schema=None,
-                                check_index=False,
+                                preserve_index=False,
                                 as_batch=False):
         klass = pa.RecordBatch if as_batch else pa.Table
         table = klass.from_pandas(df, schema=schema,
-                                  preserve_index=check_index,
+                                  preserve_index=preserve_index,
                                   nthreads=nthreads)
 
         result = table.to_pandas(nthreads=nthreads)
@@ -83,7 +83,9 @@ class TestPandasConversion(object):
             assert table.schema.equals(expected_schema)
         if expected is None:
             expected = df
-        tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
+        tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
+                              check_index_type=('equiv' if preserve_index
+                                                else False))
 
     def _check_series_roundtrip(self, s, type_=None):
         arr = pa.array(s, from_pandas=True, type=type_)
@@ -131,14 +133,14 @@ class TestPandasConversion(object):
     def test_column_index_names_are_preserved(self):
         df = pd.DataFrame({'data': [1, 2, 3]})
         df.columns.names = ['a']
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_multiindex_columns(self):
         columns = pd.MultiIndex.from_arrays([
             ['one', 'two'], ['X', 'Y']
         ])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_multiindex_columns_with_dtypes(self):
         columns = pd.MultiIndex.from_arrays(
@@ -149,11 +151,11 @@ class TestPandasConversion(object):
             names=['level_1', 'level_2'],
         )
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_categorical_column_index(self):
         # I *really* hope no one uses category dtypes for single level column
@@ -1095,6 +1097,15 @@ class TestPandasConversion(object):
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)
 
+    def test_table_batch_empty_dataframe(self):
+        df = pd.DataFrame({})
+        self._check_pandas_roundtrip(df)
+        self._check_pandas_roundtrip(df, as_batch=True)
+
+        df2 = pd.DataFrame({}, index=[0, 1, 2])
+        self._check_pandas_roundtrip(df2, preserve_index=True)
+        self._check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
+
     def test_array_from_pandas_date_with_mask(self):
         m = np.array([True, False, True])
         data = pd.Series([
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 50190f5..4282224 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -82,6 +82,14 @@ def test_recordbatch_basics():
         batch[2]
 
 
+def test_recordbatch_no_fields():
+    batch = pa.RecordBatch.from_arrays([], [])
+
+    assert len(batch) == 0
+    assert batch.num_rows == 0
+    assert batch.num_columns == 0
+
+
 def test_recordbatch_from_arrays_invalid_names():
     data = [
         pa.array(range(5)),

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message