arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-523: Python: Account for changes in PARQUET-834
Date Thu, 02 Feb 2017 21:13:47 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 0ae4d86e5 -> c05292faf


ARROW-523: Python: Account for changes in PARQUET-834

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #313 from xhochy/ARROW-523 and squashes the following commits:

ff699ea [Uwe L. Korn] Use relative import
e36dcc8 [Uwe L. Korn] ARROW-523: Python: Account for changes in PARQUET-834


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c05292fa
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c05292fa
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c05292fa

Branch: refs/heads/master
Commit: c05292faf74111e826dbbafe9a1e108346eb10dc
Parents: 0ae4d86
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Thu Feb 2 16:13:40 2017 -0500
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Thu Feb 2 16:13:40 2017 -0500

----------------------------------------------------------------------
 python/pyarrow/_parquet.pxd                 |  8 +--
 python/pyarrow/_parquet.pyx                 |  8 +--
 python/pyarrow/tests/pandas_examples.py     | 78 ++++++++++++++++++++++++
 python/pyarrow/tests/test_convert_pandas.py | 47 +-------------
 python/pyarrow/tests/test_parquet.py        | 11 ++++
 5 files changed, 100 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index fabee5d..6b9350a 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -211,9 +211,9 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
 
     cdef cppclass FileReader:
         FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader)
-        CStatus ReadFlatColumn(int i, shared_ptr[CArray]* out);
-        CStatus ReadFlatTable(shared_ptr[CTable]* out);
-        CStatus ReadFlatTable(const vector[int]& column_indices,
+        CStatus ReadColumn(int i, shared_ptr[CArray]* out);
+        CStatus ReadTable(shared_ptr[CTable]* out);
+        CStatus ReadTable(const vector[int]& column_indices,
                               shared_ptr[CTable]* out);
         const ParquetFileReader* parquet_reader();
 
@@ -228,7 +228,7 @@ cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
 
 
 cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
-    cdef CStatus WriteFlatTable(
+    cdef CStatus WriteTable(
         const CTable* table, MemoryPool* pool,
         const shared_ptr[OutputStream]& sink,
         int64_t chunk_size,

http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 3f847e9..fd4670a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -397,12 +397,12 @@ cdef class ParquetReader:
 
             with nogil:
                 check_status(self.reader.get()
-                             .ReadFlatTable(c_column_indices, &ctable))
+                             .ReadTable(c_column_indices, &ctable))
         else:
             # Read all columns
             with nogil:
                 check_status(self.reader.get()
-                             .ReadFlatTable(&ctable))
+                             .ReadTable(&ctable))
 
         table.init(ctable)
         return table
@@ -442,7 +442,7 @@ cdef class ParquetReader:
 
         with nogil:
             check_status(self.reader.get()
-                         .ReadFlatColumn(column_index, &carray))
+                         .ReadColumn(column_index, &carray))
 
         array.init(carray)
         return array
@@ -540,6 +540,6 @@ cdef class ParquetWriter:
 
         cdef int c_row_group_size = row_group_size
         with nogil:
-            check_status(WriteFlatTable(ctable, default_memory_pool(),
+            check_status(WriteTable(ctable, default_memory_pool(),
                                         self.sink, c_row_group_size,
                                         self.properties))

http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/pandas_examples.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
new file mode 100644
index 0000000..63af423
--- /dev/null
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+
+
+def dataframe_with_arrays():
+    """
+    Dataframe with numpy arrays columns of every possible primtive type.
+
+    Returns
+    -------
+    df: pandas.DataFrame
+    schema: pyarrow.Schema
+        Arrow schema definition that is in line with the constructed df.
+    """
+    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
+              ('i4', pa.int32()), ('i8', pa.int64()),
+              ('u1', pa.uint8()), ('u2', pa.uint16()),
+              ('u4', pa.uint32()), ('u8', pa.uint64()),
+              ('f4', pa.float_()), ('f8', pa.double())]
+
+    arrays = OrderedDict()
+    fields = []
+    for dtype, arrow_dtype in dtypes:
+        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
+        arrays[dtype] = [
+            np.arange(10, dtype=dtype),
+            np.arange(5, dtype=dtype),
+            None,
+            np.arange(1, dtype=dtype)
+        ]
+
+    fields.append(pa.field('str', pa.list_(pa.string())))
+    arrays['str'] = [
+        np.array([u"1", u"ä"], dtype="object"),
+        None,
+        np.array([u"1"], dtype="object"),
+        np.array([u"1", u"2", u"3"], dtype="object")
+    ]
+
+    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
+    arrays['datetime64'] = [
+        np.array(['2007-07-13T01:23:34.123456789',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+        None,
+        None,
+        np.array(['2007-07-13T02',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.Schema.from_fields(fields)
+
+    return df, schema

http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 674a436..ddbb02a 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -29,6 +29,8 @@ import pandas.util.testing as tm
 from pyarrow.compat import u
 import pyarrow as A
 
+from .pandas_examples import dataframe_with_arrays
+
 
 def _alltypes_example(size=100):
     return pd.DataFrame({
@@ -325,54 +327,11 @@ class TestPandasConversion(unittest.TestCase):
         tm.assert_frame_equal(result, expected)
 
     def test_column_of_lists(self):
-        dtypes = [('i1', A.int8()), ('i2', A.int16()),
-                  ('i4', A.int32()), ('i8', A.int64()),
-                  ('u1', A.uint8()), ('u2', A.uint16()),
-                  ('u4', A.uint32()), ('u8', A.uint64()),
-                  ('f4', A.float_()), ('f8', A.double())]
-
-        arrays = OrderedDict()
-        fields = []
-        for dtype, arrow_dtype in dtypes:
-            fields.append(A.field(dtype, A.list_(arrow_dtype)))
-            arrays[dtype] = [
-                np.arange(10, dtype=dtype),
-                np.arange(5, dtype=dtype),
-                None,
-                np.arange(1, dtype=dtype)
-            ]
-
-        fields.append(A.field('str', A.list_(A.string())))
-        arrays['str'] = [
-            np.array([u"1", u"ä"], dtype="object"),
-            None,
-            np.array([u"1"], dtype="object"),
-            np.array([u"1", u"2", u"3"], dtype="object")
-        ]
-
-        fields.append(A.field('datetime64', A.list_(A.timestamp('ns'))))
-        arrays['datetime64'] = [
-            np.array(['2007-07-13T01:23:34.123456789',
-                      None,
-                      '2010-08-13T05:46:57.437699912'],
-                     dtype='datetime64[ns]'),
-            None,
-            None,
-            np.array(['2007-07-13T02',
-                      None,
-                      '2010-08-13T05:46:57.437699912'],
-                     dtype='datetime64[ns]'),
-        ]
-
-        df = pd.DataFrame(arrays)
-        schema = A.Schema.from_fields(fields)
+        df, schema = dataframe_with_arrays()
         self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
         table = A.Table.from_pandas(df, schema=schema)
         assert table.schema.equals(schema)
 
-        # it works!
-        table.to_pandas(nthreads=1)
-
     def test_threaded_conversion(self):
         df = _alltypes_example()
         self._check_pandas_roundtrip(df, nthreads=2,

http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index d85f0e5..80a995f 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -23,6 +23,7 @@ import pytest
 from pyarrow.compat import guid
 import pyarrow as pa
 import pyarrow.io as paio
+from .pandas_examples import dataframe_with_arrays
 
 import numpy as np
 import pandas as pd
@@ -319,6 +320,16 @@ def test_compare_schemas():
     assert fileh.schema[0].equals(fileh.schema[0])
     assert not fileh.schema[0].equals(fileh.schema[1])
 
+@parquet
+def test_column_of_lists(tmpdir):
+    df, schema = dataframe_with_arrays()
+
+    filename = tmpdir.join('pandas_rountrip.parquet')
+    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
+    pq.write_table(arrow_table, filename.strpath, version="2.0")
+    table_read = pq.read_table(filename.strpath)
+    df_read = table_read.to_pandas()
+    pdt.assert_frame_equal(df, df_read)
 
 @parquet
 def test_multithreaded_read():


Mime
View raw message