arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1328: [Python] Set correct Arrow type when coercing to milliseconds and passing explicit type
Date Fri, 04 Aug 2017 18:25:57 GMT
Repository: arrow
Updated Branches:
  refs/heads/master aa5d417bb -> 717bed0d0


ARROW-1328: [Python] Set correct Arrow type when coercing to milliseconds and passing explicit
type

cc @fjetter

I'm planning to deprecate the `timestamps_to_ms` argument but it needs a patch in parquet-cpp
first (so that nanoseconds/microseconds can be casted on the fly to milliseconds), so let's
handle this fix first

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #941 from wesm/ARROW-1328 and squashes the following commits:

4632dc36 [Wes McKinney] Set correct Arrow type when coercing to milliseconds and passing explicit
type (which may not be milliseconds)


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/717bed0d
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/717bed0d
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/717bed0d

Branch: refs/heads/master
Commit: 717bed0d0a6a038cca6d44b46f60833aed6b09fc
Parents: aa5d417
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Fri Aug 4 14:25:53 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Fri Aug 4 14:25:53 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/array.pxi                    | 26 +---------------
 python/pyarrow/pandas_compat.py             | 24 +++++++++++++++
 python/pyarrow/table.pxi                    | 19 +++++++-----
 python/pyarrow/tests/test_convert_pandas.py | 39 +++++++++++++++---------
 4 files changed, 62 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/717bed0d/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index cbd036c..4e0c21c 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -16,30 +16,6 @@
 # under the License.
 
 
-cdef maybe_coerce_datetime64(values, dtype, DataType type,
-                             timestamps_to_ms=False):
-
-    from pyarrow.compat import DatetimeTZDtype
-
-    if values.dtype.type != np.datetime64:
-        return values, type
-
-    coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]'
-
-    if coerce_ms:
-        values = values.astype('datetime64[ms]')
-
-    if isinstance(dtype, DatetimeTZDtype):
-        tz = dtype.tz
-        unit = 'ms' if coerce_ms else dtype.unit
-        type = timestamp(unit, tz)
-    elif type is None:
-        # Trust the NumPy dtype
-        type = from_numpy_dtype(values.dtype)
-
-    return values, type
-
-
 def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
           size=None):
     """
@@ -205,7 +181,7 @@ cdef class Array:
             else:
                 out = chunked_out.get().chunk(0)
         else:
-            values, type = maybe_coerce_datetime64(
+            values, type = pdcompat.maybe_coerce_datetime64(
                 values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms)
 
             if type is None:

http://git-wip-us.apache.org/repos/asf/arrow/blob/717bed0d/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 62547a4..2881588 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -17,6 +17,7 @@
 
 import re
 import json
+import numpy as np
 import pandas as pd
 
 import six
@@ -241,6 +242,29 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index):
     return names, arrays, metadata
 
 
+def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False):
+    from pyarrow.compat import DatetimeTZDtype
+
+    if values.dtype.type != np.datetime64:
+        return values, type_
+
+    coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]'
+
+    if coerce_ms:
+        values = values.astype('datetime64[ms]')
+        type_ = pa.timestamp('ms')
+
+    if isinstance(dtype, DatetimeTZDtype):
+        tz = dtype.tz
+        unit = 'ms' if coerce_ms else dtype.unit
+        type_ = pa.timestamp(unit, tz)
+    elif type_ is None:
+        # Trust the NumPy dtype
+        type_ = pa.from_numpy_dtype(values.dtype)
+
+    return values, type_
+
+
 def table_to_blockmanager(table, nthreads=1):
     import pandas.core.internals as _int
     from pyarrow.compat import DatetimeTZDtype

http://git-wip-us.apache.org/repos/asf/arrow/blob/717bed0d/python/pyarrow/table.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 997b285..e33c9ba 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -134,6 +134,16 @@ cdef class Column:
         self.sp_column = column
         self.column = column.get()
 
+    def __repr__(self):
+        from pyarrow.compat import StringIO
+        result = StringIO()
+        result.write(object.__repr__(self))
+        data = self.data
+        for i in range(len(data)):
+            result.write('\nchunk {0}: {1}'.format(i, repr(data.chunk(0))))
+
+        return result.getvalue()
+
     @staticmethod
     def from_array(object field_or_name, Array arr):
         cdef Field boxed_field
@@ -661,13 +671,8 @@ cdef class Table:
         return result
 
     @classmethod
-    def from_pandas(
-        cls,
-        df,
-        bint timestamps_to_ms=False,
-        Schema schema=None,
-        bint preserve_index=True
-    ):
+    def from_pandas(cls, df, bint timestamps_to_ms=False,
+                    Schema schema=None, bint preserve_index=True):
         """
         Convert pandas.DataFrame to an Arrow Table
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/717bed0d/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index f6ea163..2a51d32 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -18,7 +18,7 @@
 
 from collections import OrderedDict
 
-import datetime
+from datetime import datetime, date, time
 import unittest
 import decimal
 import json
@@ -351,6 +351,17 @@ class TestPandasConversion(unittest.TestCase):
             expected_schema=schema,
         )
 
+    def test_timestamps_to_ms_explicit_schema(self):
+        # ARROW-1328
+        df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]})
+        pa_type = pa.from_numpy_dtype(df['datetime'].dtype)
+
+        arr = pa.Array.from_pandas(df['datetime'], type=pa_type,
+                                   timestamps_to_ms=True)
+
+        tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'),
+                               arr.to_pandas())
+
     def test_timestamps_notimezone_nulls(self):
         df = pd.DataFrame({
             'datetime64': np.array([
@@ -409,10 +420,10 @@ class TestPandasConversion(unittest.TestCase):
 
     def test_date_infer(self):
         df = pd.DataFrame({
-            'date': [datetime.date(2000, 1, 1),
+            'date': [date(2000, 1, 1),
                      None,
-                     datetime.date(1970, 1, 1),
-                     datetime.date(2040, 2, 26)]})
+                     date(1970, 1, 1),
+                     date(2040, 2, 26)]})
         table = pa.Table.from_pandas(df, preserve_index=False)
         field = pa.field('date', pa.date32())
         schema = pa.schema([field])
@@ -424,10 +435,10 @@ class TestPandasConversion(unittest.TestCase):
 
     def test_date_objects_typed(self):
         arr = np.array([
-            datetime.date(2017, 4, 3),
+            date(2017, 4, 3),
             None,
-            datetime.date(2017, 4, 4),
-            datetime.date(2017, 4, 5)], dtype=object)
+            date(2017, 4, 4),
+            date(2017, 4, 5)], dtype=object)
 
         arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
         arr_i8 = arr_i4.astype('int64') * 86400000
@@ -470,7 +481,7 @@ class TestPandasConversion(unittest.TestCase):
         a1 = pa.Array.from_pandas(arr, type=t1)
         a2 = pa.Array.from_pandas(arr2, type=t2)
 
-        expected = datetime.date(2017, 4, 3)
+        expected = date(2017, 4, 3)
         assert a1[0].as_py() == expected
         assert a2[0].as_py() == expected
 
@@ -669,8 +680,8 @@ class TestPandasConversion(unittest.TestCase):
         tm.assert_frame_equal(df, expected)
 
     def test_pytime_from_pandas(self):
-        pytimes = [datetime.time(1, 2, 3, 1356),
-                   datetime.time(4, 5, 6, 1356)]
+        pytimes = [time(1, 2, 3, 1356),
+                   time(4, 5, 6, 1356)]
 
         # microseconds
         t1 = pa.time64('us')
@@ -706,9 +717,9 @@ class TestPandasConversion(unittest.TestCase):
         assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
 
     def test_arrow_time_to_pandas(self):
-        pytimes = [datetime.time(1, 2, 3, 1356),
-                   datetime.time(4, 5, 6, 1356),
-                   datetime.time(0, 0, 0)]
+        pytimes = [time(1, 2, 3, 1356),
+                   time(4, 5, 6, 1356),
+                   time(0, 0, 0)]
 
         expected = np.array(pytimes[:2] + [None])
         expected_ms = np.array([x.replace(microsecond=1000)
@@ -902,7 +913,7 @@ def _pytime_from_micros(val):
     val //= 60
     minutes = val % 60
     hours = val // 60
-    return datetime.time(hours, minutes, seconds, microseconds)
+    return time(hours, minutes, seconds, microseconds)
 
 
 def _pytime_to_micros(pytime):


Mime
View raw message