arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1434: [Python] Support datetime64[D] numpy arrays
Date Fri, 01 Sep 2017 17:56:35 GMT
Repository: arrow
Updated Branches:
  refs/heads/master c6295f3b7 -> 8344f28f1


ARROW-1434: [Python] Support datetime64[D] numpy arrays

Author: Uwe L. Korn <uwelk@xhochy.com>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #1020 from xhochy/ARROW-1434 and squashes the following commits:

e8ff8355 [Wes McKinney] Explicitly static cast in loop to avoid MSVC warning in std::copy
03d95c6f [Uwe L. Korn] Ignore MSVC warning
83168f1f [Uwe L. Korn] ninja lint
cb36ca88 [Uwe L. Korn] ninja format
c55e92e2 [Uwe L. Korn] ARROW-1434: [Python] Support datetime64[D] numpy arrays


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8344f28f
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8344f28f
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8344f28f

Branch: refs/heads/master
Commit: 8344f28f1db2648c89c688b030ed7f616b086d88
Parents: c6295f3
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Fri Sep 1 13:56:30 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Fri Sep 1 13:56:30 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/arrow_to_pandas.cc     |  5 +-
 cpp/src/arrow/python/numpy_convert.cc       | 13 +++---
 cpp/src/arrow/python/pandas_to_arrow.cc     | 58 +++++++++++++++++++++---
 python/pyarrow/tests/test_convert_pandas.py | 47 +++++++++++++++++++
 4 files changed, 108 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/8344f28f/cpp/src/arrow/python/arrow_to_pandas.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 1f62ef8..fe4d63b 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1454,6 +1454,7 @@ class ArrowDeserializer {
   Visit(const Type& type) {
     constexpr int TYPE = Type::type_id;
     using traits = internal::arrow_traits<TYPE>;
+    using c_type = typename Type::c_type;
 
     typedef typename traits::T T;
 
@@ -1465,10 +1466,10 @@ class ArrowDeserializer {
 
     for (int c = 0; c < data_.num_chunks(); c++) {
       const auto& arr = static_cast<const PrimitiveArray&>(*data_.chunk(c));
-      auto in_values = reinterpret_cast<const T*>(arr.raw_values());
+      auto in_values = reinterpret_cast<const c_type*>(arr.raw_values());
 
       for (int64_t i = 0; i < arr.length(); ++i) {
-        *out_values++ = arr.IsNull(i) ? na_value : in_values[i] / kShift;
+        *out_values++ = arr.IsNull(i) ? na_value : static_cast<T>(in_values[i]) / kShift;
       }
     }
     return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/8344f28f/cpp/src/arrow/python/numpy_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 61192f3..4c64cc4 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -170,24 +170,25 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>*
out) {
     case NPY_DATETIME: {
       auto date_dtype =
           reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
-      TimeUnit::type unit;
       switch (date_dtype->meta.base) {
         case NPY_FR_s:
-          unit = TimeUnit::SECOND;
+          *out = timestamp(TimeUnit::SECOND);
           break;
         case NPY_FR_ms:
-          unit = TimeUnit::MILLI;
+          *out = timestamp(TimeUnit::MILLI);
           break;
         case NPY_FR_us:
-          unit = TimeUnit::MICRO;
+          *out = timestamp(TimeUnit::MICRO);
           break;
         case NPY_FR_ns:
-          unit = TimeUnit::NANO;
+          *out = timestamp(TimeUnit::NANO);
+          break;
+        case NPY_FR_D:
+          *out = date32();
           break;
         default:
           return Status::NotImplemented("Unsupported datetime64 time unit");
       }
-      *out = timestamp(unit);
     } break;
     default: {
       std::stringstream ss;

http://git-wip-us.apache.org/repos/asf/arrow/blob/8344f28f/cpp/src/arrow/python/pandas_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc
index 408aef3..8f6746a 100644
--- a/cpp/src/arrow/python/pandas_to_arrow.cc
+++ b/cpp/src/arrow/python/pandas_to_arrow.cc
@@ -23,6 +23,7 @@
 
 #include "arrow/python/pandas_to_arrow.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <limits>
@@ -357,7 +358,7 @@ class PandasConverter {
     return VisitNative<T>();
   }
 
-  Status Visit(const Date32Type& type) { return VisitNative<Int32Type>(); }
+  Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
   Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); }
   Status Visit(const TimestampType& type) { return VisitNative<TimestampType>();
}
   Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
@@ -434,19 +435,19 @@ class PandasConverter {
   uint8_t* null_bitmap_data_;
 };
 
-template <typename T>
-void CopyStrided(T* input_data, int64_t length, int64_t stride, T* output_data) {
+template <typename T, typename T2>
+void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) {
   // Passing input_data as non-const is a concession to PyObject*
   int64_t j = 0;
   for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = input_data[j];
+    output_data[i] = static_cast<T2>(input_data[j]);
     j += stride;
   }
 }
 
 template <>
-void CopyStrided<PyObject*>(PyObject** input_data, int64_t length, int64_t stride,
-                            PyObject** output_data) {
+void CopyStrided<PyObject*, PyObject*>(PyObject** input_data, int64_t length,
+                                       int64_t stride, PyObject** output_data) {
   int64_t j = 0;
   for (int64_t i = 0; i < length; ++i) {
     output_data[i] = input_data[j];
@@ -466,7 +467,11 @@ inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>*
data) {
   int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
 
   if (NumPyTypeSize(traits::npy_type) != NumPyTypeSize(type_num_compat)) {
-    return Status::NotImplemented("NumPy type casts not yet implemented");
+    std::stringstream ss;
+    ss << "NumPy type casts not yet implemented, type sizes differ: ";
+    ss << NumPyTypeSize(traits::npy_type) << " compared to "
+       << NumPyTypeSize(type_num_compat);
+    return Status::NotImplemented(ss.str());
   }
 
   if (is_strided()) {
@@ -487,6 +492,45 @@ inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>*
data) {
 }
 
 template <>
+inline Status PandasConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>*
data) {
+  // Handle LONGLONG->INT64 and other fun things
+  int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
+  int type_size = NumPyTypeSize(type_num_compat);
+
+  if (type_size == 4) {
+    // Source and target are INT32, so can refer to the main implementation.
+    return ConvertData<Int32Type>(data);
+  } else if (type_size == 8) {
+    // We need to scale down from int64 to int32
+    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_));
+
+    auto input = reinterpret_cast<const int64_t*>(PyArray_DATA(arr_));
+    auto output = reinterpret_cast<int32_t*>(new_buffer->mutable_data());
+
+    if (is_strided()) {
+      // Strided, must copy into new contiguous memory
+      const int64_t stride = PyArray_STRIDES(arr_)[0];
+      const int64_t stride_elements = stride / sizeof(int64_t);
+      CopyStrided(input, length_, stride_elements, output);
+    } else {
+      // TODO(wesm): int32 overflow checks
+      for (int64_t i= 0; i < length_; ++i) {
+        *output++ = static_cast<int32_t>(*input++);
+      }
+    }
+    *data = new_buffer;
+  } else {
+    std::stringstream ss;
+    ss << "Cannot convert NumPy array of element size ";
+    ss << type_size << " to a Date32 array";
+    return Status::NotImplemented(ss.str());
+  }
+
+  return Status::OK();
+}
+
+template <>
 inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
data) {
   int64_t nbytes = BitUtil::BytesForBits(length_);
   auto buffer = std::make_shared<PoolBuffer>(pool_);

http://git-wip-us.apache.org/repos/asf/arrow/blob/8344f28f/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 6e5e8e8..f5107c2 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -26,6 +26,7 @@ import json
 import pytest
 
 import numpy as np
+import numpy.testing as npt
 
 import pandas as pd
 import pandas.util.testing as tm
@@ -821,6 +822,52 @@ class TestPandasConversion(unittest.TestCase):
 
         tm.assert_frame_equal(df, expected_df)
 
+    def _check_numpy_array_roundtrip(self, np_array):
+        arr = pa.Array.from_pandas(np_array)
+        result = arr.to_pandas()
+        npt.assert_array_equal(result, np_array)
+
+    def test_numpy_datetime64_columns(self):
+        datetime64_ns = np.array([
+                '2007-07-13T01:23:34.123456789',
+                None,
+                '2006-01-13T12:34:56.432539784',
+                '2010-08-13T05:46:57.437699912'],
+                dtype='datetime64[ns]')
+        self._check_numpy_array_roundtrip(datetime64_ns)
+
+        datetime64_us = np.array([
+                '2007-07-13T01:23:34.123456',
+                None,
+                '2006-01-13T12:34:56.432539',
+                '2010-08-13T05:46:57.437699'],
+                dtype='datetime64[us]')
+        self._check_numpy_array_roundtrip(datetime64_us)
+
+        datetime64_ms = np.array([
+                '2007-07-13T01:23:34.123',
+                None,
+                '2006-01-13T12:34:56.432',
+                '2010-08-13T05:46:57.437'],
+                dtype='datetime64[ms]')
+        self._check_numpy_array_roundtrip(datetime64_ms)
+
+        datetime64_s = np.array([
+                '2007-07-13T01:23:34',
+                None,
+                '2006-01-13T12:34:56',
+                '2010-08-13T05:46:57'],
+                dtype='datetime64[s]')
+        self._check_numpy_array_roundtrip(datetime64_s)
+
+        datetime64_d = np.array([
+                '2007-07-13',
+                None,
+                '2006-01-15',
+                '2010-08-19'],
+                dtype='datetime64[D]')
+        self._check_numpy_array_roundtrip(datetime64_d)
+
     def test_all_nones(self):
         def _check_series(s):
             converted = pa.Array.from_pandas(s)


Mime
View raw message