arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-707: [Python] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy -> from_pandas
Date Mon, 17 Apr 2017 13:56:58 GMT
Repository: arrow
Updated Branches:
  refs/heads/master f51259068 -> 312a66535


ARROW-707: [Python] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy
-> from_pandas

per ARROW-838, I reverted the `Array.from_numpy` name to `Array.from_pandas` to reflect that
the import is specific to pandas 0.x's memory representation

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #554 from wesm/ARROW-707 and squashes the following commits:

a875257 [Wes McKinney] Rename PyObject_is_null to reflect domain-specific nature
093b057 [Wes McKinney] Check more cases of all nulls. Fix segfault for NaN that resulted from
computations
7d97f28 [Wes McKinney] Return NullArray for array of all None in Array.from_pandas. Revert
from_numpy -> from_pandas


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/312a6653
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/312a6653
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/312a6653

Branch: refs/heads/master
Commit: 312a665353c420452e98b6b266a5a7cb214c936f
Parents: f512590
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Apr 17 09:56:53 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Apr 17 09:56:53 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/pandas_convert.cc      | 31 +++++++++++++--------
 python/doc/source/api.rst                   |  1 +
 python/pyarrow/__init__.py                  |  1 +
 python/pyarrow/_array.pxd                   |  4 +++
 python/pyarrow/_array.pyx                   | 18 ++++++-------
 python/pyarrow/_io.pyx                      |  2 +-
 python/pyarrow/_table.pyx                   |  2 +-
 python/pyarrow/tests/test_array.py          |  4 +--
 python/pyarrow/tests/test_convert_pandas.py | 34 ++++++++++++++++--------
 python/pyarrow/tests/test_scalars.py        |  6 ++---
 10 files changed, 65 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index b33aea4..5cdcb6f 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -61,8 +61,16 @@ namespace py {
 // ----------------------------------------------------------------------
 // Utility code
 
-static inline bool PyObject_is_null(const PyObject* obj) {
-  return obj == Py_None || obj == numpy_nan;
+static inline bool PyFloat_isnan(const PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    double val = PyFloat_AS_DOUBLE(obj);
+    return val != val;
+  } else {
+    return false;
+  }
+}
+static inline bool PandasObjectIsNull(const PyObject* obj) {
+  return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj);
 }
 
 static inline bool PyObject_is_string(const PyObject* obj) {
@@ -158,7 +166,7 @@ static Status AppendObjectStrings(
 
   for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
     } else if (PyUnicode_Check(obj)) {
       obj = PyUnicode_AsUTF8String(obj);
@@ -197,7 +205,7 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject*
mas
 
   for (int64_t i = 0; i < objects.size(); ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
     } else if (PyUnicode_Check(obj)) {
       obj = PyUnicode_AsUTF8String(obj);
@@ -519,7 +527,7 @@ Status PandasConverter::ConvertDates() {
     obj = objects[i];
     if (PyDate_CheckExact(obj)) {
       date_builder.Append(UnboxDate<ArrowType>::Unbox(obj));
-    } else if (PyObject_is_null(obj)) {
+    } else if (PandasObjectIsNull(obj)) {
       date_builder.AppendNull();
     } else {
       return InvalidConversion(obj, "date");
@@ -570,7 +578,7 @@ Status PandasConverter::ConvertDecimals() {
         default:
           break;
       }
-    } else if (PyObject_is_null(object)) {
+    } else if (PandasObjectIsNull(object)) {
       decimal_builder.AppendNull();
     } else {
       return InvalidConversion(object, "decimal.Decimal");
@@ -724,7 +732,7 @@ Status PandasConverter::ConvertBooleans() {
   PyObject* obj;
   for (int64_t i = 0; i < length_; ++i) {
     obj = objects[i];
-    if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
       ++null_count;
     } else if (obj == Py_True) {
       BitUtil::SetBit(bitmap, i);
@@ -791,7 +799,7 @@ Status PandasConverter::ConvertObjects() {
     RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
 
     for (int64_t i = 0; i < length_; ++i) {
-      if (PyObject_is_null(objects[i])) {
+      if (PandasObjectIsNull(objects[i])) {
         continue;
       } else if (PyObject_is_string(objects[i])) {
         return ConvertObjectStrings();
@@ -809,7 +817,8 @@ Status PandasConverter::ConvertObjects() {
     }
   }
 
-  return Status::TypeError("Unable to infer type of object array, were all null");
+  out_ = std::make_shared<NullArray>(length_);
+  return Status::OK();
 }
 
 template <int ITEM_TYPE, typename ArrowType>
@@ -833,7 +842,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
   ListBuilder list_builder(pool_, value_builder);
   PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
   for (int64_t i = 0; i < length_; ++i) {
-    if (PyObject_is_null(objects[i])) {
+    if (PandasObjectIsNull(objects[i])) {
       RETURN_NOT_OK(list_builder.AppendNull());
     } else if (PyArray_Check(objects[i])) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
@@ -893,7 +902,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
   ListBuilder list_builder(pool_, value_builder);
   PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
   for (int64_t i = 0; i < length_; ++i) {
-    if (PyObject_is_null(objects[i])) {
+    if (PandasObjectIsNull(objects[i])) {
       RETURN_NOT_OK(list_builder.AppendNull());
     } else if (PyArray_Check(objects[i])) {
       auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 801ab34..1b7b9bd 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -90,6 +90,7 @@ Array Types
    :toctree: generated/
 
    Array
+   NullArray
    NumericArray
    IntegerArray
    FloatingPointArray

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 506d567..3db2a4f 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -40,6 +40,7 @@ from pyarrow._array import (null, bool_,
                             Array, Tensor,
                             from_pylist,
                             from_numpy_dtype,
+                            NullArray,
                             NumericArray, IntegerArray, FloatingPointArray,
                             BooleanArray,
                             Int8Array, UInt8Array,

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
index 4041374..afb0c27 100644
--- a/python/pyarrow/_array.pxd
+++ b/python/pyarrow/_array.pxd
@@ -141,6 +141,10 @@ cdef class Tensor:
     cdef init(self, const shared_ptr[CTensor]& sp_tensor)
 
 
+cdef class NullArray(Array):
+    pass
+
+
 cdef class BooleanArray(Array):
     pass
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
index c5a595c..99ff6f2 100644
--- a/python/pyarrow/_array.pyx
+++ b/python/pyarrow/_array.pyx
@@ -843,9 +843,9 @@ cdef class Array:
         self.type = box_data_type(self.sp_array.get().type())
 
     @staticmethod
-    def from_numpy(obj, mask=None, DataType type=None,
-                   timestamps_to_ms=False,
-                   MemoryPool memory_pool=None):
+    def from_pandas(obj, mask=None, DataType type=None,
+                    timestamps_to_ms=False,
+                    MemoryPool memory_pool=None):
         """
         Convert pandas.Series to an Arrow Array.
 
@@ -878,7 +878,7 @@ cdef class Array:
 
         >>> import pandas as pd
         >>> import pyarrow as pa
-        >>> pa.Array.from_numpy(pd.Series([1, 2]))
+        >>> pa.Array.from_pandas(pd.Series([1, 2]))
         <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
         [
           1,
@@ -886,7 +886,7 @@ cdef class Array:
         ]
 
         >>> import numpy as np
-        >>> pa.Array.from_numpy(pd.Series([1, 2]), np.array([0, 1],
+        >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1],
         ... dtype=bool))
         <pyarrow.array.Int64Array object at 0x7f9019e11208>
         [
@@ -1329,14 +1329,14 @@ cdef class DictionaryArray(Array):
                 mask = indices == -1
             else:
                 mask = mask | (indices == -1)
-            arrow_indices = Array.from_numpy(indices, mask=mask,
-                                             memory_pool=memory_pool)
+            arrow_indices = Array.from_pandas(indices, mask=mask,
+                                              memory_pool=memory_pool)
 
         if isinstance(dictionary, Array):
             arrow_dictionary = dictionary
         else:
-            arrow_dictionary = Array.from_numpy(dictionary,
-                                                memory_pool=memory_pool)
+            arrow_dictionary = Array.from_pandas(dictionary,
+                                                 memory_pool=memory_pool)
 
         if not isinstance(arrow_indices, IntegerArray):
             raise ValueError('Indices must be integer type')

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/_io.pyx
index 9f067fb..ec37de0 100644
--- a/python/pyarrow/_io.pyx
+++ b/python/pyarrow/_io.pyx
@@ -1148,7 +1148,7 @@ cdef class FeatherWriter:
         if isinstance(col, Array):
             arr = col
         else:
-            arr = Array.from_numpy(col, mask=mask)
+            arr = Array.from_pandas(col, mask=mask)
 
         cdef c_string c_name = tobytes(name)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx
index 6558b2e..78fec75 100644
--- a/python/pyarrow/_table.pyx
+++ b/python/pyarrow/_table.pyx
@@ -321,7 +321,7 @@ cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema):
         if schema is not None:
             type = schema.field_by_name(name).type
 
-        arr = Array.from_numpy(col, type=type,
+        arr = Array.from_pandas(col, type=type,
                                 timestamps_to_ms=timestamps_to_ms)
         names.append(name)
         arrays.append(arr)

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 57b17f6..a1fe842 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -162,8 +162,8 @@ def test_dictionary_from_boxed_arrays():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
 
-    iarr = pa.Array.from_numpy(indices)
-    darr = pa.Array.from_numpy(dictionary)
+    iarr = pa.Array.from_pandas(indices)
+    darr = pa.Array.from_pandas(dictionary)
 
     d1 = pa.DictionaryArray.from_arrays(iarr, darr)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 2394d63..f360234 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -79,8 +79,8 @@ class TestPandasConversion(unittest.TestCase):
 
     def _check_array_roundtrip(self, values, expected=None, mask=None,
                                timestamps_to_ms=False, type=None):
-        arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
-                                  mask=mask, type=type)
+        arr = pa.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms,
+                                   mask=mask, type=type)
         result = arr.to_pandas()
 
         values_nulls = pd.isnull(values)
@@ -125,7 +125,7 @@ class TestPandasConversion(unittest.TestCase):
         for name, arrow_dtype in dtypes:
             values = np.random.randn(num_values).astype(name)
 
-            arr = pa.Array.from_numpy(values, null_mask)
+            arr = pa.Array.from_pandas(values, null_mask)
             arrays.append(arr)
             fields.append(pa.Field.from_py(name, arrow_dtype))
             values[null_mask] = np.nan
@@ -178,7 +178,7 @@ class TestPandasConversion(unittest.TestCase):
         for name in int_dtypes:
             values = np.random.randint(0, 100, size=num_values)
 
-            arr = pa.Array.from_numpy(values, null_mask)
+            arr = pa.Array.from_pandas(values, null_mask)
             arrays.append(arr)
 
             expected = values.astype('f8')
@@ -212,7 +212,7 @@ class TestPandasConversion(unittest.TestCase):
         mask = np.random.randint(0, 10, size=num_values) < 3
         values = np.random.randint(0, 10, size=num_values) < 5
 
-        arr = pa.Array.from_numpy(values, mask)
+        arr = pa.Array.from_pandas(values, mask)
 
         expected = values.astype(object)
         expected[mask] = None
@@ -375,11 +375,11 @@ class TestPandasConversion(unittest.TestCase):
         t32 = pa.date32()
         t64 = pa.date64()
 
-        a32 = pa.Array.from_numpy(arr, type=t32)
-        a64 = pa.Array.from_numpy(arr, type=t64)
+        a32 = pa.Array.from_pandas(arr, type=t32)
+        a64 = pa.Array.from_pandas(arr, type=t64)
 
-        a32_expected = pa.Array.from_numpy(arr_i4, mask=mask, type=t32)
-        a64_expected = pa.Array.from_numpy(arr_i8, mask=mask, type=t64)
+        a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32)
+        a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64)
 
         assert a32.equals(a32_expected)
         assert a64.equals(a64_expected)
@@ -406,8 +406,8 @@ class TestPandasConversion(unittest.TestCase):
         arr = np.array([17259, 17260, 17261], dtype='int32')
         arr2 = arr.astype('int64') * 86400000
 
-        a1 = pa.Array.from_numpy(arr, type=t1)
-        a2 = pa.Array.from_numpy(arr2, type=t2)
+        a1 = pa.Array.from_pandas(arr, type=t1)
+        a2 = pa.Array.from_pandas(arr2, type=t2)
 
         expected = datetime.date(2017, 4, 3)
         assert a1[0].as_py() == expected
@@ -586,3 +586,15 @@ class TestPandasConversion(unittest.TestCase):
         converted = pa.Table.from_pandas(expected)
         df = converted.to_pandas()
         tm.assert_frame_equal(df, expected)
+
+    def test_all_nones(self):
+        def _check_series(s):
+            converted = pa.Array.from_pandas(s)
+            assert isinstance(converted, pa.NullArray)
+            assert len(converted) == 3
+            assert converted.null_count == 3
+            assert converted[0] is pa.NA
+
+        _check_series(pd.Series([None] * 3, dtype=object))
+        _check_series(pd.Series([np.nan] * 3, dtype=object))
+        _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object))

http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index f4f275b..df2a898 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -124,7 +124,7 @@ class TestScalars(unittest.TestCase):
 
         for unit in units:
             dtype = 'datetime64[{0}]'.format(unit)
-            arrow_arr = pa.Array.from_numpy(arr.astype(dtype))
+            arrow_arr = pa.Array.from_pandas(arr.astype(dtype))
             expected = pd.Timestamp('2000-01-01 12:34:56')
 
             assert arrow_arr[0].as_py() == expected
@@ -133,8 +133,8 @@ class TestScalars(unittest.TestCase):
             arrow_type = pa.timestamp(unit, tz=tz)
 
             dtype = 'datetime64[{0}]'.format(unit)
-            arrow_arr = pa.Array.from_numpy(arr.astype(dtype),
-                                            type=arrow_type)
+            arrow_arr = pa.Array.from_pandas(arr.astype(dtype),
+                                             type=arrow_type)
             expected = (pd.Timestamp('2000-01-01 12:34:56')
                         .tz_localize('utc')
                         .tz_convert(tz))


Mime
View raw message