arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1309: [Python] Handle nested lists with all None values in Array.from_pandas
Date Tue, 08 Aug 2017 02:08:17 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 02ab74841 -> 66ab6b261


ARROW-1309: [Python] Handle nested lists with all None values in Array.from_pandas

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #947 from wesm/ARROW-1309 and squashes the following commits:

dc464922 [Wes McKinney] Expand test case to include ndarray
86039ec2 [Wes McKinney] Bugfix, add multiple nulls at start of array
08687cac [Wes McKinney] NullBuilder, scaffolding


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/66ab6b26
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/66ab6b26
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/66ab6b26

Branch: refs/heads/master
Commit: 66ab6b2616260977ab9a29bdd59872fb98133d13
Parents: 02ab748
Author: Wes McKinney <wes.mckinney@twosigma.com>
Authored: Mon Aug 7 22:08:12 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Mon Aug 7 22:08:12 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/builder.cc                    | 62 +++++++++++++++---------
 cpp/src/arrow/builder.h                     | 13 +++++
 cpp/src/arrow/python/builtin_convert.cc     | 42 +++++++++-------
 cpp/src/arrow/python/pandas_to_arrow.cc     | 50 ++++++++++++++++---
 cpp/src/arrow/type_fwd.h                    |  1 +
 cpp/src/arrow/type_traits.h                 |  1 +
 python/pyarrow/tests/test_convert_pandas.py | 15 ++++++
 7 files changed, 136 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index e3eda24..889c64d 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -177,6 +177,17 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
   length_ = new_length;
 }
 
+// ----------------------------------------------------------------------
+// Null builder
+
+Status NullBuilder::Finish(std::shared_ptr<Array>* out) {
+  *out = std::make_shared<NullArray>(length_);
+  length_ = null_count_ = 0;
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+
 template <typename T>
 Status PrimitiveBuilder<T>::Init(int64_t capacity) {
   RETURN_NOT_OK(ArrayBuilder::Init(capacity));
@@ -1306,26 +1317,30 @@ Status StructBuilder::Finish(std::shared_ptr<Array>* out) {
 Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
                    std::unique_ptr<ArrayBuilder>* out) {
   switch (type->id()) {
-    BUILDER_CASE(UINT8, UInt8Builder);
-    BUILDER_CASE(INT8, Int8Builder);
-    BUILDER_CASE(UINT16, UInt16Builder);
-    BUILDER_CASE(INT16, Int16Builder);
-    BUILDER_CASE(UINT32, UInt32Builder);
-    BUILDER_CASE(INT32, Int32Builder);
-    BUILDER_CASE(UINT64, UInt64Builder);
-    BUILDER_CASE(INT64, Int64Builder);
-    BUILDER_CASE(DATE32, Date32Builder);
-    BUILDER_CASE(DATE64, Date64Builder);
-    BUILDER_CASE(TIME32, Time32Builder);
-    BUILDER_CASE(TIME64, Time64Builder);
-    BUILDER_CASE(TIMESTAMP, TimestampBuilder);
-    BUILDER_CASE(BOOL, BooleanBuilder);
-    BUILDER_CASE(FLOAT, FloatBuilder);
-    BUILDER_CASE(DOUBLE, DoubleBuilder);
-    BUILDER_CASE(STRING, StringBuilder);
-    BUILDER_CASE(BINARY, BinaryBuilder);
-    BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
-    BUILDER_CASE(DECIMAL, DecimalBuilder);
+    case Type::NA: {
+      out->reset(new NullBuilder(pool));
+      return Status::OK();
+    }
+      BUILDER_CASE(UINT8, UInt8Builder);
+      BUILDER_CASE(INT8, Int8Builder);
+      BUILDER_CASE(UINT16, UInt16Builder);
+      BUILDER_CASE(INT16, Int16Builder);
+      BUILDER_CASE(UINT32, UInt32Builder);
+      BUILDER_CASE(INT32, Int32Builder);
+      BUILDER_CASE(UINT64, UInt64Builder);
+      BUILDER_CASE(INT64, Int64Builder);
+      BUILDER_CASE(DATE32, Date32Builder);
+      BUILDER_CASE(DATE64, Date64Builder);
+      BUILDER_CASE(TIME32, Time32Builder);
+      BUILDER_CASE(TIME64, Time64Builder);
+      BUILDER_CASE(TIMESTAMP, TimestampBuilder);
+      BUILDER_CASE(BOOL, BooleanBuilder);
+      BUILDER_CASE(FLOAT, FloatBuilder);
+      BUILDER_CASE(DOUBLE, DoubleBuilder);
+      BUILDER_CASE(STRING, StringBuilder);
+      BUILDER_CASE(BINARY, BinaryBuilder);
+      BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
+      BUILDER_CASE(DECIMAL, DecimalBuilder);
     case Type::LIST: {
       std::unique_ptr<ArrayBuilder> value_builder;
       std::shared_ptr<DataType> value_type =
@@ -1348,8 +1363,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
       return Status::OK();
     }
 
-    default:
-      return Status::NotImplemented(type->ToString());
+    default: {
+      std::stringstream ss;
+      ss << "MakeBuilder: cannot construct builder for type " << type->ToString();
+      return Status::NotImplemented(ss.str());
+    }
   }
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index e441179..b15005f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -162,6 +162,19 @@ class ARROW_EXPORT ArrayBuilder {
   DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
 };
 
+class ARROW_EXPORT NullBuilder : public ArrayBuilder {
+ public:
+  explicit NullBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG) : ArrayBuilder(null(), pool) {}
+
+  Status AppendNull() {
+    ++null_count_;
+    ++length_;
+    return Status::OK();
+  }
+
+  Status Finish(std::shared_ptr<Array>* out) override;
+};
+
 template <typename Type>
 class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
  public:

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index b693b3e..ccaf280 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -155,7 +155,7 @@ static constexpr int MAX_NESTING_LEVELS = 32;
 // SeqVisitor is used to infer the type.
 class SeqVisitor {
  public:
-  SeqVisitor() : max_nesting_level_(0) {
+  SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) {
     memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
   }
 
@@ -217,24 +217,13 @@ class SeqVisitor {
       if (num_nesting_levels() > 1) {
         return Status::Invalid("Mixed nesting levels not supported");
         // If the nesting goes deeper than the deepest scalar
-      } else if (max_observed_level() < max_nesting_level_) {
+      } else if (max_observed_level_ < max_nesting_level_) {
         return Status::Invalid("Mixed nesting levels not supported");
       }
     }
     return Status::OK();
   }
 
-  // Returns the deepest level which has scalar elements.
-  int max_observed_level() const {
-    int result = 0;
-    for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
-      if (nesting_histogram_[i] > 0) {
-        result = i;
-      }
-    }
-    return result;
-  }
-
   // Returns the number of nesting levels which have scalar elements.
   int num_nesting_levels() const {
     int result = 0;
@@ -252,6 +241,8 @@ class SeqVisitor {
   // Track observed
   // Deapest nesting level (irregardless of scalars)
   int max_nesting_level_;
+  int max_observed_level_;
+
   // Number of scalar elements at each nesting level.
   // (TOOD: We really only need to know if a scalar is present, not the count).
   int nesting_histogram_[MAX_NESTING_LEVELS];
@@ -263,13 +254,15 @@ class SeqVisitor {
     } else if (PyDict_Check(item_ref.obj())) {
       return Status::NotImplemented("No type inference for dicts");
     } else {
-      // We permit nulls at any level of nesting
-      if (item_ref.obj() == Py_None) {
-        // TODO
-      } else {
+      // We permit nulls at any level of nesting, but they aren't treated like
+      // other scalar values as far as the checking for mixed nesting structure
+      if (item_ref.obj() != Py_None) {
         ++nesting_histogram_[level];
-        return scalars_.Visit(item_ref.obj());
       }
+      if (level > max_observed_level_) {
+        max_observed_level_ = level;
+      }
+      return scalars_.Visit(item_ref.obj());
     }
     return Status::OK();
   }
@@ -392,6 +385,17 @@ class TypedConverterVisitor : public TypedConverter<BuilderType>
{
   virtual Status AppendItem(const OwnedRef& item) = 0;
 };
 
+class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    if (item.obj() == Py_None) {
+      return typed_builder_->AppendNull();
+    } else {
+      return Status::Invalid("NullConverter: passed non-None value");
+    }
+  }
+};
+
 class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
@@ -616,6 +620,8 @@ class DecimalConverter
 // Dynamic constructor for sequence converters
 std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>&
type) {
   switch (type->id()) {
+    case Type::NA:
+      return std::make_shared<NullConverter>();
     case Type::BOOL:
       return std::make_shared<BoolConverter>();
     case Type::INT64:

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/python/pandas_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc
index 590be22..060fcb2 100644
--- a/cpp/src/arrow/python/pandas_to_arrow.cc
+++ b/cpp/src/arrow/python/pandas_to_arrow.cc
@@ -944,10 +944,6 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
     return Status::NotImplemented("mask not supported in object conversions yet");
   }
 
-  if (is_strided()) {
-    return Status::NotImplemented("strided arrays not implemented for lists");
-  }
-
   BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object) {
@@ -992,6 +988,47 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
 }
 
 template <>
+inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
+    const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list)
{
+  PyAcquireGIL lock;
+
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
+
+  auto foreach_item = [&](PyObject* object) {
+    if (PandasObjectIsNull(object)) {
+      return builder->AppendNull();
+    } else if (PyArray_Check(object)) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
+      RETURN_NOT_OK(builder->Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
+
+      for (int64_t i = 0; i < static_cast<int64_t>(PyArray_SIZE(numpy_array)); ++i)
{
+        RETURN_NOT_OK(value_builder->AppendNull());
+      }
+      return Status::OK();
+    } else if (PyList_Check(object)) {
+      RETURN_NOT_OK(builder->Append(true));
+      const Py_ssize_t size = PySequence_Size(object);
+      for (Py_ssize_t i = 0; i < size; ++i) {
+        RETURN_NOT_OK(value_builder->AppendNull());
+      }
+      return Status::OK();
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  };
+
+  return LoopPySequence(list, foreach_item);
+}
+
+template <>
 inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
     const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list)
{
   PyAcquireGIL lock;
@@ -1003,10 +1040,6 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
     return Status::NotImplemented("mask not supported in object conversions yet");
   }
 
-  if (is_strided()) {
-    return Status::NotImplemented("strided arrays not implemented for lists");
-  }
-
   auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
 
   auto foreach_item = [&](PyObject* object) {
@@ -1053,6 +1086,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
 Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type,
                                      ListBuilder* builder, PyObject* list) {
   switch (type->id()) {
+    LIST_CASE(NA, NPY_OBJECT, NullType)
     LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
     LIST_CASE(INT8, NPY_INT8, Int8Type)
     LIST_CASE(UINT16, NPY_UINT16, UInt16Type)

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/type_fwd.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 99c09bd..0d06b6f 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -42,6 +42,7 @@ class DictionaryArray;
 
 class NullType;
 class NullArray;
+class NullBuilder;
 
 class BooleanType;
 class BooleanArray;

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 973b0e1..f05eb56 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -31,6 +31,7 @@ struct TypeTraits {};
 template <>
 struct TypeTraits<NullType> {
   using ArrayType = NullArray;
+  using BuilderType = NullBuilder;
   constexpr static bool is_parameter_free = false;
 };
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 2a51d32..93058fb 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -534,6 +534,21 @@ class TestPandasConversion(unittest.TestCase):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], type=field.type)
 
+    def test_nested_lists_all_none(self):
+        data = np.array([[None, None], None], dtype=object)
+
+        arr = pa.Array.from_pandas(data)
+        expected = pa.array(list(data))
+        assert arr.equals(expected)
+        assert arr.type == pa.list_(pa.null())
+
+        data2 = np.array([None, None, [None, None],
+                          np.array([None, None], dtype=object)],
+                         dtype=object)
+        arr = pa.Array.from_pandas(data2)
+        expected = pa.array([None, None, [None, None], [None, None]])
+        assert arr.equals(expected)
+
     def test_threaded_conversion(self):
         df = _alltypes_example()
         self._check_pandas_roundtrip(df, nthreads=2,


Mime
View raw message