Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id D8FDD200CEE for ; Tue, 8 Aug 2017 04:08:20 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id D752D166960; Tue, 8 Aug 2017 02:08:20 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id CEBA816695D for ; Tue, 8 Aug 2017 04:08:19 +0200 (CEST) Received: (qmail 45121 invoked by uid 500); 8 Aug 2017 02:08:19 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 45112 invoked by uid 99); 8 Aug 2017 02:08:19 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Aug 2017 02:08:19 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 44B5DF3276; Tue, 8 Aug 2017 02:08:17 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@arrow.apache.org Message-Id: <9afae436d32f4a9b9bd2975c5d920b33@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-1309: [Python] Handle nested lists with all None values in Array.from_pandas Date: Tue, 8 Aug 2017 02:08:17 +0000 (UTC) archived-at: Tue, 08 Aug 2017 02:08:21 -0000 Repository: arrow Updated Branches: refs/heads/master 02ab74841 -> 66ab6b261 ARROW-1309: [Python] Handle nested lists with all None values in Array.from_pandas Author: Wes McKinney Closes #947 from wesm/ARROW-1309 and squashes the following commits: dc464922 [Wes McKinney] Expand test case to include ndarray 86039ec2 [Wes McKinney] Bugfix, add multiple nulls at start of array 08687cac [Wes McKinney] NullBuilder, scaffolding Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/66ab6b26 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/66ab6b26 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/66ab6b26 Branch: refs/heads/master Commit: 66ab6b2616260977ab9a29bdd59872fb98133d13 Parents: 02ab748 Author: Wes McKinney Authored: Mon Aug 7 22:08:12 2017 -0400 Committer: Wes McKinney Committed: Mon Aug 7 22:08:12 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/builder.cc | 62 +++++++++++++++--------- cpp/src/arrow/builder.h | 13 +++++ cpp/src/arrow/python/builtin_convert.cc | 42 +++++++++------- cpp/src/arrow/python/pandas_to_arrow.cc | 50 ++++++++++++++++--- cpp/src/arrow/type_fwd.h | 1 + cpp/src/arrow/type_traits.h | 1 + python/pyarrow/tests/test_convert_pandas.py | 15 ++++++ 7 files changed, 136 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index e3eda24..889c64d 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -177,6 +177,17 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { length_ = new_length; } +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::Finish(std::shared_ptr* out) { + *out = std::make_shared(length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + template Status PrimitiveBuilder::Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -1306,26 +1317,30 @@ Status StructBuilder::Finish(std::shared_ptr* out) { Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { - BUILDER_CASE(UINT8, UInt8Builder); - BUILDER_CASE(INT8, Int8Builder); - BUILDER_CASE(UINT16, UInt16Builder); - BUILDER_CASE(INT16, Int16Builder); - BUILDER_CASE(UINT32, UInt32Builder); - BUILDER_CASE(INT32, Int32Builder); - BUILDER_CASE(UINT64, UInt64Builder); - BUILDER_CASE(INT64, Int64Builder); - BUILDER_CASE(DATE32, Date32Builder); - BUILDER_CASE(DATE64, Date64Builder); - BUILDER_CASE(TIME32, Time32Builder); - BUILDER_CASE(TIME64, Time64Builder); - BUILDER_CASE(TIMESTAMP, TimestampBuilder); - BUILDER_CASE(BOOL, BooleanBuilder); - BUILDER_CASE(FLOAT, FloatBuilder); - BUILDER_CASE(DOUBLE, DoubleBuilder); - BUILDER_CASE(STRING, StringBuilder); - BUILDER_CASE(BINARY, BinaryBuilder); - BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - BUILDER_CASE(DECIMAL, DecimalBuilder); + case Type::NA: { + out->reset(new NullBuilder(pool)); + return Status::OK(); + } + BUILDER_CASE(UINT8, UInt8Builder); + BUILDER_CASE(INT8, Int8Builder); + BUILDER_CASE(UINT16, UInt16Builder); + BUILDER_CASE(INT16, Int16Builder); + BUILDER_CASE(UINT32, UInt32Builder); + BUILDER_CASE(INT32, Int32Builder); + BUILDER_CASE(UINT64, UInt64Builder); + BUILDER_CASE(INT64, Int64Builder); + BUILDER_CASE(DATE32, Date32Builder); + BUILDER_CASE(DATE64, Date64Builder); + BUILDER_CASE(TIME32, Time32Builder); + BUILDER_CASE(TIME64, Time64Builder); + BUILDER_CASE(TIMESTAMP, TimestampBuilder); + BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); + BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + BUILDER_CASE(DECIMAL, DecimalBuilder); case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -1348,8 +1363,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, return Status::OK(); } - default: - return Status::NotImplemented(type->ToString()); + default: { + std::stringstream ss; + ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); + return Status::NotImplemented(ss.str()); + } } } http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index e441179..b15005f 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -162,6 +162,19 @@ class ARROW_EXPORT ArrayBuilder { DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* ARROW_MEMORY_POOL_ARG) : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Finish(std::shared_ptr* out) override; +}; + template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/python/builtin_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index b693b3e..ccaf280 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -155,7 +155,7 @@ static constexpr int MAX_NESTING_LEVELS = 32; // SeqVisitor is used to infer the type. class SeqVisitor { public: - SeqVisitor() : max_nesting_level_(0) { + SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) { memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); } @@ -217,24 +217,13 @@ class SeqVisitor { if (num_nesting_levels() > 1) { return Status::Invalid("Mixed nesting levels not supported"); // If the nesting goes deeper than the deepest scalar - } else if (max_observed_level() < max_nesting_level_) { + } else if (max_observed_level_ < max_nesting_level_) { return Status::Invalid("Mixed nesting levels not supported"); } } return Status::OK(); } - // Returns the deepest level which has scalar elements. - int max_observed_level() const { - int result = 0; - for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { - if (nesting_histogram_[i] > 0) { - result = i; - } - } - return result; - } - // Returns the number of nesting levels which have scalar elements. int num_nesting_levels() const { int result = 0; @@ -252,6 +241,8 @@ class SeqVisitor { // Track observed // Deapest nesting level (irregardless of scalars) int max_nesting_level_; + int max_observed_level_; + // Number of scalar elements at each nesting level. // (TOOD: We really only need to know if a scalar is present, not the count). int nesting_histogram_[MAX_NESTING_LEVELS]; @@ -263,13 +254,15 @@ class SeqVisitor { } else if (PyDict_Check(item_ref.obj())) { return Status::NotImplemented("No type inference for dicts"); } else { - // We permit nulls at any level of nesting - if (item_ref.obj() == Py_None) { - // TODO - } else { + // We permit nulls at any level of nesting, but they aren't treated like + // other scalar values as far as the checking for mixed nesting structure + if (item_ref.obj() != Py_None) { ++nesting_histogram_[level]; - return scalars_.Visit(item_ref.obj()); } + if (level > max_observed_level_) { + max_observed_level_ = level; + } + return scalars_.Visit(item_ref.obj()); } return Status::OK(); } @@ -392,6 +385,17 @@ class TypedConverterVisitor : public TypedConverter { virtual Status AppendItem(const OwnedRef& item) = 0; }; +class NullConverter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + if (item.obj() == Py_None) { + return typed_builder_->AppendNull(); + } else { + return Status::Invalid("NullConverter: passed non-None value"); + } + } +}; + class BoolConverter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { @@ -616,6 +620,8 @@ class DecimalConverter // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->id()) { + case Type::NA: + return std::make_shared(); case Type::BOOL: return std::make_shared(); case Type::INT64: http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/python/pandas_to_arrow.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc index 590be22..060fcb2 100644 --- a/cpp/src/arrow/python/pandas_to_arrow.cc +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -944,10 +944,6 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - BuilderT* value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -992,6 +988,47 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr } template <> +inline Status PandasConverter::ConvertTypedLists( + const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { + PyAcquireGIL lock; + + // TODO: mask not supported here + if (mask_ != nullptr) { + return Status::NotImplemented("mask not supported in object conversions yet"); + } + + auto value_builder = static_cast(builder->value_builder()); + + auto foreach_item = [&](PyObject* object) { + if (PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + for (int64_t i = 0; i < static_cast(PyArray_SIZE(numpy_array)); ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else if (PyList_Check(object)) { + RETURN_NOT_OK(builder->Append(true)); + const Py_ssize_t size = PySequence_Size(object); + for (Py_ssize_t i = 0; i < size; ++i) { + RETURN_NOT_OK(value_builder->AppendNull()); + } + return Status::OK(); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequence(list, foreach_item); +} + +template <> inline Status PandasConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { PyAcquireGIL lock; @@ -1003,10 +1040,6 @@ inline Status PandasConverter::ConvertTypedLists( return Status::NotImplemented("mask not supported in object conversions yet"); } - if (is_strided()) { - return Status::NotImplemented("strided arrays not implemented for lists"); - } - auto value_builder = static_cast(builder->value_builder()); auto foreach_item = [&](PyObject* object) { @@ -1053,6 +1086,7 @@ inline Status PandasConverter::ConvertTypedLists( Status PandasConverter::ConvertLists(const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { switch (type->id()) { + LIST_CASE(NA, NPY_OBJECT, NullType) LIST_CASE(UINT8, NPY_UINT8, UInt8Type) LIST_CASE(INT8, NPY_INT8, Int8Type) LIST_CASE(UINT16, NPY_UINT16, UInt16Type) http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/type_fwd.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 99c09bd..0d06b6f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -42,6 +42,7 @@ class DictionaryArray; class NullType; class NullArray; +class NullBuilder; class BooleanType; class BooleanArray; http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/cpp/src/arrow/type_traits.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 973b0e1..f05eb56 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -31,6 +31,7 @@ struct TypeTraits {}; template <> struct TypeTraits { using ArrayType = NullArray; + using BuilderType = NullBuilder; constexpr static bool is_parameter_free = false; }; http://git-wip-us.apache.org/repos/asf/arrow/blob/66ab6b26/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 2a51d32..93058fb 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -534,6 +534,21 @@ class TestPandasConversion(unittest.TestCase): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], type=field.type) + def test_nested_lists_all_none(self): + data = np.array([[None, None], None], dtype=object) + + arr = pa.Array.from_pandas(data) + expected = pa.array(list(data)) + assert arr.equals(expected) + assert arr.type == pa.list_(pa.null()) + + data2 = np.array([None, None, [None, None], + np.array([None, None], dtype=object)], + dtype=object) + arr = pa.Array.from_pandas(data2) + expected = pa.array([None, None, [None, None], [None, None]]) + assert arr.equals(expected) + def test_threaded_conversion(self): df = _alltypes_example() self._check_pandas_roundtrip(df, nthreads=2,