Return-Path: X-Original-To: apmail-arrow-commits-archive@minotaur.apache.org Delivered-To: apmail-arrow-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id BA2EB184ED for ; Fri, 25 Mar 2016 23:52:21 +0000 (UTC) Received: (qmail 97806 invoked by uid 500); 25 Mar 2016 23:52:21 -0000 Delivered-To: apmail-arrow-commits-archive@arrow.apache.org Received: (qmail 97790 invoked by uid 500); 25 Mar 2016 23:52:21 -0000 Mailing-List: contact commits-help@arrow.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@arrow.apache.org Delivered-To: mailing list commits@arrow.apache.org Received: (qmail 97780 invoked by uid 99); 25 Mar 2016 23:52:21 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 25 Mar 2016 23:52:21 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 07786DFBA3; Fri, 25 Mar 2016 23:52:21 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: wesm@apache.org To: commits@arrow.apache.org Message-Id: <212efaf0084846e58c2194bae02b0976@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: arrow git commit: ARROW-37: [C++ / Python] Implement BooleanArray and BooleanBuilder. Handle Python built-in bool Date: Fri, 25 Mar 2016 23:52:21 +0000 (UTC) Repository: arrow Updated Branches: refs/heads/master c06b7654b -> 0a8979d3a ARROW-37: [C++ / Python] Implement BooleanArray and BooleanBuilder. Handle Python built-in bool This patch implements `arrow::Type::BOOL` in bit-packed form. ARROW-59 (Support for boolean in Python lists) is also addressed here. Author: Wes McKinney Closes #40 from wesm/ARROW-37 and squashes the following commits: f6b60f1 [Wes McKinney] Clean up template instantiations to fix clang 2d3b855 [Wes McKinney] Use gcc-4.9 to build thirdparty, too 74f704b [Wes McKinney] Try to fix clang issues, make some PrimitiveBuilder methods protected dc8d0a4 [Wes McKinney] Fix up pyarrow per C++ API changes. Add boolean builtin conversion / API support 2299a6c [Wes McKinney] Install logging.h 2dae07e [Wes McKinney] cpplint 0f55344 [Wes McKinney] Initialize memory to 0 after PoolBuffer::Resize to avoid boolean bit setting issues 83527a9 [Wes McKinney] Draft BooleanBuilder and arrange to share common code between normal numeric builders and boolean builder Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/0a8979d3 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/0a8979d3 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/0a8979d3 Branch: refs/heads/master Commit: 0a8979d3ab80374eb4f84d08e060bcec70174c73 Parents: c06b765 Author: Wes McKinney Authored: Fri Mar 25 16:52:11 2016 -0700 Committer: Wes McKinney Committed: Fri Mar 25 16:52:11 2016 -0700 ---------------------------------------------------------------------- .travis.yml | 4 +- cpp/src/arrow/api.h | 1 - cpp/src/arrow/array-test.cc | 5 +- cpp/src/arrow/builder.cc | 7 + cpp/src/arrow/builder.h | 2 + cpp/src/arrow/test-util.h | 31 ++- cpp/src/arrow/type.h | 15 +- cpp/src/arrow/types/CMakeLists.txt | 1 - cpp/src/arrow/types/boolean.h | 32 --- cpp/src/arrow/types/construct.cc | 3 +- cpp/src/arrow/types/list-test.cc | 5 +- cpp/src/arrow/types/list.h | 11 +- cpp/src/arrow/types/primitive-test.cc | 214 ++++++++++----- cpp/src/arrow/types/primitive.cc | 179 ++++++++++++- cpp/src/arrow/types/primitive.h | 302 ++++++++++++++-------- cpp/src/arrow/types/string-test.cc | 2 +- cpp/src/arrow/util/CMakeLists.txt | 3 +- cpp/src/arrow/util/bit-util.cc | 14 +- cpp/src/arrow/util/bit-util.h | 13 +- python/pyarrow/includes/libarrow.pxd | 3 + python/pyarrow/scalar.pyx | 6 +- python/pyarrow/tests/test_convert_builtin.py | 5 +- python/pyarrow/tests/test_scalars.py | 39 +-- python/src/pyarrow/adapters/builtin.cc | 26 +- 24 files changed, 643 insertions(+), 280 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index 49a956e..d89a200 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,10 +20,10 @@ matrix: language: cpp os: linux before_script: - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh - script: - export CC="gcc-4.9" - export CXX="g++-4.9" + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - compiler: clang http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/api.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 7be7f88..2ae80f6 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -27,7 +27,6 @@ #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/types/boolean.h" #include "arrow/types/construct.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7c6eaf5..121b802 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -71,8 +71,7 @@ TEST_F(TestArray, TestIsNull) { if (x == 0) ++null_count; } - std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap.data(), - null_bitmap.size()); + std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap); std::unique_ptr arr; arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); @@ -82,7 +81,7 @@ TEST_F(TestArray, TestIsNull) { ASSERT_TRUE(arr->null_bitmap()->Equals(*null_buf.get())); for (size_t i = 0; i < null_bitmap.size(); ++i) { - EXPECT_EQ(static_cast(null_bitmap[i]), !arr->IsNull(i)) << i; + EXPECT_EQ(null_bitmap[i], !arr->IsNull(i)) << i; } } http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 6a62dc3..4061f35 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -54,5 +54,12 @@ Status ArrayBuilder::Advance(int32_t elements) { return Status::OK(); } +Status ArrayBuilder::Reserve(int32_t elements) { + if (length_ + elements > capacity_) { + int32_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } + return Status::OK(); +} } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 308e54c..d1a49dc 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -69,6 +69,8 @@ class ArrayBuilder { // Resizes the null_bitmap array Status Resize(int32_t new_bits); + Status Reserve(int32_t extra_bits); + // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use // this function responsibly. http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ea3ce5f..b2bce26 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -99,27 +99,26 @@ void randint(int64_t N, T lower, T upper, std::vector* out) { template +void random_real(int n, uint32_t seed, T min_value, T max_value, + std::vector* out) { + std::mt19937 gen(seed); + std::uniform_real_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out->push_back(d(gen)); + } +} + + +template std::shared_ptr to_buffer(const std::vector& values) { return std::make_shared(reinterpret_cast(values.data()), values.size() * sizeof(T)); } -void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { - Random rng(random_seed()); - for (int i = 0; i < n; ++i) { - if (rng.NextDoubleFraction() > pct_null) { - null_bitmap->push_back(1); - } else { - // null - null_bitmap->push_back(0); - } - } -} - -void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { +void random_null_bitmap(int64_t n, double pct_null, uint8_t* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - null_bitmap->push_back(rng.NextDoubleFraction() > pct_null); + null_bitmap[i] = rng.NextDoubleFraction() > pct_null; } } @@ -160,11 +159,11 @@ static inline int null_count(const std::vector& valid_bytes) { return result; } -std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { +std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) { std::shared_ptr out; // TODO(wesm): error checking - util::bytes_to_bits(bytes, length, &out); + util::bytes_to_bits(bytes, &out); return out; } http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 5984b67..86e4779 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -132,6 +132,10 @@ struct DataType { return children_.size(); } + virtual int value_size() const { + return -1; + } + virtual std::string ToString() const = 0; }; @@ -191,11 +195,14 @@ inline std::string PrimitiveType::ToString() const { #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ typedef C_TYPE c_type; \ static constexpr Type::type type_enum = Type::ENUM; \ - static constexpr int size = SIZE; \ \ TYPENAME() \ : PrimitiveType() {} \ \ + virtual int value_size() const { \ + return SIZE; \ + } \ + \ static const char* name() { \ return NAME; \ } @@ -295,6 +302,12 @@ struct StructType : public DataType { std::string ToString() const override; }; +// These will be defined elsewhere +template +struct type_traits { +}; + + } // namespace arrow #endif // ARROW_TYPE_H http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 595b3be..f3e4128 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -21,7 +21,6 @@ # Headers: top level install(FILES - boolean.h collection.h construct.h datetime.h http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/boolean.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h deleted file mode 100644 index 1cb91f9..0000000 --- a/cpp/src/arrow/types/boolean.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_BOOLEAN_H -#define ARROW_TYPES_BOOLEAN_H - -#include "arrow/types/primitive.h" - -namespace arrow { - -// typedef PrimitiveArrayImpl BooleanArray; - -class BooleanBuilder : public ArrayBuilder { -}; - -} // namespace arrow - -#endif // ARROW_TYPES_BOOLEAN_H http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/construct.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index df2317c..34647a5 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -51,7 +51,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(UINT64, UInt64Builder); BUILDER_CASE(INT64, Int64Builder); - // BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(BOOL, BooleanBuilder); BUILDER_CASE(FLOAT, FloatBuilder); BUILDER_CASE(DOUBLE, DoubleBuilder); @@ -83,6 +83,7 @@ Status MakePrimitiveArray(const std::shared_ptr& type, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out) { switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/list-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index eb55ca8..4eb560e 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -116,11 +116,14 @@ TEST_F(TestListBuilder, TestBasics) { Int32Builder* vb = static_cast(builder_->value_builder().get()); + EXPECT_OK(builder_->Reserve(lengths.size())); + EXPECT_OK(vb->Reserve(values.size())); + int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { ASSERT_OK(builder_->Append(is_null[i] > 0)); for (int j = 0; j < lengths[i]; ++j) { - ASSERT_OK(vb->Append(values[pos++])); + vb->Append(values[pos++]); } } http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/list.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 72e20e9..8073b51 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -116,7 +116,8 @@ class ListBuilder : public Int32Builder { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } - memcpy(raw_buffer() + length_, values, length * elsize_); + memcpy(raw_data_ + length_, values, + type_traits::bytes_required(length)); if (valid_bytes != nullptr) { AppendNulls(valid_bytes, length); @@ -132,13 +133,13 @@ class ListBuilder : public Int32Builder { // Add final offset if the length is non-zero if (length_) { - raw_buffer()[length_] = items->length(); + raw_data_[length_] = items->length(); } - auto result = std::make_shared(type_, length_, values_, items, + auto result = std::make_shared(type_, length_, data_, items, null_count_, null_bitmap_); - values_ = null_bitmap_ = nullptr; + data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -162,7 +163,7 @@ class ListBuilder : public Int32Builder { } else { util::set_bit(null_bitmap_data_, length_); } - raw_buffer()[length_++] = value_builder_->length(); + raw_data_[length_++] = value_builder_->length(); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 10ba113..761845d 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -69,11 +69,11 @@ PRIMITIVE_TEST(BooleanType, BOOL, "bool"); // ---------------------------------------------------------------------- // Primitive type tests -TEST_F(TestBuilder, TestResize) { +TEST_F(TestBuilder, TestReserve) { builder_->Init(10); ASSERT_EQ(2, builder_->null_bitmap()->size()); - builder_->Resize(30); + builder_->Reserve(30); ASSERT_EQ(4, builder_->null_bitmap()->size()); } @@ -83,8 +83,9 @@ class TestPrimitiveBuilder : public TestBuilder { typedef typename Attrs::ArrayType ArrayType; typedef typename Attrs::BuilderType BuilderType; typedef typename Attrs::T T; + typedef typename Attrs::Type Type; - void SetUp() { + virtual void SetUp() { TestBuilder::SetUp(); type_ = Attrs::type(); @@ -99,58 +100,44 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - test::random_null_bitmap(N, pct_null, &valid_bytes_); + + valid_bytes_.resize(N); + test::random_null_bitmap(N, pct_null, valid_bytes_.data()); } - void CheckNullable() { - int size = builder_->length(); + void Check(const std::shared_ptr& builder, bool nullable) { + int size = builder->length(); - auto ex_data = std::make_shared( - reinterpret_cast(draws_.data()), + auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); - auto ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_.data(), size); - int32_t ex_null_count = test::null_count(valid_bytes_); + std::shared_ptr ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = std::dynamic_pointer_cast( - builder_->Finish()); + builder->Finish()); // Builder is now reset - ASSERT_EQ(0, builder_->length()); - ASSERT_EQ(0, builder_->capacity()); - ASSERT_EQ(0, builder_->null_count()); - ASSERT_EQ(nullptr, builder_->buffer()); + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); ASSERT_EQ(ex_null_count, result->null_count()); ASSERT_TRUE(result->EqualsExact(*expected.get())); } - void CheckNonNullable() { - int size = builder_nn_->length(); - - auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), - size * sizeof(T)); - - auto expected = std::make_shared(size, ex_data); - - std::shared_ptr result = std::dynamic_pointer_cast( - builder_nn_->Finish()); - - // Builder is now reset - ASSERT_EQ(0, builder_nn_->length()); - ASSERT_EQ(0, builder_nn_->capacity()); - ASSERT_EQ(nullptr, builder_nn_->buffer()); - - ASSERT_TRUE(result->EqualsExact(*expected.get())); - ASSERT_EQ(0, result->null_count()); - } - protected: - TypePtr type_; - TypePtr type_nn_; + std::shared_ptr type_; shared_ptr builder_; shared_ptr builder_nn_; @@ -158,14 +145,14 @@ class TestPrimitiveBuilder : public TestBuilder { vector valid_bytes_; }; -#define PTYPE_DECL(CapType, c_type) \ - typedef CapType##Array ArrayType; \ - typedef CapType##Builder BuilderType; \ - typedef CapType##Type Type; \ - typedef c_type T; \ - \ - static TypePtr type() { \ - return TypePtr(new Type()); \ +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr type() { \ + return std::shared_ptr(new Type()); \ } #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ @@ -176,6 +163,14 @@ class TestPrimitiveBuilder : public TestBuilder { } \ } +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector* draws) { \ + test::random_real(N, 0, LOWER, UPPER, draws); \ + } \ + } + PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); @@ -186,25 +181,89 @@ PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); -typedef ::testing::Types Primitives; +PFLOAT_DECL(Float, float, -1000, 1000); +PFLOAT_DECL(Double, double, -1000, 1000); + +struct PBoolean { + PTYPE_DECL(Boolean, uint8_t); +}; + +template <> +void TestPrimitiveBuilder::RandomData(int N, double pct_null) { + draws_.resize(N); + valid_bytes_.resize(N); + + test::random_null_bitmap(N, 0.5, draws_.data()); + test::random_null_bitmap(N, pct_null, valid_bytes_.data()); +} + +template <> +void TestPrimitiveBuilder::Check( + const std::shared_ptr& builder, bool nullable) { + int size = builder->length(); + + auto ex_data = test::bytes_to_null_buffer(draws_); + + std::shared_ptr ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } + + auto expected = std::make_shared(size, ex_data, ex_null_count, + ex_null_bitmap); + std::shared_ptr result = std::dynamic_pointer_cast( + builder->Finish()); + + // Builder is now reset + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); + + ASSERT_EQ(ex_null_count, result->null_count()); + + ASSERT_EQ(expected->length(), result->length()); + + for (int i = 0; i < result->length(); ++i) { + if (nullable) { + ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; + } + bool actual = util::get_bit(result->raw_data(), i); + ASSERT_EQ(static_cast(draws_[i]), actual) << i; + } + ASSERT_TRUE(result->EqualsExact(*expected.get())); +} + +typedef ::testing::Types Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); #define DECL_T() \ typedef typename TestFixture::T T; +#define DECL_TYPE() \ + typedef typename TestFixture::Type Type; + #define DECL_ARRAYTYPE() \ typedef typename TestFixture::ArrayType ArrayType; TYPED_TEST(TestPrimitiveBuilder, TestInit) { - DECL_T(); + DECL_TYPE(); int n = 1000; - ASSERT_OK(this->builder_->Init(n)); - ASSERT_EQ(n, this->builder_->capacity()); - ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); + ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); + ASSERT_EQ(util::next_power2(type_traits::bytes_required(n)), + this->builder_->data()->size()); // unsure if this should go in all builder classes ASSERT_EQ(0, this->builder_->num_children()); @@ -235,12 +294,14 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { this->RandomData(size); + this->builder_->Reserve(size); + int i; for (i = 0; i < size; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); } } @@ -261,31 +322,41 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { this->RandomData(size); + this->builder_->Reserve(1000); + this->builder_nn_->Reserve(1000); + int i; + int null_count = 0; // Append the first 1000 for (i = 0; i < 1000; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); + ++null_count; } - ASSERT_OK(this->builder_nn_->Append(draws[i])); + this->builder_nn_->Append(draws[i]); } + ASSERT_EQ(null_count, this->builder_->null_count()); + ASSERT_EQ(1000, this->builder_->length()); ASSERT_EQ(1024, this->builder_->capacity()); ASSERT_EQ(1000, this->builder_nn_->length()); ASSERT_EQ(1024, this->builder_nn_->capacity()); + this->builder_->Reserve(size - 1000); + this->builder_nn_->Reserve(size - 1000); + // Append the next 9000 for (i = 1000; i < size; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); } - ASSERT_OK(this->builder_nn_->Append(draws[i])); + this->builder_nn_->Append(draws[i]); } ASSERT_EQ(size, this->builder_->length()); @@ -294,8 +365,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { ASSERT_EQ(size, this->builder_nn_->length()); ASSERT_EQ(util::next_power2(size), this->builder_nn_->capacity()); - this->CheckNullable(); - this->CheckNonNullable(); + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); } @@ -327,31 +398,34 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_EQ(size, this->builder_->length()); ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); - this->CheckNullable(); - this->CheckNonNullable(); + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); } TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { int n = 1000; - ASSERT_OK(this->builder_->Init(n)); + ASSERT_OK(this->builder_->Reserve(n)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_EQ(100, this->builder_->length()); ASSERT_OK(this->builder_->Advance(900)); - ASSERT_RAISES(Invalid, this->builder_->Advance(1)); + + int too_many = this->builder_->capacity() - 1000 + 1; + ASSERT_RAISES(Invalid, this->builder_->Advance(too_many)); } TYPED_TEST(TestPrimitiveBuilder, TestResize) { - DECL_T(); + DECL_TYPE(); int cap = MIN_BUILDER_CAPACITY * 2; - ASSERT_OK(this->builder_->Resize(cap)); + ASSERT_OK(this->builder_->Reserve(cap)); ASSERT_EQ(cap, this->builder_->capacity()); - ASSERT_EQ(cap * sizeof(T), this->builder_->buffer()->size()); - ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->null_bitmap()->size()); + ASSERT_EQ(type_traits::bytes_required(cap), this->builder_->data()->size()); + ASSERT_EQ(util::bytes_for_bits(cap), + this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/primitive.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index ecd5d68..c54d075 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -20,20 +20,20 @@ #include #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, int value_size, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) : Array(type, length, null_count, null_bitmap) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); - value_size_ = value_size; } bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { @@ -52,12 +52,15 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; + int value_size = type_->value_size(); + DCHECK_GT(value_size, 0); + for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_size_)) { + if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; } - this_data += value_size_; - other_data += value_size_; + this_data += value_size; + other_data += value_size; } return true; } else { @@ -73,4 +76,170 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template +Status PrimitiveBuilder::Init(int32_t capacity) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + data_ = std::make_shared(pool_); + + int64_t nbytes = type_traits::bytes_required(capacity); + RETURN_NOT_OK(data_->Resize(nbytes)); + memset(data_->mutable_data(), 0, nbytes); + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return Status::OK(); +} + +template +Status PrimitiveBuilder::Resize(int32_t capacity) { + // XXX: Set floor size for now + if (capacity < MIN_BUILDER_CAPACITY) { + capacity = MIN_BUILDER_CAPACITY; + } + + if (capacity_ == 0) { + RETURN_NOT_OK(Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + + int64_t old_bytes = data_->size(); + int64_t new_bytes = type_traits::bytes_required(capacity); + RETURN_NOT_OK(data_->Resize(new_bytes)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } + capacity_ = capacity; + return Status::OK(); +} + +template +Status PrimitiveBuilder::Reserve(int32_t elements) { + if (length_ + elements > capacity_) { + int32_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } + return Status::OK(); +} + +template +Status PrimitiveBuilder::Append(const value_type* values, int32_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(PrimitiveBuilder::Reserve(length)); + + if (length > 0) { + memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); + } + + if (valid_bytes != nullptr) { + PrimitiveBuilder::AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } + } + + length_ += length; + return Status::OK(); +} + +template +void PrimitiveBuilder::AppendNulls(const uint8_t* valid_bytes, int32_t length) { + // If valid_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + if (valid_bytes[i] == 0) { + ++null_count_; + } else { + util::set_bit(null_bitmap_data_, length_ + i); + } + } +} + +template +std::shared_ptr PrimitiveBuilder::Finish() { + std::shared_ptr result = std::make_shared< + typename type_traits::ArrayType>( + type_, length_, data_, null_count_, null_bitmap_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return result; +} + +template <> +Status PrimitiveBuilder::Append(const uint8_t* values, int32_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + for (int i = 0; i < length; ++i) { + if (values[i] > 0) { + util::set_bit(raw_data_, length_ + i); + } else { + util::clear_bit(raw_data_, length_ + i); + } + } + + if (valid_bytes != nullptr) { + PrimitiveBuilder::AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } + } + length_ += length; + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, + int32_t null_count, + const std::shared_ptr& null_bitmap) : + PrimitiveArray(std::make_shared(), length, + data, null_count, null_bitmap) {} + +bool BooleanArray::EqualsExact(const BooleanArray& other) const { + if (this == &other) return true; + if (null_count_ != other.null_count_) { + return false; + } + + if (null_count_ > 0) { + bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, + util::bytes_for_bits(length_)); + if (!equal_bitmap) { + return false; + } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && util::get_bit(this_data, i) != util::get_bit(other_data, i)) { + return false; + } + } + return true; + } else { + return data_->Equals(*other.data_, util::bytes_for_bits(length_)); + } +} + +bool BooleanArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (Type::BOOL != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/primitive.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 4eaff43..ec6fee3 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/array.h" #include "arrow/builder.h" @@ -37,7 +38,7 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, int value_size, + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -51,25 +52,19 @@ class PrimitiveArray : public Array { protected: std::shared_ptr data_; const uint8_t* raw_data_; - int value_size_; }; #define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ class NAME : public PrimitiveArray { \ public: \ using value_type = T; \ - NAME(const TypePtr& type, int32_t length, \ - const std::shared_ptr& data, \ - int32_t null_count = 0, \ - const std::shared_ptr& null_bitmap = nullptr) : \ - PrimitiveArray(std::make_shared(), length, \ - sizeof(T), data, null_count, null_bitmap) {} \ + using PrimitiveArray::PrimitiveArray; \ \ NAME(int32_t length, const std::shared_ptr& data, \ int32_t null_count = 0, \ const std::shared_ptr& null_bitmap = nullptr) : \ PrimitiveArray(std::make_shared(), length, \ - sizeof(T), data, null_count, null_bitmap) {} \ + data, null_count, null_bitmap) {} \ \ bool EqualsExact(const NAME& other) const { \ return PrimitiveArray::EqualsExact( \ @@ -96,148 +91,241 @@ NUMERIC_ARRAY_DECL(Int64Array, Int64Type, int64_t); NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); -template +template class PrimitiveBuilder : public ArrayBuilder { public: typedef typename Type::c_type value_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), - values_(nullptr) { - elsize_ = sizeof(value_type); - } + data_(nullptr) {} virtual ~PrimitiveBuilder() {} - Status Resize(int32_t capacity) { - // XXX: Set floor size for now - if (capacity < MIN_BUILDER_CAPACITY) { - capacity = MIN_BUILDER_CAPACITY; - } - - if (capacity_ == 0) { - RETURN_NOT_OK(Init(capacity)); - } else { - RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); - RETURN_NOT_OK(values_->Resize(capacity * elsize_)); - } - capacity_ = capacity; - return Status::OK(); - } - - Status Init(int32_t capacity) { - RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared(pool_); - return values_->Resize(capacity * elsize_); - } - - Status Reserve(int32_t elements) { - if (length_ + elements > capacity_) { - int32_t new_capacity = util::next_power2(length_ + elements); - return Resize(new_capacity); - } - return Status::OK(); - } + using ArrayBuilder::Advance; - Status Advance(int32_t elements) { - return ArrayBuilder::Advance(elements); - } + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + void AppendNulls(const uint8_t* valid_bytes, int32_t length); - // Scalar append - Status Append(value_type val) { + Status AppendNull() { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - util::set_bit(null_bitmap_data_, length_); - raw_buffer()[length_++] = val; + ++null_count_; + ++length_; return Status::OK(); } + std::shared_ptr data() const { + return data_; + } + // Vector append // // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot Status Append(const value_type* values, int32_t length, - const uint8_t* valid_bytes = nullptr) { - if (length_ + length > capacity_) { - int32_t new_capacity = util::next_power2(length_ + length); - RETURN_NOT_OK(Resize(new_capacity)); - } - if (length > 0) { - memcpy(raw_buffer() + length_, values, length * elsize_); - } + const uint8_t* valid_bytes = nullptr); - if (valid_bytes != nullptr) { - AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + // Ensure that builder can accommodate an additional number of + // elements. Resizes if the current capacity is not sufficient + Status Reserve(int32_t elements); - length_ += length; - return Status::OK(); + std::shared_ptr Finish() override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; + + Status Init(int32_t capacity); + + // Increase the capacity of the builder to accommodate at least the indicated + // number of elements + Status Resize(int32_t capacity); +}; + +template +class NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + using PrimitiveBuilder::Append; + + // Scalar append. Does not capacity-check; make sure to call Reserve beforehand + void Append(value_type val) { + util::set_bit(null_bitmap_data_, length_); + raw_data_[length_++] = val; } - // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length) { - // If valid_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - if (valid_bytes[i] == 0) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::null_bitmap_data_; + using PrimitiveBuilder::raw_data_; + + using PrimitiveBuilder::Init; + using PrimitiveBuilder::Resize; +}; + +template <> +struct type_traits { + typedef UInt8Array ArrayType; + + static inline int bytes_required(int elements) { + return elements; } +}; - Status AppendNull() { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - ++null_count_; - ++length_; - return Status::OK(); +template <> +struct type_traits { + typedef Int8Array ArrayType; + + static inline int bytes_required(int elements) { + return elements; } +}; - std::shared_ptr Finish() override { - std::shared_ptr result = std::make_shared( - type_, length_, values_, null_count_, null_bitmap_); +template <> +struct type_traits { + typedef UInt16Array ArrayType; - values_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return result; + static inline int bytes_required(int elements) { + return elements * sizeof(uint16_t); } +}; + +template <> +struct type_traits { + typedef Int16Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int16_t); + } +}; - value_type* raw_buffer() { - return reinterpret_cast(values_->mutable_data()); +template <> +struct type_traits { + typedef UInt32Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(uint32_t); } +}; - std::shared_ptr buffer() const { - return values_; +template <> +struct type_traits { + typedef Int32Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int32_t); } +}; - protected: - std::shared_ptr values_; - int elsize_; +template <> +struct type_traits { + typedef UInt64Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(uint64_t); + } +}; + +template <> +struct type_traits { + typedef Int64Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int64_t); + } +}; +template <> +struct type_traits { + typedef FloatArray ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(float); + } +}; + +template <> +struct type_traits { + typedef DoubleArray ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(double); + } }; // Builders -typedef PrimitiveBuilder UInt8Builder; -typedef PrimitiveBuilder UInt16Builder; -typedef PrimitiveBuilder UInt32Builder; -typedef PrimitiveBuilder UInt64Builder; +typedef NumericBuilder UInt8Builder; +typedef NumericBuilder UInt16Builder; +typedef NumericBuilder UInt32Builder; +typedef NumericBuilder UInt64Builder; + +typedef NumericBuilder Int8Builder; +typedef NumericBuilder Int16Builder; +typedef NumericBuilder Int32Builder; +typedef NumericBuilder Int64Builder; + +typedef NumericBuilder FloatBuilder; +typedef NumericBuilder DoubleBuilder; + -typedef PrimitiveBuilder Int8Builder; -typedef PrimitiveBuilder Int16Builder; -typedef PrimitiveBuilder Int32Builder; -typedef PrimitiveBuilder Int64Builder; +class BooleanArray : public PrimitiveArray { + public: + using PrimitiveArray::PrimitiveArray; + + BooleanArray(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr); + + bool EqualsExact(const BooleanArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + + const uint8_t* raw_data() const { + return reinterpret_cast(raw_data_); + } + + bool Value(int i) const { + return util::get_bit(raw_data(), i); + } +}; + +template <> +struct type_traits { + typedef BooleanArray ArrayType; + + static inline int bytes_required(int elements) { + return util::bytes_for_bits(elements); + } +}; + +class BooleanBuilder : public PrimitiveBuilder { + public: + explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) : + PrimitiveBuilder(pool, type) {} + + virtual ~BooleanBuilder() {} + + using PrimitiveBuilder::Append; + + // Scalar append + void Append(bool val) { + util::set_bit(null_bitmap_data_, length_); + if (val) { + util::set_bit(raw_data_, length_); + } else { + util::clear_bit(raw_data_, length_); + } + ++length_; + } -typedef PrimitiveBuilder FloatBuilder; -typedef PrimitiveBuilder DoubleBuilder; + void Append(uint8_t val) { + Append(static_cast(val)); + } +}; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/types/string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index b329b4f..d3a4cc3 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -92,7 +92,7 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = test::to_buffer(offsets_); - null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_.data(), valid_bytes_.size()); + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared(length_, offsets_buf_, values_, http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index fed05e3..d2a4b09 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -23,6 +23,7 @@ install(FILES bit-util.h buffer.h + logging.h macros.h memory-pool.h status.h @@ -59,7 +60,7 @@ if (ARROW_BUILD_BENCHMARKS) ) else() target_link_libraries(arrow_benchmark_main - benchmark + benchmark pthread ) endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/util/bit-util.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 292cb33..6c6d533 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -23,25 +24,24 @@ namespace arrow { -void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { - for (int i = 0; i < length; ++i) { - if (static_cast(bytes[i])) { +void util::bytes_to_bits(const std::vector& bytes, uint8_t* bits) { + for (size_t i = 0; i < bytes.size(); ++i) { + if (bytes[i] > 0) { set_bit(bits, i); } } } -Status util::bytes_to_bits(uint8_t* bytes, int length, +Status util::bytes_to_bits(const std::vector& bytes, std::shared_ptr* out) { - int bit_length = ceil_byte(length) / 8; + int bit_length = util::bytes_for_bits(bytes.size()); auto buffer = std::make_shared(); RETURN_NOT_OK(buffer->Resize(bit_length)); memset(buffer->mutable_data(), 0, bit_length); - bytes_to_bits(bytes, length, buffer->mutable_data()); + bytes_to_bits(bytes, buffer->mutable_data()); *out = buffer; - return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/cpp/src/arrow/util/bit-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 08222d5..8d62871 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -20,6 +20,7 @@ #include #include +#include namespace arrow { @@ -43,15 +44,19 @@ static inline int64_t ceil_2bytes(int64_t size) { static constexpr uint8_t BITMASK[] = {1, 2, 4, 8, 16, 32, 64, 128}; static inline bool get_bit(const uint8_t* bits, int i) { - return bits[i / 8] & BITMASK[i % 8]; + return static_cast(bits[i / 8] & BITMASK[i % 8]); } static inline bool bit_not_set(const uint8_t* bits, int i) { return (bits[i / 8] & BITMASK[i % 8]) == 0; } +static inline void clear_bit(uint8_t* bits, int i) { + bits[i / 8] &= ~BITMASK[i % 8]; +} + static inline void set_bit(uint8_t* bits, int i) { - bits[i / 8] |= 1 << (i % 8); + bits[i / 8] |= BITMASK[i % 8]; } static inline int64_t next_power2(int64_t n) { @@ -66,8 +71,8 @@ static inline int64_t next_power2(int64_t n) { return n; } -void bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits); -Status bytes_to_bits(uint8_t*, int, std::shared_ptr*); +void bytes_to_bits(const std::vector& bytes, uint8_t* bits); +Status bytes_to_bits(const std::vector&, std::shared_ptr*); } // namespace util http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e6afcbd..943a08f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -86,6 +86,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsNull(int i) + cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): + c_bool Value(int i) + cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): uint8_t Value(int i) http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 04f013d..0d391e5 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -58,7 +58,10 @@ cdef class ArrayValue(Scalar): cdef class BooleanValue(ArrayValue): - pass + + def as_py(self): + cdef CBooleanArray* ap = self.sp_array.get() + return ap.Value(self.index) cdef class Int8Value(ArrayValue): @@ -172,6 +175,7 @@ cdef class ListValue(ArrayValue): cdef dict _scalar_classes = { + Type_BOOL: BooleanValue, Type_UINT8: Int8Value, Type_UINT16: Int16Value, Type_UINT32: Int32Value, http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/python/pyarrow/tests/test_convert_builtin.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 25f6969..2beb6b3 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -22,7 +22,10 @@ import pyarrow class TestConvertList(unittest.TestCase): def test_boolean(self): - pass + arr = pyarrow.from_pylist([True, None, False, None]) + assert len(arr) == 4 + assert arr.null_count == 2 + assert arr.type == pyarrow.bool_() def test_empty_list(self): arr = pyarrow.from_pylist([]) http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/python/pyarrow/tests/test_scalars.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 021737d..4fb850a 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -16,67 +16,74 @@ # under the License. from pyarrow.compat import unittest, u -import pyarrow as arrow +import pyarrow as A class TestScalars(unittest.TestCase): def test_null_singleton(self): with self.assertRaises(Exception): - arrow.NAType() + A.NAType() def test_bool(self): - pass + arr = A.from_pylist([True, None, False, None]) + + v = arr[0] + assert isinstance(v, A.BooleanValue) + assert repr(v) == "True" + assert v.as_py() == True + + assert arr[1] is A.NA def test_int64(self): - arr = arrow.from_pylist([1, 2, None]) + arr = A.from_pylist([1, 2, None]) v = arr[0] - assert isinstance(v, arrow.Int64Value) + assert isinstance(v, A.Int64Value) assert repr(v) == "1" assert v.as_py() == 1 - assert arr[2] is arrow.NA + assert arr[2] is A.NA def test_double(self): - arr = arrow.from_pylist([1.5, None, 3]) + arr = A.from_pylist([1.5, None, 3]) v = arr[0] - assert isinstance(v, arrow.DoubleValue) + assert isinstance(v, A.DoubleValue) assert repr(v) == "1.5" assert v.as_py() == 1.5 - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[2] assert v.as_py() == 3.0 def test_string(self): - arr = arrow.from_pylist(['foo', None, u('bar')]) + arr = A.from_pylist(['foo', None, u('bar')]) v = arr[0] - assert isinstance(v, arrow.StringValue) + assert isinstance(v, A.StringValue) assert repr(v) == "'foo'" assert v.as_py() == 'foo' - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[2].as_py() assert v == 'bar' assert isinstance(v, str) def test_list(self): - arr = arrow.from_pylist([['foo', None], None, ['bar'], []]) + arr = A.from_pylist([['foo', None], None, ['bar'], []]) v = arr[0] assert len(v) == 2 - assert isinstance(v, arrow.ListValue) + assert isinstance(v, A.ListValue) assert repr(v) == "['foo', None]" assert v.as_py() == ['foo', None] assert v[0].as_py() == 'foo' - assert v[1] is arrow.NA + assert v[1] is A.NA - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[3] assert len(v) == 0 http://git-wip-us.apache.org/repos/asf/arrow/blob/0a8979d3/python/src/pyarrow/adapters/builtin.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index acb13ac..78ef1b3 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -61,6 +61,8 @@ class ScalarVisitor { ++total_count_; if (obj == Py_None) { ++none_count_; + } else if (PyBool_Check(obj)) { + ++bool_count_; } else if (PyFloat_Check(obj)) { ++float_count_; } else if (IsPyInteger(obj)) { @@ -256,6 +258,20 @@ class TypedConverter : public SeqConverter { class BoolConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + if (item.obj() == Py_True) { + typed_builder_->Append(true); + } else { + typed_builder_->Append(false); + } + } + } return Status::OK(); } }; @@ -265,14 +281,15 @@ class Int64Converter : public TypedConverter { Status AppendData(PyObject* seq) override { int64_t val; Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + typed_builder_->AppendNull(); } else { val = PyLong_AsLongLong(item.obj()); RETURN_IF_PYERROR(); - RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + typed_builder_->Append(val); } } return Status::OK(); @@ -284,14 +301,15 @@ class DoubleConverter : public TypedConverter { Status AppendData(PyObject* seq) override { double val; Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + typed_builder_->AppendNull(); } else { val = PyFloat_AsDouble(item.obj()); RETURN_IF_PYERROR(); - RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + typed_builder_->Append(val); } } return Status::OK();