arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-1828: [C++] Hash kernel specialization for BooleanType
Date Thu, 23 Nov 2017 14:41:51 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new dda2d34  ARROW-1828: [C++] Hash kernel specialization for BooleanType
dda2d34 is described below

commit dda2d34c712a6e64bcfeafe2b1e764ba8cd017e3
Author: Wes McKinney <wes.mckinney@twosigma.com>
AuthorDate: Thu Nov 23 09:41:46 2017 -0500

    ARROW-1828: [C++] Hash kernel specialization for BooleanType
    
    This is a bit tedious because we want to preserve the order in which the unique values
were observed.
    
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1350 from wesm/ARROW-1828 and squashes the following commits:
    
    576ab330 [Wes McKinney] Fix typo
    498a9092 [Wes McKinney] clang-format, fix Python flakes
    c6a2b8fe [Wes McKinney] Add tests without nulls
    b6cd4db6 [Wes McKinney] Finish boolean hash kernel implementation, tests passing
    e4d4db6f [Wes McKinney] Scaffolding
---
 cpp/src/arrow/compute/compute-test.cc         | 34 ++++++++++++
 cpp/src/arrow/compute/kernels/hash.cc         | 77 ++++++++++++++++++++++++++-
 cpp/src/arrow/compute/kernels/util-internal.h |  4 ++
 python/pyarrow/includes/libarrow.pxd          |  3 +-
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index fa408ae..96edd8f 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -843,6 +843,40 @@ TEST_F(TestHashKernel, UniqueTimeTimestamp) {
                                       {});
 }
 
+TEST_F(TestHashKernel, UniqueBoolean) {
+  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false,
true},
+                                 {true, false, true, true}, {true, false}, {});
+
+  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false,
true},
+                                 {true, false, true, true}, {false, true}, {});
+
+  // No nulls
+  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false,
true}, {},
+                                 {true, false}, {});
+
+  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false,
true}, {},
+                                 {false, true}, {});
+}
+
+TEST_F(TestHashKernel, DictEncodeBoolean) {
+  CheckDictEncode<BooleanType, bool>(
+      &this->ctx_, boolean(), {true, true, false, true, false},
+      {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1});
+
+  CheckDictEncode<BooleanType, bool>(
+      &this->ctx_, boolean(), {false, true, false, true, false},
+      {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0});
+
+  // No nulls
+  CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
+                                     {true, true, false, true, false}, {}, {true, false},
+                                     {}, {0, 0, 1, 0, 1});
+
+  CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
+                                     {false, true, false, true, false}, {}, {false, true},
+                                     {}, {0, 1, 0, 1, 0});
+}
+
 TEST_F(TestHashKernel, UniqueBinary) {
   CheckUnique<BinaryType, std::string>(&this->ctx_, binary(),
                                        {"test", "", "test2", "test"},
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 95f0399..e47759d 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -369,6 +369,79 @@ class HashTableKernel<Type, Action, enable_if_has_c_type<Type>>
: public HashTab
 };
 
 // ----------------------------------------------------------------------
+// Hash table for boolean types
+
+template <typename Type, typename Action>
+class HashTableKernel<Type, Action, enable_if_boolean<Type>> : public HashTable
{
+ public:
+  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : HashTable(type, pool) {
+    std::fill(table_, table_ + 2, kHashSlotEmpty);
+  }
+
+  Status Append(const ArrayData& arr) override {
+    auto action = static_cast<Action*>(this);
+
+    RETURN_NOT_OK(action->Reserve(arr.length));
+
+    internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length);
+
+#define HASH_INNER_LOOP()                                      \
+  if (slot == kHashSlotEmpty) {                                \
+    if (!Action::allow_expand) {                               \
+      throw HashException("Encountered new dictionary value"); \
+    }                                                          \
+    table_[j] = slot = static_cast<hash_slot_t>(dict_.size()); \
+    dict_.push_back(value);                                    \
+    action->ObserveNotFound(slot);                             \
+  } else {                                                     \
+    action->ObserveFound(slot);                                \
+  }
+
+    if (arr.null_count != 0) {
+      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool is_null = valid_reader.IsNotSet();
+        const bool value = value_reader.IsSet();
+        const int j = value ? 1 : 0;
+        hash_slot_t slot = table_[j];
+        valid_reader.Next();
+        value_reader.Next();
+        if (is_null) {
+          action->ObserveNull();
+          continue;
+        }
+        HASH_INNER_LOOP();
+      }
+    } else {
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool value = value_reader.IsSet();
+        const int j = value ? 1 : 0;
+        hash_slot_t slot = table_[j];
+        value_reader.Next();
+        HASH_INNER_LOOP();
+      }
+    }
+
+#undef HASH_INNER_LOOP
+
+    return Status::OK();
+  }
+
+  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+    BooleanBuilder builder(pool_);
+    for (const bool value : dict_) {
+      RETURN_NOT_OK(builder.Append(value));
+    }
+    return builder.FinishInternal(out);
+  }
+
+ private:
+  hash_slot_t table_[2];
+  std::vector<bool> dict_;
+};
+
+// ----------------------------------------------------------------------
 // Hash table pass for variable-length binary types
 
 template <typename Type, typename Action>
@@ -698,7 +771,7 @@ Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr<DataType>&
ty
 
   switch (type->id()) {
     UNIQUE_CASE(NullType);
-    // UNIQUE_CASE(BooleanType);
+    UNIQUE_CASE(BooleanType);
     UNIQUE_CASE(UInt8Type);
     UNIQUE_CASE(Int8Type);
     UNIQUE_CASE(UInt16Type);
@@ -741,7 +814,7 @@ Status GetDictionaryEncodeKernel(FunctionContext* ctx,
 
   switch (type->id()) {
     DICTIONARY_ENCODE_CASE(NullType);
-    // DICTIONARY_ENCODE_CASE(BooleanType);
+    DICTIONARY_ENCODE_CASE(BooleanType);
     DICTIONARY_ENCODE_CASE(UInt8Type);
     DICTIONARY_ENCODE_CASE(Int8Type);
     DICTIONARY_ENCODE_CASE(UInt16Type);
diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h
index 70c5062..7633fed 100644
--- a/cpp/src/arrow/compute/kernels/util-internal.h
+++ b/cpp/src/arrow/compute/kernels/util-internal.h
@@ -60,6 +60,10 @@ using enable_if_binary =
     typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type;
 
 template <typename T>
+using enable_if_boolean =
+    typename std::enable_if<std::is_same<BooleanType, T>::value>::type;
+
+template <typename T>
 using enable_if_fixed_size_binary =
     typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type;
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index f1f5938..3246481 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -209,7 +209,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int byte_width()
         int bit_width()
 
-    cdef cppclass CDecimal128Type" arrow::Decimal128Type"(CFixedSizeBinaryType):
+    cdef cppclass CDecimal128Type \
+            " arrow::Decimal128Type"(CFixedSizeBinaryType):
         CDecimal128Type(int precision, int scale)
         int precision()
         int scale()

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message