arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject [arrow] branch master updated: ARROW-1882: [C++] Reintroduce DictionaryBuilder
Date Wed, 06 Dec 2017 03:02:53 GMT
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 33b628a  ARROW-1882: [C++] Reintroduce DictionaryBuilder
33b628a is described below

commit 33b628acf535b4ea4c21504483929eb399980405
Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
AuthorDate: Tue Dec 5 22:02:49 2017 -0500

    ARROW-1882: [C++] Reintroduce DictionaryBuilder
    
    Readded the previous code and moved some small parts to a new common place to share logic
between kernels and builder.
    
    Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
    Author: Wes McKinney <wes.mckinney@twosigma.com>
    
    Closes #1388 from xhochy/ARROW-1882 and squashes the following commits:
    
    db9d67a8 [Wes McKinney] Consolidate some hash table constants in util/hash.h
    b387b73d [Korn, Uwe] Ensure 64but integers
    4743a8bb [Korn, Uwe] Fix precision loss
    4c507fed [Korn, Uwe] ninja format
    a1da9355 [Korn, Uwe] Reuse common hash table code
    ba307794 [Korn, Uwe] ARROW-1882: [C++] Reintroduce DictionaryBuilder
---
 cpp/src/arrow/CMakeLists.txt          |   1 +
 cpp/src/arrow/array-test.cc           | 347 ++++++++++++++++++++++++++++++++++
 cpp/src/arrow/builder.cc              | 300 +++++++++++++++++++++++++++++
 cpp/src/arrow/builder.h               | 153 +++++++++++++++
 cpp/src/arrow/compute/kernels/hash.cc |  61 +-----
 cpp/src/arrow/util/CMakeLists.txt     |   1 +
 cpp/src/arrow/util/hash.cc            |  38 ++++
 cpp/src/arrow/util/hash.h             |  85 +++++++++
 8 files changed, 927 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 9470578..d645cca 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -38,6 +38,7 @@ set(ARROW_SRCS
   util/compression.cc
   util/cpu-info.cc
   util/decimal.cc
+  util/hash.cc
   util/key_value_metadata.cc
 )
 
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index d894df1..7ff3261 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1620,6 +1620,353 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendVector) {
 }
 
 // ----------------------------------------------------------------------
+// Dictionary tests
+
+template <typename Type>
+class TestDictionaryBuilder : public TestBuilder {};
+
+typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
+                         UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
+    PrimitiveDictionaries;
+
+TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries);
+
+TYPED_TEST(TestDictionaryBuilder, Basic) {
+  DictionaryBuilder<TypeParam> builder(default_memory_pool());
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder;
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), dict_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
+  NumericBuilder<TypeParam> builder;
+  // DictionaryBuilder<TypeParam> builder;
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> intermediate_result;
+  ASSERT_OK(builder.Finish(&intermediate_result));
+  DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool());
+  ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
+  std::shared_ptr<Array> result;
+  ASSERT_OK(dictionary_builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder;
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), dict_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
+  using Scalar = typename TypeParam::c_type;
+  // Skip this test for (u)int8
+  if (sizeof(Scalar) > 1) {
+    // Build the dictionary Array
+    DictionaryBuilder<TypeParam> builder(default_memory_pool());
+    // Build expected data
+    NumericBuilder<TypeParam> dict_builder;
+    Int16Builder int_builder;
+
+    // Fill with 1024 different values
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+    }
+    // Fill with an already existing value
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(1)));
+      ASSERT_OK(int_builder.Append(1));
+    }
+
+    // Finalize result
+    std::shared_ptr<Array> result;
+    ASSERT_OK(builder.Finish(&result));
+
+    // Finalize expected data
+    std::shared_ptr<Array> dict_array;
+    ASSERT_OK(dict_builder.Finish(&dict_array));
+    auto dtype = std::make_shared<DictionaryType>(int16(), dict_array);
+    std::shared_ptr<Array> int_array;
+    ASSERT_OK(int_builder.Finish(&int_array));
+
+    DictionaryArray expected(dtype, int_array);
+    ASSERT_TRUE(expected.Equals(result));
+  }
+}
+
+TEST(TestStringDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  StringBuilder str_builder;
+  ASSERT_OK(str_builder.Append("test"));
+  ASSERT_OK(str_builder.Append("test2"));
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), str_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestStringDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  // Build expected data
+  StringBuilder str_builder;
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::stringstream ss;
+    ss << "test" << i;
+    ASSERT_OK(builder.Append(ss.str()));
+    ASSERT_OK(str_builder.Append(ss.str()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append("test1"));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(int16(), str_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  std::vector<uint8_t> test{12, 12, 11, 12};
+  std::vector<uint8_t> test2{12, 12, 11, 11};
+  ASSERT_OK(builder.Append(test.data()));
+  ASSERT_OK(builder.Append(test2.data()));
+  ASSERT_OK(builder.Append(test.data()));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  ASSERT_OK(fsb_builder.Append(test.data()));
+  ASSERT_OK(fsb_builder.Append(test2.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
+                               static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(value.data()));
+    ASSERT_OK(fsb_builder.Append(value.data()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  std::vector<uint8_t> known_value{12, 12, 0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value.data()));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build an array with different byte width
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
+  std::vector<uint8_t> value{100, 1, 1, 1, 1};
+  ASSERT_OK(fsb_builder.Append(value.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
+}
+
+TEST(TestDecimalDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  const auto& decimal_type = arrow::decimal(2, 0);
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+  // Test data
+  std::vector<Decimal128> test{12, 12, 11, 12};
+  for (const auto& value : test) {
+    ASSERT_OK(builder.Append(value.ToBytes().data()));
+  }
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder decimal_builder(decimal_type);
+  ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes()));
+  ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes()));
+
+  std::shared_ptr<Array> decimal_array;
+  ASSERT_OK(decimal_builder.Finish(&decimal_array));
+  auto dtype = arrow::dictionary(int8(), decimal_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append({0, 0, 1, 0}));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
+  const auto& decimal_type = arrow::decimal(21, 0);
+
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(decimal_type);
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    const uint8_t bytes[] = {0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             12,
+                             12,
+                             static_cast<uint8_t>(i / 128),
+                             static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(bytes));
+    ASSERT_OK(fsb_builder.Append(bytes));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+// ----------------------------------------------------------------------
 // List tests
 
 class TestListArray : public TestBuilder {
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 4d7fd5f..de132b5 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -34,6 +34,7 @@
 #include "arrow/util/cpu-info.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/hash-util.h"
+#include "arrow/util/hash.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -806,6 +807,305 @@ Status BooleanBuilder::Append(const std::vector<bool>& values)
{
 }
 
 // ----------------------------------------------------------------------
+// DictionaryBuilder
+
+using internal::WrappedBinary;
+
+template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>&
type,
+                                        MemoryPool* pool)
+    : ArrayBuilder(type, pool),
+      hash_slots_(nullptr),
+      dict_builder_(type, pool),
+      values_builder_(pool),
+      byte_width_(-1) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>&
type,
+                                               MemoryPool* pool)
+    : ArrayBuilder(type, pool), values_builder_(pool) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+DictionaryBuilder<NullType>::~DictionaryBuilder() {}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool)
+    : ArrayBuilder(type, pool),
+      hash_slots_(nullptr),
+      dict_builder_(type, pool),
+      values_builder_(pool),
+      byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width())
{
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+
+  // Fill the initial hash table
+  RETURN_NOT_OK(internal::NewHashTable(kInitialHashTableSize, pool_, &hash_table_));
+  hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
+  hash_table_size_ = kInitialHashTableSize;
+  mod_bitmask_ = kInitialHashTableSize - 1;
+  hash_table_load_threshold_ =
+      static_cast<int64_t>(static_cast<double>(elements) * kMaxHashTableLoad);
+
+  return values_builder_.Init(elements);
+}
+
+Status DictionaryBuilder<NullType>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  return values_builder_.Init(elements);
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out)
{
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>*
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Append(const Scalar& value) {
+  RETURN_NOT_OK(Reserve(1));
+  // Based on DictEncoder<DType>::Put
+  int64_t j = HashValue(value) & mod_bitmask_;
+  hash_slot_t index = hash_slots_[j];
+
+  // Find an empty slot
+  while (kHashSlotEmpty != index && SlotDifferent(index, value)) {
+    // Linear probing
+    ++j;
+    if (j == hash_table_size_) {
+      j = 0;
+    }
+    index = hash_slots_[j];
+  }
+
+  if (index == kHashSlotEmpty) {
+    // Not in the hash table, so we insert it now
+    index = static_cast<hash_slot_t>(dict_builder_.length());
+    hash_slots_[j] = index;
+    RETURN_NOT_OK(AppendDictionary(value));
+
+    if (ARROW_PREDICT_FALSE(static_cast<int32_t>(dict_builder_.length()) >
+                            hash_table_load_threshold_)) {
+      RETURN_NOT_OK(DoubleTableSize());
+    }
+  }
+
+  RETURN_NOT_OK(values_builder_.Append(index));
+
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendArray(const Array& array) {
+  const auto& numeric_array = static_cast<const NumericArray<T>&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::AppendArray(const Array& array) {
+  for (int64_t i = 0; i < array.length(); i++) {
+    RETURN_NOT_OK(AppendNull());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array)
{
+  if (!type_->Equals(*array.type())) {
+    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+  }
+
+  const auto& numeric_array = static_cast<const FixedSizeBinaryArray&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendNull() {
+  return values_builder_.AppendNull();
+}
+
+Status DictionaryBuilder<NullType>::AppendNull() { return values_builder_.AppendNull();
}
+
+template <typename T>
+Status DictionaryBuilder<T>::DoubleTableSize() {
+#define INNER_LOOP                                                \
+  Scalar value = GetDictionaryValue(static_cast<int64_t>(index)); \
+  int64_t j = HashValue(value) & new_mod_bitmask;
+
+  DOUBLE_TABLE_SIZE(, INNER_LOOP);
+
+  return Status::OK();
+}
+
+template <typename T>
+typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
+    int64_t index) {
+  const Scalar* data = reinterpret_cast<const Scalar*>(dict_builder_.data()->data());
+  return data[index];
+}
+
+template <>
+const uint8_t* DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index)
{
+  return dict_builder_.GetValue(index);
+}
+
+template <typename T>
+int64_t DictionaryBuilder<T>::HashValue(const Scalar& value) {
+  return HashUtil::Hash(&value, sizeof(Scalar), 0);
+}
+
+template <>
+int64_t DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value)
{
+  return HashUtil::Hash(value, byte_width_, 0);
+}
+
+template <typename T>
+bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value)
{
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return other != value;
+}
+
+template <>
+bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index,
+                                                           const Scalar& value) {
+  int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width();
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return memcmp(other, value, width) != 0;
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
+  return dict_builder_.Append(value);
+}
+
+#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                     \
+  template <>                                                                     
 \
+  WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(int64_t index) {      
 \
+    int32_t v_len;                                                                  \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), &v_len);
\
+    return WrappedBinary(v, v_len);                                                 \
+  }                                                                                 \
+                                                                                    \
+  template <>                                                                     
 \
+  Status DictionaryBuilder<Type>::AppendDictionary(const WrappedBinary& value)
{    \
+    return dict_builder_.Append(value.ptr_, value.length_);                         \
+  }                                                                                 \
+                                                                                    \
+  template <>                                                                     
 \
+  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {           
     \
+    const BinaryArray& binary_array = static_cast<const BinaryArray&>(array);
      \
+    WrappedBinary value(nullptr, 0);                                                \
+    for (int64_t i = 0; i < array.length(); i++) {                                  \
+      if (array.IsNull(i)) {                                                        \
+        RETURN_NOT_OK(AppendNull());                                                \
+      } else {                                                                      \
+        value.ptr_ = binary_array.GetValue(i, &value.length_);                      \
+        RETURN_NOT_OK(Append(value));                                               \
+      }                                                                             \
+    }                                                                               \
+    return Status::OK();                                                            \
+  }                                                                                 \
+                                                                                    \
+  template <>                                                                     
 \
+  int64_t DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {    
     \
+    return HashUtil::Hash(value.ptr_, value.length_, 0);                            \
+  }                                                                                 \
+                                                                                    \
+  template <>                                                                     
 \
+  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,                  
 \
+                                              const WrappedBinary& value) {         \
+    int32_t other_length;                                                           \
+    const uint8_t* other_value =                                                    \
+        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);   
     \
+    return !(other_length == value.length_ &&                                   
   \
+             0 == memcmp(other_value, value.ptr_, value.length_));                  \
+  }
+
+BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
+BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
+
+template class DictionaryBuilder<UInt8Type>;
+template class DictionaryBuilder<UInt16Type>;
+template class DictionaryBuilder<UInt32Type>;
+template class DictionaryBuilder<UInt64Type>;
+template class DictionaryBuilder<Int8Type>;
+template class DictionaryBuilder<Int16Type>;
+template class DictionaryBuilder<Int32Type>;
+template class DictionaryBuilder<Int64Type>;
+template class DictionaryBuilder<Date32Type>;
+template class DictionaryBuilder<Date64Type>;
+template class DictionaryBuilder<Time32Type>;
+template class DictionaryBuilder<Time64Type>;
+template class DictionaryBuilder<TimestampType>;
+template class DictionaryBuilder<FloatType>;
+template class DictionaryBuilder<DoubleType>;
+template class DictionaryBuilder<FixedSizeBinaryType>;
+template class DictionaryBuilder<BinaryType>;
+template class DictionaryBuilder<StringType>;
+
+// ----------------------------------------------------------------------
 // Decimal128Builder
 
 Decimal128Builder::Decimal128Builder(const std::shared_ptr<DataType>& type,
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index e59e166..ce7b8cd 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -32,6 +32,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
+#include "arrow/util/hash.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
@@ -810,6 +811,158 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
 };
 
 // ----------------------------------------------------------------------
+// Dictionary builder
+
+namespace internal {
+
+// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
+struct WrappedBinary {
+  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {}
+
+  const uint8_t* ptr_;
+  int32_t length_;
+};
+
+template <typename T>
+struct DictionaryScalar {
+  using type = typename T::c_type;
+};
+
+template <>
+struct DictionaryScalar<BinaryType> {
+  using type = WrappedBinary;
+};
+
+template <>
+struct DictionaryScalar<StringType> {
+  using type = WrappedBinary;
+};
+
+template <>
+struct DictionaryScalar<FixedSizeBinaryType> {
+  using type = uint8_t const*;
+};
+
+}  // namespace internal
+
+/// \brief Array builder for created encoded DictionaryArray from dense array
+/// data
+template <typename T>
+class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
+ public:
+  using Scalar = typename internal::DictionaryScalar<T>::type;
+
+  ~DictionaryBuilder() {}
+
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+
+  template <typename T1 = T>
+  explicit DictionaryBuilder(
+      typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type
pool)
+      : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
+
+  /// \brief Append a scalar value
+  Status Append(const Scalar& value);
+
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ protected:
+  Status DoubleTableSize();
+  Scalar GetDictionaryValue(int64_t index);
+  int64_t HashValue(const Scalar& value);
+  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
+  Status AppendDictionary(const Scalar& value);
+
+  std::shared_ptr<Buffer> hash_table_;
+  int32_t* hash_slots_;
+
+  /// Size of the table. Must be a power of 2.
+  int64_t hash_table_size_;
+
+  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
+  // hash_table_size_, but uses far fewer CPU cycles
+  int64_t mod_bitmask_;
+
+  typename TypeTraits<T>::BuilderType dict_builder_;
+  AdaptiveIntBuilder values_builder_;
+  int32_t byte_width_;
+
+  /// Size at which we decide to resize
+  int64_t hash_table_load_threshold_;
+};
+
+template <>
+class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
+ public:
+  ~DictionaryBuilder();
+
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+  explicit DictionaryBuilder(MemoryPool* pool);
+
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ protected:
+  AdaptiveIntBuilder values_builder_;
+};
+
+class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
+ public:
+  using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(internal::WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
+                                          static_cast<int32_t>(value.size())));
+  }
+};
+
+/// \brief Dictionary array builder with convenience methods for strings
+class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
+ public:
+  using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(internal::WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
+                                          static_cast<int32_t>(value.size())));
+  }
+};
+
+// ----------------------------------------------------------------------
 // Helper functions
 
 Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 750f1d3..1face78 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -30,21 +30,13 @@
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/util-internal.h"
 #include "arrow/util/hash-util.h"
+#include "arrow/util/hash.h"
 
 namespace arrow {
 namespace compute {
 
 namespace {
 
-// Initially 1024 elements
-static constexpr int64_t kInitialHashTableSize = 1 << 10;
-
-typedef int32_t hash_slot_t;
-static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
-
-// The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.5;
-
 enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
 
 #define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE)                  \
@@ -54,17 +46,6 @@ enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
     return Status::NotImplemented(ss.str());                       \
   }
 
-Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* out) {
-  auto hash_table = std::make_shared<PoolBuffer>(pool);
-
-  RETURN_NOT_OK(hash_table->Resize(sizeof(hash_slot_t) * size));
-  int32_t* slots = reinterpret_cast<hash_slot_t*>(hash_table->mutable_data());
-  std::fill(slots, slots + size, kHashSlotEmpty);
-
-  *out = hash_table;
-  return Status::OK();
-}
-
 // This is a slight design concession -- some hash actions have the possibility
 // of failure. Rather than introduce extra error checking into all actions, we
 // will raise an internal exception so that only the actions where errors can
@@ -129,7 +110,7 @@ class HashTable {
 
 Status HashTable::Init(int64_t elements) {
   DCHECK_EQ(elements, BitUtil::NextPower2(elements));
-  RETURN_NOT_OK(NewHashTable(elements, pool_, &hash_table_));
+  RETURN_NOT_OK(internal::NewHashTable(elements, pool_, &hash_table_));
   hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data());
   hash_table_size_ = elements;
   hash_table_load_threshold_ =
@@ -238,44 +219,6 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
     }                                                                                   
\
   }
 
-#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH)                              \
-  do {                                                                           \
-    int64_t new_size = hash_table_size_ * 2;                                     \
-                                                                                 \
-    std::shared_ptr<Buffer> new_hash_table;                                      \
-    RETURN_NOT_OK(NewHashTable(new_size, pool_, &new_hash_table));               \
-    int32_t* new_hash_slots =                                                    \
-        reinterpret_cast<hash_slot_t*>(new_hash_table->mutable_data());        
 \
-    int64_t new_mod_bitmask = new_size - 1;                                      \
-                                                                                 \
-    SETUP_CODE;                                                                  \
-                                                                                 \
-    for (int i = 0; i < hash_table_size_; ++i) {                                 \
-      hash_slot_t index = hash_slots_[i];                                        \
-                                                                                 \
-      if (index == kHashSlotEmpty) {                                             \
-        continue;                                                                \
-      }                                                                          \
-                                                                                 \
-      COMPUTE_HASH;                                                              \
-      while (kHashSlotEmpty != new_hash_slots[j]) {                              \
-        ++j;                                                                     \
-        if (ARROW_PREDICT_FALSE(j == new_size)) {                                \
-          j = 0;                                                                 \
-        }                                                                        \
-      }                                                                          \
-                                                                                 \
-      new_hash_slots[j] = index;                                                 \
-    }                                                                            \
-                                                                                 \
-    hash_table_ = new_hash_table;                                                \
-    hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data()); 
 \
-    hash_table_size_ = new_size;                                                 \
-    hash_table_load_threshold_ =                                                 \
-        static_cast<int64_t>(static_cast<double>(new_size) * kMaxHashTableLoad);
\
-    mod_bitmask_ = new_size - 1;                                                 \
-  } while (false)
-
 template <typename Type, typename Action>
 class HashTableKernel<Type, Action, enable_if_has_c_type<Type>> : public HashTable
{
  public:
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 29b18a9..42613d6 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -34,6 +34,7 @@ install(FILES
   cpu-info.h
   decimal.h
   hash-util.h
+  hash.h
   key_value_metadata.h
   logging.h
   macros.h
diff --git a/cpp/src/arrow/util/hash.cc b/cpp/src/arrow/util/hash.cc
new file mode 100644
index 0000000..94ba524
--- /dev/null
+++ b/cpp/src/arrow/util/hash.cc
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/hash.h"
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace internal {
+
+Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* out) {
+  auto hash_table = std::make_shared<PoolBuffer>(pool);
+
+  RETURN_NOT_OK(hash_table->Resize(sizeof(hash_slot_t) * size));
+  int32_t* slots = reinterpret_cast<hash_slot_t*>(hash_table->mutable_data());
+  std::fill(slots, slots + size, kHashSlotEmpty);
+
+  *out = hash_table;
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hash.h b/cpp/src/arrow/util/hash.h
new file mode 100644
index 0000000..3597342
--- /dev/null
+++ b/cpp/src/arrow/util/hash.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_HASH_H
+#define ARROW_UTIL_HASH_H
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+typedef int32_t hash_slot_t;
+static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
+
+// Initially 1024 elements
+static constexpr int kInitialHashTableSize = 1 << 10;
+
+// The maximum load factor for the hash table before resizing.
+static constexpr double kMaxHashTableLoad = 0.5;
+
+namespace internal {
+
+#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH)                              \
+  do {                                                                           \
+    int64_t new_size = hash_table_size_ * 2;                                     \
+                                                                                 \
+    std::shared_ptr<Buffer> new_hash_table;                                      \
+    RETURN_NOT_OK(internal::NewHashTable(new_size, pool_, &new_hash_table));     \
+    int32_t* new_hash_slots =                                                    \
+        reinterpret_cast<hash_slot_t*>(new_hash_table->mutable_data());        
 \
+    int64_t new_mod_bitmask = new_size - 1;                                      \
+                                                                                 \
+    SETUP_CODE;                                                                  \
+                                                                                 \
+    for (int i = 0; i < hash_table_size_; ++i) {                                 \
+      hash_slot_t index = hash_slots_[i];                                        \
+                                                                                 \
+      if (index == kHashSlotEmpty) {                                             \
+        continue;                                                                \
+      }                                                                          \
+                                                                                 \
+      COMPUTE_HASH;                                                              \
+      while (kHashSlotEmpty != new_hash_slots[j]) {                              \
+        ++j;                                                                     \
+        if (ARROW_PREDICT_FALSE(j == new_size)) {                                \
+          j = 0;                                                                 \
+        }                                                                        \
+      }                                                                          \
+                                                                                 \
+      new_hash_slots[j] = index;                                                 \
+    }                                                                            \
+                                                                                 \
+    hash_table_ = new_hash_table;                                                \
+    hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data()); 
 \
+    hash_table_size_ = new_size;                                                 \
+    hash_table_load_threshold_ =                                                 \
+        static_cast<int64_t>(static_cast<double>(new_size) * kMaxHashTableLoad);
\
+    mod_bitmask_ = new_size - 1;                                                 \
+  } while (false)
+
+Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* out);
+
+}  // namespace internal
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_HASH_H

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Mime
View raw message