arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-1160: C++: Implement DictionaryBuilder
Date Sun, 02 Jul 2017 21:41:41 GMT
Repository: arrow
Updated Branches:
  refs/heads/master e268ce87e -> 9e4906f7f


ARROW-1160: C++: Implement DictionaryBuilder

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #792 from xhochy/ARROW-1160 and squashes the following commits:

cf59f3a1 [Uwe L. Korn] Add benchmarks
809cdc71 [Uwe L. Korn] Incorporate review comments
69cc6a92 [Uwe L. Korn] ninja lint
957a1a9b [Uwe L. Korn] Generalize DictionaryBuilder
c97b423e [Uwe L. Korn] Fix conversion warning.
ea0d41ae [Uwe L. Korn] ARROW-1160: C++: Implement DictionaryBuilder


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/9e4906f7
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/9e4906f7
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/9e4906f7

Branch: refs/heads/master
Commit: 9e4906f7ff911c3a869976b2add30d3066cbe9cf
Parents: e268ce8
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Sun Jul 2 17:41:34 2017 -0400
Committer: Wes McKinney <wes.mckinney@twosigma.com>
Committed: Sun Jul 2 17:41:34 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-decimal-test.cc           |  12 +-
 cpp/src/arrow/array-test.cc                   | 145 ++++++++++++++
 cpp/src/arrow/buffer.h                        |   5 +-
 cpp/src/arrow/builder-benchmark.cc            |  43 +++++
 cpp/src/arrow/builder.cc                      | 211 +++++++++++++++++++++
 cpp/src/arrow/builder.h                       |  85 +++++++++
 cpp/src/arrow/io/io-hdfs-test.cc              |   7 +-
 cpp/src/arrow/ipc/ipc-read-write-benchmark.cc |   4 +-
 cpp/src/arrow/python/builtin_convert.cc       |   5 +-
 cpp/src/arrow/type.h                          |   2 +-
 cpp/src/arrow/util/hash-util.h                |   8 +-
 11 files changed, 507 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/array-decimal-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc
index fb4b8d9..2a3d0a9 100644
--- a/cpp/src/arrow/array-decimal-test.cc
+++ b/cpp/src/arrow/array-decimal-test.cc
@@ -170,14 +170,14 @@ TEST_P(Decimal128BuilderTest, WithNulls) {
 }
 
 INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest,
-    ::testing::Range(
-        DecimalPrecision<int32_t>::minimum, DecimalPrecision<int32_t>::maximum));
+    ::testing::Range(DecimalPrecision<int32_t>::minimum,
+                            DecimalPrecision<int32_t>::maximum));
 INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest,
-    ::testing::Range(
-        DecimalPrecision<int64_t>::minimum, DecimalPrecision<int64_t>::maximum));
+    ::testing::Range(DecimalPrecision<int64_t>::minimum,
+                            DecimalPrecision<int64_t>::maximum));
 INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest,
-    ::testing::Range(
-        DecimalPrecision<int128_t>::minimum, DecimalPrecision<int128_t>::maximum));
+    ::testing::Range(DecimalPrecision<int128_t>::minimum,
+                            DecimalPrecision<int128_t>::maximum));
 
 }  // namespace decimal
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 8f6323b..b28a3a3 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1488,6 +1488,151 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendVector) {
 }
 
 // ----------------------------------------------------------------------
+// Dictionary tests
+
+template <typename Type>
+class TestDictionaryBuilder : public TestBuilder {};
+
+typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
+    UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
+    PrimitiveDictionaries;
+
+TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries);
+
+TYPED_TEST(TestDictionaryBuilder, Basic) {
+  DictionaryBuilder<TypeParam> builder(default_memory_pool());
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder(default_memory_pool());
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype =
+      std::make_shared<DictionaryType>(std::make_shared<TypeParam>(), dict_array);
+
+  UInt8Builder int_builder(default_memory_pool());
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
+  using Scalar = typename TypeParam::c_type;
+  // Skip this test for (u)int8
+  if (sizeof(Scalar) > 1) {
+    // Build the dictionary Array
+    DictionaryBuilder<TypeParam> builder(default_memory_pool());
+    // Build expected data
+    NumericBuilder<TypeParam> dict_builder(default_memory_pool());
+    UInt16Builder int_builder(default_memory_pool());
+
+    // Fill with 1024 different values
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+    }
+    // Fill with an already existing value
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(1)));
+      ASSERT_OK(int_builder.Append(1));
+    }
+
+    // Finalize result
+    std::shared_ptr<Array> result;
+    ASSERT_OK(builder.Finish(&result));
+
+    // Finalize expected data
+    std::shared_ptr<Array> dict_array;
+    ASSERT_OK(dict_builder.Finish(&dict_array));
+    auto dtype =
+        std::make_shared<DictionaryType>(std::make_shared<TypeParam>(), dict_array);
+    std::shared_ptr<Array> int_array;
+    ASSERT_OK(int_builder.Finish(&int_array));
+
+    DictionaryArray expected(dtype, int_array);
+    ASSERT_TRUE(expected.Equals(result));
+  }
+}
+
+TEST(TestStringDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  StringBuilder str_builder(default_memory_pool());
+  ASSERT_OK(str_builder.Append("test"));
+  ASSERT_OK(str_builder.Append("test2"));
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(utf8(), str_array);
+
+  UInt8Builder int_builder(default_memory_pool());
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestStringDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  // Build expected data
+  StringBuilder str_builder(default_memory_pool());
+  UInt16Builder int_builder(default_memory_pool());
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::stringstream ss;
+    ss << "test" << i;
+    ASSERT_OK(builder.Append(ss.str()));
+    ASSERT_OK(str_builder.Append(ss.str()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append("test1"));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(utf8(), str_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+// ----------------------------------------------------------------------
 // List tests
 
 class TestListBuilder : public TestBuilder {

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index a02ce3c..bfbea77 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -241,8 +241,9 @@ class ARROW_EXPORT BufferBuilder {
     capacity_ = size_ = 0;
     return result;
   }
-  int64_t capacity() { return capacity_; }
-  int64_t length() { return size_; }
+  int64_t capacity() const { return capacity_; }
+  int64_t length() const { return size_; }
+  const uint8_t* data() const { return data_; }
 
  private:
   std::shared_ptr<PoolBuffer> buffer_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/builder-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc
index bdfba8b..5eae9ab 100644
--- a/cpp/src/arrow/builder-benchmark.cc
+++ b/cpp/src/arrow/builder-benchmark.cc
@@ -121,6 +121,47 @@ static void BM_BuildAdaptiveUIntNoNulls(
   state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t));
 }
 
+static void BM_BuildDictionary(benchmark::State& state) {  // NOLINT non-const reference
+  const int64_t iterations = 1024;
+  while (state.KeepRunning()) {
+    DictionaryBuilder<Int64Type> builder(default_memory_pool());
+    for (int64_t i = 0; i < iterations; i++) {
+      for (int64_t j = 0; j < i; j++) {
+        ABORT_NOT_OK(builder.Append(j));
+      }
+    }
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(builder.Finish(&out));
+  }
+  state.SetBytesProcessed(
+      state.iterations() * iterations * (iterations + 1) / 2 * sizeof(int64_t));
+}
+
+static void BM_BuildStringDictionary(
+    benchmark::State& state) {  // NOLINT non-const reference
+  const int64_t iterations = 1024;
+  // Pre-render strings
+  std::vector<std::string> data;
+  for (int64_t i = 0; i < iterations; i++) {
+    std::stringstream ss;
+    ss << i;
+    data.push_back(ss.str());
+  }
+  while (state.KeepRunning()) {
+    StringDictionaryBuilder builder(default_memory_pool());
+    for (int64_t i = 0; i < iterations; i++) {
+      for (int64_t j = 0; j < i; j++) {
+        ABORT_NOT_OK(builder.Append(data[j]));
+      }
+    }
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(builder.Finish(&out));
+  }
+  // Assuming a string here needs on average 2 bytes
+  state.SetBytesProcessed(
+      state.iterations() * iterations * (iterations + 1) / 2 * sizeof(int32_t));
+}
+
 BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildVectorNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildAdaptiveIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
@@ -128,5 +169,7 @@ BENCHMARK(BM_BuildAdaptiveIntNoNullsScalarAppend)
     ->Repetitions(3)
     ->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildAdaptiveUIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
 
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 6762e17..e61258c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -28,7 +28,9 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
+#include "arrow/util/cpu-info.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/hash-util.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -656,6 +658,204 @@ Status BooleanBuilder::Append(
 }
 
 // ----------------------------------------------------------------------
+// DictionaryBuilder
+
+template <typename T, typename Scalar>
+DictionaryBuilder<T, Scalar>::DictionaryBuilder(
+    MemoryPool* pool, const std::shared_ptr<DataType>& type)
+    : ArrayBuilder(pool, type),
+      hash_table_(new PoolBuffer(pool)),
+      hash_slots_(nullptr),
+      dict_builder_(pool, type),
+      values_builder_(pool) {
+  if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); }
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+
+  // Fill the initial hash table
+  RETURN_NOT_OK(hash_table_->Resize(sizeof(hash_slot_t) * kInitialHashTableSize));
+  hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
+  std::fill(hash_slots_, hash_slots_ + kInitialHashTableSize, kHashSlotEmpty);
+  hash_table_size_ = kInitialHashTableSize;
+  mod_bitmask_ = kInitialHashTableSize - 1;
+
+  return values_builder_.Init(elements);
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::Finish(std::shared_ptr<Array>* out) {
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+  auto type = std::make_shared<DictionaryType>(type_, dictionary);
+
+  std::shared_ptr<Array> values;
+  RETURN_NOT_OK(values_builder_.Finish(&values));
+
+  *out = std::make_shared<DictionaryArray>(type, values);
+  return Status::OK();
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::Append(const Scalar& value) {
+  RETURN_NOT_OK(Reserve(1));
+  // Based on DictEncoder<DType>::Put
+  int j = HashValue(value) & mod_bitmask_;
+  hash_slot_t index = hash_slots_[j];
+
+  // Find an empty slot
+  while (kHashSlotEmpty != index && SlotDifferent(index, value)) {
+    // Linear probing
+    ++j;
+    if (j == hash_table_size_) { j = 0; }
+    index = hash_slots_[j];
+  }
+
+  if (index == kHashSlotEmpty) {
+    // Not in the hash table, so we insert it now
+    index = static_cast<hash_slot_t>(dict_builder_.length());
+    hash_slots_[j] = index;
+    RETURN_NOT_OK(AppendDictionary(value));
+
+    if (UNLIKELY(static_cast<int32_t>(dict_builder_.length()) >
+                 hash_table_size_ * kMaxHashTableLoad)) {
+      RETURN_NOT_OK(DoubleTableSize());
+    }
+  }
+
+  RETURN_NOT_OK(values_builder_.Append(index));
+
+  return Status::OK();
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::DoubleTableSize() {
+  int new_size = hash_table_size_ * 2;
+  auto new_hash_table = std::make_shared<PoolBuffer>(pool_);
+
+  RETURN_NOT_OK(new_hash_table->Resize(sizeof(hash_slot_t) * new_size));
+  int32_t* new_hash_slots = reinterpret_cast<int32_t*>(new_hash_table->mutable_data());
+  std::fill(new_hash_slots, new_hash_slots + new_size, kHashSlotEmpty);
+  int new_mod_bitmask = new_size - 1;
+
+  for (int i = 0; i < hash_table_size_; ++i) {
+    hash_slot_t index = hash_slots_[i];
+
+    if (index == kHashSlotEmpty) { continue; }
+
+    // Compute the hash value mod the new table size to start looking for an
+    // empty slot
+    Scalar value = GetDictionaryValue(static_cast<int64_t>(index));
+
+    // Find an empty slot in the new hash table
+    int j = HashValue(value) & new_mod_bitmask;
+    hash_slot_t slot = new_hash_slots[j];
+
+    while (kHashSlotEmpty != slot && SlotDifferent(slot, value)) {
+      ++j;
+      if (j == new_size) { j = 0; }
+      slot = new_hash_slots[j];
+    }
+
+    // Copy the old slot index to the new hash table
+    new_hash_slots[j] = index;
+  }
+
+  hash_table_ = new_hash_table;
+  hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
+  hash_table_size_ = new_size;
+  mod_bitmask_ = new_size - 1;
+
+  return Status::OK();
+}
+
+template <typename T, typename Scalar>
+Scalar DictionaryBuilder<T, Scalar>::GetDictionaryValue(int64_t index) {
+  const Scalar* data = reinterpret_cast<const Scalar*>(dict_builder_.data()->data());
+  return data[index];
+}
+
+template <typename T, typename Scalar>
+int DictionaryBuilder<T, Scalar>::HashValue(const Scalar& value) {
+  return HashUtil::Hash(&value, sizeof(Scalar), 0);
+}
+
+template <typename T, typename Scalar>
+bool DictionaryBuilder<T, Scalar>::SlotDifferent(hash_slot_t index, const Scalar&
value) {
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return other != value;
+}
+
+template <typename T, typename Scalar>
+Status DictionaryBuilder<T, Scalar>::AppendDictionary(const Scalar& value) {
+  return dict_builder_.Append(value);
+}
+
+#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                       \
+  template <>                                                                     
   \
+  WrappedBinary DictionaryBuilder<Type, WrappedBinary>::GetDictionaryValue(       
   \
+      int64_t index) {                                                                \
+    int32_t v_len;                                                                    \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), &v_len);
  \
+    return WrappedBinary(v, v_len);                                                   \
+  }                                                                                   \
+                                                                                      \
+  template <>                                                                     
   \
+  int DictionaryBuilder<Type, WrappedBinary>::HashValue(const WrappedBinary& value)
{ \
+    return HashUtil::Hash(value.ptr_, value.length_, 0);                              \
+  }                                                                                   \
+                                                                                      \
+  template <>                                                                     
   \
+  bool DictionaryBuilder<Type, WrappedBinary>::SlotDifferent(                     
   \
+      hash_slot_t index, const WrappedBinary& value) {                              
 \
+    int32_t other_length;                                                             \
+    const uint8_t* other_value =                                                      \
+        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);   
       \
+    return !(other_length == value.length_ &&                                   
     \
+             0 == memcmp(other_value, value.ptr_, value.length_));                    \
+  }                                                                                   \
+                                                                                      \
+  template <>                                                                     
   \
+  Status DictionaryBuilder<Type, WrappedBinary>::AppendDictionary(                
   \
+      const WrappedBinary& value) {                                                 
 \
+    return dict_builder_.Append(value.ptr_, value.length_);                           \
+  }
+
+BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
+BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
+
+template class DictionaryBuilder<UInt8Type>;
+template class DictionaryBuilder<UInt16Type>;
+template class DictionaryBuilder<UInt32Type>;
+template class DictionaryBuilder<UInt64Type>;
+template class DictionaryBuilder<Int8Type>;
+template class DictionaryBuilder<Int16Type>;
+template class DictionaryBuilder<Int32Type>;
+template class DictionaryBuilder<Int64Type>;
+template class DictionaryBuilder<Date32Type>;
+template class DictionaryBuilder<Date64Type>;
+template class DictionaryBuilder<Time32Type>;
+template class DictionaryBuilder<Time64Type>;
+template class DictionaryBuilder<TimestampType>;
+template class DictionaryBuilder<FloatType>;
+template class DictionaryBuilder<DoubleType>;
+template class DictionaryBuilder<BinaryType, WrappedBinary>;
+template class DictionaryBuilder<StringType, WrappedBinary>;
+
+// ----------------------------------------------------------------------
 // DecimalBuilder
 DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type)
     : FixedSizeBinaryBuilder(pool, type),
@@ -830,6 +1030,17 @@ Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
   return Status::OK();
 }
 
+const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const {
+  const int32_t* offsets = reinterpret_cast<const int32_t*>(offset_builder_.data());
+  int32_t offset = offsets[i];
+  if (i == (length_ - 1)) {
+    *out_length = static_cast<int32_t>(value_builder_->length()) - offset;
+  } else {
+    *out_length = offsets[i + 1] - offset;
+  }
+  return byte_builder_->data()->data() + offset;
+}
+
 StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {}
 
 Status StringBuilder::Finish(std::shared_ptr<Array>* out) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index d77223e..75fb6ef 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -569,6 +569,11 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
 
   Status Finish(std::shared_ptr<Array>* out) override;
 
+  /// Temporary access to a value.
+  ///
+  /// This pointer becomes invalid on the next modifying operation.
+  const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
+
  protected:
   UInt8Builder* byte_builder_;
 };
@@ -675,6 +680,86 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
 };
 
 // ----------------------------------------------------------------------
+// Dictionary builder
+
+// Based on Apache Parquet-cpp's DictEncoder
+
+// Initially 1024 elements
+static constexpr int kInitialHashTableSize = 1 << 10;
+
+typedef int32_t hash_slot_t;
+static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
+
+// The maximum load factor for the hash table before resizing.
+static constexpr double kMaxHashTableLoad = 0.7;
+
+template <typename T, typename Scalar = typename T::c_type>
+class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
+ public:
+  explicit DictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type);
+
+  template <typename T1 = T, typename Scalar1 = Scalar>
+  explicit DictionaryBuilder(
+      typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type
pool)
+      : DictionaryBuilder<T1, Scalar1>(pool, TypeTraits<T1>::type_singleton())
{}
+
+  Status Append(const Scalar& value);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status Finish(std::shared_ptr<Array>* out) override;
+
+ protected:
+  Status DoubleTableSize();
+  Scalar GetDictionaryValue(int64_t index);
+  int HashValue(const Scalar& value);
+  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
+  Status AppendDictionary(const Scalar& value);
+
+  std::shared_ptr<PoolBuffer> hash_table_;
+  int32_t* hash_slots_;
+
+  /// Size of the table. Must be a power of 2.
+  int hash_table_size_;
+
+  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
+  // hash_table_size_, but uses far fewer CPU cycles
+  int mod_bitmask_;
+
+  typename TypeTraits<T>::BuilderType dict_builder_;
+  AdaptiveUIntBuilder values_builder_;
+};
+
+// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
+struct WrappedBinary {
+  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {}
+
+  const uint8_t* ptr_;
+  int32_t length_;
+};
+
+class ARROW_EXPORT StringDictionaryBuilder
+    : public DictionaryBuilder<StringType, WrappedBinary> {
+ public:
+  using DictionaryBuilder::DictionaryBuilder;
+
+  using DictionaryBuilder::Append;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
+        static_cast<int32_t>(value.size())));
+  }
+};
+
+// ----------------------------------------------------------------------
 // Helper functions
 
 Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
type,

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/io/io-hdfs-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc
index 74f8042..b8203f0 100644
--- a/cpp/src/arrow/io/io-hdfs-test.cc
+++ b/cpp/src/arrow/io/io-hdfs-test.cc
@@ -87,9 +87,10 @@ class TestHdfsClient : public ::testing::Test {
     LibHdfsShim* driver_shim;
 
     client_ = nullptr;
-    scratch_dir_ = boost::filesystem::unique_path(
-        boost::filesystem::temp_directory_path() / "arrow-hdfs/scratch-%%%%")
-                       .string();
+    scratch_dir_ =
+        boost::filesystem::unique_path(
+            boost::filesystem::temp_directory_path() / "arrow-hdfs/scratch-%%%%")
+            .string();
 
     loaded_driver_ = false;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
index b385929..1aecdbc 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
@@ -80,7 +80,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) {  // NOLINT
non-const
     int32_t metadata_length;
     int64_t body_length;
     if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
-            default_memory_pool())
+             default_memory_pool())
              .ok()) {
       state.SkipWithError("Failed to write!");
     }
@@ -101,7 +101,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) {  // NOLINT
non-const r
   int32_t metadata_length;
   int64_t body_length;
   if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
-          default_memory_pool())
+           default_memory_pool())
            .ok()) {
     state.SkipWithError("Failed to write!");
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 97e2bb7..2f334f0 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -464,8 +464,9 @@ class FixedWidthBytesConverter
   inline Status AppendItem(const OwnedRef& item) {
     PyObject* bytes_obj;
     OwnedRef tmp;
-    Py_ssize_t expected_length = std::dynamic_pointer_cast<FixedSizeBinaryType>(
-        typed_builder_->type())->byte_width();
+    Py_ssize_t expected_length =
+        std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
+            ->byte_width();
     if (item.obj() == Py_None) {
       RETURN_NOT_OK(typed_builder_->AppendNull());
       return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8338800..9ae8374 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -407,7 +407,7 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType {
 
   Status Accept(TypeVisitor* visitor) const override;
   std::string ToString() const override;
-  static std::string name() {return "fixed_size_binary"; }
+  static std::string name() { return "fixed_size_binary"; }
 
   std::vector<BufferDescr> GetBufferLayout() const override;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/9e4906f7/cpp/src/arrow/util/hash-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index ffe1a9d..4c049c1 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -41,8 +41,8 @@ class HashUtil {
   /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
   static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
     DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
-    uint32_t words = bytes / sizeof(uint32_t);
-    bytes = bytes % sizeof(uint32_t);
+    uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
+    bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
 
     const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
     while (words--) {
@@ -201,7 +201,7 @@ class HashUtil {
     DCHECK_GT(bytes, 0);
     uint64_t hash_u64 = hash | ((uint64_t)hash << 32);
     hash_u64 = FnvHash64(data, bytes, hash_u64);
-    return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF);
+    return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
   }
 
   /// Computes the hash value for data.  Will call either CrcHash or MurmurHash
@@ -241,7 +241,7 @@ class HashUtil {
     // randomness of the constants) that any subset of bit positions of
     // Rehash32to32(hash1) is equal to the same subset of bit positions
     // Rehash32to32(hash2) is minimal.
-    return (static_cast<uint64_t>(hash) * m + a) >> 32;
+    return static_cast<uint32_t>((static_cast<uint64_t>(hash) * m + a) >>
32);
   }
 
   static inline uint64_t Rehash32to64(const uint32_t hash) {


Mime
View raw message