arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From u..@apache.org
Subject arrow git commit: ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder
Date Mon, 11 Sep 2017 10:47:41 GMT
Repository: arrow
Updated Branches:
  refs/heads/master a45b04712 -> 94f6247c4


ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes #1073 from xhochy/ARROW-1508 and squashes the following commits:

cefee2c [Uwe L. Korn] ninja format
c471bfa [Uwe L. Korn] Review comments
4bf354e [Uwe L. Korn] Use non-deprecated interfaces
2538926 [Uwe L. Korn] ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/94f6247c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/94f6247c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/94f6247c

Branch: refs/heads/master
Commit: 94f6247c4ef49b9b4fe1cccdc172810c55c34a10
Parents: a45b047
Author: Uwe L. Korn <uwelk@xhochy.com>
Authored: Mon Sep 11 12:46:37 2017 +0200
Committer: Uwe L. Korn <uwelk@xhochy.com>
Committed: Mon Sep 11 12:46:37 2017 +0200

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc | 84 ++++++++++++++++++++++++++++++++++++++++
 cpp/src/arrow/array.h       |  1 +
 cpp/src/arrow/builder.cc    | 71 ++++++++++++++++++++++++++++++++-
 cpp/src/arrow/builder.h     | 11 ++++++
 4 files changed, 165 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index c92c23d..a9596c5 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1764,6 +1764,90 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  std::vector<uint8_t> test{12, 12, 11, 12};
+  std::vector<uint8_t> test2{12, 12, 11, 11};
+  ASSERT_OK(builder.Append(test.data()));
+  ASSERT_OK(builder.Append(test2.data()));
+  ASSERT_OK(builder.Append(test.data()));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  ASSERT_OK(fsb_builder.Append(test.data()));
+  ASSERT_OK(fsb_builder.Append(test2.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
+                               static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(value.data()));
+    ASSERT_OK(fsb_builder.Append(value.data()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  std::vector<uint8_t> known_value{12, 12, 0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value.data()));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build an array with different byte width
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
+  std::vector<uint8_t> value{100, 1, 1, 1, 1};
+  ASSERT_OK(fsb_builder.Append(value.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
+}
+
 // ----------------------------------------------------------------------
 // List tests
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index bfeedd2..994270d 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -512,6 +512,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
                        int64_t null_count = 0, int64_t offset = 0);
 
   const uint8_t* GetValue(int64_t i) const;
+  const uint8_t* Value(int64_t i) const { return GetValue(i); }
 
   int32_t byte_width() const { return byte_width_; }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 7966241..5945677 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -26,6 +26,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
+#include "arrow/compare.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/type.h"
@@ -818,10 +819,25 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>&
type,
       hash_table_(new PoolBuffer(pool)),
       hash_slots_(nullptr),
       dict_builder_(type, pool),
+      values_builder_(pool),
+      byte_width_(-1) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool)
+    : ArrayBuilder(type, pool),
+      hash_table_(new PoolBuffer(pool)),
+      hash_slots_(nullptr),
+      dict_builder_(type, pool),
       values_builder_(pool) {
   if (!::arrow::CpuInfo::initialized()) {
     ::arrow::CpuInfo::Init();
   }
+  byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
 }
 
 #ifndef ARROW_NO_DEPRECATED_API
@@ -918,6 +934,24 @@ Status DictionaryBuilder<T>::AppendArray(const Array& array)
{
   return Status::OK();
 }
 
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array)
{
+  if (!type_->Equals(*array.type())) {
+    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+  }
+
+  const FixedSizeBinaryArray& numeric_array =
+      static_cast<const FixedSizeBinaryArray&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
 template <typename T>
 Status DictionaryBuilder<T>::AppendNull() {
   return values_builder_.AppendNull();
@@ -975,17 +1009,35 @@ typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
   return data[index];
 }
 
+template <>
+const uint8_t* DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index)
{
+  return dict_builder_.GetValue(index);
+}
+
 template <typename T>
 int DictionaryBuilder<T>::HashValue(const Scalar& value) {
   return HashUtil::Hash(&value, sizeof(Scalar), 0);
 }
 
+template <>
+int DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value) {
+  return HashUtil::Hash(value, byte_width_, 0);
+}
+
 template <typename T>
 bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value)
{
   const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
   return other != value;
 }
 
+template <>
+bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index,
+                                                           const Scalar& value) {
+  int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width();
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return memcmp(other, value, width) != 0;
+}
+
 template <typename T>
 Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
   return dict_builder_.Append(value);
@@ -1052,6 +1104,7 @@ template class DictionaryBuilder<Time64Type>;
 template class DictionaryBuilder<TimestampType>;
 template class DictionaryBuilder<FloatType>;
 template class DictionaryBuilder<DoubleType>;
+template class DictionaryBuilder<FixedSizeBinaryType>;
 template class DictionaryBuilder<BinaryType>;
 template class DictionaryBuilder<StringType>;
 
@@ -1315,6 +1368,11 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>*
out) {
   return Status::OK();
 }
 
+const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const {
+  const uint8_t* data_ptr = byte_builder_.data();
+  return data_ptr + i * byte_width_;
+}
+
 // ----------------------------------------------------------------------
 // Struct
 
@@ -1438,6 +1496,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
     DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
     DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
     DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
+    DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
     default:
       return Status::NotImplemented(type->ToString());
   }
@@ -1472,8 +1531,12 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool*
pool,
     DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
     DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder);
     DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder);
+    DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
     default:
-      return Status::NotImplemented(type->ToString());
+      std::stringstream ss;
+      ss << "Cannot encode array of type " << type->ToString();
+      ss << " to dictionary";
+      return Status::NotImplemented(ss.str());
   }
 }
 #define DICTIONARY_COLUMN_CASE(ENUM, BuilderType)                             \
@@ -1516,8 +1579,12 @@ Status EncodeColumnToDictionary(const Column& input, MemoryPool*
pool,
     DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
     DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder);
     DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder);
+    DICTIONARY_COLUMN_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
     default:
-      return Status::NotImplemented(type->ToString());
+      std::stringstream ss;
+      ss << "Cannot encode column of type " << type->ToString();
+      ss << " to dictionary";
+      return Status::NotImplemented(ss.str());
   }
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 3e8289f..bf7b317 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -756,6 +756,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
   /// \return size of values buffer so far
   int64_t value_data_length() const { return byte_builder_.length(); }
 
+  /// Temporary access to a value.
+  ///
+  /// This pointer becomes invalid on the next modifying operation.
+  const uint8_t* GetValue(int64_t i) const;
+
  protected:
   int32_t byte_width_;
   BufferBuilder byte_builder_;
@@ -866,6 +871,11 @@ struct DictionaryScalar<StringType> {
   using type = WrappedBinary;
 };
 
+template <>
+struct DictionaryScalar<FixedSizeBinaryType> {
+  using type = uint8_t const*;
+};
+
 }  // namespace internal
 
 /// \brief Array builder for created encoded DictionaryArray from dense array
@@ -921,6 +931,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
 
   typename TypeTraits<T>::BuilderType dict_builder_;
   AdaptiveIntBuilder values_builder_;
+  int32_t byte_width_;
 };
 
 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {


Mime
View raw message