arrow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From w...@apache.org
Subject arrow git commit: ARROW-206: Expose a C++ api to compare ranges of slots between two arrays
Date Mon, 23 May 2016 20:57:44 GMT
Repository: arrow
Updated Branches:
  refs/heads/master 703546787 -> cd1d770ed


ARROW-206: Expose a C++ api to compare ranges of slots between two arrays

@wesm the need for this grew out of @fengguangyuan PR to add struct type (#66) and struct
builder.  I considered a different APIs before settling on this:
1.  Add an API that took the parent bitmask (this potentially has possibility of being the
most performant, but would have a more awkward contract then provided here)
2.  Add an equality comparison for a single slot (leaves the least amount of room for optimization
but it would be the simplest to implement).
3.  This API which potentially leaves some room for optimization but I think places the least
requirements on the caller.

Let me know if you would prefer a different API.

WIP because I need to add more unit tests (I also need to think about if it is worth mirroring
the EqualsExact in addition to the Equals method).  Which I should get to by the end of the
weekend.

@fengguangyuan let me know if this makes sense to you as a way forward on your PR

Author: Micah Kornfield <emkornfield@gmail.com>

Closes #80 from emkornfield/emk_add_equality and squashes the following commits:

d5ae777 [Micah Kornfield] remove todo, its handled by type_traits
f963639 [Micah Kornfield] add in check for null arrays
f5c6bd5 [Micah Kornfield] make format/lint check
dcbaad4 [Micah Kornfield] unittests passing
318855d [Micah Kornfield] working primitive tests
dadb244 [Micah Kornfield] wip expose range equality to to allow for nested comparisons


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/cd1d770e
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/cd1d770e
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/cd1d770e

Branch: refs/heads/master
Commit: cd1d770ede57f08b8be2f2b42f2f629eb5106098
Parents: 7035467
Author: Micah Kornfield <emkornfield@gmail.com>
Authored: Mon May 23 13:55:51 2016 -0700
Committer: Wes McKinney <wesm@apache.org>
Committed: Mon May 23 13:55:51 2016 -0700

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc           | 29 +++++++++++++++++
 cpp/src/arrow/array.cc                |  7 ++++
 cpp/src/arrow/array.h                 |  9 +++++-
 cpp/src/arrow/types/list-test.cc      | 36 +++++++++++++++++++++
 cpp/src/arrow/types/list.cc           | 26 +++++++++++++++
 cpp/src/arrow/types/list.h            |  3 ++
 cpp/src/arrow/types/primitive-test.cc | 51 ++++++++++++++++++++++++++++++
 cpp/src/arrow/types/primitive.cc      | 17 +++++++++-
 cpp/src/arrow/types/primitive.h       | 35 ++++++++++++++++----
 9 files changed, 205 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index b4c7279..3b47363 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -56,6 +56,35 @@ TEST_F(TestArray, TestLength) {
   ASSERT_EQ(arr->length(), 100);
 }
 
+ArrayPtr MakeArrayFromValidBytes(const std::vector<uint8_t>& v, MemoryPool* pool)
{
+  int32_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0);
+  std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(v);
+
+  BufferBuilder value_builder(pool);
+  for (size_t i = 0; i < v.size(); ++i) {
+    value_builder.Append<int32_t>(0);
+  }
+
+  ArrayPtr arr(new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf));
+  return arr;
+}
+
+TEST_F(TestArray, TestEquality) {
+  auto array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_);
+  auto equal_array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_);
+  auto unequal_array = MakeArrayFromValidBytes({1, 1, 1, 1, 0, 1, 0, 0}, pool_);
+
+  EXPECT_TRUE(array->Equals(array));
+  EXPECT_TRUE(array->Equals(equal_array));
+  EXPECT_TRUE(equal_array->Equals(array));
+  EXPECT_FALSE(equal_array->Equals(unequal_array));
+  EXPECT_FALSE(unequal_array->Equals(equal_array));
+  EXPECT_TRUE(array->RangeEquals(4, 8, 4, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(0, 4, 0, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(0, 8, 0, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
+}
+
 TEST_F(TestArray, TestIsNull) {
   // clang-format off
   std::vector<uint8_t> null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0,

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index c6b9b15..d6b081f 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -58,4 +58,11 @@ bool NullArray::Equals(const std::shared_ptr<Array>& arr) const
{
   return arr->length() == length_;
 }
 
+bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
+    const std::shared_ptr<Array>& arr) const {
+  if (!arr) { return false; }
+  if (Type::NA != arr->type_enum()) { return false; }
+  return true;
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index f98c4c2..76dc0f5 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -59,6 +59,12 @@ class Array {
 
   bool EqualsExact(const Array& arr) const;
   virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0;
+
+  // Compare if the range of slots specified are equal for the given array and
+  // this array.  end_idx exclusive.  This methods does not bounds check.
+  virtual bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+      const std::shared_ptr<Array>& arr) const = 0;
+
   // Determines if the array is internally consistent.  Defaults to always
   // returning Status::OK.  This can be an expensive check.
   virtual Status Validate() const;
@@ -85,10 +91,11 @@ class NullArray : public Array {
   explicit NullArray(int32_t length) : NullArray(std::make_shared<NullType>(), length)
{}
 
   bool Equals(const std::shared_ptr<Array>& arr) const override;
+  bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
+      const std::shared_ptr<Array>& arr) const override;
 };
 
 typedef std::shared_ptr<Array> ArrayPtr;
-
 }  // namespace arrow
 
 #endif

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/list-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc
index 6a8ad9a..2e41b4a 100644
--- a/cpp/src/arrow/types/list-test.cc
+++ b/cpp/src/arrow/types/list-test.cc
@@ -86,6 +86,42 @@ class TestListBuilder : public TestBuilder {
   shared_ptr<ListArray> result_;
 };
 
+TEST_F(TestListBuilder, Equality) {
+  Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get());
+
+  ArrayPtr array, equal_array, unequal_array;
+  vector<int32_t> equal_offsets = {0, 1, 2, 5};
+  vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2};
+  vector<int32_t> unequal_offsets = {0, 1, 4};
+  vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};
+
+  // setup two equal arrays
+  ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size()));
+  ASSERT_OK(vb->Append(equal_values.data(), equal_values.size()));
+  array = builder_->Finish();
+  ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size()));
+  ASSERT_OK(vb->Append(equal_values.data(), equal_values.size()));
+  equal_array = builder_->Finish();
+  // now an unequal one
+  ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size()));
+  ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size()));
+  unequal_array = builder_->Finish();
+
+  // Test array equality
+  EXPECT_TRUE(array->Equals(array));
+  EXPECT_TRUE(array->Equals(equal_array));
+  EXPECT_TRUE(equal_array->Equals(array));
+  EXPECT_FALSE(equal_array->Equals(unequal_array));
+  EXPECT_FALSE(unequal_array->Equals(equal_array));
+
+  // Test range equality
+  EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
+  EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
+  EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array));
+}
+
 TEST_F(TestListBuilder, TestResize) {}
 
 TEST_F(TestListBuilder, TestAppendNull) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/list.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc
index 76e7fe5..6334054 100644
--- a/cpp/src/arrow/types/list.cc
+++ b/cpp/src/arrow/types/list.cc
@@ -44,6 +44,32 @@ bool ListArray::Equals(const std::shared_ptr<Array>& arr) const
{
   return EqualsExact(*static_cast<const ListArray*>(arr.get()));
 }
 
+bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+    const std::shared_ptr<Array>& arr) const {
+  if (this == arr.get()) { return true; }
+  if (!arr) { return false; }
+  if (this->type_enum() != arr->type_enum()) { return false; }
+  const auto other = static_cast<ListArray*>(arr.get());
+  for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
+    const bool is_null = IsNull(i);
+    if (is_null != arr->IsNull(o_i)) { return false; }
+    if (is_null) continue;
+    const int32_t begin_offset = offset(i);
+    const int32_t end_offset = offset(i + 1);
+    const int32_t other_begin_offset = other->offset(o_i);
+    const int32_t other_end_offset = other->offset(o_i + 1);
+    // Underlying can't be equal if the size isn't equal
+    if (end_offset - begin_offset != other_end_offset - other_begin_offset) {
+      return false;
+    }
+    if (!values_->RangeEquals(
+            begin_offset, end_offset, other_begin_offset, other->values())) {
+      return false;
+    }
+  }
+  return true;
+}
+
 Status ListArray::Validate() const {
   if (length_ < 0) { return Status::Invalid("Length was negative"); }
   if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); }

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index a020b8a..0a39416 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -72,6 +72,9 @@ class ListArray : public Array {
   bool EqualsExact(const ListArray& other) const;
   bool Equals(const std::shared_ptr<Array>& arr) const override;
 
+  bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+      const ArrayPtr& arr) const override;
+
  protected:
   std::shared_ptr<Buffer> offset_buf_;
   const int32_t* offsets_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/primitive-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc
index 2b4c087..87eb0fe 100644
--- a/cpp/src/arrow/types/primitive-test.cc
+++ b/cpp/src/arrow/types/primitive-test.cc
@@ -304,6 +304,57 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
   ASSERT_EQ(memory_before, this->pool_->bytes_allocated());
 }
 
+template <class T, class Builder>
+Status MakeArray(const vector<uint8_t>& valid_bytes, const vector<T>&
draws, int size,
+    Builder* builder, ArrayPtr* out) {
+  // Append the first 1000
+  for (int i = 0; i < size; ++i) {
+    if (valid_bytes[i] > 0) {
+      RETURN_NOT_OK(builder->Append(draws[i]));
+    } else {
+      RETURN_NOT_OK(builder->AppendNull());
+    }
+  }
+  *out = builder->Finish();
+  return Status::OK();
+}
+
+TYPED_TEST(TestPrimitiveBuilder, Equality) {
+  DECL_T();
+
+  const int size = 1000;
+  this->RandomData(size);
+  vector<T>& draws = this->draws_;
+  vector<uint8_t>& valid_bytes = this->valid_bytes_;
+  ArrayPtr array, equal_array, unequal_array;
+  auto builder = this->builder_.get();
+  ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array));
+  ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array));
+
+  // Make the not equal array by negating the first valid element with itself.
+  const auto first_valid = std::find_if(
+      valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; });
+  const int first_valid_idx = std::distance(valid_bytes.begin(), first_valid);
+  // This should be true with a very high probability, but might introduce flakiness
+  ASSERT_LT(first_valid_idx, size - 1);
+  draws[first_valid_idx] = ~*reinterpret_cast<int64_t*>(&draws[first_valid_idx]);
+  ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array));
+
+  // test normal equality
+  EXPECT_TRUE(array->Equals(array));
+  EXPECT_TRUE(array->Equals(equal_array));
+  EXPECT_TRUE(equal_array->Equals(array));
+  EXPECT_FALSE(equal_array->Equals(unequal_array));
+  EXPECT_FALSE(unequal_array->Equals(equal_array));
+
+  // Test range equality
+  EXPECT_FALSE(array->RangeEquals(0, first_valid_idx + 1, 0, unequal_array));
+  EXPECT_FALSE(array->RangeEquals(first_valid_idx, size, first_valid_idx, unequal_array));
+  EXPECT_TRUE(array->RangeEquals(0, first_valid_idx, 0, unequal_array));
+  EXPECT_TRUE(
+      array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array));
+}
+
 TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) {
   DECL_T();
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index 57a3f1e..8e6c0f8 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -185,10 +185,25 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const
{
   }
 }
 
-bool BooleanArray::Equals(const std::shared_ptr<Array>& arr) const {
+bool BooleanArray::Equals(const ArrayPtr& arr) const {
   if (this == arr.get()) return true;
   if (Type::BOOL != arr->type_enum()) { return false; }
   return EqualsExact(*static_cast<const BooleanArray*>(arr.get()));
 }
 
+bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx,
+    int32_t other_start_idx, const ArrayPtr& arr) const {
+  if (this == arr.get()) { return true; }
+  if (!arr) { return false; }
+  if (this->type_enum() != arr->type_enum()) { return false; }
+  const auto other = static_cast<BooleanArray*>(arr.get());
+  for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
+    const bool is_null = IsNull(i);
+    if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i)))
{
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/cd1d770e/cpp/src/arrow/types/primitive.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h
index fc45f6c..9597fc8 100644
--- a/cpp/src/arrow/types/primitive.h
+++ b/cpp/src/arrow/types/primitive.h
@@ -66,6 +66,22 @@ class PrimitiveArray : public Array {
       return PrimitiveArray::EqualsExact(*static_cast<const PrimitiveArray*>(&other));
\
     }                                                                                  \
                                                                                        \
+    bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,      \
+        const ArrayPtr& arr) const override {                                       
  \
+      if (this == arr.get()) { return true; }                                          \
+      if (!arr) { return false; }                                                      \
+      if (this->type_enum() != arr->type_enum()) { return false; }                
    \
+      const auto other = static_cast<NAME*>(arr.get());                           
    \
+      for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {  
 \
+        const bool is_null = IsNull(i);                                                \
+        if (is_null != arr->IsNull(o_i) ||                                           
 \
+            (!is_null && Value(i) != other->Value(o_i))) {                   
         \
+          return false;                                                                \
+        }                                                                              \
+      }                                                                                \
+      return true;                                                                     \
+    }                                                                                  \
+                                                                                       \
     const T* raw_data() const { return reinterpret_cast<const T*>(raw_data_); }   
    \
                                                                                        \
     T Value(int i) const { return raw_data()[i]; }                                     \
@@ -95,8 +111,10 @@ class PrimitiveBuilder : public ArrayBuilder {
   using ArrayBuilder::Advance;
 
   // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
-  void AppendNulls(const uint8_t* valid_bytes, int32_t length) {
+  Status AppendNulls(const uint8_t* valid_bytes, int32_t length) {
+    RETURN_NOT_OK(Reserve(length));
     UnsafeAppendToBitmap(valid_bytes, length);
+    return Status::OK();
   }
 
   Status AppendNull() {
@@ -139,9 +157,10 @@ class NumericBuilder : public PrimitiveBuilder<T> {
   using PrimitiveBuilder<T>::Reserve;
 
   // Scalar append.
-  void Append(value_type val) {
-    ArrayBuilder::Reserve(1);
+  Status Append(value_type val) {
+    RETURN_NOT_OK(ArrayBuilder::Reserve(1));
     UnsafeAppend(val);
+    return Status::OK();
   }
 
   // Does not capacity-check; make sure to call Reserve beforehand
@@ -248,7 +267,9 @@ class BooleanArray : public PrimitiveArray {
       int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   bool EqualsExact(const BooleanArray& other) const;
-  bool Equals(const std::shared_ptr<Array>& arr) const override;
+  bool Equals(const ArrayPtr& arr) const override;
+  bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+      const ArrayPtr& arr) const override;
 
   const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_);
}
 
@@ -274,7 +295,8 @@ class BooleanBuilder : public PrimitiveBuilder<BooleanType> {
   using PrimitiveBuilder<BooleanType>::Append;
 
   // Scalar append
-  void Append(bool val) {
+  Status Append(bool val) {
+    Reserve(1);
     util::set_bit(null_bitmap_data_, length_);
     if (val) {
       util::set_bit(raw_data_, length_);
@@ -282,9 +304,10 @@ class BooleanBuilder : public PrimitiveBuilder<BooleanType> {
       util::clear_bit(raw_data_, length_);
     }
     ++length_;
+    return Status::OK();
   }
 
-  void Append(uint8_t val) { Append(static_cast<bool>(val)); }
+  Status Append(uint8_t val) { return Append(static_cast<bool>(val)); }
 };
 
 }  // namespace arrow


Mime
View raw message