From commits-return-8572-archive-asf-public=cust-asf.ponee.io@kudu.apache.org Tue Mar 10 21:41:53 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id AF503180665 for ; Tue, 10 Mar 2020 22:41:52 +0100 (CET) Received: (qmail 3954 invoked by uid 500); 10 Mar 2020 21:41:52 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 3887 invoked by uid 99); 10 Mar 2020 21:41:52 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 10 Mar 2020 21:41:52 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id E91B28DACD; Tue, 10 Mar 2020 21:41:51 +0000 (UTC) Date: Tue, 10 Mar 2020 21:41:53 +0000 To: "commits@kudu.apache.org" Subject: [kudu] 02/03: [util] Import "Or" function to BlockBloomFilter from Impala MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit From: adar@apache.org In-Reply-To: <158387651184.16967.8681847610914044191@gitbox.apache.org> References: <158387651184.16967.8681847610914044191@gitbox.apache.org> X-Git-Host: gitbox.apache.org X-Git-Repo: kudu X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Rev: 885d7946c771ea875919d479c67b2ddac5c5103e X-Git-NotificationType: diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated Message-Id: <20200310214151.E91B28DACD@gitbox.apache.org> This is an automated email from the ASF dual-hosted git repository. adar pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git commit 885d7946c771ea875919d479c67b2ddac5c5103e Author: Bankim Bhavsar AuthorDate: Tue Mar 3 16:18:17 2020 -0800 [util] Import "Or" function to BlockBloomFilter from Impala Impala will be switching to using the Block Bloom filter from kudu-util. "Or" function was missing and this change adds it. Note that original implementation for OrEqualArrayAvx() in Impala is targeted for AVX and not AVX2, however AVX2 is super-set of AVX instructions and there is already provision in the Block Bloom filter to separate out AVX2 v/s non-AVX2 (SSE) code. Hence don't see need to add separate AVX specific file/implementation. Change-Id: Ibe5f9311f73dcff883dd2cce18fd558e7d57d14f Reviewed-on: http://gerrit.cloudera.org:8080/15373 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo Reviewed-by: Alexey Serbin --- src/kudu/util/block_bloom_filter-test.cc | 33 +++++++++++++++++++ src/kudu/util/block_bloom_filter.cc | 56 ++++++++++++++++++++++++++++++++ src/kudu/util/block_bloom_filter.h | 28 ++++++++++++++-- src/kudu/util/block_bloom_filter_avx2.cc | 20 +++++++++++- 4 files changed, 134 insertions(+), 3 deletions(-) diff --git a/src/kudu/util/block_bloom_filter-test.cc b/src/kudu/util/block_bloom_filter-test.cc index 4c4d418..b9ed9f2 100644 --- a/src/kudu/util/block_bloom_filter-test.cc +++ b/src/kudu/util/block_bloom_filter-test.cc @@ -286,4 +286,37 @@ TEST_F(BlockBloomFilterTest, MinSpaceForFpp) { } } } + +TEST_F(BlockBloomFilterTest, Or) { + BlockBloomFilter* bf1 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01)); + BlockBloomFilter* bf2 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01)); + + for (int i = 60; i < 80; ++i) bf2->Insert(i); + for (int i = 0; i < 10; ++i) bf1->Insert(i); + + ASSERT_OK(bf1->Or(*bf2)); + for (int i = 0; i < 10; ++i) ASSERT_TRUE(bf1->Find(i)) << i; + for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf1->Find(i)) << i; + + // Insert another value to aggregated BloomFilter. + for (int i = 11; i < 50; ++i) bf1->Insert(i); + + for (int i = 11; i < 50; ++i) ASSERT_TRUE(bf1->Find(i)) << i; + ASSERT_FALSE(bf1->Find(81)); + + // Check that AlwaysFalse() is updated correctly. + BlockBloomFilter* bf3 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01)); + BlockBloomFilter* always_false = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01)); + ASSERT_OK(bf3->Or(*always_false)); + EXPECT_TRUE(bf3->AlwaysFalse()); + ASSERT_OK(bf3->Or(*bf2)); + EXPECT_FALSE(bf3->AlwaysFalse()); + + // Invalid argument test cases. + BlockBloomFilter* bf4 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100, 0.01)); + BlockBloomFilter* bf5 = CreateBloomFilter(BlockBloomFilter::MinLogSpace(100000, 0.01)); + Status s = bf4->Or(*bf5); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "Directory size don't match"); +} } // namespace kudu diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc index e09a7cc..acaf358 100644 --- a/src/kudu/util/block_bloom_filter.cc +++ b/src/kudu/util/block_bloom_filter.cc @@ -47,6 +47,9 @@ namespace kudu { constexpr uint32_t BlockBloomFilter::kRehash[8] __attribute__((aligned(32))); const base::CPU BlockBloomFilter::kCpu = base::CPU(); +// constexpr data member requires initialization in the class declaration. +// Hence no duplicate initialization in the definition here. +constexpr BlockBloomFilter* const BlockBloomFilter::kAlwaysTrueFilter; BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_allocator) : always_false_(true), @@ -60,13 +63,16 @@ BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_all if (has_avx2()) { bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsertAVX2; bucket_find_func_ptr_ = &BlockBloomFilter::BucketFindAVX2; + or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArrayAVX2; } else { bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert; bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind; + or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray; } #else bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert; bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind; + or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray; #endif } @@ -259,6 +265,56 @@ bool BlockBloomFilter::operator!=(const BlockBloomFilter& rhs) const { return !(rhs == *this); } +void BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out) { + // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not + // written in a way that is very friendly to auto-vectorization. Instead, we manually + // vectorize, increasing the speed by up to 56x. + const __m128i* simd_in = reinterpret_cast(in); + const __m128i* const simd_in_end = reinterpret_cast(in + n); + __m128i* simd_out = reinterpret_cast<__m128i*>(out); + // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i) + // == 16, we can do two _mm_or_si128's in each iteration without checking array + // bounds. + while (simd_in != simd_in_end) { + for (int i = 0; i < 2; ++i, ++simd_in, ++simd_out) { + _mm_storeu_si128( + simd_out, _mm_or_si128(_mm_loadu_si128(simd_out), _mm_loadu_si128(simd_in))); + } + } +} + +Status BlockBloomFilter::Or(const BlockBloomFilter& other) { + // AlwaysTrueFilter is a special case implemented with a nullptr. + // Hence Or'ing with an AlwaysTrueFilter will result in a Bloom filter that also + // always returns true which'll require destructing this Bloom filter. + // Moreover for a reference "other" to be an AlwaysTrueFilter the reference needs + // to be created from a nullptr and so we get into undefined behavior territory. + // Comparing AlwaysTrueFilter with "&other" results in a compiler warning for + // comparing a non-null argument "other" with NULL [-Wnonnull-compare]. + // For above reasons, guard against it. + CHECK_NE(kAlwaysTrueFilter, &other); + + if (this == &other) { + // No op. + return Status::OK(); + } + if (directory_size() != other.directory_size()) { + return Status::InvalidArgument(Substitute("Directory size don't match. this: $0, other: $1", + directory_size(), other.directory_size())); + } + if (other.AlwaysFalse()) { + // Nothing to do. + return Status::OK(); + } + + (*or_equal_array_func_ptr_)(directory_size(), + reinterpret_cast(other.directory_), + reinterpret_cast(directory_)); + always_false_ = false; + return Status::OK(); +} + shared_ptr DefaultBlockBloomFilterBufferAllocator::GetSingletonSharedPtr() { // Meyer's Singleton. diff --git a/src/kudu/util/block_bloom_filter.h b/src/kudu/util/block_bloom_filter.h index a218dec..8a0fdf8 100644 --- a/src/kudu/util/block_bloom_filter.h +++ b/src/kudu/util/block_bloom_filter.h @@ -148,6 +148,21 @@ class BlockBloomFilter { bool operator==(const BlockBloomFilter& rhs) const; bool operator!=(const BlockBloomFilter& rhs) const; + // Computes the logical OR of this filter with 'other' and stores the result in this + // filter. + // Notes: + // - The directory sizes of the Bloom filters must match. + // - Or'ing with kAlwaysTrueFilter is disallowed. + Status Or(const BlockBloomFilter& other); + + // Returns whether the Bloom filter is empty and hence would return false for all lookups. + bool AlwaysFalse() const { + return always_false_; + } + + // Representation of a filter which allows all elements to pass. + static constexpr BlockBloomFilter* const kAlwaysTrueFilter = nullptr; + private: // always_false_ is true when the bloom filter hasn't had any elements inserted. bool always_false_; @@ -190,7 +205,7 @@ class BlockBloomFilter { // Helper function for public Init() variants. Status InitInternal(int log_space_bytes, HashAlgorithm hash_algorithm, uint32_t hash_seed); - // Same as Insert(), but skips the CPU check and assumes that AVX is not available. + // Same as Insert(), but skips the CPU check and assumes that AVX2 is not available. void InsertNoAvx2(uint32_t hash) noexcept; // Does the actual work of Insert(). bucket_idx is the index of the bucket to insert @@ -199,8 +214,11 @@ class BlockBloomFilter { bool BucketFind(uint32_t bucket_idx, uint32_t hash) const noexcept; + // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n'. + static void OrEqualArray(size_t n, const uint8_t* __restrict__ in, uint8_t* __restrict__ out); + #ifdef USE_AVX2 - // Same as Insert(), but skips the CPU check and assumes that AVX is available. + // Same as Insert(), but skips the CPU check and assumes that AVX2 is available. void InsertAvx2(uint32_t hash) noexcept __attribute__((__target__("avx2"))); // A faster SIMD version of BucketInsert(). @@ -210,12 +228,18 @@ class BlockBloomFilter { // A faster SIMD version of BucketFind(). bool BucketFindAVX2(uint32_t bucket_idx, uint32_t hash) const noexcept __attribute__((__target__("avx2"))); + + // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX2 + // instructions. 'n' must be a multiple of 32. + static void OrEqualArrayAVX2(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out) __attribute__((target("avx2"))); #endif // Function pointers initialized in constructor to avoid run-time cost // in hot-path of Find and Insert operations. decltype(&BlockBloomFilter::BucketInsert) bucket_insert_func_ptr_; decltype(&BlockBloomFilter::BucketFind) bucket_find_func_ptr_; + decltype(&BlockBloomFilter::OrEqualArray) or_equal_array_func_ptr_; // Returns amount of space used in log2 bytes. int log_space_bytes() const { diff --git a/src/kudu/util/block_bloom_filter_avx2.cc b/src/kudu/util/block_bloom_filter_avx2.cc index e10b6cc..93c3ff6 100644 --- a/src/kudu/util/block_bloom_filter_avx2.cc +++ b/src/kudu/util/block_bloom_filter_avx2.cc @@ -24,9 +24,14 @@ #include "kudu/util/block_bloom_filter.h" -#include #include +#include +#include +#include + +#include + #include "kudu/gutil/port.h" namespace kudu { @@ -75,4 +80,17 @@ void BlockBloomFilter::InsertAvx2(const uint32_t hash) noexcept { BucketInsertAVX2(bucket_idx, hash); } +void BlockBloomFilter::OrEqualArrayAVX2(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out) { + constexpr size_t kAVXRegisterBytes = sizeof(__m256d); + DCHECK_EQ(n % kAVXRegisterBytes, 0) << "Invalid Bloom filter directory size"; + const uint8_t* const in_end = in + n; + for (; in != in_end; (in += kAVXRegisterBytes), (out += kAVXRegisterBytes)) { + const double* double_in = reinterpret_cast(in); + double* double_out = reinterpret_cast(out); + _mm256_storeu_pd(double_out, + _mm256_or_pd(_mm256_loadu_pd(double_out), _mm256_loadu_pd(double_in))); + } +} + } // namespace kudu