This is an automated email from the ASF dual-hosted git repository.
adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 5b290cd bitmap: faster implementation of iteration over a bitmap
5b290cd is described below
commit 5b290cd4439d19ad8e5773cf69f7b994adc1f2e9
Author: Todd Lipcon <todd@apache.org>
AuthorDate: Mon Mar 23 13:18:42 2020 -0700
bitmap: faster implementation of iteration over a bitmap
This new implementation iterates 64 bits at a time instead of 8,
and uses a templated callback function instead of using an "iterator"
approach. In an upcoming patch, I found this method to be measurably
faster than the previous TrueBitIterator code.
I compared the included benchmark using the old TrueBitIterator vs the
new implementation:
Old:
Performance counter stats for 'build/latest/bin/bitmap-test --gtest_filter=*Bench* --gtest_repeat=10':
1,263.17 msec task-clock # 0.998 CPUs utilized
254 context-switches # 0.201 K/sec
0 cpu-migrations # 0.000 K/sec
1,808 page-faults # 0.001 M/sec
3,561,984,981 cycles # 2.820 GHz
5,878,012,694 instructions # 1.65 insn per cycle
1,301,336,934 branches # 1030.218 M/sec
22,275,166 branch-misses # 1.71% of all branches
1.265635106 seconds time elapsed
1.255785000 seconds user
0.007992000 seconds sys
new:
Performance counter stats for 'build/latest/bin/bitmap-test --gtest_filter=*Bench* --gtest_repeat=10':
595.63 msec task-clock # 0.996 CPUs utilized
254 context-switches # 0.426 K/sec
0 cpu-migrations # 0.000 K/sec
1,808 page-faults # 0.003 M/sec
1,662,752,028 cycles # 2.792 GHz
3,666,713,168 instructions # 2.21 insn per cycle
328,412,721 branches # 551.374 M/sec
2,267,098 branch-misses # 0.69% of all branches
0.598305718 seconds time elapsed
0.588236000 seconds user
0.007962000 seconds sys
Change-Id: Ibe772a7dd92faf9f99115148ad4cc7df542d1c76
Reviewed-on: http://gerrit.cloudera.org:8080/15539
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <awong@cloudera.com>
---
src/kudu/util/bitmap-test.cc | 80 +++++++++++++++++++++------
src/kudu/util/bitmap.h | 129 +++++++++++++++++++++++--------------------
2 files changed, 134 insertions(+), 75 deletions(-)
diff --git a/src/kudu/util/bitmap-test.cc b/src/kudu/util/bitmap-test.cc
index b6a280a..c897736 100644
--- a/src/kudu/util/bitmap-test.cc
+++ b/src/kudu/util/bitmap-test.cc
@@ -19,26 +19,23 @@
#include <cstdint>
#include <cstring>
+#include <set>
#include <vector>
#include <gtest/gtest.h>
#include "kudu/gutil/strings/join.h"
+#include "kudu/util/faststring.h"
+#include "kudu/util/random.h"
+#include "kudu/util/random_util.h"
namespace kudu {
-static int ReadBackBitmap(uint8_t *bm, size_t bits,
- std::vector<size_t> *result) {
- int iters = 0;
- for (TrueBitIterator iter(bm, bits);
- !iter.done();
- ++iter) {
- size_t val = *iter;
- result->push_back(val);
-
- iters++;
- }
- return iters;
+static void ReadBackBitmap(uint8_t *bm, size_t bits,
+ std::vector<size_t> *result) {
+ ForEachSetBit(bm, bits, [&](size_t b) {
+ result->push_back(b);
+ });
}
TEST(TestBitMap, TestIteration) {
@@ -56,8 +53,7 @@ TEST(TestBitMap, TestIteration) {
std::vector<size_t> read_back;
- int iters = ReadBackBitmap(bm, sizeof(bm)*8, &read_back);
- ASSERT_EQ(6, iters);
+ ReadBackBitmap(bm, sizeof(bm)*8, &read_back);
ASSERT_EQ("0,8,31,32,33,63", JoinElements(read_back, ","));
}
@@ -69,11 +65,63 @@ TEST(TestBitMap, TestIteration2) {
std::vector<size_t> read_back;
- int iters = ReadBackBitmap(bm, 3, &read_back);
- ASSERT_EQ(1, iters);
+ ReadBackBitmap(bm, 3, &read_back);
ASSERT_EQ("1", JoinElements(read_back, ","));
}
+struct RandomBitmap {
+ RandomBitmap(int n_bits, double set_ratio) {
+ bm.resize(BitmapSize(n_bits));
+ Random r(GetRandomSeed32());
+ for (int i = 0; i < n_bits; i++) {
+ bool set = r.NextDoubleFraction() < set_ratio;
+ BitmapChange(bm.data(), i, set);
+ if (set) {
+ set_bits.insert(i);
+ }
+ }
+ }
+
+ faststring bm;
+ std::set<int> set_bits;
+};
+
+TEST(TestBitMap, TestIterationRandom) {
+ Random r(GetRandomSeed32());
+ const auto kNumBits = 1 + r.Uniform(100);
+ RandomBitmap bm(kNumBits, r.NextDoubleFraction());
+
+ std::set<int> remaining = bm.set_bits;
+
+ // Iterate over the set bits and remove them from the set. When we're
+ // done we should have none left in the set.
+ ForEachSetBit(bm.bm.data(), kNumBits,
+ [&](size_t b) {
+ EXPECT_EQ(1, remaining.erase(b));
+ });
+ EXPECT_EQ(remaining.size(), 0);
+}
+
+
+
+TEST(TestBitMap, Benchmark) {
+ static constexpr int kNumBits = 1000;
+ static constexpr int kNumTrials = 1000;
+ for (int frac = 0; frac <= 100; frac++) {
+ RandomBitmap bm(kNumBits, frac/100.0);
+
+ volatile int sink = 0;
+ for (int i = 0; i < kNumTrials; i++) {
+ int sum = 0;
+ ForEachSetBit(bm.bm.data(), kNumBits,
+ [&](size_t b) {
+ sum += b;
+ });
+ sink += sum;
+ }
+ }
+}
+
TEST(TestBitMap, TestSetAndTestBits) {
uint8_t bm[1];
memset(bm, 0, sizeof(bm));
diff --git a/src/kudu/util/bitmap.h b/src/kudu/util/bitmap.h
index c29ba09..0c67970 100644
--- a/src/kudu/util/bitmap.h
+++ b/src/kudu/util/bitmap.h
@@ -176,72 +176,83 @@ class BitmapIterator {
const uint8_t *map_;
};
-// Iterator which yields the set bits in a bitmap.
-// Example usage:
-// for (TrueBitIterator iter(bitmap, n_bits);
-// !iter.done();
-// ++iter) {
-// int next_onebit_position = *iter;
-// }
-class TrueBitIterator {
- public:
- TrueBitIterator(const uint8_t *bitmap, size_t n_bits)
- : bitmap_(bitmap),
- cur_byte_(0),
- cur_byte_idx_(0),
- n_bits_(n_bits),
- n_bytes_(BitmapSize(n_bits_)),
- bit_idx_(0) {
- if (n_bits_ == 0) {
- cur_byte_idx_ = 1; // sets done
- } else {
- cur_byte_ = bitmap[0];
- AdvanceToNextOneBit();
+// Iterate over the bits in 'bitmap' and call 'func' for each set bit.
+// 'func' should take a single argument which is the bit's index.
+template<class F>
+void ForEachSetBit(const uint8_t* __restrict__ bitmap,
+ int n_bits,
+ const F& func);
+
+// Iterate over the bits in 'bitmap' and call 'func' for each unset bit.
+// 'func' should take a single argument which is the bit's index.
+template<class F>
+void ForEachUnsetBit(const uint8_t* __restrict__ bitmap,
+ int n_bits,
+ const F& func);
+
+
+////////////////////////////////////////////////////////////
+// Implementation details
+////////////////////////////////////////////////////////////
+
+template<bool SET, class F>
+inline void ForEachBit(const uint8_t* bitmap,
+ int n_bits,
+ const F& func) {
+ int rem = n_bits;
+ int base_idx = 0;
+ while (rem >= 64) {
+ uint64_t w = UnalignedLoad<uint64_t>(bitmap);
+ if (!SET) {
+ w = ~w;
}
+ bitmap += sizeof(uint64_t);
+
+ // Within each word, keep flipping the least significant non-zero bit and adding
+ // the bit index to the output until none are set.
+ //
+ // Get the count up front so that the loop can be unrolled without dependencies.
+ // The 'tzcnt' instruction that's generated here has a latency of 3 so unrolling
+ // and avoiding any cross-iteration dependencies is beneficial.
+ int tot_count = Bits::CountOnes64withPopcount(w);
+#pragma unroll(3)
+ while (tot_count--) {
+ func(base_idx + Bits::FindLSBSetNonZero64(w));
+ w &= w - 1;
+ }
+ base_idx += 64;
+ rem -= 64;
}
- TrueBitIterator &operator ++() {
- DCHECK(!done());
- DCHECK(cur_byte_ & 1);
- cur_byte_ &= (~1);
- AdvanceToNextOneBit();
- return *this;
- }
-
- bool done() const {
- return cur_byte_idx_ >= n_bytes_;
- }
-
- size_t operator *() const {
- DCHECK(!done());
- return bit_idx_;
- }
-
- private:
- void AdvanceToNextOneBit() {
- while (cur_byte_ == 0) {
- cur_byte_idx_++;
- if (cur_byte_idx_ >= n_bytes_) return;
- cur_byte_ = bitmap_[cur_byte_idx_];
- bit_idx_ = cur_byte_idx_ * 8;
+ while (rem > 0) {
+ uint8_t b = *bitmap++;
+ if (!SET) {
+ b = ~b;
}
- DVLOG(2) << "Found next nonzero byte at " << cur_byte_idx_
- << " val=" << cur_byte_;
-
- DCHECK_NE(cur_byte_, 0);
- int set_bit = Bits::FindLSBSetNonZero(cur_byte_);
- bit_idx_ += set_bit;
- cur_byte_ >>= set_bit;
+ while (b) {
+ int idx = base_idx + Bits::FindLSBSetNonZero(b);
+ if (idx >= n_bits) break;
+ func(idx);
+ b &= b - 1;
+ }
+ base_idx += 8;
+ rem -= 8;
}
+}
- const uint8_t *bitmap_;
- uint8_t cur_byte_;
- uint8_t cur_byte_idx_;
+template<class F>
+inline void ForEachSetBit(const uint8_t* __restrict__ bitmap,
+ int n_bits,
+ const F& func) {
+ ForEachBit<true, F>(bitmap, n_bits, func);
+}
- const size_t n_bits_;
- const size_t n_bytes_;
- size_t bit_idx_;
-};
+template<class F>
+inline void ForEachUnsetBit(const uint8_t* __restrict__ bitmap,
+ int n_bits,
+ const F& func) {
+ ForEachBit<false, F>(bitmap, n_bits, func);
+}
} // namespace kudu
|