From commits-return-9089-archive-asf-public=cust-asf.ponee.io@kudu.apache.org Tue Aug 25 04:59:33 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mailroute1-lw-us.apache.org (mailroute1-lw-us.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with ESMTPS id 5108C18037A for ; Tue, 25 Aug 2020 06:59:33 +0200 (CEST) Received: from mail.apache.org (localhost [127.0.0.1]) by mailroute1-lw-us.apache.org (ASF Mail Server at mailroute1-lw-us.apache.org) with SMTP id 7E4EA1243A0 for ; Tue, 25 Aug 2020 04:59:32 +0000 (UTC) Received: (qmail 36866 invoked by uid 500); 25 Aug 2020 04:59:32 -0000 Mailing-List: contact commits-help@kudu.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@kudu.apache.org Delivered-To: mailing list commits@kudu.apache.org Received: (qmail 36857 invoked by uid 99); 25 Aug 2020 04:59:32 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 25 Aug 2020 04:59:32 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 0E1BA85A68; Tue, 25 Aug 2020 04:59:32 +0000 (UTC) Date: Tue, 25 Aug 2020 04:59:31 +0000 To: "commits@kudu.apache.org" Subject: [kudu] branch master updated: Block Bloom filter false positive rate correction MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <159833157198.1992.11133579374578085728@gitbox.apache.org> From: bankim@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: kudu X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 6ab55f6e13c09ab688ab23ee2164b98929ccf26c X-Git-Newrev: d1190c2b06a6eef91b21fd4a0b5fb76534b4e9f9 X-Git-Rev: d1190c2b06a6eef91b21fd4a0b5fb76534b4e9f9 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. bankim pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git The following commit(s) were added to refs/heads/master by this push: new d1190c2 Block Bloom filter false positive rate correction d1190c2 is described below commit d1190c2b06a6eef91b21fd4a0b5fb76534b4e9f9 Author: Jim Apple AuthorDate: Tue Jul 28 15:54:52 2020 -0700 Block Bloom filter false positive rate correction Block Bloom filters have a higher false positive rate than standard Bloom filter, due to the uneven distribution of keys between buckets. This patch changes the code to match the theory, using an approximation from the paper that introduced block Bloom filters, "Cache-, Hash- and Space-Efficient Bloom Filters" by Putze et al. In scan_predicate.cc, filters are created with BlockBloomFilter::MinLogSpace. Prior to this patch, that method will sometimes return a value that is lower than the true answer, leading to smaller filters and higher false positive probabilities than expected. This patch corrects BlockBloomFilter::MinLogSpace, leading to filters having the expected false positive rate by dint of their larger size. The performance impact here is dependent on the extent than a scan is bottlenecked by heap space for the filter vs. compute time for the scan predicate application to filter false positives. For a false positive probability of 1%, as is currently set in scan_predicate.cc, this patch leads to an increase in filter size of about 10% and a decrease in filter false positive probability by 50%. However, this is obscured by the coarseness of the fact that filters are constrained to have a size in bytes that is a power of two. Loosening that restriction is potential future work. Change-Id: I58a5191ee86edfa6e4f415829a6a647f586e9af6 Reviewed-on: http://gerrit.cloudera.org:8080/16248 Tested-by: Kudu Jenkins Reviewed-by: Bankim Bhavsar --- src/kudu/util/block_bloom_filter-test.cc | 38 ++++++++---- src/kudu/util/block_bloom_filter.cc | 100 ++++++++++++++++++++++++------- 2 files changed, 106 insertions(+), 32 deletions(-) diff --git a/src/kudu/util/block_bloom_filter-test.cc b/src/kudu/util/block_bloom_filter-test.cc index 3a50060..1b6878c 100644 --- a/src/kudu/util/block_bloom_filter-test.cc +++ b/src/kudu/util/block_bloom_filter-test.cc @@ -33,6 +33,7 @@ #include "kudu/util/hash.pb.h" #include "kudu/util/memory/arena.h" +#include "kudu/util/random.h" #include "kudu/util/slice.h" #include "kudu/util/status.h" #include "kudu/util/test_macros.h" @@ -181,22 +182,27 @@ TEST_F(BlockBloomFilterTest, CumulativeFind) { // The empirical false positives we find when looking for random items is with a constant // factor of the false positive probability the Bloom filter was constructed for. TEST_F(BlockBloomFilterTest, FindInvalid) { - static const int find_limit = 1 << 20; + // We use a deterministic pseudorandom number generator with a set seed. The reason is + // that with a run-dependent seed, there will always be inputs that can fail. That's a + // potential argument for this to be a benchmark rather than a test, although the + // measured quantity would be not time but deviation from predicted fpp. + Random rgen(867 + 5309); + static const int find_limit = 1 << 22; unordered_set to_find; while (to_find.size() < find_limit) { - to_find.insert(MakeRand()); + to_find.insert(rgen.Next64()); } static const int max_log_ndv = 19; unordered_set to_insert; while (to_insert.size() < (1ULL << max_log_ndv)) { - const auto candidate = MakeRand(); + const auto candidate = rgen.Next64(); if (to_find.find(candidate) == to_find.end()) { to_insert.insert(candidate); } } vector shuffled_insert(to_insert.begin(), to_insert.end()); for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) { - for (int log_fpp = 4; log_fpp < 15; ++log_fpp) { + for (int log_fpp = 4; log_fpp < 12; ++log_fpp) { double fpp = 1.0 / (1 << log_fpp); const size_t ndv = 1 << log_ndv; const int log_heap_space = BlockBloomFilter::MinLogSpace(ndv, fpp); @@ -211,24 +217,32 @@ TEST_F(BlockBloomFilterTest, FindInvalid) { found += bf->Find(i); } EXPECT_LE(found, find_limit * fpp * 2) - << "Too many false positives with -log2(fpp) = " << log_fpp; + << "Too many false positives with -log2(fpp) = " << log_fpp + << " and log_ndv = " << log_ndv << " and log_heap_space = " << log_heap_space; // Because the space is rounded up to a power of 2, we might actually get a lower // fpp than the one passed to MinLogSpace(). const double expected_fpp = BlockBloomFilter::FalsePositiveProb(ndv, log_heap_space); - EXPECT_GE(found, find_limit * expected_fpp) - << "Too few false positives with -log2(fpp) = " << log_fpp; - EXPECT_LE(found, find_limit * expected_fpp * 16) - << "Too many false positives with -log2(fpp) = " << log_fpp; + // Fudge factors are present because filter characteristics are true in the limit, + // and will deviate for small samples. + EXPECT_GE(found, find_limit * expected_fpp * 0.75) + << "Too few false positives with -log2(fpp) = " << log_fpp + << " expected_fpp = " << expected_fpp; + EXPECT_LE(found, find_limit * expected_fpp * 1.25) + << "Too many false positives with -log2(fpp) = " << log_fpp + << " expected_fpp = " << expected_fpp; } } } // Test that MaxNdv() and MinLogSpace() are dual TEST_F(BlockBloomFilterTest, MinSpaceMaxNdv) { - for (int log2fpp = -2; log2fpp >= -63; --log2fpp) { + for (int log2fpp = -2; log2fpp >= -30; --log2fpp) { const double fpp = pow(2, log2fpp); for (int given_log_space = 8; given_log_space < 30; ++given_log_space) { const size_t derived_ndv = BlockBloomFilter::MaxNdv(given_log_space, fpp); + // If NO values can be added without exceeding fpp, then the space needed is + // trivially zero. This becomes a useless test; skip to the next iteration. + if (0 == derived_ndv) continue; int derived_log_space = BlockBloomFilter::MinLogSpace(derived_ndv, fpp); EXPECT_EQ(derived_log_space, given_log_space) << "fpp: " << fpp @@ -270,8 +284,8 @@ TEST_F(BlockBloomFilterTest, MinSpaceEdgeCase) { // Check that MinLogSpace() and FalsePositiveProb() are dual TEST_F(BlockBloomFilterTest, MinSpaceForFpp) { - for (size_t ndv = 10000; ndv < 100 * 1000 * 1000; ndv *= 1.01) { - for (double fpp = 0.1; fpp > pow(2, -20); fpp *= 0.99) { // NOLINT: loop on double + for (size_t ndv = 10000; ndv < 100 * 1000 * 1000; ndv *= 1.1) { + for (double fpp = 0.1; fpp > pow(2, -20); fpp *= 0.9) { // NOLINT: loop on double // When contructing a Bloom filter, we can request a particular fpp by calling // MinLogSpace(). const int min_log_space = BlockBloomFilter::MinLogSpace(ndv, fpp); diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc index 16a5b7b..cfb31ff 100644 --- a/src/kudu/util/block_bloom_filter.cc +++ b/src/kudu/util/block_bloom_filter.cc @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -214,32 +215,91 @@ bool BlockBloomFilter::BucketFind( return true; } -// The following three methods are derived from -// -// fpp = (1 - exp(-kBucketWords * ndv/space))^kBucketWords -// -// where space is in bits. +// This implements the false positive probability in Putze et al.'s "Cache-, hash-and +// space-efficient bloom filters", equation 3. +double BlockBloomFilter::FalsePositiveProb(const size_t ndv, const int log_space_bytes) { + static constexpr double kWordBits = 1 << kLogBucketWordBits; + const double bytes = static_cast(1L << log_space_bytes); + if (ndv == 0) return 0.0; + if (bytes <= 0) return 1.0; + // This short-cuts a slowly-converging sum for very dense filters + if (ndv / (bytes * CHAR_BIT) > 2) return 1.0; + + double result = 0; + // lam is the usual parameter to the Poisson's PMF. Following the notation in the paper, + // lam is B/c, where B is the number of bits in a bucket and c is the number of bits per + // distinct value + const double lam = kBucketWords * kWordBits / ((bytes * CHAR_BIT) / ndv); + // Some of the calculations are done in log-space to increase numerical stability + const double loglam = log(lam); + + // 750 iterations are sufficient to cause the sum to converge in all of the tests. In + // other words, setting the iterations higher than 750 will give the same result as + // leaving it at 750. + static constexpr uint64_t kBloomFppIters = 750; + for (uint64_t j = 0; j < kBloomFppIters; ++j) { + // We start with the highest value of i, since the values we're adding to result are + // mostly smaller at high i, and this increases accuracy to sum from the smallest + // values up. + const double i = static_cast(kBloomFppIters - 1 - j); + // The PMF of the Poisson distribution is lam^i * exp(-lam) / i!. In logspace, using + // lgamma for the log of the factorial function: + double logp = i * loglam - lam - lgamma(i + 1); + // The f_inner part of the equation in the paper is the probability of a single + // collision in the bucket. Since there are kBucketWords non-overlapping lanes in each + // bucket, the log of this probability is: + const double logfinner = kBucketWords * log(1.0 - pow(1.0 - 1.0 / kWordBits, i)); + // Here we are forced out of log-space calculations + result += exp(logp + logfinner); + } + return (result > 1.0) ? 1.0 : result; +} + size_t BlockBloomFilter::MaxNdv(const int log_space_bytes, const double fpp) { DCHECK(log_space_bytes > 0 && log_space_bytes < 61); DCHECK(0 < fpp && fpp < 1); - static const double ik = 1.0 / kBucketWords; - return -1 * ik * static_cast(1ULL << (log_space_bytes + 3)) * log(1 - pow(fpp, ik)); + // Starting with an exponential search, we find bounds for how many distinct values a + // filter of size (1 << log_space_bytes) can hold before it exceeds a false positive + // probability of fpp. + size_t too_many = 1; + while (FalsePositiveProb(too_many, log_space_bytes) <= fpp) { + too_many *= 2; + } + // From here forward, we have the invariant that FalsePositiveProb(too_many, + // log_space_bytes) > fpp + size_t too_few = too_many / 2; + // Invariant for too_few: FalsePositiveProb(too_few, log_space_bytes) <= fpp + + constexpr size_t kProximity = 1; + // Simple binary search. If this is too slow, the required proximity of too_few and + // too_many can be raised from 1 to something higher. + while (too_many - too_few > kProximity) { + const size_t mid = (too_many + too_few) / 2; + const double mid_fpp = FalsePositiveProb(mid, log_space_bytes); + if (mid_fpp <= fpp) { + too_few = mid; + } else { + too_many = mid; + } + } + DCHECK_LE(FalsePositiveProb(too_few, log_space_bytes), fpp); + DCHECK_GT(FalsePositiveProb(too_few + kProximity, log_space_bytes), fpp); + return too_few; } int BlockBloomFilter::MinLogSpace(const size_t ndv, const double fpp) { - static const double k = kBucketWords; - if (0 == ndv) return 0; - // m is the number of bits we would need to get the fpp specified - const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k)); - - // Handle case where ndv == 1 => ceil(log2(m/8)) < 0. - return std::max(0, static_cast(ceil(log2(m / 8)))); -} - -double BlockBloomFilter::FalsePositiveProb(const size_t ndv, const int log_space_bytes) { - return pow(1 - exp((-1.0 * static_cast(kBucketWords) * static_cast(ndv)) - / static_cast(1ULL << (log_space_bytes + 3))), - kBucketWords); + int low = 0; + int high = 64; + while (high > low + 1) { + int mid = (high + low) / 2; + const double candidate = FalsePositiveProb(ndv, mid); + if (candidate <= fpp) { + high = mid; + } else { + low = mid; + } + } + return high; } void BlockBloomFilter::InsertNoAvx2(const uint32_t hash) noexcept {