commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aherb...@apache.org
Subject [commons-collections] 07/07: Revert CountingBloomFilter to ignore counts from another filter.
Date Mon, 24 Feb 2020 22:42:33 GMT
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git

commit e08f0be55b9e17d9e03e6f311b76044cb786cd7a
Author: Alex Herbert <aherbert@apache.org>
AuthorDate: Mon Feb 24 22:42:20 2020 +0000

    Revert CountingBloomFilter to ignore counts from another filter.
    
    Partially removes changes made in commit:
    6ad69bedd3436a75883e66bd130c77df884be98b
    
    The class requires a revision to handle add/subtract of another
    CountingBloomFilter. Restore the tests to check that Counting filters
    are merged as if another non-counting filter type.
    
    Remove the javadoc from the CountingBloomFilter methods that state it
    uses the counts when merge/remove are called with a CountingBloomFilter
    as this is not what the functionality currently performs.
    
    Fix the merge with a hasher to only increment the count by 1 even if the
    hasher contains duplicates. Add test to verify this works as documented.
    This matches the remove functionality which removes duplicates before
    subtraction.
---
 .../bloomfilter/CountingBloomFilter.java           | 109 ++++++-------
 .../bloomfilter/CountingBloomFilterTest.java       | 172 +++++++++++++++------
 2 files changed, 180 insertions(+), 101 deletions(-)

diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
index d3df6c2..a55b0b5 100644
--- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
+++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java
@@ -19,7 +19,6 @@ package org.apache.commons.collections4.bloomfilter;
 import java.util.AbstractMap;
 import java.util.BitSet;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.Map;
 import java.util.PrimitiveIterator.OfInt;
 import java.util.function.Consumer;
@@ -151,115 +150,117 @@ public class CountingBloomFilter extends AbstractBloomFilter {
     }
 
     /**
-     * Merge this Bloom filter with the other creating a new filter. The counts for
-     * bits that are on in the other filter are incremented.
+     * {@inheritDoc}
      * <p>
-     * For each bit that is turned on in the other filter; if the other filter is
-     * also a CountingBloomFilter the count is added to this filter, otherwise the
-     * count is incremented by one.
+     * Note: If the other filter is also a CountingBloomFilter the counts are not used.
      * </p>
      *
-     * @param other the other filter.
+     * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
      */
     @Override
     public void merge(final BloomFilter other) {
         verifyShape(other);
         if (other instanceof CountingBloomFilter) {
-            // TODO - This should use the entrySet and increment each key by the count
-            merge(((CountingBloomFilter) other).counts.keySet().iterator());
+            // Only use the keys and not the counts
+            ((CountingBloomFilter) other).counts.keySet().forEach(this::addIndex);
         } else {
-            merge(BitSet.valueOf(other.getBits()).stream().iterator());
+            BitSet.valueOf(other.getBits()).stream().forEach(this::addIndex);
         }
     }
 
+    /**
+     * {@inheritDoc}
+     *
+     * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
+     */
     @Override
     public void merge(final Hasher hasher) {
         verifyHasher(hasher);
-        merge(hasher.getBits(getShape()));
+        toStream(hasher).forEach(this::addIndex);
     }
 
     /**
-     * Merges an iterator of set bits into this filter.
-     * @param iter the iterator of bits to set.
+     * Increments the count for the bit index.
+     *
+     * @param idx the index.
+     * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
      */
-    private void merge(final Iterator<Integer> iter) {
-        iter.forEachRemaining(idx -> {
-            final Integer val = counts.get(idx);
-            if (val == null) {
-                counts.put(idx, 1 );
-            } else if (val == Integer.MAX_VALUE) {
-                throw new IllegalStateException("Overflow on index " + idx);
-            } else {
-                counts.put(idx, val + 1);
-            }
-        });
+    private void addIndex(int idx) {
+        final Integer val = counts.get(idx);
+        if (val == null) {
+            counts.put(idx, 1);
+        } else if (val == Integer.MAX_VALUE) {
+            throw new IllegalStateException("Overflow on index " + idx);
+        } else {
+            counts.put(idx, val + 1);
+        }
     }
 
     /**
-     * Decrements the counts for the bits that are on in the other BloomFilter from this
-     * one.
-     *
+     * Removes the other Bloom filter from this one.
      * <p>
      * For each bit that is turned on in the other filter the count is decremented by 1.
      * </p>
      *
      * @param other the other filter.
+     * @throws IllegalStateException If the count for a decremented bit was already at zero
      */
     public void remove(final BloomFilter other) {
         verifyShape(other);
         if (other instanceof CountingBloomFilter) {
-            // TODO - This should use the entrySet and decrement each key by the count
-            remove(((CountingBloomFilter) other).counts.keySet().stream());
+            // Only use the keys and not the counts
+            ((CountingBloomFilter) other).counts.keySet().forEach(this::subtractIndex);
         } else {
-            remove(BitSet.valueOf(other.getBits()).stream().boxed());
+            BitSet.valueOf(other.getBits()).stream().forEach(this::subtractIndex);
         }
     }
 
     /**
      * Decrements the counts for the bits that are on in the hasher from this
      * Bloom filter.
-     *
      * <p>
-     * For each bit that is turned on in the other filter the count is decremented by 1.
+     * For each bit that is identified by the hasher the count is decremented by 1.
+     * Duplicate bits are ignored.
      * </p>
      *
      * @param hasher the hasher to generate bits.
+     * @throws IllegalStateException If the count for a decremented bit was already at zero
      */
     public void remove(final Hasher hasher) {
         verifyHasher(hasher);
-        final Set<Integer> lst = new HashSet<>();
-        hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add);
-        remove(lst.stream());
+        toStream(hasher).forEach(this::subtractIndex);
     }
 
     /**
-     * Decrements the counts for the bits specified in the Integer stream.
+     * Decrements the count for the bit index.
      *
-     * @param idxStream The stream of bit counts to decrement.
+     * @param idx the index.
+     * @throws IllegalStateException If the count for a decremented bit was already at zero
      */
-    private void remove(final Stream<Integer> idxStream) {
-        idxStream.forEach(idx -> {
-            // TODO - This doubles up removal of the index.
-            // The first part will not throw if the count decrements past zero.
-            // The second part will throw if the count decrements past zero.
-            final Integer val = counts.get(idx);
-            if (val != null) {
-                if (val - 1 == 0) {
-                    counts.remove(idx);
-                } else {
-                    counts.put(idx, val - 1);
-                }
-            }
-            if (val == null || val == 0) {
-                throw new IllegalStateException("Underflow on index " + idx);
-            } else if (val - 1 == 0) {
+    private void subtractIndex(int idx) {
+        final Integer val = counts.get(idx);
+        if (val != null) {
+            if (val == 1) {
                 counts.remove(idx);
             } else {
                 counts.put(idx, val - 1);
             }
-        });
+        } else {
+            throw new IllegalStateException("Underflow on index " + idx);
+        }
     }
 
+    /**
+     * Convert the hasher to a stream. Duplicates indices are removed.
+     *
+     * @param hasher the hasher
+     * @return the stream
+     */
+    private Stream<Integer> toStream(Hasher hasher) {
+        final Set<Integer> lst = new HashSet<>();
+        hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add);
+        return lst.stream();
+    }
 
     @Override
     public String toString() {
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java
b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java
index a42f3f6..28220a7 100644
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java
+++ b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java
@@ -24,12 +24,13 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.PrimitiveIterator.OfInt;
 
+import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
 import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
 import org.apache.commons.collections4.bloomfilter.hasher.Shape;
 import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -140,33 +141,35 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest
{
      * Tests that merge correctly updates the counts when a CountingBloomFilter is passed.
      */
     @Test
-    @Ignore
     public void mergeTest_Counts() {
-        final int[] c1 = {1, 10, 100, 0};
-        final int[] c2 = {4, 0, 7, 2};
+        final int[] expected = {
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1, 0
+        };
+        final List<Integer> lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17);
+        final Hasher hasher = new StaticHasher(lst.iterator(), shape);
 
-        final Map<Integer, Integer> map1 = new HashMap<>();
-        for (int i = 0; i < c1.length; i++) {
-            map1.put(i, c1[i]);
-        }
-        final Map<Integer, Integer> map2 = new HashMap<>();
-        for (int i = 0; i < c2.length; i++) {
-            map2.put(i, c2[i]);
-        }
+        final CountingBloomFilter bf = createFilter(hasher, shape);
 
-        final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape);
-        final BloomFilter bf2 = new CountingBloomFilter(map2, shape);
+        final List<Integer> lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27);
+        final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape);
+        final BloomFilter bf2 = createFilter(hasher2, shape);
 
-        bf1.merge(bf2);
+        bf.merge(bf2);
 
-        assertEquals(4, bf1.cardinality());
+        assertEquals(27, bf.getCounts().count());
+        assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get());
+        assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get());
 
         final Map<Integer, Integer> m = new HashMap<>();
-        bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
-
-        // This currently fails as the counts from the second filter are ignored.
-        for (int i = 0; i < c1.length; i++) {
-            assertEquals(c1[i] + c2[i], m.get(i).intValue());
+        bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
+        for (int i = 0; i < 29; i++) {
+            if (m.get(i) == null) {
+                assertEquals("Wrong value for " + i, expected[i], 0);
+            } else {
+                assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue());
+            }
         }
     }
 
@@ -238,7 +241,7 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest {
     }
 
     /**
-     * Test that merge correctly updates the counts when a Hasher is passed
+     * Test that merge correctly updates the counts when a Hasher is passed.
      */
     @Test
     public void mergeTest_Shape_Hasher_Count() {
@@ -274,41 +277,81 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest
{
     }
 
     /**
-     * Tests that when removing a counting Bloom filter the counts are correctly updated.
+     * Test that merge correctly updates the counts when a Hasher is passed with duplicates.
      */
     @Test
-    @Ignore
-    public void removeTest_Counting() {
-        final int[] c1 = {1, 10, 100, 0};
-        final int[] c2 = {1, 0, 7, 2};
+    public void mergeTest_Shape_Hasher_Duplicates() {
+        final int[] expected = {
+            0, 2, 1, 2, 1
+        };
 
-        // If left alone the test fails with an exception as there is underflow
-        // for (0 - 2). Fix this but the underflow behaviour is subject to change.
-        c2[c2.length - 1] = 0;
+        final List<Integer> lst = Arrays.asList(1, 2, 3);
+        final Hasher hasher = new StaticHasher(lst.iterator(), shape);
 
-        final Map<Integer, Integer> map1 = new HashMap<>();
-        for (int i = 0; i < c1.length; i++) {
-            map1.put(i, c1[i]);
-        }
-        final Map<Integer, Integer> map2 = new HashMap<>();
-        for (int i = 0; i < c2.length; i++) {
-            map2.put(i, c2[i]);
-        }
+        final CountingBloomFilter bf = createFilter(hasher, shape);
+
+        final Hasher hasher2 = new Hasher() {
+            @Override
+            public OfInt getBits(Shape shape) {
+                // Deliberately return duplicates
+                return Arrays.asList(1, 1, 1, 1, 3, 3, 4).stream().mapToInt(Integer::intValue).iterator();
+            }
 
-        final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape);
-        final BloomFilter bf2 = new CountingBloomFilter(map2, shape);
+            @Override
+            public HashFunctionIdentity getHashFunctionIdentity() {
+                return shape.getHashFunctionIdentity();
+            }
+
+            @Override
+            public boolean isEmpty() {
+                return false;
+            }
+        };
 
-        bf1.remove(bf2);
+        bf.merge(hasher2);
 
-        assertEquals(2, bf1.cardinality());
+        assertEquals(4, bf.getCounts().count());
 
         final Map<Integer, Integer> m = new HashMap<>();
-        bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
+        bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
+        for (int i = 0; i < expected.length; i++) {
+            if (m.get(i) == null) {
+                assertEquals("Wrong value for " + i, expected[i], 0);
+            } else {
+                assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue());
+            }
+        }
+    }
 
-        // This currently fails as the counts from the second filter are ignored
-        // and only 1 is subtracted for each index.
-        for (int i = 0; i < c1.length; i++) {
-            assertEquals(Math.max(0, c1[i] - c2[i]), m.getOrDefault(i, 0).intValue());
+    /**
+     * Tests that when removing a counting Bloom filter the counts are correctly updated.
+     */
+    @Test
+    public void removeTest_Counting() {
+        final int[] values = {
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1
+        };
+        final Map<Integer, Integer> map = new HashMap<>();
+        for (int i = 1; i < values.length; i++) {
+            map.put(i, values[i]);
+        }
+
+        final CountingBloomFilter bf = new CountingBloomFilter(map, shape);
+
+        final List<Integer> lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17);
+        final Hasher hasher = new StaticHasher(lst.iterator(), shape);
+        final BloomFilter bf2 = new CountingBloomFilter(hasher, shape);
+
+        bf.remove(bf2);
+        assertEquals(17, bf.cardinality());
+        final Map<Integer, Integer> map2 = new HashMap<>();
+        bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue()));
+
+        for (int i = 11; i < values.length; i++) {
+            assertNotNull(map2.get(i));
+            assertEquals(1, map2.get(i).intValue());
         }
     }
 
@@ -344,6 +387,41 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest
{
     }
 
     /**
+     * Tests that removing a hasher update the counts properly if there are duplicates.
+     */
+    @Test
+    public void removeTest_Hasher_Duplicates() {
+        final int[] values = {
+            0, 3, 2, 1
+        };
+        final int[] expected = {
+            0, 2, 1, 0
+        };
+        final Map<Integer, Integer> map = new HashMap<>();
+        for (int i = 1; i < values.length; i++) {
+            map.put(i, values[i]);
+        }
+
+        final CountingBloomFilter bf = new CountingBloomFilter(map, shape);
+
+        final List<Integer> lst = Arrays.asList(1, 1, 1, 1, 1, 1, 2, 3, 3, 3);
+        final Hasher hasher = new StaticHasher(lst.iterator(), shape);
+
+        bf.remove(hasher);
+        assertEquals(2, bf.cardinality());
+        final Map<Integer, Integer> map2 = new HashMap<>();
+        bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue()));
+
+        for (int i = 0; i < expected.length; i++) {
+            if (map2.get(i) == null) {
+                assertEquals("Wrong value for " + i, expected[i], 0);
+            } else {
+                assertEquals("Wrong value for " + i, expected[i], map2.get(i).intValue());
+            }
+        }
+    }
+
+    /**
      * Tests that when removing a standard Bloom filter the counts are correctly updated.
      */
     @Test


Mime
View raw message