jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thom...@apache.org
Subject svn commit: r1221775 - in /jackrabbit/sandbox/microkernel/src: main/java/org/apache/jackrabbit/mk/mem/ main/java/org/apache/jackrabbit/mk/util/ test/java/org/apache/jackrabbit/mk/util/
Date Wed, 21 Dec 2011 15:47:16 GMT
Author: thomasm
Date: Wed Dec 21 15:47:15 2011
New Revision: 1221775

URL: http://svn.apache.org/viewvc?rev=1221775&view=rev
Log:
Moved bloom filter implementation to a utility class and added test cases.

Added:
    jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
    jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
Modified:
    jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListLarge.java
    jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListSmall.java

Modified: jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListLarge.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListLarge.java?rev=1221775&r1=1221774&r2=1221775&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListLarge.java
(original)
+++ jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListLarge.java
Wed Dec 21 15:47:15 2011
@@ -23,6 +23,7 @@ import java.util.Iterator;
 import org.apache.jackrabbit.mk.json.JsopTokenizer;
 import org.apache.jackrabbit.mk.json.JsopWriter;
 import org.apache.jackrabbit.mk.mem.NodeImpl.ChildVisitor;
+import org.apache.jackrabbit.mk.util.BloomFilterUtils;
 import org.apache.jackrabbit.mk.util.ExceptionFactory;
 import org.apache.jackrabbit.mk.util.IOUtils;
 import org.apache.jackrabbit.mk.util.StringUtils;
@@ -193,9 +194,7 @@ public class NodeListLarge implements No
         byte[] nameFilter;
 
         boolean possiblyContains(String name) {
-            int h = name.hashCode();
-            int b = nameFilter[(h >> 3) & (nameFilter.length - 1)] & (1 <<
(h & 7));
-            return b != 0;
+            return BloomFilterUtils.probablyContains(nameFilter, name);
         }
 
     }

Modified: jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListSmall.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListSmall.java?rev=1221775&r1=1221774&r2=1221775&view=diff
==============================================================================
--- jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListSmall.java
(original)
+++ jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/mem/NodeListSmall.java
Wed Dec 21 15:47:15 2011
@@ -25,6 +25,7 @@ import org.apache.jackrabbit.mk.json.Jso
 import org.apache.jackrabbit.mk.json.JsopWriter;
 import org.apache.jackrabbit.mk.mem.NodeImpl.ChildVisitor;
 import org.apache.jackrabbit.mk.util.ArrayUtils;
+import org.apache.jackrabbit.mk.util.BloomFilterUtils;
 import org.apache.jackrabbit.mk.util.ExceptionFactory;
 import org.apache.jackrabbit.mk.util.IOUtils;
 import org.apache.jackrabbit.mk.util.StringCache;
@@ -180,11 +181,9 @@ public class NodeListSmall implements No
     }
 
     public byte[] getNameFilter(NodeMap map) {
-        int len = IOUtils.nextPowerOf2(Math.min(64, map.getMaxMemoryChildren() / 4));
-        byte[] data = new byte[len];
+        byte[] data = BloomFilterUtils.createFilter(map.getMaxMemoryChildren(), 64);
         for (String n : names) {
-            int h = n.hashCode();
-            data[(h >> 3) & (data.length - 1)] |= 1 << (h & 7);
+            BloomFilterUtils.add(data, n);
         }
         return data;
     }

Added: jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java?rev=1221775&view=auto
==============================================================================
--- jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
(added)
+++ jackrabbit/sandbox/microkernel/src/main/java/org/apache/jackrabbit/mk/util/BloomFilterUtils.java
Wed Dec 21 15:47:15 2011
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.mk.util;
+
+/**
+ * Bloom filter utilities.
+ */
+public class BloomFilterUtils {
+
+    /**
+     * The multiply and shift constants for the supplemental hash function.
+     */
+    private static final int MUL = 257, SHIFT = 8;
+
+    /**
+     * The number of bits needed per stored element.
+     * Using the formula m = - (n * ln(p)) / (ln(2)^2) as described in
+     * http://en.wikipedia.org/wiki/Bloom_filter
+     * (simplified, as we used a fixed K: 2).
+     */
+    private static final double BIT_FACTOR = -Math.log(0.02) / Math.pow(Math.log(2), 2);
+
+    /**
+     * Create a bloom filter array for the given number of elements.
+     *
+     * @param count the number of entries
+     * @param maxBytes the maximum number of bytes
+     * @return the empty bloom filter
+     */
+    public static byte[] createFilter(int elementCount, int maxBytes) {
+        int bits = (int) (elementCount * BIT_FACTOR) + 7;
+        return new byte[Math.min(maxBytes, bits / 8)];
+    }
+
+    /**
+     * Add the key.
+     *
+     * @param bloom the bloom filter
+     * @param key the key
+     */
+    public static void add(byte[] bloom, Object key) {
+        int len = bloom.length;
+        if (len > 0) {
+            int h1 = hash(key.hashCode()), h2 = hash(h1);
+            bloom[(h1 >>> 3) % len] |= 1 << (h1 & 7);
+            bloom[(h2 >>> 3) % len] |= 1 << (h2 & 7);
+        }
+    }
+
+    /**
+     * Check whether the given key is probably in the set. This method never
+     * returns false if the key is in the set, but possibly returns true even if
+     * it isn't.
+     *
+     * @param bloom the bloom filter
+     * @param key the key
+     * @return true if the given key is probably in the set
+     */
+    public static boolean probablyContains(byte[] bloom, Object key) {
+        int len = bloom.length;
+        if (len == 0) {
+            return true;
+        }
+        int h1 = hash(key.hashCode()), h2 = hash(h1);
+        int x = bloom[(h1 >>> 3) % len] & (1 << (h1 & 7));
+        if (x != 0) {
+            x = bloom[(h2 >>> 3) % len] & (1 << (h2 & 7));
+        }
+        return x != 0;
+    }
+
+    /**
+     * Get the hash value for the given key. The returned hash value is
+     * stretched so that it should work well even for relatively bad hashCode
+     * implementations.
+     *
+     * @param key the key
+     * @return the hash value
+     */
+    private static int hash(int oldHash) {
+        return oldHash ^ ((oldHash * MUL) >> SHIFT);
+    }
+
+}

Added: jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java?rev=1221775&view=auto
==============================================================================
--- jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
(added)
+++ jackrabbit/sandbox/microkernel/src/test/java/org/apache/jackrabbit/mk/util/BloomFilterUtilsTest.java
Wed Dec 21 15:47:15 2011
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.mk.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import java.math.BigInteger;
+import org.junit.Test;
+
+/**
+ * Test the bloom filter utility class.
+ */
+public class BloomFilterUtilsTest {
+
+    /**
+     * Program to calculate the best shift and multiply constants.
+     */
+    public static void main(String... args) {
+        int best = Integer.MAX_VALUE;
+        for (int mul = 1; mul < 100000; mul++) {
+            if (!BigInteger.valueOf(mul).isProbablePrime(10)) {
+                continue;
+            }
+            for (int shift = 0; shift < 32; shift++) {
+                byte[] bloom = BloomFilterUtils.createFilter(20, 64);
+                for (int i = 0; i < 20; i++) {
+                    String k = String.valueOf(i);
+                    int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
+                    add(bloom, h1, h2);
+                }
+                for (int i = 0; i < 20; i++) {
+                    String k = String.valueOf(i);
+                    int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
+                    assertTrue(probablyContains(bloom, h1, h2));
+                }
+                int falsePositives = 0;
+                for (int i = 20; i < 100000; i++) {
+                    String k = String.valueOf(i);
+                    int h1 = hash(k.hashCode(), mul, shift), h2 = hash(h1, mul, shift);
+                    if (probablyContains(bloom, h1, h2)) {
+                        falsePositives++;
+                    }
+                }
+                if (falsePositives < best) {
+                    best = falsePositives;
+                    System.out.println("mul: " + mul + " shift: "
+                            + shift + " falsePositives: " + best);
+                }
+            }
+        }
+    }
+
+    private static int hash(int oldHash, int mul, int shift) {
+        return oldHash ^ ((oldHash * mul) >> shift);
+    }
+
+    private static void add(byte[] bloom, int h1, int h2) {
+        int len = bloom.length;
+        if (len > 0) {
+            bloom[(h1 >>> 3) % len] |= 1 << (h1 & 7);
+            bloom[(h2 >>> 3) % len] |= 1 << (h2 & 7);
+        }
+    }
+
+    private static boolean probablyContains(byte[] bloom, int h1, int h2) {
+        int len = bloom.length;
+        if (len == 0) {
+            return true;
+        }
+        int x = bloom[(h1 >>> 3) % len] & (1 << (h1 & 7));
+        if (x != 0) {
+            x = bloom[(h2 >>> 3) % len] & (1 << (h2 & 7));
+        }
+        return x != 0;
+    }
+
+    @Test
+    public void size() {
+        byte[] bloom = BloomFilterUtils.createFilter(100, 64);
+        assertEquals(64, bloom.length);
+        bloom = BloomFilterUtils.createFilter(10, 64);
+        assertEquals(11, bloom.length);
+        bloom = BloomFilterUtils.createFilter(0, 64);
+        assertEquals(0, bloom.length);
+        bloom = BloomFilterUtils.createFilter(1, 64);
+        assertEquals(1, bloom.length);
+    }
+
+    @Test
+    public void probability() {
+        byte[] bloom = BloomFilterUtils.createFilter(20, 64);
+        for (int i = 0; i < 20; i++) {
+            BloomFilterUtils.add(bloom, String.valueOf(i));
+        }
+        for (int i = 0; i < 20; i++) {
+            assertTrue(BloomFilterUtils.probablyContains(bloom, String.valueOf(i)));
+        }
+        int falsePositives = 0;
+        for (int i = 20; i < 100000; i++) {
+            if (BloomFilterUtils.probablyContains(bloom, String.valueOf(i))) {
+                falsePositives++;
+            }
+        }
+        assertEquals(113, falsePositives);
+    }
+    @Test
+    public void negativeHashCode() {
+        BloomFilterUtils.add(new byte[0], new Object() {
+            public int hashCode() {
+                return -1;
+            }
+        });
+    }
+
+}



Mime
View raw message