hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From j...@apache.org
Subject svn commit: r676728 - in /hadoop/hbase/trunk: CHANGES.txt src/java/org/onelab/filter/BloomFilter.java src/java/org/onelab/filter/Filter.java src/test/org/onelab/test/TestFilter.java
Date Mon, 14 Jul 2008 20:46:55 GMT
Author: jimk
Date: Mon Jul 14 13:46:55 2008
New Revision: 676728

URL: http://svn.apache.org/viewvc?rev=676728&view=rev
Log:
HBASE-744   BloomFilter serialization/deserialization broken

Modified:
    hadoop/hbase/trunk/CHANGES.txt
    hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java
    hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java
    hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java

Modified: hadoop/hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=676728&r1=676727&r2=676728&view=diff
==============================================================================
--- hadoop/hbase/trunk/CHANGES.txt (original)
+++ hadoop/hbase/trunk/CHANGES.txt Mon Jul 14 13:46:55 2008
@@ -183,6 +183,7 @@
    HBASE-742   Rename getMetainfo in HTable as getTableDescriptor
    HBASE-739   HBaseAdmin.createTable() using old HTableDescription doesn't work
                (Izaak Rubin via Stack)
+   HBASE-744   BloomFilter serialization/deserialization broken
 
   IMPROVEMENTS
    HBASE-559   MR example job to count table rows

Modified: hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java?rev=676728&r1=676727&r2=676728&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java (original)
+++ hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java Mon Jul 14 13:46:55 2008
@@ -226,6 +226,7 @@
   @Override
   public void readFields(DataInput in) throws IOException {
     super.readFields(in);
+    bits = new BitSet(this.vectorSize);
     byte[] bytes = new byte[getNBytes()];
     in.readFully(bytes);
     for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
@@ -233,9 +234,6 @@
         bitIndex = 0;
         byteIndex++;
       }
-      if (bitIndex == 0) {
-        bytes[byteIndex] = 0;
-      }
       if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
         bits.set(i);
       }

Modified: hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java?rev=676728&r1=676727&r2=676728&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java (original)
+++ hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java Mon Jul 14 13:46:55 2008
@@ -76,13 +76,13 @@
  */
 public abstract class Filter implements Writable {
   /** The vector size of <i>this</i> filter. */
-  int vectorSize;
+  protected int vectorSize;
 
   /** The hash function used to map a key to several positions in the vector. */
   protected HashFunction hash;
 
   /** The number of hash function to consider. */
-  int nbHash;
+  protected int nbHash;
 
   protected Filter() {}
   

Modified: hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java
URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java?rev=676728&r1=676727&r2=676728&view=diff
==============================================================================
--- hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java (original)
+++ hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java Mon Jul 14 13:46:55 2008
@@ -48,9 +48,16 @@
  */
 package org.onelab.test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import junit.framework.TestCase;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.onelab.filter.*;
 
 /**
@@ -61,22 +68,196 @@
  * @version 1.0 - 8 Feb. 07
  */
 public class TestFilter extends TestCase {
+  private static final Log LOG = LogFactory.getLog(TestFilter.class);
   
   /** Test a BloomFilter
    * @throws UnsupportedEncodingException
+   * @throws IOException
    */
-  public void testBloomFilter() throws UnsupportedEncodingException {
-    Filter bf = new BloomFilter(8, 2);
-    Key key = new StringKey("toto");
-    Key k2 = new StringKey("lulu");
-    Key k3 = new StringKey("mama");
-    bf.add(key);
-    bf.add(k2);
-    bf.add(k3);
-    assertTrue(bf.membershipTest(key));
-    assertTrue(bf.membershipTest(new StringKey("graknyl")));
-    assertFalse(bf.membershipTest(new StringKey("xyzzy")));
-    assertFalse(bf.membershipTest(new StringKey("abcd")));
+  public void testBloomFilter() throws UnsupportedEncodingException,
+  IOException {
+    final StringKey[] inserted = {
+        new StringKey("wmjwjzyv"),
+        new StringKey("baietibz"),
+        new StringKey("guhsgxnv"),
+        new StringKey("mhnqycto"),
+        new StringKey("xcyqafgz"),
+        new StringKey("zidoamgb"),
+        new StringKey("tftfirzd"),
+        new StringKey("okapqlrg"),
+        new StringKey("yccwzwsq"),
+        new StringKey("qmonufqu"),
+        new StringKey("wlsctews"),
+        new StringKey("mksdhqri"),
+        new StringKey("wxxllokj"),
+        new StringKey("eviuqpls"),
+        new StringKey("bavotqmj"),
+        new StringKey("yibqzhdl"),
+        new StringKey("csfqmsyr"),
+        new StringKey("guxliyuh"),
+        new StringKey("pzicietj"),
+        new StringKey("qdwgrqwo"),
+        new StringKey("ujfzecmi"),
+        new StringKey("dzeqfvfi"),
+        new StringKey("phoegsij"),
+        new StringKey("bvudfcou"),
+        new StringKey("dowzmciz"),
+        new StringKey("etvhkizp"),
+        new StringKey("rzurqycg"),
+        new StringKey("krqfxuge"),
+        new StringKey("gflcohtd"),
+        new StringKey("fcrcxtps"),
+        new StringKey("qrtovxdq"),
+        new StringKey("aypxwrwi"),
+        new StringKey("dckpyznr"),
+        new StringKey("mdaawnpz"),
+        new StringKey("pakdfvca"),
+        new StringKey("xjglfbez"),
+        new StringKey("xdsecofi"),
+        new StringKey("sjlrfcab"),
+        new StringKey("ebcjawxv"),
+        new StringKey("hkafkjmy"),
+        new StringKey("oimmwaxo"),
+        new StringKey("qcuzrazo"),
+        new StringKey("nqydfkwk"),
+        new StringKey("frybvmlb"),
+        new StringKey("amxmaqws"),
+        new StringKey("gtkovkgx"),
+        new StringKey("vgwxrwss"),
+        new StringKey("xrhzmcep"),
+        new StringKey("tafwziil"),
+        new StringKey("erjmncnv"),
+        new StringKey("heyzqzrn"),
+        new StringKey("sowvyhtu"),
+        new StringKey("heeixgzy"),
+        new StringKey("ktcahcob"),
+        new StringKey("ljhbybgg"),
+        new StringKey("jiqfcksl"),
+        new StringKey("anjdkjhm"),
+        new StringKey("uzcgcuxp"),
+        new StringKey("vzdhjqla"),
+        new StringKey("svhgwwzq"),
+        new StringKey("zhswvhbp"),
+        new StringKey("ueceybwy"),
+        new StringKey("czkqykcw"),
+        new StringKey("ctisayir"),
+        new StringKey("hppbgciu"),
+        new StringKey("nhzgljfk"),
+        new StringKey("vaziqllf"),
+        new StringKey("narvrrij"),
+        new StringKey("kcevbbqi"),
+        new StringKey("qymuaqnp"),
+        new StringKey("pwqpfhsr"),
+        new StringKey("peyeicuk"),
+        new StringKey("kudlwihi"),
+        new StringKey("pkmqejlm"),
+        new StringKey("ylwzjftl"),
+        new StringKey("rhqrlqar"),
+        new StringKey("xmftvzsp"),
+        new StringKey("iaemtihk"),
+        new StringKey("ymsbrqcu"),
+        new StringKey("yfnlcxto"),
+        new StringKey("nluqopqh"),
+        new StringKey("wmrzhtox"),
+        new StringKey("qnffhqbl"),
+        new StringKey("zypqpnbw"),
+        new StringKey("oiokhatd"),
+        new StringKey("mdraddiu"),
+        new StringKey("zqoatltt"),
+        new StringKey("ewhulbtm"),
+        new StringKey("nmswpsdf"),
+        new StringKey("xsjeteqe"),
+        new StringKey("ufubcbma"),
+        new StringKey("phyxvrds"),
+        new StringKey("vhnfldap"),
+        new StringKey("zrrlycmg"),
+        new StringKey("becotcjx"),
+        new StringKey("wvbubokn"),
+        new StringKey("avkgiopr"),
+        new StringKey("mbqqxmrv"),
+        new StringKey("ibplgvuu"),
+        new StringKey("dghvpkgc")
+    };
+
+    final StringKey[] notInserted = {
+        new StringKey("abcdefgh"),
+        new StringKey("ijklmnop"),
+        new StringKey("qrstuvwx"),
+        new StringKey("yzabcdef")
+    };
+
+    /* 
+     * Bloom filters are very sensitive to the number of elements inserted into
+     * them.
+     * 
+     * If m denotes the number of bits in the Bloom filter (vectorSize),
+     * n denotes the number of elements inserted into the Bloom filter and
+     * k represents the number of hash functions used (nbHash), then
+     * according to Broder and Mitzenmacher,
+     * 
+     * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf
)
+     * 
+     * the probability of false positives is minimized when k is
+     * approximately ln(2) * m/n.
+     * 
+     * If we fix the number of hash functions and know the number of entries,
+     * then the optimal vector size m = (k * n) / ln(2)
+     */
+    final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
+    BloomFilter bf = new BloomFilter(
+        (int) Math.ceil(
+            (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
+            Math.log(2.0)),
+            DEFAULT_NUMBER_OF_HASH_FUNCTIONS
+    );
+    
+    for (int i = 0; i < inserted.length; i++) {
+      bf.add(inserted[i]);
+    }
+    
+    // Verify that there are no false negatives and few (if any) false positives
+    
+    checkFalsePositivesNegatives(bf, inserted, notInserted);
+
+    // Test serialization/deserialization
+    
+    LOG.info("Checking serialization/deserialization");
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DataOutputStream out = new DataOutputStream(baos);
+    bf.write(out);
+    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
+    DataInputStream in = new DataInputStream(bais);
+    bf = new BloomFilter();
+    bf.readFields(in);
+   
+    // Verify that there are no false negatives and few (if any) false positives
+    
+    checkFalsePositivesNegatives(bf, inserted, notInserted);
+  }
+  
+  private void checkFalsePositivesNegatives(BloomFilter bf,
+      StringKey[] inserted, StringKey[] notInserted) {
+    // Test membership for values we inserted. Should not get false negatives
+    
+    LOG.info("Checking for false negatives");
+    for (int i = 0; i < inserted.length; i++) {
+      if (!bf.membershipTest(inserted[i])) {
+        LOG.error("false negative for: " + inserted[i]);
+        fail();
+      }
+    }
+    
+    // Test membership for values we did not insert. It is possible to get
+    // false positives
+    
+    LOG.info("Checking for false positives");
+    for (int i = 0; i < notInserted.length; i++) {
+      if(bf.membershipTest(notInserted[i])) {
+        LOG.error("false positive for: " + notInserted[i]);
+        fail();
+      }
+    }
+    LOG.info("Success!");
   }
   
   /** Test a CountingBloomFilter



Mime
View raw message