Return-Path: Delivered-To: apmail-hadoop-hbase-commits-archive@locus.apache.org Received: (qmail 51894 invoked from network); 14 Jul 2008 20:47:26 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 14 Jul 2008 20:47:26 -0000 Received: (qmail 84730 invoked by uid 500); 14 Jul 2008 20:47:26 -0000 Delivered-To: apmail-hadoop-hbase-commits-archive@hadoop.apache.org Received: (qmail 84717 invoked by uid 500); 14 Jul 2008 20:47:26 -0000 Mailing-List: contact hbase-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hbase-dev@hadoop.apache.org Delivered-To: mailing list hbase-commits@hadoop.apache.org Received: (qmail 84708 invoked by uid 99); 14 Jul 2008 20:47:26 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 14 Jul 2008 13:47:26 -0700 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 14 Jul 2008 20:46:32 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 161782388A06; Mon, 14 Jul 2008 13:46:56 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r676728 - in /hadoop/hbase/trunk: CHANGES.txt src/java/org/onelab/filter/BloomFilter.java src/java/org/onelab/filter/Filter.java src/test/org/onelab/test/TestFilter.java Date: Mon, 14 Jul 2008 20:46:55 -0000 To: hbase-commits@hadoop.apache.org From: jimk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20080714204656.161782388A06@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jimk Date: Mon Jul 14 13:46:55 2008 New Revision: 676728 URL: http://svn.apache.org/viewvc?rev=676728&view=rev Log: HBASE-744 BloomFilter serialization/deserialization broken Modified: hadoop/hbase/trunk/CHANGES.txt hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java Modified: hadoop/hbase/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=676728&r1=676727&r2=676728&view=diff ============================================================================== --- hadoop/hbase/trunk/CHANGES.txt (original) +++ hadoop/hbase/trunk/CHANGES.txt Mon Jul 14 13:46:55 2008 @@ -183,6 +183,7 @@ HBASE-742 Rename getMetainfo in HTable as getTableDescriptor HBASE-739 HBaseAdmin.createTable() using old HTableDescription doesn't work (Izaak Rubin via Stack) + HBASE-744 BloomFilter serialization/deserialization broken IMPROVEMENTS HBASE-559 MR example job to count table rows Modified: hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java?rev=676728&r1=676727&r2=676728&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java (original) +++ hadoop/hbase/trunk/src/java/org/onelab/filter/BloomFilter.java Mon Jul 14 13:46:55 2008 @@ -226,6 +226,7 @@ @Override public void readFields(DataInput in) throws IOException { super.readFields(in); + bits = new BitSet(this.vectorSize); byte[] bytes = new byte[getNBytes()]; in.readFully(bytes); for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { @@ -233,9 +234,6 @@ bitIndex = 0; byteIndex++; } - if (bitIndex == 0) { - bytes[byteIndex] = 0; - } if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { bits.set(i); } Modified: hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java?rev=676728&r1=676727&r2=676728&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java (original) +++ hadoop/hbase/trunk/src/java/org/onelab/filter/Filter.java Mon Jul 14 13:46:55 2008 @@ -76,13 +76,13 @@ */ public abstract class Filter implements Writable { /** The vector size of this filter. */ - int vectorSize; + protected int vectorSize; /** The hash function used to map a key to several positions in the vector. */ protected HashFunction hash; /** The number of hash function to consider. */ - int nbHash; + protected int nbHash; protected Filter() {} Modified: hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java?rev=676728&r1=676727&r2=676728&view=diff ============================================================================== --- hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java (original) +++ hadoop/hbase/trunk/src/test/org/onelab/test/TestFilter.java Mon Jul 14 13:46:55 2008 @@ -48,9 +48,16 @@ */ package org.onelab.test; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; import java.io.UnsupportedEncodingException; import junit.framework.TestCase; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.onelab.filter.*; /** @@ -61,22 +68,196 @@ * @version 1.0 - 8 Feb. 07 */ public class TestFilter extends TestCase { + private static final Log LOG = LogFactory.getLog(TestFilter.class); /** Test a BloomFilter * @throws UnsupportedEncodingException + * @throws IOException */ - public void testBloomFilter() throws UnsupportedEncodingException { - Filter bf = new BloomFilter(8, 2); - Key key = new StringKey("toto"); - Key k2 = new StringKey("lulu"); - Key k3 = new StringKey("mama"); - bf.add(key); - bf.add(k2); - bf.add(k3); - assertTrue(bf.membershipTest(key)); - assertTrue(bf.membershipTest(new StringKey("graknyl"))); - assertFalse(bf.membershipTest(new StringKey("xyzzy"))); - assertFalse(bf.membershipTest(new StringKey("abcd"))); + public void testBloomFilter() throws UnsupportedEncodingException, + IOException { + final StringKey[] inserted = { + new StringKey("wmjwjzyv"), + new StringKey("baietibz"), + new StringKey("guhsgxnv"), + new StringKey("mhnqycto"), + new StringKey("xcyqafgz"), + new StringKey("zidoamgb"), + new StringKey("tftfirzd"), + new StringKey("okapqlrg"), + new StringKey("yccwzwsq"), + new StringKey("qmonufqu"), + new StringKey("wlsctews"), + new StringKey("mksdhqri"), + new StringKey("wxxllokj"), + new StringKey("eviuqpls"), + new StringKey("bavotqmj"), + new StringKey("yibqzhdl"), + new StringKey("csfqmsyr"), + new StringKey("guxliyuh"), + new StringKey("pzicietj"), + new StringKey("qdwgrqwo"), + new StringKey("ujfzecmi"), + new StringKey("dzeqfvfi"), + new StringKey("phoegsij"), + new StringKey("bvudfcou"), + new StringKey("dowzmciz"), + new StringKey("etvhkizp"), + new StringKey("rzurqycg"), + new StringKey("krqfxuge"), + new StringKey("gflcohtd"), + new StringKey("fcrcxtps"), + new StringKey("qrtovxdq"), + new StringKey("aypxwrwi"), + new StringKey("dckpyznr"), + new StringKey("mdaawnpz"), + new StringKey("pakdfvca"), + new StringKey("xjglfbez"), + new StringKey("xdsecofi"), + new StringKey("sjlrfcab"), + new StringKey("ebcjawxv"), + new StringKey("hkafkjmy"), + new StringKey("oimmwaxo"), + new StringKey("qcuzrazo"), + new StringKey("nqydfkwk"), + new StringKey("frybvmlb"), + new StringKey("amxmaqws"), + new StringKey("gtkovkgx"), + new StringKey("vgwxrwss"), + new StringKey("xrhzmcep"), + new StringKey("tafwziil"), + new StringKey("erjmncnv"), + new StringKey("heyzqzrn"), + new StringKey("sowvyhtu"), + new StringKey("heeixgzy"), + new StringKey("ktcahcob"), + new StringKey("ljhbybgg"), + new StringKey("jiqfcksl"), + new StringKey("anjdkjhm"), + new StringKey("uzcgcuxp"), + new StringKey("vzdhjqla"), + new StringKey("svhgwwzq"), + new StringKey("zhswvhbp"), + new StringKey("ueceybwy"), + new StringKey("czkqykcw"), + new StringKey("ctisayir"), + new StringKey("hppbgciu"), + new StringKey("nhzgljfk"), + new StringKey("vaziqllf"), + new StringKey("narvrrij"), + new StringKey("kcevbbqi"), + new StringKey("qymuaqnp"), + new StringKey("pwqpfhsr"), + new StringKey("peyeicuk"), + new StringKey("kudlwihi"), + new StringKey("pkmqejlm"), + new StringKey("ylwzjftl"), + new StringKey("rhqrlqar"), + new StringKey("xmftvzsp"), + new StringKey("iaemtihk"), + new StringKey("ymsbrqcu"), + new StringKey("yfnlcxto"), + new StringKey("nluqopqh"), + new StringKey("wmrzhtox"), + new StringKey("qnffhqbl"), + new StringKey("zypqpnbw"), + new StringKey("oiokhatd"), + new StringKey("mdraddiu"), + new StringKey("zqoatltt"), + new StringKey("ewhulbtm"), + new StringKey("nmswpsdf"), + new StringKey("xsjeteqe"), + new StringKey("ufubcbma"), + new StringKey("phyxvrds"), + new StringKey("vhnfldap"), + new StringKey("zrrlycmg"), + new StringKey("becotcjx"), + new StringKey("wvbubokn"), + new StringKey("avkgiopr"), + new StringKey("mbqqxmrv"), + new StringKey("ibplgvuu"), + new StringKey("dghvpkgc") + }; + + final StringKey[] notInserted = { + new StringKey("abcdefgh"), + new StringKey("ijklmnop"), + new StringKey("qrstuvwx"), + new StringKey("yzabcdef") + }; + + /* + * Bloom filters are very sensitive to the number of elements inserted into + * them. + * + * If m denotes the number of bits in the Bloom filter (vectorSize), + * n denotes the number of elements inserted into the Bloom filter and + * k represents the number of hash functions used (nbHash), then + * according to Broder and Mitzenmacher, + * + * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf ) + * + * the probability of false positives is minimized when k is + * approximately ln(2) * m/n. + * + * If we fix the number of hash functions and know the number of entries, + * then the optimal vector size m = (k * n) / ln(2) + */ + final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4; + BloomFilter bf = new BloomFilter( + (int) Math.ceil( + (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) / + Math.log(2.0)), + DEFAULT_NUMBER_OF_HASH_FUNCTIONS + ); + + for (int i = 0; i < inserted.length; i++) { + bf.add(inserted[i]); + } + + // Verify that there are no false negatives and few (if any) false positives + + checkFalsePositivesNegatives(bf, inserted, notInserted); + + // Test serialization/deserialization + + LOG.info("Checking serialization/deserialization"); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(baos); + bf.write(out); + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + DataInputStream in = new DataInputStream(bais); + bf = new BloomFilter(); + bf.readFields(in); + + // Verify that there are no false negatives and few (if any) false positives + + checkFalsePositivesNegatives(bf, inserted, notInserted); + } + + private void checkFalsePositivesNegatives(BloomFilter bf, + StringKey[] inserted, StringKey[] notInserted) { + // Test membership for values we inserted. Should not get false negatives + + LOG.info("Checking for false negatives"); + for (int i = 0; i < inserted.length; i++) { + if (!bf.membershipTest(inserted[i])) { + LOG.error("false negative for: " + inserted[i]); + fail(); + } + } + + // Test membership for values we did not insert. It is possible to get + // false positives + + LOG.info("Checking for false positives"); + for (int i = 0; i < notInserted.length; i++) { + if(bf.membershipTest(notInserted[i])) { + LOG.error("false positive for: " + notInserted[i]); + fail(); + } + } + LOG.info("Success!"); } /** Test a CountingBloomFilter