Return-Path: Delivered-To: apmail-lucene-hadoop-commits-archive@locus.apache.org Received: (qmail 5301 invoked from network); 27 Aug 2007 23:35:50 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 27 Aug 2007 23:35:50 -0000 Received: (qmail 33448 invoked by uid 500); 27 Aug 2007 23:35:46 -0000 Delivered-To: apmail-lucene-hadoop-commits-archive@lucene.apache.org Received: (qmail 33424 invoked by uid 500); 27 Aug 2007 23:35:46 -0000 Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hadoop-dev@lucene.apache.org Delivered-To: mailing list hadoop-commits@lucene.apache.org Received: (qmail 33415 invoked by uid 99); 27 Aug 2007 23:35:46 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 27 Aug 2007 16:35:46 -0700 X-ASF-Spam-Status: No, hits=-100.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 27 Aug 2007 23:35:38 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 7249C1A9832; Mon, 27 Aug 2007 16:35:18 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r570270 - in /lucene/hadoop/trunk/src/contrib/hbase: ./ src/java/org/apache/hadoop/hbase/ src/test/org/apache/hadoop/hbase/ Date: Mon, 27 Aug 2007 23:35:17 -0000 To: hadoop-commits@lucene.apache.org From: jimk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20070827233518.7249C1A9832@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jimk Date: Mon Aug 27 16:35:15 2007 New Revision: 570270 URL: http://svn.apache.org/viewvc?rev=570270&view=rev Log: HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom filter types Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=570270&r1=570269&r2=570270&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original) +++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Mon Aug 27 16:35:15 2007 @@ -25,6 +25,8 @@ IMPROVEMENTS HADOOP-1737 Make HColumnDescriptor data publically members settable HADOOP-1746 Clean up findbugs warnings + HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom + filter types Below are the list of changes before 2007-08-18 Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java?rev=570270&r1=570269&r2=570270&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java Mon Aug 27 16:35:15 2007 @@ -27,28 +27,46 @@ /** * Supplied as a parameter to HColumnDescriptor to specify what kind of - * bloom filter to use for a column, and its configuration parameters + * bloom filter to use for a column, and its configuration parameters. + * + * There is no way to automatically determine the vector size and the number of + * hash functions to use. In particular, bloom filters are very sensitive to the + * number of elements inserted into them. For HBase, the number of entries + * depends on the size of the data stored in the column. Currently the default + * region size is 64MB, so the number of entries is approximately + * 64MB / (average value size for column). + * + * If m denotes the number of bits in the Bloom filter (vectorSize), + * n denotes the number of elements inserted into the Bloom filter and + * k represents the number of hash functions used (nbHash), then according to + * Broder and Mitzenmacher, + * + * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf ) + * + * the probability of false positives is minimized when k is approximately + * m/n ln(2). + * */ public class BloomFilterDescriptor implements WritableComparable { + private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0; /* * Specify the kind of bloom filter that will be instantiated */ - /** - * Bloom filter, as defined by Bloom in 1970. - */ - public static final int BLOOMFILTER = 1; - - /** - * counting Bloom filter, as defined by Fan et al. in a ToN 2000 paper. - */ - public static final int COUNTING_BLOOMFILTER = 2; - - /** - * retouched Bloom filter, as defined in the CoNEXT 2006 paper. - */ - public static final int RETOUCHED_BLOOMFILTER = 3; + /** The type of bloom filter */ + public static enum BloomFilterType { + /** Bloom filter, as defined by Bloom in 1970. */ + BLOOMFILTER, + /** + * Counting Bloom filter, as defined by Fan et al. in a ToN 2000 paper. + */ + COUNTING_BLOOMFILTER, + /** + * Retouched Bloom filter, as defined in the CoNEXT 2006 paper. + */ + RETOUCHED_BLOOMFILTER + } /** Default constructor - used in conjunction with Writable */ public BloomFilterDescriptor() { @@ -56,11 +74,41 @@ } /** + * Creates a BloomFilterDescriptor for the specified type of filter, fixes + * the number of hash functions to 4 and computes a vector size using: + * + * vectorSize = ceil((4 * n) / ln(2)) + * + * @param type + * @param numberOfEntries + */ + public BloomFilterDescriptor(final BloomFilterType type, + final int numberOfEntries) { + + switch(type) { + case BLOOMFILTER: + case COUNTING_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: + this.filterType = type; + break; + + default: + throw new IllegalArgumentException("Invalid bloom filter type: " + type); + } + this.nbHash = (int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS; + this.vectorSize = (int) Math.ceil( + (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * numberOfEntries)) / + Math.log(2.0)); + } + + /** * @param type The kind of bloom filter to use. * @param vectorSize The vector size of this filter. * @param nbHash The number of hash functions to consider. */ - public BloomFilterDescriptor(int type, int vectorSize, int nbHash) { + public BloomFilterDescriptor(final BloomFilterType type, final int vectorSize, + final int nbHash) { + switch(type) { case BLOOMFILTER: case COUNTING_BLOOMFILTER: @@ -75,7 +123,7 @@ this.nbHash = nbHash; } - int filterType; + BloomFilterType filterType; int vectorSize; int nbHash; @@ -113,7 +161,7 @@ /** {@inheritDoc} */ @Override public int hashCode() { - int result = Integer.valueOf(this.filterType).hashCode(); + int result = this.filterType.hashCode(); result ^= Integer.valueOf(this.vectorSize).hashCode(); result ^= Integer.valueOf(this.nbHash).hashCode(); return result; @@ -123,14 +171,15 @@ /** {@inheritDoc} */ public void readFields(DataInput in) throws IOException { - filterType = in.readInt(); + int ordinal = in.readInt(); + this.filterType = BloomFilterType.values()[ordinal]; vectorSize = in.readInt(); nbHash = in.readInt(); } /** {@inheritDoc} */ public void write(DataOutput out) throws IOException { - out.writeInt(filterType); + out.writeInt(filterType.ordinal()); out.writeInt(vectorSize); out.writeInt(nbHash); } @@ -140,7 +189,7 @@ /** {@inheritDoc} */ public int compareTo(Object o) { BloomFilterDescriptor other = (BloomFilterDescriptor)o; - int result = this.filterType - other.filterType; + int result = this.filterType.ordinal() - other.filterType.ordinal(); if(result == 0) { result = this.vectorSize - other.vectorSize; Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java?rev=570270&r1=570269&r2=570270&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java Mon Aug 27 16:35:15 2007 @@ -31,6 +31,11 @@ /** * An HColumnDescriptor contains information about a column family such as the * number of versions, compression settings, etc. + * + * It is used as input when creating a table or adding a column. Once set, the + * parameters that specify a column cannot be changed without deleting the + * column and recreating it. If there is data stored in the column, it will be + * deleted when the column is deleted. */ public class HColumnDescriptor implements WritableComparable { Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java?rev=570270&r1=570269&r2=570270&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java Mon Aug 27 16:35:15 2007 @@ -316,18 +316,21 @@ if (LOG.isDebugEnabled()) { LOG.debug("loading bloom filter for " + this.storeName); } + + BloomFilterDescriptor.BloomFilterType type = + family.getBloomFilter().filterType; - switch(family.getBloomFilter().filterType) { + switch(type) { - case BloomFilterDescriptor.BLOOMFILTER: + case BLOOMFILTER: bloomFilter = new BloomFilter(); break; - case BloomFilterDescriptor.COUNTING_BLOOMFILTER: + case COUNTING_BLOOMFILTER: bloomFilter = new CountingBloomFilter(); break; - case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: bloomFilter = new RetouchedBloomFilter(); } FSDataInputStream in = fs.open(filterFile); @@ -339,20 +342,23 @@ LOG.debug("creating bloom filter for " + this.storeName); } - switch(family.getBloomFilter().filterType) { + BloomFilterDescriptor.BloomFilterType type = + family.getBloomFilter().filterType; + + switch(type) { - case BloomFilterDescriptor.BLOOMFILTER: + case BLOOMFILTER: bloomFilter = new BloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); break; - case BloomFilterDescriptor.COUNTING_BLOOMFILTER: + case COUNTING_BLOOMFILTER: bloomFilter = new CountingBloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); break; - case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: bloomFilter = new RetouchedBloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); Modified: lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java?rev=570270&r1=570269&r2=570270&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java Mon Aug 27 16:35:15 2007 @@ -19,18 +19,16 @@ */ package org.apache.hadoop.hbase; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; - +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; /** Tests per-column bloom filters */ public class TestBloomFilters extends HBaseClusterTestCase { + static final Log LOG = LogFactory.getLog(TestBloomFilters.class); + private static final Text CONTENTS = new Text("contents:"); - private HTableDescriptor desc = null; - private HTable table = null; - private static final Text[] rows = { new Text("wmjwjzyv"), new Text("baietibz"), @@ -144,28 +142,40 @@ /** constructor */ public TestBloomFilters() { super(); - conf.set("hbase.hregion.maxunflushed", "90"); // flush cache every 100 writes + conf.set("hbase.hregion.memcache.flush.size", "100");// flush cache every 100 bytes conf.set("hbase.regionserver.maxlogentries", "90"); // and roll log too - Logger.getLogger(HRegion.class).setLevel(Level.DEBUG); - Logger.getLogger(HStore.class).setLevel(Level.DEBUG); } - /** {@inheritDoc} */ - @Override - public void setUp() { + /** Test that specifies explicit parameters for the bloom filter */ + public void testExplicitParameters() { + HTable table = null; try { - super.setUp(); - this.desc = new HTableDescriptor("test"); + // Setup + HTableDescriptor desc = new HTableDescriptor(getName()); + BloomFilterDescriptor bloomFilter = + new BloomFilterDescriptor( // if we insert 1000 values + BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter + 12499, // number of bits + 4 // number of hash functions + ); + desc.addFamily( - new HColumnDescriptor(CONTENTS, 1, HColumnDescriptor.CompressionType.NONE, - false, Integer.MAX_VALUE, - new BloomFilterDescriptor( // if we insert 1000 values - BloomFilterDescriptor.BLOOMFILTER, // plain old bloom filter - 12499, // number of bits - 4 // number of hash functions - ))); // false positive = 0.0000001 + new HColumnDescriptor(CONTENTS, // Column name + 1, // Max versions + HColumnDescriptor.CompressionType.NONE, // no compression + HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory + HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH, + bloomFilter + ) + ); + + // Create the table + HBaseAdmin admin = new HBaseAdmin(conf); admin.createTable(desc); + + // Open table + table = new HTable(conf, desc.getName()); // Store some values @@ -181,10 +191,78 @@ e.printStackTrace(); fail(); } + try { + // Give cache flusher and log roller a chance to run + // Otherwise we'll never hit the bloom filter, just the memcache + Thread.sleep(conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000) * 2); + + } catch (InterruptedException e) { + // ignore + } + + + try { + if (table != null) { + for(int i = 0; i < testKeys.length; i++) { + byte[] value = table.get(testKeys[i], CONTENTS); + if(value != null && value.length != 0) { + LOG.info("non existant key: " + testKeys[i] + " returned value: " + + new String(value, HConstants.UTF8_ENCODING)); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + fail(); + } } + + /** Test that uses computed for the bloom filter */ + public void testComputedParameters() { + HTable table = null; + try { + // Setup + HTableDescriptor desc = new HTableDescriptor(getName()); + + BloomFilterDescriptor bloomFilter = + new BloomFilterDescriptor( + BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter + 1000 // estimated number of entries + ); + LOG.info("vector size: " + bloomFilter.vectorSize); + + desc.addFamily( + new HColumnDescriptor(CONTENTS, // Column name + 1, // Max versions + HColumnDescriptor.CompressionType.NONE, // no compression + HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory + HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH, + bloomFilter + ) + ); + + // Create the table + + HBaseAdmin admin = new HBaseAdmin(conf); + admin.createTable(desc); + + // Open table + + table = new HTable(conf, desc.getName()); + + // Store some values - /** the test */ - public void testBloomFilters() { + for(int i = 0; i < 100; i++) { + Text row = rows[i]; + String value = row.toString(); + long lockid = table.startUpdate(rows[i]); + table.put(lockid, CONTENTS, value.getBytes(HConstants.UTF8_ENCODING)); + table.commit(lockid); + } + } catch (Exception e) { + e.printStackTrace(); + fail(); + } try { // Give cache flusher and log roller a chance to run // Otherwise we'll never hit the bloom filter, just the memcache @@ -195,11 +273,13 @@ } try { - for(int i = 0; i < testKeys.length; i++) { - byte[] value = table.get(testKeys[i], CONTENTS); - if(value != null && value.length != 0) { - System.err.println("non existant key: " + testKeys[i] + - " returned value: " + new String(value, HConstants.UTF8_ENCODING)); + if (table != null) { + for(int i = 0; i < testKeys.length; i++) { + byte[] value = table.get(testKeys[i], CONTENTS); + if(value != null && value.length != 0) { + LOG.info("non existant key: " + testKeys[i] + " returned value: " + + new String(value, HConstants.UTF8_ENCODING)); + } } } } catch (Exception e) {