Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: hadoop-dev@lucene.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r570270 - in /lucene/hadoop/trunk/src/contrib/hbase: ./
 src/java/org/apache/hadoop/hbase/ src/test/org/apache/hadoop/hbase/
Date: Mon, 27 Aug 2007 23:35:17 -0000
To: hadoop-commits@lucene.apache.org
From: jimk@apache.org
Message-Id: <20070827233518.7249C1A9832@eris.apache.org>

Author: jimk
Date: Mon Aug 27 16:35:15 2007
New Revision: 570270

URL: http://svn.apache.org/viewvc?rev=570270&view=rev
Log:
HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom filter types

Modified:
    lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt
    lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java
    lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java
    lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java
    lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java

Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=570270&r1=570269&r2=570270&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Mon Aug 27 16:35:15 2007
@@ -25,6 +25,8 @@
   IMPROVEMENTS
     HADOOP-1737 Make HColumnDescriptor data publically members settable
     HADOOP-1746 Clean up findbugs warnings
+    HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom
+                filter types
 
 
 Below are the list of changes before 2007-08-18

Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java?rev=570270&r1=570269&r2=570270&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java Mon Aug 27 16:35:15 2007
@@ -27,28 +27,46 @@
 
 /** 
  * Supplied as a parameter to HColumnDescriptor to specify what kind of
- * bloom filter to use for a column, and its configuration parameters
+ * bloom filter to use for a column, and its configuration parameters.
+ * 
+ * There is no way to automatically determine the vector size and the number of
+ * hash functions to use. In particular, bloom filters are very sensitive to the
+ * number of elements inserted into them. For HBase, the number of entries
+ * depends on the size of the data stored in the column. Currently the default
+ * region size is 64MB, so the number of entries is approximately 
+ * 64MB / (average value size for column).
+ * 
+ * If m denotes the number of bits in the Bloom filter (vectorSize),
+ * n denotes the number of elements inserted into the Bloom filter and
+ * k represents the number of hash functions used (nbHash), then according to
+ * Broder and Mitzenmacher,
+ * 
+ * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
+ * 
+ * the probability of false positives is minimized when k is approximately
+ * m/n ln(2).
+ * 
  */
 public class BloomFilterDescriptor implements WritableComparable {
+  private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0;
   
   /*
    * Specify the kind of bloom filter that will be instantiated
    */
 
-  /**
-   * <i>Bloom filter</i>, as defined by Bloom in 1970.
-   */
-  public static final int BLOOMFILTER = 1;
-
-  /**
-   * <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN 2000 paper.
-   */
-  public static final int COUNTING_BLOOMFILTER = 2;
-
-  /**
-   * <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
-   */
-  public static final int RETOUCHED_BLOOMFILTER = 3;
+  /** The type of bloom filter */
+  public static enum BloomFilterType {
+    /** <i>Bloom filter</i>, as defined by Bloom in 1970. */
+    BLOOMFILTER,
+    /**
+     * <i>Counting Bloom filter</i>, as defined by Fan et al. in a ToN 2000 paper.
+     */
+    COUNTING_BLOOMFILTER,
+    /**
+     * <i>Retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
+     */
+    RETOUCHED_BLOOMFILTER
+  }
 
   /** Default constructor - used in conjunction with Writable */
   public BloomFilterDescriptor() {
@@ -56,11 +74,41 @@
   }
   
   /**
+   * Creates a BloomFilterDescriptor for the specified type of filter, fixes
+   * the number of hash functions to 4 and computes a vector size using:
+   * 
+   * vectorSize = ceil((4 * n) / ln(2))
+   * 
+   * @param type
+   * @param numberOfEntries
+   */
+  public BloomFilterDescriptor(final BloomFilterType type,
+      final int numberOfEntries) {
+    
+    switch(type) {
+    case BLOOMFILTER:
+    case COUNTING_BLOOMFILTER:
+    case RETOUCHED_BLOOMFILTER:
+      this.filterType = type;
+      break;
+
+    default:
+      throw new IllegalArgumentException("Invalid bloom filter type: " + type);
+    }
+    this.nbHash = (int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS;
+    this.vectorSize = (int) Math.ceil(
+        (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * numberOfEntries)) /
+        Math.log(2.0));
+  }
+  
+  /**
    * @param type The kind of bloom filter to use.
    * @param vectorSize The vector size of <i>this</i> filter.
    * @param nbHash The number of hash functions to consider.
    */
-  public BloomFilterDescriptor(int type, int vectorSize, int nbHash) {
+  public BloomFilterDescriptor(final BloomFilterType type, final int vectorSize,
+      final int nbHash) {
+    
     switch(type) {
     case BLOOMFILTER:
     case COUNTING_BLOOMFILTER:
@@ -75,7 +123,7 @@
     this.nbHash = nbHash;
   }
   
-  int filterType;
+  BloomFilterType filterType;
   int vectorSize;
   int nbHash;
 
@@ -113,7 +161,7 @@
   /** {@inheritDoc} */
   @Override
   public int hashCode() {
-    int result = Integer.valueOf(this.filterType).hashCode();
+    int result = this.filterType.hashCode();
     result ^= Integer.valueOf(this.vectorSize).hashCode();
     result ^= Integer.valueOf(this.nbHash).hashCode();
     return result;
@@ -123,14 +171,15 @@
   
   /** {@inheritDoc} */
   public void readFields(DataInput in) throws IOException {
-    filterType = in.readInt();
+    int ordinal = in.readInt();
+    this.filterType = BloomFilterType.values()[ordinal];
     vectorSize = in.readInt();
     nbHash = in.readInt();
   }
   
   /** {@inheritDoc} */
   public void write(DataOutput out) throws IOException {
-    out.writeInt(filterType);
+    out.writeInt(filterType.ordinal());
     out.writeInt(vectorSize);
     out.writeInt(nbHash);
   }
@@ -140,7 +189,7 @@
   /** {@inheritDoc} */
   public int compareTo(Object o) {
     BloomFilterDescriptor other = (BloomFilterDescriptor)o;
-    int result = this.filterType - other.filterType;
+    int result = this.filterType.ordinal() - other.filterType.ordinal();
 
     if(result == 0) {
       result = this.vectorSize - other.vectorSize;

Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java?rev=570270&r1=570269&r2=570270&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java Mon Aug 27 16:35:15 2007
@@ -31,6 +31,11 @@
 /**
  * An HColumnDescriptor contains information about a column family such as the
  * number of versions, compression settings, etc.
+ * 
+ * It is used as input when creating a table or adding a column. Once set, the
+ * parameters that specify a column cannot be changed without deleting the
+ * column and recreating it. If there is data stored in the column, it will be
+ * deleted when the column is deleted.
  */
 public class HColumnDescriptor implements WritableComparable {
   

Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java?rev=570270&r1=570269&r2=570270&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HStore.java Mon Aug 27 16:35:15 2007
@@ -316,18 +316,21 @@
       if (LOG.isDebugEnabled()) {
         LOG.debug("loading bloom filter for " + this.storeName);
       }
+      
+      BloomFilterDescriptor.BloomFilterType type =
+        family.getBloomFilter().filterType;
 
-      switch(family.getBloomFilter().filterType) {
+      switch(type) {
       
-      case BloomFilterDescriptor.BLOOMFILTER:
+      case BLOOMFILTER:
         bloomFilter = new BloomFilter();
         break;
         
-      case BloomFilterDescriptor.COUNTING_BLOOMFILTER:
+      case COUNTING_BLOOMFILTER:
         bloomFilter = new CountingBloomFilter();
         break;
         
-      case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER:
+      case RETOUCHED_BLOOMFILTER:
         bloomFilter = new RetouchedBloomFilter();
       }
       FSDataInputStream in = fs.open(filterFile);
@@ -339,20 +342,23 @@
         LOG.debug("creating bloom filter for " + this.storeName);
       }
 
-      switch(family.getBloomFilter().filterType) {
+      BloomFilterDescriptor.BloomFilterType type =
+        family.getBloomFilter().filterType;
+
+      switch(type) {
       
-      case BloomFilterDescriptor.BLOOMFILTER:
+      case BLOOMFILTER:
         bloomFilter = new BloomFilter(family.getBloomFilter().vectorSize,
             family.getBloomFilter().nbHash);
         break;
         
-      case BloomFilterDescriptor.COUNTING_BLOOMFILTER:
+      case COUNTING_BLOOMFILTER:
         bloomFilter =
           new CountingBloomFilter(family.getBloomFilter().vectorSize,
             family.getBloomFilter().nbHash);
         break;
         
-      case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER:
+      case RETOUCHED_BLOOMFILTER:
         bloomFilter =
           new RetouchedBloomFilter(family.getBloomFilter().vectorSize,
             family.getBloomFilter().nbHash);

Modified: lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java?rev=570270&r1=570269&r2=570270&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestBloomFilters.java Mon Aug 27 16:35:15 2007
@@ -19,18 +19,16 @@
  */
 package org.apache.hadoop.hbase;
 
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
-
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.io.Text;
 
 /** Tests per-column bloom filters */
 public class TestBloomFilters extends HBaseClusterTestCase {
+  static final Log LOG = LogFactory.getLog(TestBloomFilters.class);
+
   private static final Text CONTENTS = new Text("contents:");
 
-  private HTableDescriptor desc = null;
-  private HTable table = null;
-  
   private static final Text[] rows = {
     new Text("wmjwjzyv"),
     new Text("baietibz"),
@@ -144,28 +142,40 @@
   /** constructor */
   public TestBloomFilters() {
     super();
-    conf.set("hbase.hregion.maxunflushed", "90"); // flush cache every 100 writes
+    conf.set("hbase.hregion.memcache.flush.size", "100");// flush cache every 100 bytes
     conf.set("hbase.regionserver.maxlogentries", "90"); // and roll log too
-    Logger.getLogger(HRegion.class).setLevel(Level.DEBUG);
-    Logger.getLogger(HStore.class).setLevel(Level.DEBUG);
   }
   
-  /** {@inheritDoc} */
-  @Override
-  public void setUp() {
+  /** Test that specifies explicit parameters for the bloom filter */
+  public void testExplicitParameters() {
+    HTable table = null;
     try {
-      super.setUp();
-      this.desc = new HTableDescriptor("test");
+      // Setup
+      HTableDescriptor desc = new HTableDescriptor(getName());
+      BloomFilterDescriptor bloomFilter =
+        new BloomFilterDescriptor(              // if we insert 1000 values
+            BloomFilterDescriptor.BloomFilterType.BLOOMFILTER,  // plain old bloom filter
+            12499,                              // number of bits
+            4                                   // number of hash functions
+        );
+            
       desc.addFamily(
-          new HColumnDescriptor(CONTENTS, 1, HColumnDescriptor.CompressionType.NONE,
-              false, Integer.MAX_VALUE, 
-              new BloomFilterDescriptor(              // if we insert 1000 values
-                  BloomFilterDescriptor.BLOOMFILTER,  // plain old bloom filter
-                  12499,                              // number of bits
-                  4                                   // number of hash functions
-              )));                                    // false positive = 0.0000001
+          new HColumnDescriptor(CONTENTS,               // Column name
+              1,                                        // Max versions
+              HColumnDescriptor.CompressionType.NONE,   // no compression
+              HColumnDescriptor.DEFAULT_IN_MEMORY,      // not in memory
+              HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH,
+              bloomFilter
+          )
+      );
+      
+      // Create the table
+      
       HBaseAdmin admin = new HBaseAdmin(conf);
       admin.createTable(desc);
+      
+      // Open table
+      
       table = new HTable(conf, desc.getName());
 
       // Store some values
@@ -181,10 +191,78 @@
       e.printStackTrace();
       fail();
     }
+    try {
+      // Give cache flusher and log roller a chance to run
+      // Otherwise we'll never hit the bloom filter, just the memcache
+      Thread.sleep(conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000) * 2);
+      
+    } catch (InterruptedException e) {
+      // ignore
+    }
+
+    
+    try {
+      if (table != null) {
+        for(int i = 0; i < testKeys.length; i++) {
+          byte[] value = table.get(testKeys[i], CONTENTS);
+          if(value != null && value.length != 0) {
+            LOG.info("non existant key: " + testKeys[i] + " returned value: " +
+                new String(value, HConstants.UTF8_ENCODING));
+          }
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail();
+    }
   }
+  
+  /** Test that uses computed for the bloom filter */
+  public void testComputedParameters() {
+    HTable table = null;
+    try {
+      // Setup
+      HTableDescriptor desc = new HTableDescriptor(getName());
+      
+      BloomFilterDescriptor bloomFilter =
+        new BloomFilterDescriptor(
+            BloomFilterDescriptor.BloomFilterType.BLOOMFILTER,  // plain old bloom filter
+            1000                                  // estimated number of entries
+        );
+      LOG.info("vector size: " + bloomFilter.vectorSize);
+      
+      desc.addFamily(
+          new HColumnDescriptor(CONTENTS,               // Column name
+              1,                                        // Max versions
+              HColumnDescriptor.CompressionType.NONE,   // no compression
+              HColumnDescriptor.DEFAULT_IN_MEMORY,      // not in memory
+              HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH,
+              bloomFilter
+          )
+      );
+      
+      // Create the table
+      
+      HBaseAdmin admin = new HBaseAdmin(conf);
+      admin.createTable(desc);
+      
+      // Open table
+      
+      table = new HTable(conf, desc.getName());
+
+      // Store some values
 
-  /** the test */
-  public void testBloomFilters() {
+      for(int i = 0; i < 100; i++) {
+        Text row = rows[i];
+        String value = row.toString();
+        long lockid = table.startUpdate(rows[i]);
+        table.put(lockid, CONTENTS, value.getBytes(HConstants.UTF8_ENCODING));
+        table.commit(lockid);
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail();
+    }
     try {
       // Give cache flusher and log roller a chance to run
       // Otherwise we'll never hit the bloom filter, just the memcache
@@ -195,11 +273,13 @@
     }
     
     try {
-      for(int i = 0; i < testKeys.length; i++) {
-        byte[] value = table.get(testKeys[i], CONTENTS);
-        if(value != null && value.length != 0) {
-          System.err.println("non existant key: " + testKeys[i] +
-              " returned value: " + new String(value, HConstants.UTF8_ENCODING));
+      if (table != null) {
+        for(int i = 0; i < testKeys.length; i++) {
+          byte[] value = table.get(testKeys[i], CONTENTS);
+          if(value != null && value.length != 0) {
+            LOG.info("non existant key: " + testKeys[i] + " returned value: " +
+                new String(value, HConstants.UTF8_ENCODING));
+          }
         }
       }
     } catch (Exception e) {