hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From li...@apache.org
Subject svn commit: r1546425 - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/client/ main/java/org/apache/hadoop/hbase/io/ main/java/org/apache/hadoop/hbase/io/hfile/ main/java/org/apache/hadoop/hbase/io/h...
Date Thu, 28 Nov 2013 18:13:20 GMT
Author: liyin
Date: Thu Nov 28 18:13:19 2013
New Revision: 1546425

URL: http://svn.apache.org/r1546425
Log:
[HBASE-9815] Add Histogram representative of row key distribution inside a region.

Author: manukranthk

Summary: Using Histogram of row key distribution inside a region, we can perform cost estimation of various scan operations and pro-actively optimize the parallelism of the scan operations.

Test Plan: Unit Tests

Reviewers: rshroff, aaiyer, liyintang

Reviewed By: liyintang

CC: hbase-eng@, san, liyintang, adela, gauravm

Differential Revision: https://phabricator.fb.com/D1004829

Task ID: 2905536

Added:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HFileHistogram.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HiveBasedNumericHistogram.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/NumericHistogram.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/UniformSplitHFileHistogram.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHFileHistogramE2E.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHiveBasedNumericHistogram.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestUniformSplitHistogram.java
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HTable.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/Scan.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/HbaseObjectWritable.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/HFile.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Store.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Bytes.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java Thu Nov 28 18:13:19 2013
@@ -900,6 +900,8 @@ public final class HConstants {
 
   public static final String CLIENT_SIDE_SCAN = "hbase.client.side.scan";
   public static final boolean DEFAULT_CLIENT_SIDE_SCAN = false;
+  public static final String USE_HFILEHISTOGRAM = "hbase.client.hfilehistogram.enabled";
+  public static final boolean DEFAULT_USE_HFILEHISTOGRAM = true;
 
   private HConstants() {
     // Can't be instantiated with this constructor.

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HTable.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HTable.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HTable.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HTable.java Thu Nov 28 18:13:19 2013
@@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.UnknownSc
 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
 import org.apache.hadoop.hbase.io.hfile.Compression;
 import org.apache.hadoop.hbase.io.hfile.PreloadThreadPool;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.Bucket;
 import org.apache.hadoop.hbase.ipc.HBaseRPCOptions;
 import org.apache.hadoop.hbase.ipc.ProfilingData;
 import org.apache.hadoop.hbase.util.Bytes;
@@ -1490,4 +1491,53 @@ public class HTable implements HTableInt
   public void endBatchedLoad() throws IOException {
     connection.endBatchedLoad(tableName, this.options);
   }
+
+  /**
+   * Returns the List of buckets which represent the histogram for the region
+   * the row belongs to.
+   * Some notes regarding the buckets :
+   * The Bucket boundaries may not align with the boundaries of the Region.
+   * The Bucket Boundaries will look as follows :
+   * [0x00,0x00, ... 0x00] -> [some byte array] -> ... -> [some byte array]
+   * -> [0xff, 0xff, ... 0xff]
+   *
+   * @param row
+   * @return will be either null or at least will contain
+   * one element
+   * @throws IOException
+   */
+  public List<Bucket> getHistogram(final byte[] row) throws IOException {
+    return this.getConnectionAndResetOperationContext()
+        .getRegionServerWithRetries(
+        new ServerCallable<List<Bucket>>(connection,
+            tableName, row, this.options) {
+          public List<Bucket> call() throws IOException {
+            return server.getHistogram(
+                location.getRegionInfo().getRegionName());
+          }
+        }
+    );
+  }
+
+  /**
+   * Returns the List of buckets which represent the histogram for the column
+   * family in the region the row belongs to.
+   * Also see {@link #getHistogram(byte[])}
+   * @param row
+   * @return
+   * @throws IOException
+   */
+  public List<Bucket> getHistogramForColumnFamily(final byte[] row,
+      final byte[] cf) throws IOException {
+    return this.getConnectionAndResetOperationContext()
+        .getRegionServerWithRetries(
+            new ServerCallable<List<Bucket>>(connection,
+                tableName, row, this.options) {
+              public List<Bucket> call() throws IOException {
+                return server.getHistogramForStore(
+                    location.getRegionInfo().getRegionName(), cf);
+              }
+        }
+    );
+  }
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/Scan.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/Scan.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/Scan.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/Scan.java Thu Nov 28 18:13:19 2013
@@ -26,7 +26,6 @@ import org.apache.hadoop.hbase.KeyValue;
 import org.apache.hadoop.hbase.filter.Filter;
 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
 import org.apache.hadoop.hbase.io.TimeRange;
-import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableFactories;

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/HbaseObjectWritable.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/HbaseObjectWritable.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/HbaseObjectWritable.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/HbaseObjectWritable.java Thu Nov 28 18:13:19 2013
@@ -71,6 +71,7 @@ import org.apache.hadoop.hbase.filter.Sk
 import org.apache.hadoop.hbase.filter.ValueFilter;
 import org.apache.hadoop.hbase.filter.WhileMatchFilter;
 import org.apache.hadoop.hbase.filter.WritableByteArrayComparable;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram;
 import org.apache.hadoop.hbase.ipc.HBaseRPCOptions;
 import org.apache.hadoop.hbase.ipc.ProfilingData;
 import org.apache.hadoop.hbase.master.AssignmentPlan;
@@ -216,6 +217,7 @@ public class HbaseObjectWritable impleme
 
     addToMap(MultiAction.class, code++);
     addToMap(MultiResponse.class, code++);
+    addToMap(HFileHistogram.Bucket.class, code++);
   }
 
   private Class<?> declaredClass;

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/HFile.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/HFile.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/HFile.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/HFile.java Thu Nov 28 18:13:19 2013
@@ -27,7 +27,6 @@ import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
@@ -392,6 +391,8 @@ public class HFile {
   /** The configuration key for HFile version to use for new files */
   public static final String FORMAT_VERSION_KEY = "hfile.format.version";
 
+  public static final String HFILEHISTOGRAM_METABLOCK = "hfile.histogram.metaentry";
+
   public static int getFormatVersion(Configuration conf) {
     int version = conf.getInt(FORMAT_VERSION_KEY, MAX_FORMAT_VERSION);
     checkFormatVersion(version);

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HFileHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HFileHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HFileHistogram.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HFileHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,249 @@
+/**
+ * Copyright The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Captures histogram of statistics about the distribution of rows in the HFile.
+ * This needs to be serialized onto the HFile.
+ */
+public interface HFileHistogram {
+  /**
+   * This enum provides the set of additional stats the Histogram can store.
+   * (TODO) manukranthk : Integrate HFileStats.
+   */
+  public static enum HFileStat {
+    KEYVALUECOUNT
+  }
+
+  public final String HFILEHISTOGRAM_BINCOUNT = "hfile.histogram.bin.count";
+  public final int DEFAULT_HFILEHISTOGRAM_BINCOUNT = 100;
+
+  public static class Bucket implements Writable {
+    private byte[] startRow;
+    private byte[] endRow;
+    private double numRows;
+    private Map<HFileStat, Double> hfileStats;
+
+    private Bucket(byte[] startRow, byte[] endRow, double numRows,
+        Map<HFileStat, Double> hfileStats) {
+      this.startRow = startRow;
+      this.endRow = endRow;
+      this.numRows = numRows;
+      this.hfileStats = hfileStats;
+    }
+
+    public Bucket() {
+    }
+
+    public double getCount() {
+      return numRows;
+    }
+
+    /**
+     * Returns the number of key values that this bucket holds.
+     *
+     * @return
+     */
+    public double getNumKvs() {
+      return this.hfileStats.get(HFileStat.KEYVALUECOUNT);
+    }
+
+    /**
+     * @return returns a copy of the endRow
+     */
+    public byte[] getEndRow() {
+      return Bytes.copyOfByteArray(this.endRow);
+    }
+
+    /**
+     * @return returns a copy of last
+     */
+    public byte[] getStartRow() {
+      return Bytes.copyOfByteArray(this.startRow);
+    }
+
+    @Override
+    public void readFields(DataInput in) throws IOException {
+      this.startRow = Bytes.readByteArray(in);
+      this.endRow = Bytes.readByteArray(in);
+      this.numRows = in.readDouble();
+      int numStats = in.readInt();
+      this.hfileStats = new TreeMap<HFileStat, Double>();
+      for (int i = 0; i < numStats; i++) {
+        String ordinal = Bytes.toString(Bytes.readByteArray(in));
+        double val = in.readDouble();
+        hfileStats.put(HFileStat.valueOf(ordinal), val);
+      }
+    }
+
+    @Override
+    public void write(DataOutput out) throws IOException {
+      Bytes.writeByteArray(out, this.startRow);
+      Bytes.writeByteArray(out, this.endRow);
+      out.writeDouble(numRows);
+      out.writeInt(this.hfileStats.size());
+      for (Entry<HFileStat, Double> entry : hfileStats.entrySet()) {
+        Bytes.writeByteArray(out, Bytes.toBytes(entry.getKey().name()));
+        out.writeDouble(entry.getValue());
+      }
+    }
+
+    public String print() {
+      StringBuilder sb = new StringBuilder(3 * this.startRow.length);
+      sb.append("Bucket : ");
+      sb.append(" , startRow : ");
+      sb.append(Bytes.toStringBinary(this.startRow));
+      sb.append(" , endRow : ");
+      sb.append(Bytes.toStringBinary(this.endRow));
+      sb.append(" , count : ");
+      sb.append(this.numRows);
+      return sb.toString();
+    }
+
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + Arrays.hashCode(endRow);
+      long temp;
+      temp = Double.doubleToLongBits(numRows);
+      result = prime * result + (int) (temp ^ (temp >>> 32));
+      result = prime * result + Arrays.hashCode(startRow);
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj)
+        return true;
+      if (obj == null)
+        return false;
+      if (getClass() != obj.getClass())
+        return false;
+      Bucket other = (Bucket) obj;
+      if (!Arrays.equals(endRow, other.endRow))
+        return false;
+      if (Double.doubleToLongBits(numRows) != Double
+          .doubleToLongBits(other.numRows))
+        return false;
+      if (!Arrays.equals(startRow, other.startRow))
+        return false;
+      return true;
+    }
+
+    public static class Builder {
+      private byte[] startRow;
+      private byte[] endRow;
+      private double numRows;
+      private Map<HFileStat, Double> hfileStats;
+
+      public Builder() {
+        this.hfileStats = new HashMap<HFileStat, Double>();
+      }
+
+      public Builder setStartRow(byte[] startRow) {
+        this.startRow = Bytes.copyOfByteArray(startRow);
+        return this;
+      }
+
+      public Builder setEndRow(byte[] endRow) {
+        this.endRow = Bytes.copyOfByteArray(endRow);
+        return this;
+      }
+
+      public Builder setNumRows(double numRows) {
+        this.numRows = numRows;
+        return this;
+      }
+
+      public Builder addHFileStat(HFileStat stat, Double count) {
+        this.hfileStats.put(stat, count);
+        return this;
+      }
+
+      public Bucket create() {
+        return new Bucket(this.startRow, this.endRow, this.numRows,
+            this.hfileStats);
+      }
+    }
+  }
+
+  /**
+   * Adds a row to the Histogram.
+   *
+   * @param kv
+   */
+  public void add(KeyValue kv);
+
+  /**
+   * Gets the set of Buckets from the Histogram. The buckets will be a
+   * representation of the Equi-Depth histogram stored inside the Region.
+   *
+   * @return
+   */
+  public List<Bucket> getUniformBuckets();
+
+  /**
+   * Serializes the Histogram to he written onto the HFile and stored in the
+   * meta block.
+   *
+   * @return
+   */
+  public Writable serialize();
+
+  /**
+   * Composes a list of HFileHistograms and returns a HFileHistogram which is a
+   * merge of all the given Histograms. Assumes that the HFileHistogram objects
+   * in the list are of the same type as this object.
+   *
+   * @param histograms
+   * @return
+   */
+  public HFileHistogram compose(List<HFileHistogram> histograms);
+
+  /**
+   * Method to deserialize the histogram from the HFile. This is the inverse of
+   * the serialize function.
+   *
+   * @param buf
+   * @return
+   * @throws IOException
+   */
+  public HFileHistogram deserialize(ByteBuffer buf) throws IOException;
+
+  public HFileHistogram merge(HFileHistogram h2);
+
+  public int getBinCount();
+}

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HiveBasedNumericHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HiveBasedNumericHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HiveBasedNumericHistogram.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/HiveBasedNumericHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,465 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * Adapted from NumericHistogram from hive
+ * Reference : hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/
+ *             generic/NumericHistogram.java
+ *
+ */
+
+/**
+ * A generic, re-usable histogram class that supports partial aggregations. The
+ * algorithm is a heuristic adapted from the following paper: Yael Ben-Haim and
+ * Elad Tom-Tov, "A streaming parallel decision tree algorithm", J. Machine
+ * Learning Research 11 (2010), pp. 849--872. Although there are no
+ * approximation guarantees, it appears to work well with adequate data and a
+ * large (e.g., 20-80) number of histogram bins.
+ */
+public class HiveBasedNumericHistogram implements NumericHistogram {
+  /**
+   * The Coord class defines a histogram bin, which is just an (x,y) pair.
+   */
+  protected static class Coord implements Comparable<Coord> {
+    double x;
+    double y;
+    Map<Enum<?>, Double> stats;
+
+    public int compareTo(Coord o) {
+      if (x < o.x) {
+        return -1;
+      }
+      if (x > o.x) {
+        return 1;
+      }
+      return 0;
+    }
+  };
+
+  // Class variables
+  private int nbins;
+  private int nusedbins;
+  private ArrayList<Coord> bins;
+  private Random prng;
+  private final double minusInfinity;
+  private final double infinity;
+
+  /**
+   * Creates a new histogram object. Note that the allocate() or merge() method
+   * must be called before the histogram can be used.
+   */
+  public HiveBasedNumericHistogram(double minusInfinity, double infinity) {
+    nbins = 0;
+    nusedbins = 0;
+    bins = null;
+    this.minusInfinity = minusInfinity;
+    this.infinity = infinity;
+
+    // init the RNG for breaking ties in histogram merging. A fixed seed is
+    // specified here
+    // to aid testing, but can be eliminated to use a time-based seed (which
+    // would
+    // make the algorithm non-deterministic).
+    prng = new Random(31183);
+  }
+
+  protected HiveBasedNumericHistogram(double minusInfinity2, double infinity2,
+      List<Bucket> nbuckets) {
+    this.minusInfinity = minusInfinity2;
+    this.infinity = infinity2;
+
+  }
+
+  /**
+   * Resets a histogram object to its initial state. allocate() or merge() must
+   * be called again before use.
+   */
+  public void reset() {
+    bins = null;
+    nbins = nusedbins = 0;
+  }
+
+  /**
+   * Returns the number of bins currently being used by the histogram.
+   */
+  public int getUsedBins() {
+    return nusedbins;
+  }
+
+  /**
+   * Returns true if this histogram object has been initialized by calling
+   * merge() or allocate().
+   */
+  public boolean isReady() {
+    return nbins != 0;
+  }
+
+  /**
+   * Returns a particular histogram bin.
+   */
+  public Coord getBin(int b) {
+    return bins.get(b);
+  }
+
+  /**
+   * Sets the number of histogram bins to use for approximating data.
+   *
+   * @param num_bins
+   *          Number of non-uniform-width histogram bins to use
+   */
+  public void allocate(int numBins) {
+    nbins = numBins;
+    bins = new ArrayList<Coord>();
+    nusedbins = 0;
+  }
+
+  /**
+   * Takes a serialized histogram created by the serialize() method and merges
+   * it with the current histogram object.
+   *
+   * @param other
+   *          A serialized histogram created by the serialize() method
+   * @see #merge
+   */
+  public NumericHistogram merge(NumericHistogram hist) {
+    Preconditions.checkNotNull(hist);
+    Preconditions.checkArgument(hist instanceof HiveBasedNumericHistogram);
+    HiveBasedNumericHistogram other = (HiveBasedNumericHistogram)hist;
+
+    if (nbins == 0 || nusedbins == 0) {
+      // Our aggregation buffer has nothing in it, so just copy over 'other'
+      // by deserializing the ArrayList of (x,y) pairs into an array of Coord
+      // objects
+      nbins = other.nbins;
+      nusedbins = other.nusedbins;
+      bins = new ArrayList<Coord>(nusedbins);
+      for (int i = 0; i < other.nusedbins; i++) {
+        Coord bin = new Coord();
+        bin.x = other.bins.get(i).x;
+        bin.y = other.bins.get(i).y;
+        bins.add(bin);
+      }
+    } else {
+      // The aggregation buffer already contains a partial histogram. Therefore,
+      // we need
+      // to merge histograms using Algorithm #2 from the Ben-Haim and Tom-Tov
+      // paper.
+
+      ArrayList<Coord> tmp_bins = new ArrayList<Coord>(nusedbins
+          + other.nusedbins);
+      // Copy all the histogram bins from us and 'other' into an overstuffed
+      // histogram
+      for (int i = 0; i < nusedbins; i++) {
+        Coord bin = new Coord();
+        bin.x = bins.get(i).x;
+        bin.y = bins.get(i).y;
+        tmp_bins.add(bin);
+      }
+      for (int j = 0; j < other.nusedbins; j++) {
+        Coord bin = new Coord();
+        bin.x = other.bins.get(j).x;
+        bin.y = other.bins.get(j).y;
+        tmp_bins.add(bin);
+      }
+      Collections.sort(tmp_bins);
+
+      // Now trim the overstuffed histogram down to the correct number of bins
+      bins = tmp_bins;
+      nusedbins += other.nusedbins;
+      trim();
+    }
+    return this;
+  }
+
+  /**
+   * Adds a new data point to the histogram approximation. Make sure you have
+   * called either allocate() or merge() first. This method implements Algorithm
+   * #1 from Ben-Haim and Tom-Tov,
+   * "A Streaming Parallel Decision Tree Algorithm", JMLR 2010.
+   *
+   * @param v
+   *          The data point to add to the histogram approximation.
+   */
+  public void add(double v) {
+    // Binary search to find the closest bucket that v should go into.
+    // 'bin' should be interpreted as the bin to shift right in order to
+    // accomodate
+    // v. As a result, bin is in the range [0,N], where N means that the value v
+    // is
+    // greater than all the N bins currently in the histogram. It is also
+    // possible that
+    // a bucket centered at 'v' already exists, so this must be checked in the
+    // next step.
+    int bin = 0;
+    for (int l = 0, r = nusedbins; l < r;) {
+      bin = (l + r) / 2;
+      if (bins.get(bin).x > v) {
+        r = bin;
+      } else {
+        if (bins.get(bin).x < v) {
+          l = ++bin;
+        } else {
+          break; // break loop on equal comparator
+        }
+      }
+    }
+
+    // If we found an exact bin match for value v, then just increment that
+    // bin's count.
+    // Otherwise, we need to insert a new bin and trim the resulting histogram
+    // back to size.
+    // A possible optimization here might be to set some threshold under which
+    // 'v' is just
+    // assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) <
+    // THRESHOLD, then
+    // just increment 'bin'. This is not done now because we don't want to make
+    // any
+    // assumptions about the range of numeric data being analyzed.
+    if (bin < nusedbins && bins.get(bin).x == v) {
+      bins.get(bin).y++;
+    } else {
+      Coord newBin = new Coord();
+      newBin.x = v;
+      newBin.y = 1;
+      bins.add(bin, newBin);
+
+      // Trim the bins down to the correct number of bins.
+      if (++nusedbins > nbins) {
+        trim();
+      }
+    }
+
+  }
+
+  /**
+   * Trims a histogram down to 'nbins' bins by iteratively merging the closest
+   * bins. If two pairs of bins are equally close to each other, decide
+   * uniformly at random which pair to merge, based on a PRNG.
+   */
+  private void trim() {
+    while (nusedbins > nbins) {
+      // Find the closest pair of bins in terms of x coordinates.
+      // Break ties randomly.
+      double smallestdiff = bins.get(1).x - bins.get(0).x;
+      int smallestdiffloc = 0, smallestdiffcount = 1;
+      for (int i = 1; i < nusedbins - 1; i++) {
+        double diff = bins.get(i + 1).x - bins.get(i).x;
+        if (diff < smallestdiff) {
+          smallestdiff = diff;
+          smallestdiffloc = i;
+          smallestdiffcount = 1;
+        } else {
+          if (diff == smallestdiff
+              && prng.nextDouble() <= (1.0 / ++smallestdiffcount)) {
+            smallestdiffloc = i;
+          }
+        }
+      }
+
+      // Merge the two closest bins into their average x location, weighted by
+      // their heights.
+      // The height of the new bin is the sum of the heights of the old bins.
+      // double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y;
+      // bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d;
+      // bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d *
+      // bins[smallestdiffloc+1].y;
+      // bins[smallestdiffloc].y = d;
+
+      double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc + 1).y;
+      Coord smallestdiffbin = bins.get(smallestdiffloc);
+      smallestdiffbin.x *= smallestdiffbin.y / d;
+      smallestdiffbin.x += bins.get(smallestdiffloc + 1).x / d
+          * bins.get(smallestdiffloc + 1).y;
+      smallestdiffbin.y = d;
+      // Shift the remaining bins left one position
+      bins.remove(smallestdiffloc + 1);
+      nusedbins--;
+    }
+  }
+
+  /**
+   * Gets an approximate quantile value from the current histogram. Some popular
+   * quantiles are 0.5 (median), 0.95, and 0.98.
+   *
+   * @param q
+   *          The requested quantile, must be strictly within the range (0,1).
+   * @return The quantile value.
+   */
+  public double quantile(double q) {
+    assert (bins != null && nusedbins > 0 && nbins > 0);
+    double sum = 0, csum = 0;
+    int b;
+    for (b = 0; b < nusedbins; b++) {
+      sum += bins.get(b).y;
+    }
+    for (b = 0; b < nusedbins; b++) {
+      csum += bins.get(b).y;
+      if (csum / sum >= q) {
+        if (b == 0) {
+          return bins.get(b).x;
+        }
+
+        csum -= bins.get(b).y;
+        double r = bins.get(b - 1).x + (q * sum - csum)
+            * (bins.get(b).x - bins.get(b - 1).x) / (bins.get(b).y);
+        return r;
+      }
+    }
+    return -1; // for Xlint, code will never reach here
+  }
+
+  public int getNumBins() {
+    return bins == null ? 0 : bins.size();
+  }
+
+  /**
+   * Gives the sum for points from [-infinity, pi]
+   *
+   * @param i
+   * @return
+   */
+  public double sum(int i) {
+    double sum = 0;
+    for (int j = i - 1; j >= 0; j--) {
+      sum += this.bins.get(j).y;
+    }
+    return sum + (bins.get(i).y / 2);
+  }
+
+  /**
+   * Following the Algorithm 4 : Uniform Procedure in the paper. Returns a
+   * HiveBasedNumericHistogram which has B - 1 points such that each range
+   * [-infinity, p1], [p1, p2] .. [pB, infinity] have the same number of
+   * elements.
+   *
+   * @return
+   */
+  public HiveBasedNumericHistogram uniform(int B) {
+    double sum = 0.0;
+    HiveBasedNumericHistogram hist = new HiveBasedNumericHistogram(
+        this.minusInfinity, this.infinity);
+    for (int j = 0; j < this.nusedbins; j++) {
+      sum += bins.get(j).y;
+    }
+    if (this.nusedbins == 0)
+      return hist;
+    double[] partialSums = new double[this.nusedbins];
+    partialSums[0] = this.bins.get(0).y / 2;
+    for (int j = 1; j < this.nusedbins; j++) {
+      partialSums[j] = partialSums[j - 1]
+          + (this.bins.get(j - 1).y + this.bins.get(j).y) / 2;
+    }
+    hist.allocate(this.nusedbins);
+    hist.bins = new ArrayList<Coord>();
+    for (int j = 0; j < (B - 1); j++) {
+      double s = sum * (j + 1) / B;
+      int i = 0;
+      double d = s - partialSums[this.nusedbins - 1];
+      for (; i < (this.nusedbins - 1); i++) {
+        if (s >= partialSums[i] && s < partialSums[i + 1]) {
+          d = s - partialSums[i];
+          break;
+        }
+      }
+      double endVal = 0;
+      double endKey = this.infinity;
+      if (i < (this.nusedbins - 1)) {
+        endVal = this.bins.get(i + 1).y;
+        endKey = this.bins.get(i + 1).x;
+      }
+      double a = endVal - this.bins.get(i).y;
+      a = (a == 0) ? 1 : a;
+      double b = 2 * this.bins.get(i).y;
+      double c = -2 * d;
+      double det = b * b - 2 * a * c;
+      double z = (-b + Math.sqrt(det)) / (2 * a);
+      Coord newBin = new Coord();
+      newBin.x = this.bins.get(i).x + (endKey - this.bins.get(i).x) * z;
+      newBin.y = sum / this.nusedbins;
+      hist.bins.add(newBin);
+    }
+    hist.nusedbins = hist.bins.size();
+    hist.nbins = hist.nusedbins;
+    return hist;
+  }
+
+  private List<Bucket> getBuckets(HiveBasedNumericHistogram hist) {
+    List<Bucket> buckets = Lists.newArrayList();
+    if (hist.bins.size() == 0) {
+      return buckets;
+    }
+    buckets.add(new Bucket(this.minusInfinity, hist.bins.get(0).x, hist.bins
+        .get(0).y / 2, hist.bins.get(0).stats));
+    if (hist.bins.size() != 1) {
+      for (int i = 1; i < hist.bins.size(); i++) {
+        buckets.add(new Bucket(hist.bins.get(i - 1).x, hist.bins.get(i).x,
+            (hist.bins.get(i - 1).y + hist.bins.get(i).y) / 2,
+            hist.bins.get(i).stats));
+      }
+    }
+    buckets.add(new Bucket(hist.bins.get(hist.bins.size() - 1).x,
+        this.infinity, hist.bins.get(hist.bins.size() - 1).y / 2, hist.bins
+            .get(hist.bins.size() - 1).stats));
+    return buckets;
+  }
+
+  public static HiveBasedNumericHistogram getHistogram(List<Bucket> buckets) {
+    if (buckets.size() <= 1)
+      return null;
+    double min = buckets.get(0).getStart();
+    double max = buckets.get(buckets.size() - 1).getEnd();
+    HiveBasedNumericHistogram ret = new HiveBasedNumericHistogram(min, max);
+    ret.allocate(buckets.size() - 1);
+    for (int i = 1; i < buckets.size(); i++) {
+      Coord c = new Coord();
+      Bucket b = buckets.get(i);
+      Bucket prevB = buckets.get(i - 1);
+      c.x = b.getStart();
+      c.y = (prevB.getCount() + b.getCount()) / 2;
+      ret.bins.add(c);
+    }
+    ret.nusedbins = ret.bins.size();
+    return ret;
+  }
+
+  /**
+   * Constructs a uniform histogram and returns the list of buckets.
+   */
+  public List<Bucket> getUniformBuckets() {
+    return getBuckets(uniform(this.getUsedBins()));
+  }
+
+  public List<Bucket> getOriginalBuckets() {
+    return getBuckets(this);
+  }
+
+  @Override
+  public int getBinCount() {
+    return this.nbins;
+  }
+}

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/NumericHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/NumericHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/NumericHistogram.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/NumericHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,129 @@
+/**
+ * Copyright The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.HFileStat;
+
+public interface NumericHistogram {
+  public static class Bucket {
+    private double count;
+    private double start;
+    private double end;
+    private Map<Enum<?>, Double> stats;
+
+    public Bucket(double start, double end, double count,
+        Map<Enum<?>, Double> stats) {
+      this.count = count;
+      this.start = start;
+      this.end = end;
+      this.stats = stats;
+    }
+
+    public double getCount() {
+      return count;
+    }
+
+    public double getStart() {
+      return start;
+    }
+
+    public double getEnd() {
+      return end;
+    }
+
+    public double getStat(HFileStat stat) {
+      return this.stats.get(stat);
+    }
+
+    public static class Builder {
+      private double startRow;
+      private double endRow;
+      private double numRows;
+      private Map<Enum<?>, Double> hfileStats;
+
+      public Builder() {
+        this.hfileStats = new HashMap<Enum<?>, Double>();
+      }
+
+      public Builder setStartRow(double startRow) {
+        this.startRow = startRow;
+        return this;
+      }
+
+      public Builder setEndRow(double endRow) {
+        this.endRow = endRow;
+        return this;
+      }
+
+      public Builder setNumRows(double numRows) {
+        this.numRows = numRows;
+        return this;
+      }
+
+      public Builder addHFileStat(HFileStat stat, Double count) {
+        this.hfileStats.put(stat, count);
+        return this;
+      }
+
+      public Bucket create() {
+        return new Bucket(this.startRow, this.endRow, this.numRows,
+            this.hfileStats);
+      }
+    }
+
+    public String print() {
+      StringBuilder sb = new StringBuilder();
+      sb.append("Bucket : ");
+      sb.append(" start: ");
+      sb.append(this.start);
+      sb.append(" end: ");
+      sb.append(this.end);
+      sb.append(" count: ");
+      sb.append(this.count);
+      return sb.toString();
+    }
+  }
+
+  /**
+   * Adds a double value into the histogram.
+   * @param dataPoint
+   */
+  public void add(double dataPoint);
+
+  /**
+   * Returns the list of buckets which represent the equi-depth histogram.
+   * @return
+   */
+  public List<Bucket> getUniformBuckets();
+
+  /**
+   * Clears the state and allocates the histogram.
+   * @param num_bins
+   */
+  public void allocate(int num_bins);
+
+  public int getBinCount();
+
+  public NumericHistogram merge(NumericHistogram hist);
+}

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/UniformSplitHFileHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/UniformSplitHFileHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/UniformSplitHFileHistogram.java (added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/io/hfile/histogram/UniformSplitHFileHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,252 @@
+/**
+ * Copyright The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.Writable;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * Uniform split histogram splits the range uniformly while creating the
+ * Histogram i.e. if we split range [a00, z00] into 25 parts, we should get
+ * [a00, b00], [b00, c00], ... , [y00, z00]
+ *
+ * This layer is provided to be able to easily swap other underlyingHistogram
+ * implementations in the future.
+ */
+public class UniformSplitHFileHistogram implements HFileHistogram {
+  protected NumericHistogram underlyingHistogram;
+  // TODO manukranthk : make this configurable.
+  int padding = 8;
+
+  public UniformSplitHFileHistogram(int binCount) {
+    this.underlyingHistogram = new HiveBasedNumericHistogram(
+        getMinusInfinity(), getInfinity());
+    this.underlyingHistogram.allocate(binCount);
+  }
+
+  private UniformSplitHFileHistogram(List<Bucket> buckets) {
+    List<NumericHistogram.Bucket> nbuckets = Lists.newArrayList();
+    for (Bucket b : buckets) {
+      nbuckets.add(this.getFromHFileHistogramBucket(b));
+    }
+    this.underlyingHistogram = HiveBasedNumericHistogram.getHistogram(nbuckets);
+  }
+
+  @Override
+  public void add(KeyValue kv) {
+    double val = convertBytesToDouble(kv.getRow());
+    underlyingHistogram.add(val);
+  }
+
+  private double getInfinity() {
+    return new BigInteger(getInfinityArr()).doubleValue();
+  }
+
+  /**
+   * This returns the maximum number that we can represent using padding bytes.
+   * Returns {0x00, 0xff, 0xff .... 0xff }
+   *                <----  padding  ---->
+   * @return
+   */
+  private byte[] getInfinityArr() {
+    byte[] row = new byte[1];
+    row[0] = (byte) 0;
+    return Bytes.appendToTail(row, padding, (byte)0xFF);
+  }
+
+  private double getMinusInfinity() {
+    return 0.0;
+  }
+
+  /**
+   * Bytes are are sorted lexicographically, so for the purposes of
+   * HFileHistogram, we need to convert a byte[] to a double so that it still
+   * compares correctly.
+   *
+   * We initially take the first 'padding' amount of bytes and convert the bytes
+   * into a BigInteger assuming the byte[] was in 2's complement representation
+   *
+   * We will add an extra 0 at the start so that we don't have to deal with -ve
+   * numbers.
+   *
+   * @param row
+   * @return
+   */
+  protected double convertBytesToDouble(byte[] row) {
+    byte[] tmpRow = Bytes.head(row, Math.min(row.length, padding));
+    byte[] newRow = Bytes.padTail(tmpRow, padding - tmpRow.length);
+    // To avoid messing with 2's complement.
+    newRow = Bytes.padHead(newRow, 1);
+    return new BigInteger(newRow).doubleValue();
+  }
+
+  /**
+   * Double is converted to Bytes in a similar manner.
+   *
+   * @param d
+   * @return
+   */
+  protected byte[] convertDoubleToBytes(double d) {
+    BigDecimal tmpDecimal = new BigDecimal(d);
+    BigInteger tmp = tmpDecimal.toBigInteger();
+    byte[] arr = tmp.toByteArray();
+    if (arr[0] == 0) {
+      // to represent {0xff, 0xff}, big integer uses {0x00, 0xff, 0xff}
+      // due to the one's compliment representation.
+      Preconditions.checkArgument(arr.length == 1 || arr[1] != 0);
+      arr = Bytes.tail(arr, arr.length - 1);
+    }
+    if (arr.length > padding) {
+      // Can happen due to loose precision guarentee in double.
+      // while doing the conversion,
+      // {0x00, 0xff, ... , 0xff, 0xff}=>double=>{0x01, 0x00, ... , 0x00, 0x00}
+      // might happen.
+      arr = Bytes.tail(getInfinityArr(), padding);
+    }
+    return Bytes.padHead(arr, padding - arr.length);
+  }
+
+  @Override
+  public List<Bucket> getUniformBuckets() {
+    List<NumericHistogram.Bucket> buckets = this.underlyingHistogram
+        .getUniformBuckets();
+    List<Bucket> ret = Lists.newArrayList();
+    for (NumericHistogram.Bucket b : buckets) {
+      ret.add(getFromNumericHistogramBucket(b));
+    }
+    return ret;
+  }
+
+  public static HFileHistogram getHistogram(List<Bucket> buckets) {
+    HFileHistogram ret = new UniformSplitHFileHistogram(buckets);
+    return ret;
+  }
+
+  private NumericHistogram.Bucket getFromHFileHistogramBucket(
+      HFileHistogram.Bucket bucket) {
+    NumericHistogram.Bucket b = (new NumericHistogram.Bucket.Builder())
+        .setStartRow(convertBytesToDouble(bucket.getStartRow()))
+        .setEndRow(convertBytesToDouble(bucket.getEndRow()))
+        .setNumRows(bucket.getCount()).create();
+    return b;
+  }
+
+  private HFileHistogram.Bucket getFromNumericHistogramBucket(
+      NumericHistogram.Bucket bucket) {
+    Bucket b = (new Bucket.Builder())
+        .setStartRow(this.convertDoubleToBytes(bucket.getStart()))
+        .setEndRow(this.convertDoubleToBytes(bucket.getEnd()))
+        .setNumRows(bucket.getCount()).create();
+    return b;
+  }
+
+  @Override
+  public Writable serialize() {
+    return new Writable() {
+      List<Bucket> buckets;
+
+      public Writable setVal(List<Bucket> buckets) {
+        this.buckets = buckets;
+        return this;
+      }
+
+      @Override
+      public void readFields(DataInput in) throws IOException {
+        int len = in.readInt();
+        buckets = new ArrayList<Bucket>(len);
+        for (int i = 0; i < len; i++) {
+          Bucket b = new Bucket();
+          b.readFields(in);
+          buckets.add(b);
+        }
+      }
+
+      @Override
+      public void write(DataOutput out) throws IOException {
+        out.writeInt(buckets.size());
+        for (Bucket bucket : buckets) {
+          bucket.write(out);
+        }
+      }
+    }.setVal(getUniformBuckets());
+  }
+
+  /**
+   * Modifies the elements in the list of histograms.
+   */
+  @Override
+  public HFileHistogram compose(List<HFileHistogram> histograms) {
+    if (histograms.size() <= 0)
+      return null;
+    HFileHistogram h = histograms.get(0);
+    int binCnt = h.getBinCount();
+    HFileHistogram ret = new UniformSplitHFileHistogram(binCnt);
+    for (HFileHistogram h2 : histograms) {
+      ret = ret.merge(h2);
+    }
+    return ret;
+  }
+
+  @Override
+  public HFileHistogram deserialize(ByteBuffer buf) throws IOException {
+    ByteArrayInputStream bais = new ByteArrayInputStream(buf.array(),
+        buf.arrayOffset(), buf.limit());
+    DataInput in = new DataInputStream(bais);
+    int len = in.readInt();
+    List<Bucket> buckets = new ArrayList<Bucket>(len);
+    for (int i = 0; i < len; i++) {
+      Bucket b = new Bucket();
+      b.readFields(in);
+      buckets.add(b);
+    }
+    bais.close();
+    if (buckets.size() == 0) return null;
+    HFileHistogram ret = getHistogram(buckets);
+    return ret;
+  }
+
+  @Override
+  public HFileHistogram merge(HFileHistogram h2) {
+    Preconditions.checkNotNull(h2);
+    Preconditions.checkArgument(h2 instanceof UniformSplitHFileHistogram);
+    UniformSplitHFileHistogram h = (UniformSplitHFileHistogram) h2;
+    this.underlyingHistogram.merge(h.underlyingHistogram);
+    return this;
+  }
+
+  @Override
+  public int getBinCount() {
+    return this.underlyingHistogram.getBinCount();
+  }
+}

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java Thu Nov 28 18:13:19 2013
@@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.client.Pu
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.RowMutations;
 import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.Bucket;
 import org.apache.hadoop.hbase.master.AssignmentPlan;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.io.MapWritable;
@@ -435,4 +436,24 @@ public interface HRegionInterface extend
    *
    */
   public void setHDFSQuorumReadTimeoutMillis(long timeoutMillis);
+
+  /**
+   * Returns the list of buckets which represent the uniform depth histogram
+   * for a given region.
+   * @param regionName
+   * @return
+   * @throws IOException
+   */
+  public List<Bucket> getHistogram(byte[] regionName) throws IOException;
+
+  /**
+   * Returns the list of buckets which represent the uniform depth histogram
+   * for a given store.
+   * @param regionName
+   * @param family
+   * @return
+   * @throws IOException
+   */
+  public List<Bucket> getHistogramForStore(byte[] regionName, byte[] family)
+      throws IOException;
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java Thu Nov 28 18:13:19 2013
@@ -85,6 +85,7 @@ import org.apache.hadoop.hbase.io.Refere
 import org.apache.hadoop.hbase.io.hfile.BlockCache;
 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
 import org.apache.hadoop.hbase.io.hfile.L2Cache;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram;
 import org.apache.hadoop.hbase.ipc.HRegionInterface;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
@@ -4033,6 +4034,15 @@ public class HRegion implements HeapSize
     }
   };
 
+  public HFileHistogram getHistogram() throws IOException {
+    List<HFileHistogram> histograms = new ArrayList<HFileHistogram>();
+    if (stores.size() == 0) return null;
+    for (Store s : stores.values()) {
+      histograms.add(s.getHistogram());
+    }
+    HFileHistogram h = histograms.get(0).compose(histograms);
+    return h;
+  }
 
   /**
    * Facility for dumping and compacting catalog tables.

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Thu Nov 28 18:13:19 2013
@@ -114,6 +114,7 @@ import org.apache.hadoop.hbase.io.hfile.
 import org.apache.hadoop.hbase.io.hfile.LruBlockCache;
 import org.apache.hadoop.hbase.io.hfile.LruBlockCache.CacheStats;
 import org.apache.hadoop.hbase.io.hfile.PreloadThreadPool;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.Bucket;
 import org.apache.hadoop.hbase.ipc.HBaseRPC;
 import org.apache.hadoop.hbase.ipc.HBaseRPCErrorHandler;
 import org.apache.hadoop.hbase.ipc.HBaseRPCOptions;
@@ -3814,6 +3815,21 @@ public class HRegionServer implements HR
     }
     return 0;
   }
+
+  @Override
+  public List<Bucket> getHistogram(byte[] regionName) throws IOException {
+    checkOpen();
+    HRegion region = getRegion(regionName);
+    return region.getHistogram().getUniformBuckets();
+  }
+
+  @Override
+  public List<Bucket> getHistogramForStore(byte[] regionName, byte[] family)
+      throws IOException {
+    checkOpen();
+    HRegion region = getRegion(regionName);
+    return region.getStore(family).getHistogram().getUniformBuckets();
+  }
 }
     boolean origProfiling = enableServerSideProfilingForAllCalls.get();
     boolean newProfiling = conf.getBoolean(

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Store.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Store.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Store.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/Store.java Thu Nov 28 18:13:19 2013
@@ -62,6 +62,8 @@ import org.apache.hadoop.hbase.io.hfile.
 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
 import org.apache.hadoop.hbase.io.hfile.HFileScanner;
 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram;
+import org.apache.hadoop.hbase.io.hfile.histogram.UniformSplitHFileHistogram;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.regionserver.compactionhook.CompactionHook;
@@ -162,6 +164,7 @@ public class Store extends SchemaConfigu
   private CompactionHook compactHook = null;
 
   private final HRegionInfo info;
+  private boolean writeHFileHistogram = false;
 
   // This should account for the Store's non static variables. So, when there
   // is an addition to the member variables to Store, this value should be
@@ -274,6 +277,9 @@ public class Store extends SchemaConfigu
       Store.closeCheckInterval = conf.getInt(
           "hbase.hstore.close.check.interval", 10*1000*1000 /* 10 MB */);
     }
+
+    writeHFileHistogram  = conf.getBoolean(HConstants.USE_HFILEHISTOGRAM,
+        HConstants.DEFAULT_USE_HFILEHISTOGRAM);
   }
   /**
    * Constructor
@@ -762,7 +768,9 @@ public class Store extends SchemaConfigu
         this.region.getSmallestReadPoint(),
         Long.MIN_VALUE, getAggregator(),
         flashBackQueryLimit); // include all deletes
-
+    HFileHistogram hist = new UniformSplitHFileHistogram(
+        this.conf.getInt(HFileHistogram.HFILEHISTOGRAM_BINCOUNT,
+            HFileHistogram.DEFAULT_HFILEHISTOGRAM_BINCOUNT));
     String fileName;
     try {
       // TODO:  We can fail in the below block before we complete adding this
@@ -776,12 +784,20 @@ public class Store extends SchemaConfigu
         writer.setTimeRangeTracker(snapshotTimeRangeTracker);
         fileName = writer.getPath().getName();
         try {
+          byte[] lastRow = new byte[0];
           final List<KeyValue> kvs = new ArrayList<KeyValue>();
           boolean hasMore;
           do {
             hasMore = scanner.next(kvs);
             if (!kvs.isEmpty()) {
               for (KeyValue kv : kvs) {
+                if (writeHFileHistogram) {
+                  byte[] thisRow = kv.getRow();
+                  if (!Bytes.equals(lastRow, thisRow)) {
+                    hist.add(kv);
+                  }
+                  lastRow = thisRow;
+                }
                 // If we know that this KV is going to be included always, then let us
                 // set its memstoreTS to 0. This will help us save space when writing to disk.
                 if (kv.getMemstoreTS() <= smallestReadPoint) {
@@ -802,6 +818,9 @@ public class Store extends SchemaConfigu
           // hfile.  The hfile is current up to and including logCacheFlushId.
           status.setStatus("Flushing " + this + ": appending metadata");
           writer.appendMetadata(EnvironmentEdgeManager.currentTimeMillis(), logCacheFlushId, false);
+          if (writeHFileHistogram) {
+            writer.appendHFileHistogram(hist);
+          }
           status.setStatus("Flushing " + this + ": closing flushed file");
           writer.close();
           InjectionHandler.processEventIO(InjectionEvent.STOREFILE_AFTER_WRITE_CLOSE, writer.getPath());
@@ -1331,6 +1350,9 @@ public class Store extends SchemaConfigu
     // Find the smallest read point across all the Scanners.
     long smallestReadPoint = region.getSmallestReadPoint();
     MultiVersionConsistencyControl.setThreadReadPoint(smallestReadPoint);
+    HFileHistogram hist = new UniformSplitHFileHistogram(
+        this.conf.getInt(HFileHistogram.HFILEHISTOGRAM_BINCOUNT,
+            HFileHistogram.DEFAULT_HFILEHISTOGRAM_BINCOUNT));
     try {
       InternalScanner scanner = null;
       try {
@@ -1356,6 +1378,7 @@ public class Store extends SchemaConfigu
         }
         KeyValueContext kvContext = new KeyValueContext();
 
+        byte[] lastRow = new byte[0];
         do {
           hasMore = scanner.next(kvs, 1, kvContext);
           if (!kvs.isEmpty()) {
@@ -1364,6 +1387,13 @@ public class Store extends SchemaConfigu
             }
             // output to writer:
             for (KeyValue kv : kvs) {
+              if (writeHFileHistogram) {
+                byte[] thisRow = kv.getRow();
+                if (!Bytes.equals(lastRow, thisRow)) {
+                  hist.add(kv);
+                }
+                lastRow = thisRow;
+              }
               if (kv.getMemstoreTS() <= smallestReadPoint) {
                 kv.setMemstoreTS(0);
               }
@@ -1411,12 +1441,29 @@ public class Store extends SchemaConfigu
           minFlushTime = HConstants.NO_MIN_FLUSH_TIME;
         }
         writer.appendMetadata(minFlushTime, maxCompactingSequcenceId, majorCompaction);
+        if (writeHFileHistogram) {
+          writer.appendHFileHistogram(hist);
+        }
         writer.close();
       }
     }
     return writer;
   }
 
+  private HFileHistogram hist = null;
+  public HFileHistogram getHistogram() throws IOException {
+    if (hist != null) return hist;
+    List<HFileHistogram> histograms = new ArrayList<HFileHistogram>();
+    if (storefiles.size() == 0) return null;
+    for (StoreFile file : this.storefiles) {
+      HFileHistogram hist = file.getHistogram();
+      if (hist != null) histograms.add(hist);
+    }
+    HFileHistogram h = histograms.get(0).compose(histograms);
+    this.hist = h;
+    return hist;
+  }
+
   /**
    * Validates a store file by opening and closing it. In HFileV2 this should
    * not be an expensive operation.

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java Thu Nov 28 18:13:19 2013
@@ -58,6 +58,8 @@ import org.apache.hadoop.hbase.io.hfile.
 import org.apache.hadoop.hbase.io.hfile.HFileWriterV1;
 import org.apache.hadoop.hbase.io.hfile.HFileWriterV2;
 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram;
+import org.apache.hadoop.hbase.io.hfile.histogram.UniformSplitHFileHistogram;
 import org.apache.hadoop.hbase.regionserver.metrics.SchemaConfigured;
 import org.apache.hadoop.hbase.regionserver.metrics.SchemaMetrics;
 import org.apache.hadoop.hbase.util.BloomFilter;
@@ -216,6 +218,8 @@ public class StoreFile extends SchemaCon
   // the last modification time stamp
   private long modificationTimeStamp = 0L;
 
+  private HFileHistogram histogram = null;
+
   /**
    * Constructor, loads a reader and it's indices, etc. May allocate a
    * substantial amount of ram depending on the underlying files (10-20MB?).
@@ -1146,6 +1150,15 @@ public class StoreFile extends SchemaCon
       includeInTimeRangeTracker(kv);
     }
 
+    /**
+     * Appends HFileHistogram to the HFile. This function is to be called only
+     * once with the Histogram that is constructed after compaction.
+     */
+    public void appendHFileHistogram(HFileHistogram histogram) {
+      writer.appendMetaBlock(HFile.HFILEHISTOGRAM_METABLOCK,
+          histogram.serialize());
+    }
+
     public Path getPath() {
       return this.writer.getPath();
     }
@@ -1832,4 +1845,15 @@ public class StoreFile extends SchemaCon
       });
   }
 
+  public HFileHistogram getHistogram() throws IOException {
+    if (histogram != null) return histogram;
+    ByteBuffer buf = this.reader.reader.getMetaBlock(
+        HFile.HFILEHISTOGRAM_METABLOCK, false);
+    histogram = new UniformSplitHFileHistogram(
+        this.conf.getInt(HFileHistogram.HFILEHISTOGRAM_BINCOUNT,
+            HFileHistogram.DEFAULT_HFILEHISTOGRAM_BINCOUNT));
+    histogram = histogram.deserialize(buf);
+    return histogram;
+  }
+
 }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Bytes.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Bytes.java?rev=1546425&r1=1546424&r2=1546425&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Bytes.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/Bytes.java Thu Nov 28 18:13:19 2013
@@ -1102,9 +1102,22 @@ public class Bytes {
    * @return Value in <code>a</code> plus <code>length</code> appended 0 bytes
    */
   public static byte [] padTail(final byte [] a, final int length) {
+    return appendToTail(a, length, (byte)0);
+  }
+
+  /**
+   * Appends length bytes to the end of the array and returns the new array
+   * Fills byte b in the newly allocated space in the byte[].
+   * @param a array
+   * @param length new array size
+   * @param b byte to write to the tail.
+   * @return Value in <code>a</code> plus <code>length</code> appended 0 bytes
+   */
+  public static byte [] appendToTail(final byte [] a, final int length, byte b)
+  {
     byte [] padding = new byte[length];
     for (int i = 0; i < length; i++) {
-      padding[i] = 0;
+      padding[i] = b;
     }
     return add(a,padding);
   }
@@ -1477,4 +1490,10 @@ public class Bytes {
   public static boolean isNonEmpty(ByteBuffer b) {
     return b != null && b.remaining() > 0;
   }
+
+  public static byte[] copyOfByteArray(byte[] arr) {
+    byte[] tmp = new byte[arr.length];
+    System.arraycopy(arr, 0, tmp, 0, arr.length);
+    return tmp;
+  }
 }

Added: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHFileHistogramE2E.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHFileHistogramE2E.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHFileHistogramE2E.java (added)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHFileHistogramE2E.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,141 @@
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.Bucket;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestHFileHistogramE2E {
+  private static final byte[] TABLE =
+      Bytes.toBytes("TestHFileHistogramE2ESingleStore");
+  private static final byte[] FAMILY = Bytes.toBytes("family");
+  private static final byte[] TABLE2 =
+      Bytes.toBytes("TestHistogramSerDeE2E");
+  private static final Log LOG = LogFactory.getLog(TestHFileHistogramE2E.class);
+  private HBaseTestingUtility util = new HBaseTestingUtility();
+  private final int numBuckets = 100;
+
+  @Before
+  public void setUp() throws Exception {
+    util.getConfiguration().setInt(HFileHistogram.HFILEHISTOGRAM_BINCOUNT,
+        numBuckets);
+    util.startMiniCluster(3);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    util.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testSingleStore() throws IOException {
+    HTable table = util.createTable(TABLE, FAMILY);
+    util.loadTable(table, FAMILY);
+    util.flush(TABLE);
+    assertTrue(util.getHBaseCluster().getRegions(TABLE).size() == 1);
+    HRegion region = util.getHBaseCluster().getRegions(TABLE).get(0);
+    HFileHistogram hist = region.getHistogram();
+    assertTrue(hist != null);
+    boolean first = true;
+    List<Bucket> buckets = hist.getUniformBuckets();
+    assertTrue(buckets != null);
+    assertTrue(buckets.size() > 0);
+    Bucket prevBucket = buckets.get(0);
+    for (Bucket b : buckets) {
+      if (first) {
+        first = false;
+        prevBucket = b;
+        continue;
+      }
+      assertTrue(Bytes.compareTo(b.getStartRow(), prevBucket.getEndRow()) >= 0);
+      assertTrue(Bytes.compareTo(b.getEndRow(), prevBucket.getStartRow()) > 0);
+    }
+  }
+
+  @Test
+  public void testHistogramSerDeE2E() throws IOException {
+    HTable table = util.createTable(TABLE2, FAMILY);
+    util.loadTable(table, FAMILY);
+    util.flush(TABLE2);
+    assertTrue(util.getHBaseCluster().getRegions(TABLE2).size() == 1);
+    HRegion region = util.getHBaseCluster().getRegions(TABLE2).get(0);
+    List<Bucket> buckets = region.getHistogram().getUniformBuckets();
+    assertTrue(buckets != null);
+    assertTrue(buckets.size() > 0);
+    List<Bucket> serBuckets = table.getHistogramForColumnFamily(
+        region.getStartKey(), FAMILY);
+    assertTrue(serBuckets != null);
+    assertTrue(serBuckets.size() > 0);
+    assertTrue(compareBuckets(buckets, serBuckets));
+  }
+
+  public boolean compareBuckets(List<Bucket> buckets1, List<Bucket> buckets2) {
+    int len1 = buckets1.size();
+    int len2 = buckets2.size();
+    assertTrue(len1 == len2);
+    for (int i=0; i<len1; i++) {
+      Bucket b1 = buckets1.get(i);
+      Bucket b2 = buckets2.get(i);
+      if (!b1.equals(b2)) return false;
+    }
+    return true;
+  }
+
+  private List<byte[]> putRandomKVs(HTable table, int numEntries, int rowSize)
+      throws IOException {
+    List<byte[]> inputList = new ArrayList<byte[]>();
+    // The error estimation holds for more than 10000 entries.
+    // We wouldn't be using this feature if it weren't bigger than that.
+    Random r = new Random();
+    for (int i = 0; i < numEntries; i++) {
+      byte[] arr = new byte[rowSize];
+      r.nextBytes(arr);
+      KeyValue kv = new KeyValue(arr, (long)0);
+      inputList.add(kv.getRow());
+      table.put(new Put(kv.getRow()).add(FAMILY, null, kv.getRow()));
+      if (i%10000 == 0) {
+        table.flushCommits();
+        util.flush();
+      }
+    }
+    return inputList;
+  }
+
+  @Test
+  public void testHistogramError() throws IOException {
+    byte[] TABLE3 = Bytes.toBytes("testHistogramError");
+    HTable table = util.createTable(TABLE3, FAMILY);
+    util.flush(TABLE2);
+    Random r = new Random();
+    int numEntries = 100000 + r.nextInt(100000);
+    int expectedBucketCnt = numEntries/numBuckets;
+    List<byte[]> inputList = putRandomKVs(table, numEntries, 15);
+    Collections.sort(inputList, Bytes.BYTES_COMPARATOR);
+    List<HRegion> regions = util.getHBaseCluster().getRegions(TABLE3);
+    assertTrue(regions.size() == 1);
+    HRegion region = regions.get(0);
+    List<Bucket> lst = table.getHistogram(region.getStartKey());
+    assertTrue(lst.size() > 0);
+
+    TestUniformSplitHistogram.checkError(inputList, lst,
+        0.2, expectedBucketCnt);
+  }
+}

Added: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHiveBasedNumericHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHiveBasedNumericHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHiveBasedNumericHistogram.java (added)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestHiveBasedNumericHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,82 @@
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import static org.junit.Assert.*;
+
+import java.util.Random;
+
+import org.apache.hadoop.hbase.io.hfile.histogram.HiveBasedNumericHistogram.Coord;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestHiveBasedNumericHistogram {
+  private HiveBasedNumericHistogram hist;
+  double min = 0;
+  double max = 10000;
+  int count = 1000;
+  int buckets = 100;
+  Random r;
+  @Before
+  public void setUp() throws Exception {
+    setup(min, max, buckets, count);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+
+  }
+
+  private void setup(double min, double max, int buckets, int count) {
+    hist = new HiveBasedNumericHistogram(min, max);
+    r = new Random();
+    // Inserting elements into the histogram.
+    hist.allocate(buckets);
+    for (int i = 0; i<count; i++) {
+      hist.add(r.nextDouble() * max);
+    }
+    assertTrue(hist.getNumBins() == 100);
+  }
+
+  @Test
+  public void testInsertionsAndSum() {
+    double sum = 0;
+    for (int i = 0; i< (hist.getUsedBins() - 1); i++) {
+      Coord bin1 = hist.getBin(i);
+      Coord bin2 = hist.getBin(i+1);
+      if (i == 0) sum += bin1.y;
+      sum += bin2.y;
+      assertTrue(bin1.x <= bin2.x);
+    }
+    assertTrue(sum == count);
+  }
+
+  @Test
+  public void testUniform() {
+    HiveBasedNumericHistogram uniformhist = this.hist.uniform(buckets);
+    for (int i = 0; i < (uniformhist.getUsedBins() - 1); i++) {
+      Coord bin1 = uniformhist.getBin(i);
+      Coord bin2 = uniformhist.getBin(i+1);
+      assertTrue(bin1.x <= bin2.x);
+    }
+  }
+
+  @Test
+  /**
+   * Testing the following case:
+   *
+   */
+  public void testSpecificTest1() {
+    this.min = 0;
+    this.max = 10;
+    this.count = 10;
+    this.buckets = 10;
+    hist = new HiveBasedNumericHistogram(0, 10);
+    hist.allocate(10);
+    for (int i=0; i<10; i++) {
+      hist.add(0);
+    }
+    assertTrue(hist.getUsedBins() == 1);
+    assertTrue(hist.getBin(0).x == 0);
+    assertTrue(hist.getBin(0).y == 10);
+  }
+}

Added: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestUniformSplitHistogram.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestUniformSplitHistogram.java?rev=1546425&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestUniformSplitHistogram.java (added)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/io/hfile/histogram/TestUniformSplitHistogram.java Thu Nov 28 18:13:19 2013
@@ -0,0 +1,118 @@
+package org.apache.hadoop.hbase.io.hfile.histogram;
+
+import static org.junit.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.io.hfile.histogram.HFileHistogram.Bucket;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestUniformSplitHistogram {
+
+  private final static Log LOG =
+      LogFactory.getLog(TestUniformSplitHistogram.class);
+  @Before
+  public void setUp() throws Exception {
+  }
+
+  @After
+  public void tearDown() throws Exception {
+  }
+
+  @Test
+  public void testUniformHistogram() {
+    UniformSplitHFileHistogram hist = new UniformSplitHFileHistogram(100);
+    Random r = new Random();
+    int size = 10;
+    for (int i = 0; i < 100; i++) {
+      byte[] arr = new byte[size];
+      r.nextBytes(arr);
+      KeyValue kv = new KeyValue(arr, (long)0);
+      hist.add(kv);
+    }
+    List<Bucket> lst = hist.getUniformBuckets();
+    assertTrue(lst.size() > 0);
+    Bucket prevBucket = null;
+    for (Bucket b : lst) {
+      if (prevBucket != null) {
+        assertTrue(Bytes.toStringBinary(b.getStartRow())
+            + " not greater than "
+            + Bytes.toStringBinary(prevBucket.getStartRow()),
+            Bytes.compareTo(b.getStartRow(), prevBucket.getStartRow()) > 0);
+        assertTrue(Bytes.toStringBinary(b.getEndRow())
+            + " not greater than "
+            + Bytes.toStringBinary(prevBucket.getEndRow()),
+            Bytes.compareTo(b.getEndRow(), prevBucket.getEndRow()) >= 0);
+        assertTrue(Bytes.toStringBinary(b.getEndRow())
+            + " not greater than "
+            + Bytes.toStringBinary(prevBucket.getStartRow()),
+            Bytes.compareTo(b.getEndRow(), prevBucket.getStartRow()) >= 0);
+      }
+      prevBucket = b;
+    }
+  }
+
+  @Test
+  public void testUniformHistogramError() {
+    for (int numRuns = 0; numRuns < 100; numRuns++) {
+      int numBuckets = 100;
+      UniformSplitHFileHistogram hist = new UniformSplitHFileHistogram(numBuckets);
+      Random r = new Random();
+      int size = 10;
+      List<byte[]> inputList = new ArrayList<byte[]>();
+      // The error estimation holds for more than 10000 entries.
+      // We wouldn't be using this feature if it weren't bigger than that.
+      int numEntries = 10000 + r.nextInt(10000);
+      int expectedBucketCnt = numEntries/numBuckets;
+      for (int i = 0; i < numEntries; i++) {
+        byte[] arr = new byte[size];
+        r.nextBytes(arr);
+        KeyValue kv = new KeyValue(arr, (long)0);
+        inputList.add(kv.getRow());
+        hist.add(kv);
+      }
+      List<Bucket> lst = hist.getUniformBuckets();
+
+      // 20 error is an observation, this test gives an estimate of how much
+      // error you can expect.
+      checkError(inputList, lst, 0.2, expectedBucketCnt);
+    }
+  }
+
+  public static void checkError(List<byte[]> inputList, List<Bucket> lst,
+      double errorPct, int expectedBucketCnt) {
+    Collections.sort(inputList, Bytes.BYTES_COMPARATOR);
+    assertTrue(lst.size() > 0);
+    int numEntries = inputList.size();
+    int i = 0;
+    int j = i;
+    int bucketIndex = 0;
+    int error = 0;
+    for (Bucket b : lst) {
+      while (i<numEntries && isWithinBucket(b, inputList.get(i))) {
+        i++;
+      }
+      LOG.debug("Bucket #" + bucketIndex++ + ", Actual : "
+          + (i-j) + ", From Bucket :" + b.getCount());
+      error += Math.abs(((i-j) - expectedBucketCnt));
+      j = i;
+    }
+    LOG.debug("numEntries : " + numEntries + ", error :" + error
+        + "expectedNBucketCnt : " + expectedBucketCnt);
+    assertTrue(error/(double)numEntries < 0.2);
+  }
+
+  public static boolean isWithinBucket(Bucket b, byte[] row) {
+    return (Bytes.compareTo(b.getStartRow(), row) <= 0
+        && Bytes.compareTo(b.getEndRow(), row) >= 0);
+  }
+}



Mime
View raw message