hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From te...@apache.org
Subject svn commit: r1443289 [3/6] - in /hbase/trunk: ./ hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/ hbase-common/src/main/java/org/apache/hadoop/hbase/util/ hbase-common/src/main/java/org/apache/hadoop/hbase/util/test/ hbase-common/src/mai...
Date Thu, 07 Feb 2013 00:36:27 GMT
Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnNodeWriter.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.column;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.ByteRange;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Strings;
+import org.apache.hbase.codec.prefixtree.PrefixTreeBlockMeta;
+import org.apache.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
+import org.apache.hbase.util.vint.UFIntTool;
+import org.apache.hbase.util.vint.UVIntTool;
+
+/**
+ * Column nodes can be either family nodes or qualifier nodes, as both sections encode similarly.
+ * The family and qualifier sections of the data block are made of 1 or more of these nodes.
+ * <p/>
+ * Each node is composed of 3 sections:<br/>
+ * <li>tokenLength: UVInt (normally 1 byte) indicating the number of token bytes
+ * <li>token[]: the actual token bytes
+ * <li>parentStartPosition: the offset of the next node from the start of the family or qualifier
+ * section
+ */
+@InterfaceAudience.Private
+public class ColumnNodeWriter{
+
+  /************* fields ****************************/
+
+  protected TokenizerNode builderNode;
+  protected PrefixTreeBlockMeta blockMeta;
+
+  protected boolean familyVsQualifier;
+
+  protected int tokenLength;
+  protected byte[] token;
+  protected int parentStartPosition;
+
+
+  /*************** construct **************************/
+
+  public ColumnNodeWriter(PrefixTreeBlockMeta blockMeta, TokenizerNode builderNode,
+      boolean familyVsQualifier) {
+    this.blockMeta = blockMeta;
+    this.builderNode = builderNode;
+    this.familyVsQualifier = familyVsQualifier;
+    calculateTokenLength();
+  }
+
+
+  /************* methods *******************************/
+
+  public boolean isRoot() {
+    return parentStartPosition == 0;
+  }
+
+  private void calculateTokenLength() {
+    tokenLength = builderNode.getTokenLength();
+    token = new byte[tokenLength];
+  }
+
+  /**
+   * This method is called before blockMeta.qualifierOffsetWidth is known, so we pass in a
+   * placeholder.
+   * @param offsetWidthPlaceholder the placeholder
+   * @return node width
+   */
+  public int getWidthUsingPlaceholderForOffsetWidth(int offsetWidthPlaceholder) {
+    int width = 0;
+    width += UVIntTool.numBytes(tokenLength);
+    width += token.length;
+    width += offsetWidthPlaceholder;
+    return width;
+  }
+
+  public void writeBytes(OutputStream os) throws IOException {
+    int parentOffsetWidth;
+    if (familyVsQualifier) {
+      parentOffsetWidth = blockMeta.getFamilyOffsetWidth();
+    } else {
+      parentOffsetWidth = blockMeta.getQualifierOffsetWidth();
+    }
+    UVIntTool.writeBytes(tokenLength, os);
+    os.write(token);
+    UFIntTool.writeBytes(parentOffsetWidth, parentStartPosition, os);
+  }
+
+  public void setTokenBytes(ByteRange source) {
+    source.deepCopySubRangeTo(0, tokenLength, token, 0);
+  }
+
+
+  /****************** standard methods ************************/
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(Strings.padFront(builderNode.getOutputArrayOffset() + "", ' ', 3) + ",");
+    sb.append("[");
+    sb.append(Bytes.toString(token));
+    sb.append("]->");
+    sb.append(parentStartPosition);
+    return sb.toString();
+  }
+
+
+  /************************** get/set ***********************/
+
+  public void setParentStartPosition(int parentStartPosition) {
+    this.parentStartPosition = parentStartPosition;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/column/ColumnSectionWriter.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.column;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.CollectionUtils;
+import org.apache.hbase.codec.prefixtree.PrefixTreeBlockMeta;
+import org.apache.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
+import org.apache.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
+import org.apache.hbase.util.vint.UFIntTool;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Takes the tokenized family or qualifier data and flattens it into a stream of bytes. The family
+ * section is written after the row section, and qualifier section after family section.
+ * <p/>
+ * The family and qualifier tries, or "column tries", are structured differently than the row trie.
+ * The trie cannot be reassembled without external data about the offsets of the leaf nodes, and
+ * these external pointers are stored in the nubs and leaves of the row trie. For each cell in a
+ * row, the row trie contains a list of offsets into the column sections (along with pointers to
+ * timestamps and other per-cell fields). These offsets point to the last column node/token that
+ * comprises the column name. To assemble the column name, the trie is traversed in reverse (right
+ * to left), with the rightmost tokens pointing to the start of their "parent" node which is the
+ * node to the left.
+ * <p/>
+ * This choice was made to reduce the size of the column trie by storing the minimum amount of
+ * offset data. As a result, to find a specific qualifier within a row, you must do a binary search
+ * of the column nodes, reassembling each one as you search. Future versions of the PrefixTree might
+ * encode the columns in both a forward and reverse trie, which would convert binary searches into
+ * more efficient trie searches which would be beneficial for wide rows.
+ */
+@InterfaceAudience.Private
+public class ColumnSectionWriter {
+
+  public static final int EXPECTED_NUBS_PLUS_LEAVES = 100;
+
+  /****************** fields ****************************/
+
+  private PrefixTreeBlockMeta blockMeta;
+
+  private boolean familyVsQualifier;
+  private Tokenizer tokenizer;
+  private int numBytes = 0;
+  private ArrayList<TokenizerNode> nonLeaves;
+  private ArrayList<TokenizerNode> leaves;
+  private ArrayList<TokenizerNode> allNodes;
+  private ArrayList<ColumnNodeWriter> columnNodeWriters;
+  private List<Integer> outputArrayOffsets;
+
+
+	/*********************** construct *********************/
+
+  public ColumnSectionWriter() {
+    this.nonLeaves = Lists.newArrayList();
+    this.leaves = Lists.newArrayList();
+    this.outputArrayOffsets = Lists.newArrayList();
+  }
+
+  public ColumnSectionWriter(PrefixTreeBlockMeta blockMeta, Tokenizer builder,
+      boolean familyVsQualifier) {
+    this();// init collections
+    reconstruct(blockMeta, builder, familyVsQualifier);
+  }
+
+  public void reconstruct(PrefixTreeBlockMeta blockMeta, Tokenizer builder,
+      boolean familyVsQualifier) {
+    this.blockMeta = blockMeta;
+    this.tokenizer = builder;
+    this.familyVsQualifier = familyVsQualifier;
+  }
+
+  public void reset() {
+    numBytes = 0;
+    nonLeaves.clear();
+    leaves.clear();
+    outputArrayOffsets.clear();
+  }
+
+
+	/****************** methods *******************************/
+
+  public ColumnSectionWriter compile() {
+    if (familyVsQualifier) {
+      // do nothing. max family length fixed at Byte.MAX_VALUE
+    } else {
+      blockMeta.setMaxQualifierLength(tokenizer.getMaxElementLength());
+    }
+
+    tokenizer.setNodeFirstInsertionIndexes();
+
+    tokenizer.appendNodes(nonLeaves, true, false);
+
+    tokenizer.appendNodes(leaves, false, true);
+
+    allNodes = Lists.newArrayListWithCapacity(nonLeaves.size() + leaves.size());
+    allNodes.addAll(nonLeaves);
+    allNodes.addAll(leaves);
+
+    columnNodeWriters = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(allNodes));
+    for (int i = 0; i < allNodes.size(); ++i) {
+      TokenizerNode node = allNodes.get(i);
+      columnNodeWriters.add(new ColumnNodeWriter(blockMeta, node, familyVsQualifier));
+    }
+
+    // leaf widths are known at this point, so add them up
+    int totalBytesWithoutOffsets = 0;
+    for (int i = allNodes.size() - 1; i >= 0; --i) {
+      ColumnNodeWriter columnNodeWriter = columnNodeWriters.get(i);
+      // leaves store all but their first token byte
+      totalBytesWithoutOffsets += columnNodeWriter.getWidthUsingPlaceholderForOffsetWidth(0);
+    }
+
+    // figure out how wide our offset FInts are
+    int parentOffsetWidth = 0;
+    while (true) {
+      ++parentOffsetWidth;
+      int numBytesFinder = totalBytesWithoutOffsets + parentOffsetWidth * allNodes.size();
+      if (numBytesFinder < UFIntTool.maxValueForNumBytes(parentOffsetWidth)) {
+        numBytes = numBytesFinder;
+        break;
+      }// it fits
+    }
+    if (familyVsQualifier) {
+      blockMeta.setFamilyOffsetWidth(parentOffsetWidth);
+    } else {
+      blockMeta.setQualifierOffsetWidth(parentOffsetWidth);
+    }
+
+    int forwardIndex = 0;
+    for (int i = 0; i < allNodes.size(); ++i) {
+      TokenizerNode node = allNodes.get(i);
+      ColumnNodeWriter columnNodeWriter = columnNodeWriters.get(i);
+      int fullNodeWidth = columnNodeWriter
+          .getWidthUsingPlaceholderForOffsetWidth(parentOffsetWidth);
+      node.setOutputArrayOffset(forwardIndex);
+      columnNodeWriter.setTokenBytes(node.getToken());
+      if (node.isRoot()) {
+        columnNodeWriter.setParentStartPosition(0);
+      } else {
+        columnNodeWriter.setParentStartPosition(node.getParent().getOutputArrayOffset());
+      }
+      forwardIndex += fullNodeWidth;
+    }
+
+    tokenizer.appendOutputArrayOffsets(outputArrayOffsets);
+
+    return this;
+  }
+
+  public void writeBytes(OutputStream os) throws IOException {
+    for (ColumnNodeWriter columnNodeWriter : columnNodeWriters) {
+      columnNodeWriter.writeBytes(os);
+    }
+  }
+
+
+  /************* get/set **************************/
+
+  public ArrayList<ColumnNodeWriter> getColumnNodeWriters() {
+    return columnNodeWriters;
+  }
+
+  public int getNumBytes() {
+    return numBytes;
+  }
+
+  public int getOutputArrayOffset(int sortedIndex) {
+    return outputArrayOffsets.get(sortedIndex);
+  }
+
+  public ArrayList<TokenizerNode> getNonLeaves() {
+    return nonLeaves;
+  }
+
+  public ArrayList<TokenizerNode> getLeaves() {
+    return leaves;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/CellTypeEncoder.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.other;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * Detect if every KV has the same KeyValue.Type, in which case we don't need to store it for each
+ * KV.  If(allSameType) during conversion to byte[], then we can store the "onlyType" in blockMeta,
+ * therefore not repeating it for each cell and saving 1 byte per cell.
+ */
+@InterfaceAudience.Private
+public class CellTypeEncoder {
+
+  /************* fields *********************/
+
+  protected boolean pendingFirstType = true;
+  protected boolean allSameType = true;
+  protected byte onlyType;
+
+
+  /************* construct *********************/
+
+  public void reset() {
+    pendingFirstType = true;
+    allSameType = true;
+  }
+
+
+  /************* methods *************************/
+
+  public void add(byte type) {
+    if (pendingFirstType) {
+      onlyType = type;
+      pendingFirstType = false;
+    } else if (onlyType != type) {
+      allSameType = false;
+    }
+  }
+
+
+  /**************** get/set **************************/
+
+  public boolean areAllSameType() {
+    return allSameType;
+  }
+
+  public byte getOnlyType() {
+    return onlyType;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/LongEncoder.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/LongEncoder.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/LongEncoder.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/other/LongEncoder.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.other;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.ArrayUtils;
+import org.apache.hadoop.hbase.util.CollectionUtils;
+import org.apache.hbase.util.vint.UFIntTool;
+
+import com.google.common.base.Joiner;
+
+/**
+ * Used to de-duplicate, sort, minimize/diff, and serialize timestamps and mvccVersions from a
+ * collection of Cells.
+ *
+ * 1. add longs to a HashSet for fast de-duplication
+ * 2. keep track of the min and max
+ * 3. copy all values to a new long[]
+ * 4. Collections.sort the long[]
+ * 5. calculate maxDelta = max - min
+ * 6. determine FInt width based on maxDelta
+ * 7. PrefixTreeEncoder binary searches to find index of each value
+ */
+@InterfaceAudience.Private
+public class LongEncoder {
+
+  /****************** fields ****************************/
+
+  protected HashSet<Long> uniqueValues;
+  protected long[] sortedUniqueValues;
+  protected long min, max, maxDelta;
+
+  protected int bytesPerDelta;
+  protected int bytesPerIndex;
+  protected int totalCompressedBytes;
+
+
+  /****************** construct ****************************/
+
+  public LongEncoder() {
+    this.uniqueValues = new HashSet<Long>();
+  }
+
+  public void reset() {
+    uniqueValues.clear();
+    sortedUniqueValues = null;
+    min = Long.MAX_VALUE;
+    max = Long.MIN_VALUE;
+    maxDelta = Long.MIN_VALUE;
+    bytesPerIndex = 0;
+    bytesPerDelta = 0;
+    totalCompressedBytes = 0;
+  }
+
+
+	/************* methods ***************************/
+
+  public void add(long timestamp) {
+    uniqueValues.add(timestamp);
+  }
+
+  public LongEncoder compile() {
+    int numUnique = uniqueValues.size();
+    if (numUnique == 1) {
+      min = CollectionUtils.getFirst(uniqueValues);
+      sortedUniqueValues = new long[] { min };
+      return this;
+    }
+
+    sortedUniqueValues = new long[numUnique];
+    int lastIndex = -1;
+    for (long value : uniqueValues) {
+      sortedUniqueValues[++lastIndex] = value;
+    }
+    Arrays.sort(sortedUniqueValues);
+    min = ArrayUtils.getFirst(sortedUniqueValues);
+    max = ArrayUtils.getLast(sortedUniqueValues);
+    maxDelta = max - min;
+    if (maxDelta > 0) {
+      bytesPerDelta = UFIntTool.numBytes(maxDelta);
+    } else {
+      bytesPerDelta = 0;
+    }
+
+    int maxIndex = numUnique - 1;
+    bytesPerIndex = UFIntTool.numBytes(maxIndex);
+
+    totalCompressedBytes = numUnique * bytesPerDelta;
+
+    return this;
+  }
+
+  public long getDelta(int index) {
+    if (sortedUniqueValues.length == 0) {
+      return 0;
+    }
+    return sortedUniqueValues[index] - min;
+  }
+
+  public int getIndex(long value) {
+    // should always find an exact match
+    return Arrays.binarySearch(sortedUniqueValues, value);
+  }
+
+  public void writeBytes(OutputStream os) throws IOException {
+    for (int i = 0; i < sortedUniqueValues.length; ++i) {
+      long delta = sortedUniqueValues[i] - min;
+      UFIntTool.writeBytes(bytesPerDelta, delta, os);
+    }
+  }
+
+  //convenience method for tests
+  public byte[] getByteArray() throws IOException{
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    writeBytes(baos);
+    return baos.toByteArray();
+  }
+
+  public int getOutputArrayLength() {
+    return sortedUniqueValues.length * bytesPerDelta;
+  }
+
+  public int getNumUniqueValues() {
+    return sortedUniqueValues.length;
+  }
+
+
+  /******************* Object methods **********************/
+
+  @Override
+  public String toString() {
+    if (ArrayUtils.isEmpty(sortedUniqueValues)) {
+      return "[]";
+    }
+    return "[" + Joiner.on(",").join(ArrayUtils.toList(sortedUniqueValues)) + "]";
+  }
+
+
+	/******************** get/set **************************/
+
+  public long getMin() {
+    return min;
+  }
+
+  public int getBytesPerDelta() {
+    return bytesPerDelta;
+  }
+
+  public int getBytesPerIndex() {
+    return bytesPerIndex;
+  }
+
+  public int getTotalCompressedBytes() {
+    return totalCompressedBytes;
+  }
+
+  public long[] getSortedUniqueTimestamps() {
+    return sortedUniqueValues;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowNodeWriter.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowNodeWriter.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowNodeWriter.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowNodeWriter.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.row;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.ByteRangeTool;
+import org.apache.hadoop.hbase.util.CollectionUtils;
+import org.apache.hbase.codec.prefixtree.PrefixTreeBlockMeta;
+import org.apache.hbase.codec.prefixtree.encode.PrefixTreeEncoder;
+import org.apache.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
+import org.apache.hbase.util.vint.UFIntTool;
+import org.apache.hbase.util.vint.UVIntTool;
+
+/**
+ * Serializes the fields comprising one node of the row trie, which can be a branch, nub, or leaf.
+ * Please see the write() method for the order in which data is written.
+ */
+@InterfaceAudience.Private
+public class RowNodeWriter{
+  protected static final Log LOG = LogFactory.getLog(RowNodeWriter.class);
+
+  /********************* fields ******************************/
+
+  protected PrefixTreeEncoder prefixTreeEncoder;
+  protected PrefixTreeBlockMeta blockMeta;
+  protected TokenizerNode tokenizerNode;
+
+  protected int tokenWidth;
+  protected int fanOut;
+  protected int numCells;
+
+  protected int width;
+
+
+  /*********************** construct *************************/
+
+  public RowNodeWriter(PrefixTreeEncoder keyValueBuilder, TokenizerNode tokenizerNode) {
+    reconstruct(keyValueBuilder, tokenizerNode);
+  }
+
+  public void reconstruct(PrefixTreeEncoder prefixTreeEncoder, TokenizerNode tokenizerNode) {
+    this.prefixTreeEncoder = prefixTreeEncoder;
+    reset(tokenizerNode);
+  }
+
+  public void reset(TokenizerNode node) {
+    this.blockMeta = prefixTreeEncoder.getBlockMeta();// changes between blocks
+    this.tokenizerNode = node;
+    this.tokenWidth = 0;
+    this.fanOut = 0;
+    this.numCells = 0;
+    this.width = 0;
+    calculateOffsetsAndLengths();
+  }
+
+
+  /********************* methods ****************************/
+
+  protected void calculateOffsetsAndLengths(){
+    tokenWidth = tokenizerNode.getTokenLength();
+    if(!tokenizerNode.isRoot()){
+      --tokenWidth;//root has no parent
+    }
+    fanOut = CollectionUtils.nullSafeSize(tokenizerNode.getChildren());
+    numCells = tokenizerNode.getNumOccurrences();
+  }
+
+  public int calculateWidth(){
+    calculateWidthOverrideOffsetWidth(blockMeta.getNextNodeOffsetWidth());
+    return width;
+  }
+
+  public int calculateWidthOverrideOffsetWidth(int offsetWidth){
+    width = 0;
+    width += UVIntTool.numBytes(tokenWidth);
+    width += tokenWidth;
+
+    width += UVIntTool.numBytes(fanOut);
+    width += fanOut;
+
+    width += UVIntTool.numBytes(numCells);
+
+    if(tokenizerNode.hasOccurrences()){
+      int fixedBytesPerCell = blockMeta.getFamilyOffsetWidth()
+        + blockMeta.getQualifierOffsetWidth()
+        + blockMeta.getTimestampIndexWidth()
+        + blockMeta.getMvccVersionIndexWidth()
+        + blockMeta.getKeyValueTypeWidth()
+        + blockMeta.getValueOffsetWidth()
+        + blockMeta.getValueLengthWidth();
+      width += numCells * fixedBytesPerCell;
+    }
+
+    if( ! tokenizerNode.isLeaf()){
+      width += fanOut * offsetWidth;
+    }
+
+    return width;
+  }
+
+
+  /*********************** writing the compiled structure to the OutputStream ***************/
+
+  public void write(OutputStream os) throws IOException{
+    //info about this row trie node
+    writeRowToken(os);
+    writeFan(os);
+    writeNumCells(os);
+
+    //UFInt indexes and offsets for each cell in the row (if nub or leaf)
+    writeFamilyNodeOffsets(os);
+    writeQualifierNodeOffsets(os);
+    writeTimestampIndexes(os);
+    writeMvccVersionIndexes(os);
+    writeCellTypes(os);
+    writeValueOffsets(os);
+    writeValueLengths(os);
+
+    //offsets to the children of this row trie node (if branch or nub)
+    writeNextRowTrieNodeOffsets(os);
+  }
+
+
+  /**
+   * Row node token, fan, and numCells. Written once at the beginning of each row node. These 3
+   * fields can reproduce all the row keys that compose the block.
+   */
+
+  /**
+   * UVInt: tokenWidth
+   * bytes: token
+   */
+  protected void writeRowToken(OutputStream os) throws IOException {
+    UVIntTool.writeBytes(tokenWidth, os);
+    int tokenStartIndex = tokenizerNode.isRoot() ? 0 : 1;
+    ByteRangeTool.write(os, tokenizerNode.getToken(), tokenStartIndex);
+  }
+
+  /**
+   * UVInt: numFanBytes/fanOut
+   * bytes: each fan byte
+   */
+  public void writeFan(OutputStream os) throws IOException {
+    UVIntTool.writeBytes(fanOut, os);
+    if (fanOut <= 0) {
+      return;
+    }
+    ArrayList<TokenizerNode> children = tokenizerNode.getChildren();
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      os.write(child.getToken().get(0));// first byte of each child's token
+    }
+  }
+
+  /**
+   * UVInt: numCells, the number of cells in this row which will be 0 for branch nodes
+   */
+  protected void writeNumCells(OutputStream os) throws IOException {
+    UVIntTool.writeBytes(numCells, os);
+  }
+
+
+  /**
+   * The following methods write data for each cell in the row, mostly consisting of indexes or
+   * offsets into the timestamp/column data structures that are written in the middle of the block.
+   * We use {@link UFIntTool} to encode these indexes/offsets to allow random access during a binary
+   * search of a particular column/timestamp combination.
+   * <p/>
+   * Branch nodes will not have any data in these sections.
+   */
+
+  protected void writeFamilyNodeOffsets(OutputStream os) throws IOException {
+    if (blockMeta.getFamilyOffsetWidth() <= 0) {
+      return;
+    }
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = PrefixTreeEncoder.MULITPLE_FAMILIES_POSSIBLE ? tokenizerNode
+          .getFirstInsertionIndex() + i : 0;
+      int sortedIndex = prefixTreeEncoder.getFamilySorter().getSortedIndexForInsertionId(
+        cellInsertionIndex);
+      int indexedFamilyOffset = prefixTreeEncoder.getFamilyWriter().getOutputArrayOffset(
+        sortedIndex);
+      UFIntTool.writeBytes(blockMeta.getFamilyOffsetWidth(), indexedFamilyOffset, os);
+    }
+  }
+
+  protected void writeQualifierNodeOffsets(OutputStream os) throws IOException {
+    if (blockMeta.getQualifierOffsetWidth() <= 0) {
+      return;
+    }
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      int sortedIndex = prefixTreeEncoder.getQualifierSorter().getSortedIndexForInsertionId(
+        cellInsertionIndex);
+      int indexedQualifierOffset = prefixTreeEncoder.getQualifierWriter().getOutputArrayOffset(
+        sortedIndex);
+      UFIntTool.writeBytes(blockMeta.getQualifierOffsetWidth(), indexedQualifierOffset, os);
+    }
+  }
+
+  protected void writeTimestampIndexes(OutputStream os) throws IOException {
+    if (blockMeta.getTimestampIndexWidth() <= 0) {
+      return;
+    }
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      long timestamp = prefixTreeEncoder.getTimestamps()[cellInsertionIndex];
+      int timestampIndex = prefixTreeEncoder.getTimestampEncoder().getIndex(timestamp);
+      UFIntTool.writeBytes(blockMeta.getTimestampIndexWidth(), timestampIndex, os);
+    }
+  }
+
+  protected void writeMvccVersionIndexes(OutputStream os) throws IOException {
+    if (blockMeta.getMvccVersionIndexWidth() <= 0) {
+      return;
+    }
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      long mvccVersion = prefixTreeEncoder.getMvccVersions()[cellInsertionIndex];
+      int mvccVersionIndex = prefixTreeEncoder.getMvccVersionEncoder().getIndex(mvccVersion);
+      UFIntTool.writeBytes(blockMeta.getMvccVersionIndexWidth(), mvccVersionIndex, os);
+    }
+  }
+
+  protected void writeCellTypes(OutputStream os) throws IOException {
+    if (blockMeta.isAllSameType()) {
+      return;
+    }
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      os.write(prefixTreeEncoder.getTypeBytes()[cellInsertionIndex]);
+    }
+  }
+
+  protected void writeValueOffsets(OutputStream os) throws IOException {
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      long valueStartIndex = prefixTreeEncoder.getValueOffset(cellInsertionIndex);
+      UFIntTool.writeBytes(blockMeta.getValueOffsetWidth(), valueStartIndex, os);
+    }
+  }
+
+  protected void writeValueLengths(OutputStream os) throws IOException {
+    for (int i = 0; i < numCells; ++i) {
+      int cellInsertionIndex = tokenizerNode.getFirstInsertionIndex() + i;
+      int valueLength = prefixTreeEncoder.getValueLength(cellInsertionIndex);
+      UFIntTool.writeBytes(blockMeta.getValueLengthWidth(), valueLength, os);
+    }
+  }
+
+
+  /**
+   * If a branch or a nub, the last thing we append are the UFInt offsets to the child row nodes.
+   */
+  protected void writeNextRowTrieNodeOffsets(OutputStream os) throws IOException {
+    ArrayList<TokenizerNode> children = tokenizerNode.getChildren();
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      int distanceToChild = tokenizerNode.getNegativeIndex() - child.getNegativeIndex();
+      UFIntTool.writeBytes(blockMeta.getNextNodeOffsetWidth(), distanceToChild, os);
+    }
+  }
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowSectionWriter.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowSectionWriter.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowSectionWriter.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/row/RowSectionWriter.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.row;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hbase.codec.prefixtree.PrefixTreeBlockMeta;
+import org.apache.hbase.codec.prefixtree.encode.PrefixTreeEncoder;
+import org.apache.hbase.codec.prefixtree.encode.tokenize.TokenizerNode;
+import org.apache.hbase.util.vint.UFIntTool;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Most of the complexity of the PrefixTree is contained in the "row section". It contains the row
+ * key trie structure used to search and recreate all the row keys. Each nub and leaf in this trie
+ * also contains references to offsets in the other sections of the data block that enable the
+ * decoder to match a row key with its qualifier, timestamp, type, value, etc.
+ * <p>
+ * The row section is a concatenated collection of {@link RowNodeWriter}s. See that class for the
+ * internals of each row node.
+ */
+@InterfaceAudience.Private
+public class RowSectionWriter {
+
+  /***************** fields **************************/
+
+  protected PrefixTreeEncoder prefixTreeEncoder;
+
+  protected PrefixTreeBlockMeta blockMeta;
+
+  protected int numBytes;
+
+  protected ArrayList<TokenizerNode> nonLeaves;
+  protected ArrayList<TokenizerNode> leaves;
+
+  protected ArrayList<RowNodeWriter> leafWriters;
+  protected ArrayList<RowNodeWriter> nonLeafWriters;
+
+  protected int numLeafWriters;
+  protected int numNonLeafWriters;
+
+
+  /********************* construct **********************/
+
+  public RowSectionWriter() {
+    this.nonLeaves = Lists.newArrayList();
+    this.leaves = Lists.newArrayList();
+    this.leafWriters = Lists.newArrayList();
+    this.nonLeafWriters = Lists.newArrayList();
+  }
+
+  public RowSectionWriter(PrefixTreeEncoder prefixTreeEncoder) {
+    reconstruct(prefixTreeEncoder);
+  }
+
+  public void reconstruct(PrefixTreeEncoder prefixTreeEncoder) {
+    this.prefixTreeEncoder = prefixTreeEncoder;
+    this.blockMeta = prefixTreeEncoder.getBlockMeta();
+    reset();
+  }
+
+  public void reset() {
+    numBytes = 0;
+    nonLeaves.clear();
+    leaves.clear();
+    numLeafWriters = 0;
+    numNonLeafWriters = 0;
+  }
+
+
+  /****************** methods *******************************/
+
+  public RowSectionWriter compile() {
+    blockMeta.setMaxRowLength(prefixTreeEncoder.getRowTokenizer().getMaxElementLength());
+    prefixTreeEncoder.getRowTokenizer().setNodeFirstInsertionIndexes();
+
+    prefixTreeEncoder.getRowTokenizer().appendNodes(nonLeaves, true, false);
+    prefixTreeEncoder.getRowTokenizer().appendNodes(leaves, false, true);
+
+    // track the starting position of each node in final output
+    int negativeIndex = 0;
+
+    // create leaf writer nodes
+    // leaf widths are known at this point, so add them up
+    int totalLeafBytes = 0;
+    for (int i = leaves.size() - 1; i >= 0; --i) {
+      TokenizerNode leaf = leaves.get(i);
+      RowNodeWriter leafWriter = initializeWriter(leafWriters, numLeafWriters, leaf);
+      ++numLeafWriters;
+      // leaves store all but their first token byte
+      int leafNodeWidth = leafWriter.calculateWidthOverrideOffsetWidth(0);
+      totalLeafBytes += leafNodeWidth;
+      negativeIndex += leafNodeWidth;
+      leaf.setNegativeIndex(negativeIndex);
+    }
+
+    int totalNonLeafBytesWithoutOffsets = 0;
+    int totalChildPointers = 0;
+    for (int i = nonLeaves.size() - 1; i >= 0; --i) {
+      TokenizerNode nonLeaf = nonLeaves.get(i);
+      RowNodeWriter nonLeafWriter = initializeWriter(nonLeafWriters, numNonLeafWriters, nonLeaf);
+      ++numNonLeafWriters;
+      totalNonLeafBytesWithoutOffsets += nonLeafWriter.calculateWidthOverrideOffsetWidth(0);
+      totalChildPointers += nonLeaf.getNumChildren();
+    }
+
+    // figure out how wide our offset FInts are
+    int offsetWidth = 0;
+    while (true) {
+      ++offsetWidth;
+      int offsetBytes = totalChildPointers * offsetWidth;
+      int totalRowBytes = totalNonLeafBytesWithoutOffsets + offsetBytes + totalLeafBytes;
+      if (totalRowBytes < UFIntTool.maxValueForNumBytes(offsetWidth)) {
+        // it fits
+        numBytes = totalRowBytes;
+        break;
+      }
+    }
+    blockMeta.setNextNodeOffsetWidth(offsetWidth);
+
+    // populate negativeIndexes
+    for (int i = nonLeaves.size() - 1; i >= 0; --i) {
+      TokenizerNode nonLeaf = nonLeaves.get(i);
+      int writerIndex = nonLeaves.size() - i - 1;
+      RowNodeWriter nonLeafWriter = nonLeafWriters.get(writerIndex);
+      int nodeWidth = nonLeafWriter.calculateWidth();
+      negativeIndex += nodeWidth;
+      nonLeaf.setNegativeIndex(negativeIndex);
+    }
+
+    return this;
+  }
+
+  protected RowNodeWriter initializeWriter(List<RowNodeWriter> list, int index,
+      TokenizerNode builderNode) {
+    RowNodeWriter rowNodeWriter = null;
+    //check if there is an existing node we can recycle
+    if (index >= list.size()) {
+      //there are not enough existing nodes, so add a new one which will be retrieved below
+      list.add(new RowNodeWriter(prefixTreeEncoder, builderNode));
+    }
+    rowNodeWriter = list.get(index);
+    rowNodeWriter.reset(builderNode);
+    return rowNodeWriter;
+  }
+
+
+  public void writeBytes(OutputStream os) throws IOException {
+    for (int i = numNonLeafWriters - 1; i >= 0; --i) {
+      RowNodeWriter nonLeafWriter = nonLeafWriters.get(i);
+      nonLeafWriter.write(os);
+    }
+    // duplicates above... written more for clarity right now
+    for (int i = numLeafWriters - 1; i >= 0; --i) {
+      RowNodeWriter leafWriter = leafWriters.get(i);
+      leafWriter.write(os);
+    }
+  }
+
+
+  /***************** static ******************************/
+
+  protected static ArrayList<TokenizerNode> filterByLeafAndReverse(
+      ArrayList<TokenizerNode> ins, boolean leaves) {
+    ArrayList<TokenizerNode> outs = Lists.newArrayList();
+    for (int i = ins.size() - 1; i >= 0; --i) {
+      TokenizerNode n = ins.get(i);
+      if (n.isLeaf() && leaves || (!n.isLeaf() && !leaves)) {
+        outs.add(ins.get(i));
+      }
+    }
+    return outs;
+  }
+
+
+  /************* get/set **************************/
+
+  public int getNumBytes() {
+    return numBytes;
+  }
+
+  public ArrayList<TokenizerNode> getNonLeaves() {
+    return nonLeaves;
+  }
+
+  public ArrayList<TokenizerNode> getLeaves() {
+    return leaves;
+  }
+
+  public ArrayList<RowNodeWriter> getNonLeafWriters() {
+    return nonLeafWriters;
+  }
+
+  public ArrayList<RowNodeWriter> getLeafWriters() {
+    return leafWriters;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenDepthComparator.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenDepthComparator.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenDepthComparator.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenDepthComparator.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.tokenize;
+
+import java.util.Comparator;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * Determines order of nodes in the output array.  Maybe possible to optimize further.
+ */
+@InterfaceAudience.Private
+public class TokenDepthComparator implements Comparator<TokenizerNode> {
+
+  @Override
+  public int compare(TokenizerNode a, TokenizerNode b) {
+    if(a==null){
+      throw new IllegalArgumentException("a cannot be null");
+    }
+    if(b==null){
+      throw new IllegalArgumentException("b cannot be null");
+    }
+
+    // put leaves at the end
+    if (!a.isLeaf() && b.isLeaf()) {
+      return -1;
+    }
+    if (a.isLeaf() && !b.isLeaf()) {
+      return 1;
+    }
+
+    if (a.isLeaf() && b.isLeaf()) {// keep leaves in sorted order (for debugability)
+      return a.getId() < b.getId() ? -1 : 1;
+    }
+
+    // compare depth
+    if (a.getTokenOffset() < b.getTokenOffset()) {
+      return -1;
+    }
+    if (a.getTokenOffset() > b.getTokenOffset()) {
+      return 1;
+    }
+
+    // if same depth, return lower id first. ids are unique
+    return a.getId() < b.getId() ? -1 : 1;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/Tokenizer.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.tokenize;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.ArrayUtils;
+import org.apache.hadoop.hbase.util.ByteRange;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.CollectionUtils;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Data structure used in the first stage of PrefixTree encoding:
+ * <li>accepts a sorted stream of ByteRanges
+ * <li>splits them into a set of tokens, each held by a {@link TokenizerNode}
+ * <li>connects the TokenizerNodes via standard java references
+ * <li>keeps a pool of TokenizerNodes and a reusable byte[] for holding all token content
+ * <p><br>
+ * Mainly used for turning Cell rowKeys into a trie, but also used for family and qualifier
+ * encoding.
+ */
+@InterfaceAudience.Private
+public class Tokenizer{
+
+  /***************** fields **************************/
+
+  protected int numArraysAdded = 0;
+  protected long lastNodeId = -1;
+  protected ArrayList<TokenizerNode> nodes;
+  protected int numNodes;
+  protected TokenizerNode root;
+  protected byte[] tokens;
+  protected int tokensLength;
+
+  protected int maxElementLength = 0;
+  // number of levels in the tree assuming root level is 0
+  protected int treeDepth = 0;
+
+
+  /******************* construct *******************/
+
+  public Tokenizer() {
+    this.nodes = Lists.newArrayList();
+    this.tokens = new byte[0];
+  }
+
+  public void reset() {
+    numArraysAdded = 0;
+    lastNodeId = -1;
+    numNodes = 0;
+    tokensLength = 0;
+    root = null;
+    maxElementLength = 0;
+    treeDepth = 0;
+  }
+
+
+  /***************** building *************************/
+
+  public void addAll(ArrayList<ByteRange> sortedByteRanges) {
+    for (int i = 0; i < sortedByteRanges.size(); ++i) {
+      ByteRange byteRange = sortedByteRanges.get(i);
+      addSorted(byteRange);
+    }
+  }
+
+  public void addSorted(final ByteRange bytes) {
+    ++numArraysAdded;
+    if (bytes.getLength() > maxElementLength) {
+      maxElementLength = bytes.getLength();
+    }
+    if (root == null) {
+      // nodeDepth of firstNode (non-root) is 1
+      root = addNode(null, 1, 0, bytes, 0);
+    } else {
+      root.addSorted(bytes);
+    }
+  }
+
+  public void incrementNumOccurrencesOfLatestValue(){
+    CollectionUtils.getLast(nodes).incrementNumOccurrences(1);
+  }
+
+  protected long nextNodeId() {
+    return ++lastNodeId;
+  }
+
+  protected TokenizerNode addNode(TokenizerNode parent, int nodeDepth, int tokenStartOffset,
+      final ByteRange token, int inputTokenOffset) {
+    int inputTokenLength = token.getLength() - inputTokenOffset;
+    int tokenOffset = appendTokenAndRepointByteRange(token, inputTokenOffset);
+    TokenizerNode node = null;
+    if (nodes.size() <= numNodes) {
+      node = new TokenizerNode(this, parent, nodeDepth, tokenStartOffset, tokenOffset,
+          inputTokenLength);
+      nodes.add(node);
+    } else {
+      node = nodes.get(numNodes);
+      node.reset();
+      node.reconstruct(this, parent, nodeDepth, tokenStartOffset, tokenOffset, inputTokenLength);
+    }
+    ++numNodes;
+    return node;
+  }
+
+  protected int appendTokenAndRepointByteRange(final ByteRange token, int inputTokenOffset) {
+    int newOffset = tokensLength;
+    int inputTokenLength = token.getLength() - inputTokenOffset;
+    int newMinimum = tokensLength + inputTokenLength;
+    tokens = ArrayUtils.growIfNecessary(tokens, newMinimum, 2 * newMinimum);
+    token.deepCopySubRangeTo(inputTokenOffset, inputTokenLength, tokens, tokensLength);
+    tokensLength += inputTokenLength;
+    return newOffset;
+  }
+
+  protected void submitMaxNodeDepthCandidate(int nodeDepth) {
+    if (nodeDepth > treeDepth) {
+      treeDepth = nodeDepth;
+    }
+  }
+
+
+  /********************* read ********************/
+
+  public int getNumAdded(){
+    return numArraysAdded;
+  }
+
+  // for debugging
+  public ArrayList<TokenizerNode> getNodes(boolean includeNonLeaves, boolean includeLeaves) {
+    ArrayList<TokenizerNode> nodes = Lists.newArrayList();
+    root.appendNodesToExternalList(nodes, includeNonLeaves, includeLeaves);
+    return nodes;
+  }
+
+  public void appendNodes(List<TokenizerNode> appendTo, boolean includeNonLeaves,
+      boolean includeLeaves) {
+    root.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
+  }
+
+  public List<byte[]> getArrays() {
+    List<TokenizerNode> nodes = new ArrayList<TokenizerNode>();
+    root.appendNodesToExternalList(nodes, true, true);
+    List<byte[]> byteArrays = Lists.newArrayListWithCapacity(CollectionUtils.nullSafeSize(nodes));
+    for (int i = 0; i < nodes.size(); ++i) {
+      TokenizerNode node = nodes.get(i);
+      for (int j = 0; j < node.getNumOccurrences(); ++j) {
+        byte[] byteArray = node.getNewByteArray();
+        byteArrays.add(byteArray);
+      }
+    }
+    return byteArrays;
+  }
+
+  //currently unused, but working and possibly useful in the future
+  public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
+      int keyLength) {
+    root.getNode(resultHolder, key, keyOffset, keyLength);
+  }
+
+
+	/********************** write ***************************/
+
+  public Tokenizer setNodeFirstInsertionIndexes() {
+    root.setInsertionIndexes(0);
+    return this;
+  }
+
+  public Tokenizer appendOutputArrayOffsets(List<Integer> offsets) {
+    root.appendOutputArrayOffsets(offsets);
+    return this;
+  }
+
+
+  /********************* print/debug ********************/
+
+  protected static final Boolean INCLUDE_FULL_TREE_IN_TO_STRING = false;
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getStructuralString());
+    if (INCLUDE_FULL_TREE_IN_TO_STRING) {
+      for (byte[] bytes : getArrays()) {
+        if (sb.length() > 0) {
+          sb.append("\n");
+        }
+        sb.append(Bytes.toString(bytes));
+      }
+    }
+    return sb.toString();
+  }
+
+  public String getStructuralString() {
+    List<TokenizerNode> nodes = getNodes(true, true);
+    StringBuilder sb = new StringBuilder();
+    for (TokenizerNode node : nodes) {
+      String line = node.getPaddedTokenAndOccurrenceString();
+      sb.append(line + "\n");
+    }
+    return sb.toString();
+  }
+
+
+  /****************** get/set ************************/
+
+  public TokenizerNode getRoot() {
+    return root;
+  }
+
+  public int getMaxElementLength() {
+    return maxElementLength;
+  }
+
+  public int getTreeDepth() {
+    return treeDepth;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerNode.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerNode.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerNode.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerNode.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,632 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.tokenize;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.ByteRange;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.CollectionUtils;
+import org.apache.hadoop.hbase.util.Strings;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Individual node in a Trie structure.  Each node is one of 3 types:
+ * <li>Branch: an internal trie node that may have a token and must have multiple children, but does
+ * not represent an actual input byte[], hence its numOccurrences is 0
+ * <li>Leaf: a node with no children and where numOccurrences is >= 1.  It's token represents the
+ * last bytes in the input byte[]s.
+ * <li>Nub: a combination of a branch and leaf.  Its token represents the last bytes of input
+ * byte[]s and has numOccurrences >= 1, but it also has child nodes which represent input byte[]s
+ * that add bytes to this nodes input byte[].
+ * <br/><br/>
+ * Example inputs (numInputs=7):
+ * 0: AAA
+ * 1: AAA
+ * 2: AAB
+ * 3: AAB
+ * 4: AAB
+ * 5: AABQQ
+ * 6: AABQQ
+ * <br/><br/>
+ * Resulting TokenizerNodes:
+ * AA <- branch, numOccurrences=0, tokenStartOffset=0, token.length=2
+ * A  <- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1
+ * B  <- nub, numOccurrences=3, tokenStartOffset=2, token.length=1
+ * QQ <- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2
+ * <br/><br/>
+ * numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2
+ */
+@InterfaceAudience.Private
+public class TokenizerNode{
+
+  /*
+   * Ref to data structure wrapper
+   */
+  protected Tokenizer builder;
+
+  /****************************************************************** 
+   * Tree content/structure used during tokenization 
+   * ****************************************************************/
+
+  /*
+   * ref to parent trie node
+   */
+  protected TokenizerNode parent;
+
+  /*
+   * node depth in trie, irrespective of each node's token length
+   */
+  protected int nodeDepth;
+
+  /*
+   * start index of this token in original byte[]
+   */
+  protected int tokenStartOffset;
+
+  /*
+   * bytes for this trie node.  can be length 0 in root node
+   */
+  protected ByteRange token;
+
+  /*
+   * A count of occurrences in the input byte[]s, not the trie structure. 0 for branch nodes, 1+ for
+   * nubs and leaves. If the same byte[] is added to the trie multiple times, this is the only thing
+   * that changes in the tokenizer. As a result, duplicate byte[]s are very inexpensive to encode.
+   */
+  protected int numOccurrences;
+
+  /*
+   * The maximum fan-out of a byte[] trie is 256, so there are a maximum of 256
+   * child nodes.
+   */
+  protected ArrayList<TokenizerNode> children;
+
+
+  /*
+   * Fields used later in the encoding process for sorting the nodes into the order they'll be
+   * written to the output byte[].  With these fields, the TokenizerNode and therefore Tokenizer
+   * are not generic data structures but instead are specific to HBase PrefixTree encoding. 
+   */
+
+  /*
+   * unique id assigned to each TokenizerNode
+   */
+  protected long id;
+
+  /*
+   * set >=0 for nubs and leaves
+   */
+  protected int firstInsertionIndex = -1;
+
+  /*
+   * A positive value indicating how many bytes before the end of the block this node will start. If
+   * the section is 55 bytes and negativeOffset is 9, then the node will start at 46.
+   */
+  protected int negativeIndex = 0;
+
+  /*
+   * The offset in the output array at which to start writing this node's token bytes.  Influenced
+   * by the lengths of all tokens sorted before this one.
+   */
+  protected int outputArrayOffset = -1;
+
+
+  /*********************** construct *****************************/
+
+  public TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth,
+      int tokenStartOffset, int tokenOffset, int tokenLength) {
+    this.token = new ByteRange();
+    reconstruct(builder, parent, nodeDepth, tokenStartOffset, tokenOffset, tokenLength);
+    this.children = Lists.newArrayList();
+  }
+
+  /*
+   * Sub-constructor for initializing all fields without allocating a new object.  Used by the
+   * regular constructor.
+   */
+  public void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth,
+      int tokenStartOffset, int tokenOffset, int tokenLength) {
+    this.builder = builder;
+    this.id = builder.nextNodeId();
+    this.parent = parent;
+    this.nodeDepth = nodeDepth;
+    builder.submitMaxNodeDepthCandidate(nodeDepth);
+    this.tokenStartOffset = tokenStartOffset;
+    this.token.set(builder.tokens, tokenOffset, tokenLength);
+    this.numOccurrences = 1;
+  }
+
+  /*
+   * Clear the state of this node so that it looks like it was just allocated.
+   */
+  public void reset() {
+    builder = null;
+    parent = null;
+    nodeDepth = 0;
+    tokenStartOffset = 0;
+    token.clear();
+    numOccurrences = 0;
+    children.clear();// branches & nubs
+
+    // ids/offsets. used during writing to byte[]
+    id = 0;
+    firstInsertionIndex = -1;// set >=0 for nubs and leaves
+    negativeIndex = 0;
+    outputArrayOffset = -1;
+  }
+
+
+  /************************* building *********************************/
+
+  /*
+   * <li>Only public method used during the tokenization process
+   * <li>Requires that the input ByteRange sort after the previous, and therefore after all previous
+   * inputs
+   * <li>Only looks at bytes of the input array that align with this node's token
+   */
+  public void addSorted(final ByteRange bytes) {// recursively build the tree
+
+    /*
+     * Recurse deeper into the existing trie structure
+     */
+    if (matchesToken(bytes) && CollectionUtils.notEmpty(children)) {
+      TokenizerNode lastChild = CollectionUtils.getLast(children);
+      if (lastChild.partiallyMatchesToken(bytes)) {
+        lastChild.addSorted(bytes);
+        return;
+      }
+    }
+
+    /*
+     * Recursion ended.  We must either
+     * <li>1: increment numOccurrences if this input was equal to the previous
+     * <li>2: convert this node from a leaf to a nub, and add a new child leaf
+     * <li>3: split this node into a branch and leaf, and then add a second leaf
+     */
+
+    // add it as a child of this node
+    int numIdenticalTokenBytes = numIdenticalBytes(bytes);// should be <= token.length
+    int tailOffset = tokenStartOffset + numIdenticalTokenBytes;
+    int tailLength = bytes.getLength() - tailOffset;
+
+    if (numIdenticalTokenBytes == token.getLength()) {
+      if (tailLength == 0) {// identical to this node (case 1)
+        incrementNumOccurrences(1);
+      } else {// identical to this node, but with a few extra tailing bytes. (leaf -> nub) (case 2)
+        int childNodeDepth = nodeDepth + 1;
+        int childTokenStartOffset = tokenStartOffset + numIdenticalTokenBytes;
+        TokenizerNode newChildNode = builder.addNode(this, childNodeDepth, childTokenStartOffset,
+          bytes, tailOffset);
+        addChild(newChildNode);
+      }
+    } else {//numIdenticalBytes > 0, split into branch/leaf and then add second leaf (case 3)
+      split(numIdenticalTokenBytes, bytes);
+    }
+  }
+
+
+  protected void addChild(TokenizerNode node) {
+    node.setParent(this);
+    children.add(node);
+  }
+
+
+  /**
+   * Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the
+   * method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output
+   * will be 3 nodes:<br/>
+   * <li>1: B <- branch
+   * <li>2: AA <- leaf
+   * <li>3: OO <- leaf
+   *
+   * @param numTokenBytesToRetain => 1 (the B)
+   * @param bytes => BOO
+   */
+  protected void split(int numTokenBytesToRetain, final ByteRange bytes) {
+    int childNodeDepth = nodeDepth;
+    int childTokenStartOffset = tokenStartOffset + numTokenBytesToRetain;
+
+    //create leaf AA
+    TokenizerNode firstChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
+      token, numTokenBytesToRetain);
+    firstChild.setNumOccurrences(numOccurrences);// do before clearing this node's numOccurrences
+    token.setLength(numTokenBytesToRetain);//shorten current token from BAA to B
+    numOccurrences = 0;//current node is now a branch
+
+    moveChildrenToDifferentParent(firstChild);//point the new leaf (AA) to the new branch (B)
+    addChild(firstChild);//add the new leaf (AA) to the branch's (B's) children
+
+    //create leaf OO
+    TokenizerNode secondChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
+      bytes, tokenStartOffset + numTokenBytesToRetain);
+    addChild(secondChild);//add the new leaf (00) to the branch's (B's) children
+
+    // we inserted branch node B as a new level above/before the two children, so increment the
+    // depths of the children below
+    firstChild.incrementNodeDepthRecursively();
+    secondChild.incrementNodeDepthRecursively();
+  }
+
+
+  protected void incrementNodeDepthRecursively() {
+    ++nodeDepth;
+    builder.submitMaxNodeDepthCandidate(nodeDepth);
+    for (int i = 0; i < children.size(); ++i) {
+      children.get(i).incrementNodeDepthRecursively();
+    }
+  }
+
+
+  protected void moveChildrenToDifferentParent(TokenizerNode newParent) {
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      child.setParent(newParent);
+      newParent.children.add(child);
+    }
+    children.clear();
+  }
+
+
+	/************************ byte[] utils *************************/
+
+  protected boolean partiallyMatchesToken(ByteRange bytes) {
+    return numIdenticalBytes(bytes) > 0;
+  }
+
+  protected boolean matchesToken(ByteRange bytes) {
+    return numIdenticalBytes(bytes) == getTokenLength();
+  }
+
+  protected int numIdenticalBytes(ByteRange bytes) {
+    return token.numEqualPrefixBytes(bytes, tokenStartOffset);
+  }
+
+
+	/***************** moving nodes around ************************/
+
+  public void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves,
+      boolean includeLeaves) {
+    if (includeNonLeaves && !isLeaf() || includeLeaves && isLeaf()) {
+      appendTo.add(this);
+    }
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      child.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
+    }
+  }
+
+  public int setInsertionIndexes(int nextIndex) {
+    int newNextIndex = nextIndex;
+    if (hasOccurrences()) {
+      setFirstInsertionIndex(nextIndex);
+      newNextIndex += numOccurrences;
+    }
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      newNextIndex = child.setInsertionIndexes(newNextIndex);
+    }
+    return newNextIndex;
+  }
+
+  public void appendOutputArrayOffsets(List<Integer> offsets) {
+    if (hasOccurrences()) {
+      offsets.add(outputArrayOffset);
+    }
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      child.appendOutputArrayOffsets(offsets);
+    }
+  }
+
+
+  /***************** searching *********************************/
+
+  /*
+   * Do a trie style search through the tokenizer.  One option for looking up families or qualifiers
+   * during encoding, but currently unused in favor of tracking this information as they are added.
+   *
+   * Keeping code pending further performance testing.
+   */
+  public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
+      int keyLength) {
+    int thisNodeDepthPlusLength = tokenStartOffset + token.getLength();
+
+    // quick check if the key is shorter than this node (may not work for binary search)
+    if (CollectionUtils.isEmpty(children)) {
+      if (thisNodeDepthPlusLength < keyLength) {// ran out of bytes
+        resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
+        return;
+      }
+    }
+
+    // all token bytes must match
+    for (int i = 0; i < token.getLength(); ++i) {
+      if (key[tokenStartOffset + keyOffset + i] != token.get(i)) {
+        // TODO return whether it's before or after so we can binary search
+        resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
+        return;
+      }
+    }
+
+    if (thisNodeDepthPlusLength == keyLength && numOccurrences > 0) {
+      resultHolder.set(TokenizerRowSearchPosition.MATCH, this);// MATCH
+      return;
+    }
+
+    if (CollectionUtils.notEmpty(children)) {
+      // TODO binary search the children
+      for (int i = 0; i < children.size(); ++i) {
+        TokenizerNode child = children.get(i);
+        child.getNode(resultHolder, key, keyOffset, keyLength);
+        if (resultHolder.isMatch()) {
+          return;
+        } else if (resultHolder.getDifference() == TokenizerRowSearchPosition.BEFORE) {
+          // passed it, so it doesn't exist
+          resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
+          return;
+        }
+        // key is still AFTER the current node, so continue searching
+      }
+    }
+
+    // checked all children (or there were no children), and didn't find it
+    resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
+    return;
+  }
+
+
+  /****************** writing back to byte[]'s *************************/
+
+  public byte[] getNewByteArray() {
+    byte[] arrayToFill = new byte[tokenStartOffset + token.getLength()];
+    fillInBytes(arrayToFill);
+    return arrayToFill;
+  }
+
+  public void fillInBytes(byte[] arrayToFill) {
+    for (int i = 0; i < token.getLength(); ++i) {
+      arrayToFill[tokenStartOffset + i] = token.get(i);
+    }
+    if (parent != null) {
+      parent.fillInBytes(arrayToFill);
+    }
+  }
+
+
+  /************************** printing ***********************/
+
+  @Override
+  public String toString() {
+    String s = "";
+    if (parent == null) {
+      s += "R ";
+    } else {
+      s += getBnlIndicator(false) + " " + Bytes.toString(parent.getNewByteArray());
+    }
+    s += "[" + Bytes.toString(token.deepCopyToNewArray()) + "]";
+    if (numOccurrences > 0) {
+      s += "x" + numOccurrences;
+    }
+    return s;
+  }
+
+  public String getPaddedTokenAndOccurrenceString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getBnlIndicator(true));
+    sb.append(Strings.padFront(numOccurrences + "", ' ', 3));
+    sb.append(Strings.padFront(nodeDepth + "", ' ', 3));
+    if (outputArrayOffset >= 0) {
+      sb.append(Strings.padFront(outputArrayOffset + "", ' ', 3));
+    }
+    sb.append("  ");
+    for (int i = 0; i < tokenStartOffset; ++i) {
+      sb.append(" ");
+    }
+    sb.append(Bytes.toString(token.deepCopyToNewArray()).replaceAll(" ", "_"));
+    return sb.toString();
+  }
+
+  public String getBnlIndicator(boolean indent) {
+    if (indent) {
+      if (isNub()) {
+        return " N ";
+      }
+      return isBranch() ? "B  " : "  L";
+    }
+    if (isNub()) {
+      return "N";
+    }
+    return isBranch() ? "B" : "L";
+  }
+
+
+	/********************** count different node types ********************/
+
+  public int getNumBranchNodesIncludingThisNode() {
+    if (isLeaf()) {
+      return 0;
+    }
+    int totalFromThisPlusChildren = isBranch() ? 1 : 0;
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      totalFromThisPlusChildren += child.getNumBranchNodesIncludingThisNode();
+    }
+    return totalFromThisPlusChildren;
+  }
+
+  public int getNumNubNodesIncludingThisNode() {
+    if (isLeaf()) {
+      return 0;
+    }
+    int totalFromThisPlusChildren = isNub() ? 1 : 0;
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      totalFromThisPlusChildren += child.getNumNubNodesIncludingThisNode();
+    }
+    return totalFromThisPlusChildren;
+  }
+
+  public int getNumLeafNodesIncludingThisNode() {
+    if (isLeaf()) {
+      return 1;
+    }
+    int totalFromChildren = 0;
+    for (int i = 0; i < children.size(); ++i) {
+      TokenizerNode child = children.get(i);
+      totalFromChildren += child.getNumLeafNodesIncludingThisNode();
+    }
+    return totalFromChildren;
+  }
+
+
+  /*********************** simple read-only methods *******************************/
+
+  public int getNodeDepth() {
+    return nodeDepth;
+  }
+
+  public int getTokenLength() {
+    return token.getLength();
+  }
+
+  public boolean hasOccurrences() {
+    return numOccurrences > 0;
+  }
+
+  public boolean isRoot() {
+    return this.parent == null;
+  }
+
+  public int getNumChildren() {
+    return CollectionUtils.nullSafeSize(children);
+  }
+
+  public TokenizerNode getLastChild() {
+    if (CollectionUtils.isEmpty(children)) {
+      return null;
+    }
+    return CollectionUtils.getLast(children);
+  }
+
+  public boolean isLeaf() {
+    return CollectionUtils.isEmpty(children) && hasOccurrences();
+  }
+
+  public boolean isBranch() {
+    return CollectionUtils.notEmpty(children) && !hasOccurrences();
+  }
+
+  public boolean isNub() {
+    return CollectionUtils.notEmpty(children) && hasOccurrences();
+  }
+
+
+  /********************** simple mutation methods *************************/
+
+  /**
+   * Each occurrence > 1 indicates a repeat of the previous entry.  This can be called directly by
+   * an external class without going through the process of detecting a repeat if it is a known
+   * repeat by some external mechanism.  PtEncoder uses this when adding cells to a row if it knows
+   * the new cells are part of the current row.
+   * @param d increment by this amount
+   */
+  public void incrementNumOccurrences(int d) {
+    numOccurrences += d;
+  }
+
+
+  /************************* autogenerated get/set ******************/
+
+  public int getTokenOffset() {
+    return tokenStartOffset;
+  }
+
+  public TokenizerNode getParent() {
+    return parent;
+  }
+
+  public ByteRange getToken() {
+    return token;
+  }
+
+  public int getNumOccurrences() {
+    return numOccurrences;
+  }
+
+  public void setParent(TokenizerNode parent) {
+    this.parent = parent;
+  }
+
+  public void setNumOccurrences(int numOccurrences) {
+    this.numOccurrences = numOccurrences;
+  }
+
+  public ArrayList<TokenizerNode> getChildren() {
+    return children;
+  }
+
+  public long getId() {
+    return id;
+  }
+
+  public int getFirstInsertionIndex() {
+    return firstInsertionIndex;
+  }
+
+  public void setFirstInsertionIndex(int firstInsertionIndex) {
+    this.firstInsertionIndex = firstInsertionIndex;
+  }
+
+  public int getNegativeIndex() {
+    return negativeIndex;
+  }
+
+  public void setNegativeIndex(int negativeIndex) {
+    this.negativeIndex = negativeIndex;
+  }
+
+  public int getOutputArrayOffset() {
+    return outputArrayOffset;
+  }
+
+  public void setOutputArrayOffset(int outputArrayOffset) {
+    this.outputArrayOffset = outputArrayOffset;
+  }
+
+  public void setId(long id) {
+    this.id = id;
+  }
+
+  public void setBuilder(Tokenizer builder) {
+    this.builder = builder;
+  }
+
+  public void setTokenOffset(int tokenOffset) {
+    this.tokenStartOffset = tokenOffset;
+  }
+
+  public void setToken(ByteRange token) {
+    this.token = token;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchPosition.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchPosition.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchPosition.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchPosition.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.tokenize;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+
+/**
+ * Warning: currently unused, but code is valid.  Pending performance testing on more data sets.
+ *
+ * Where is the key relative to our current position in the tree. For example, the current tree node
+ * is "BEFORE" the key we are seeking
+ */
+@InterfaceAudience.Private
+public enum TokenizerRowSearchPosition {
+
+	AFTER,//the key is after this tree node, so keep searching
+	BEFORE,//in a binary search, this tells us to back up
+	MATCH,//the current node is a full match
+	NO_MATCH,//might as well return a value more informative than null
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchResult.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchResult.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchResult.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/encode/tokenize/TokenizerRowSearchResult.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.encode.tokenize;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+
+/**
+ * for recursively searching a PtBuilder
+ */
+@InterfaceAudience.Private
+public class TokenizerRowSearchResult{
+
+  /************ fields ************************/
+
+  protected TokenizerRowSearchPosition difference;
+  protected TokenizerNode matchingNode;
+
+
+  /*************** construct *****************/
+
+  public TokenizerRowSearchResult() {
+  }
+
+  public TokenizerRowSearchResult(TokenizerRowSearchPosition difference) {
+    this.difference = difference;
+  }
+
+  public TokenizerRowSearchResult(TokenizerNode matchingNode) {
+    this.difference = TokenizerRowSearchPosition.MATCH;
+    this.matchingNode = matchingNode;
+  }
+
+
+  /*************** methods **********************/
+
+  public boolean isMatch() {
+    return TokenizerRowSearchPosition.MATCH == difference;
+  }
+
+
+  /************* get/set ***************************/
+
+  public TokenizerRowSearchPosition getDifference() {
+    return difference;
+  }
+
+  public TokenizerNode getMatchingNode() {
+    return matchingNode;
+  }
+
+  public void set(TokenizerRowSearchPosition difference, TokenizerNode matchingNode) {
+    this.difference = difference;
+    this.matchingNode = matchingNode;
+  }
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellScanner.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellScanner.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellScanner.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.scanner;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hbase.Cell;
+
+/**
+ * Alternate name may be CellInputStream
+ * <p/>
+ * An interface for iterating through a sequence of cells. Similar to Java's Iterator, but without
+ * the hasNext() or remove() methods. The hasNext() method is problematic because it may require
+ * actually loading the next object, which in turn requires storing the previous object somewhere.
+ * The core data block decoder should be as fast as possible, so we push the complexity and
+ * performance expense of concurrently tracking multiple cells to layers above the CellScanner.
+ * <p/>
+ * The getCurrentCell() method will return a reference to a Cell implementation. This reference may
+ * or may not point to a reusable cell implementation, so users of the CellScanner should not, for
+ * example, accumulate a List of Cells. All of the references may point to the same object, which
+ * would be the latest state of the underlying Cell. In short, the Cell is mutable.
+ * <p/>
+ * At a minimum, an implementation will need to be able to advance from one cell to the next in a
+ * LinkedList fashion. The nextQualifier(), nextFamily(), and nextRow() methods can all be
+ * implemented by calling nextCell(), however, if the DataBlockEncoding supports random access into
+ * the block then it may provide smarter versions of these methods.
+ * <p/>
+ * Typical usage:
+ * 
+ * <pre>
+ * while (scanner.nextCell()) {
+ *   Cell cell = scanner.getCurrentCell();
+ *   // do something
+ * }
+ * </pre>
+ */
+@InterfaceAudience.Private
+public interface CellScanner{
+
+  /**
+   * Reset any state in the scanner so it appears it was freshly opened.
+   */
+  void resetToBeforeFirstEntry();
+
+  /**
+   * @return the current Cell which may be mutable
+   */
+  Cell getCurrent();
+
+  /**
+   * Advance the scanner 1 cell.
+   * @return true if the next cell is found and getCurrentCell() will return a valid Cell
+   */
+  boolean next();
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellSearcher.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellSearcher.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellSearcher.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/CellSearcher.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.scanner;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hbase.Cell;
+import org.apache.hbase.cell.CellScannerPosition;
+
+/**
+ * Methods for seeking to a random {@link Cell} inside a sorted collection of cells. Indicates that
+ * the implementation is able to navigate between cells without iterating through every cell.
+ */
+@InterfaceAudience.Private
+public interface CellSearcher extends ReversibleCellScanner {
+
+  /**
+   * Do everything within this scanner's power to find the key. Look forward and backwards.
+   * <p/>
+   * Abort as soon as we know it can't be found, possibly leaving the Searcher in an invalid state.
+   * <p/>
+   * @param key position the CellScanner exactly on this key
+   * @return true if the cell existed and getCurrentCell() holds a valid cell
+   */
+  boolean positionAt(Cell key);
+
+  /**
+   * Same as positionAt(..), but go to the extra effort of finding the previous key if there's no
+   * exact match.
+   * <p/>
+   * @param key position the CellScanner on this key or the closest cell before
+   * @return AT if exact match<br/>
+   *         BEFORE if on last cell before key<br/>
+   *         BEFORE_FIRST if key was before the first cell in this scanner's scope
+   */
+  CellScannerPosition positionAtOrBefore(Cell key);
+
+  /**
+   * Same as positionAt(..), but go to the extra effort of finding the next key if there's no exact
+   * match.
+   * <p/>
+   * @param key position the CellScanner on this key or the closest cell after
+   * @return AT if exact match<br/>
+   *         AFTER if on first cell after key<br/>
+   *         AFTER_LAST if key was after the last cell in this scanner's scope
+   */
+  CellScannerPosition positionAtOrAfter(Cell key);
+
+  /**
+   * Note: Added for backwards compatibility with 
+   * {@link org.apache.hadoop.hbase.regionserver.KeyValueScanner#reseek}
+   * <p/>
+   * Look for the key, but only look after the current position. Probably not needed for an
+   * efficient tree implementation, but is important for implementations without random access such
+   * as unencoded KeyValue blocks.
+   * <p/>
+   * @param key position the CellScanner exactly on this key
+   * @return true if getCurrent() holds a valid cell
+   */
+  boolean seekForwardTo(Cell key);
+
+  /**
+   * Same as seekForwardTo(..), but go to the extra effort of finding the next key if there's no
+   * exact match.
+   * <p/>
+   * @param key
+   * @return AT if exact match<br/>
+   *         AFTER if on first cell after key<br/>
+   *         AFTER_LAST if key was after the last cell in this scanner's scope
+   */
+  CellScannerPosition seekForwardToOrBefore(Cell key);
+
+  /**
+   * Same as seekForwardTo(..), but go to the extra effort of finding the next key if there's no
+   * exact match.
+   * <p/>
+   * @param key
+   * @return AT if exact match<br/>
+   *         AFTER if on first cell after key<br/>
+   *         AFTER_LAST if key was after the last cell in this scanner's scope
+   */
+  CellScannerPosition seekForwardToOrAfter(Cell key);
+
+  /**
+   * Note: This may not be appropriate to have in the interface.  Need to investigate.
+   * <p/>
+   * Position the scanner in an invalid state after the last cell: CellScannerPosition.AFTER_LAST.
+   * This is used by tests and for handling certain edge cases.
+   */
+  void positionAfterLastCell();
+
+}

Added: hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/ReversibleCellScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/ReversibleCellScanner.java?rev=1443289&view=auto
==============================================================================
--- hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/ReversibleCellScanner.java (added)
+++ hbase/trunk/hbase-prefix-tree/src/main/java/org/apache/hbase/codec/prefixtree/scanner/ReversibleCellScanner.java Thu Feb  7 00:36:24 2013
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hbase.codec.prefixtree.scanner;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * An extension of CellScanner indicating the scanner supports iterating backwards through cells.
+ * <p>
+ * Note: This was not added to suggest that HBase should support client facing reverse Scanners, but
+ * because some {@link CellSearcher} implementations, namely PrefixTree, need a method of backing up
+ * if the positionAt(..) method goes past the requested cell.
+ */
+@InterfaceAudience.Private
+public interface ReversibleCellScanner extends CellScanner {
+
+  /**
+   * Try to position the scanner one Cell before the current position.
+   * @return true if the operation was successful, meaning getCurrentCell() will return a valid
+   *         Cell.<br/>
+   *         false if there were no previous cells, meaning getCurrentCell() will return null.
+   *         Scanner position will be {@link org.apache.hbase.cell.CellScannerPosition#BEFORE_FIRST}
+   */
+  boolean previous();
+
+  /**
+   * Try to position the scanner in the row before the current row.
+   * @param endOfRow true for the last cell in the previous row; false for the first cell
+   * @return true if the operation was successful, meaning getCurrentCell() will return a valid
+   *         Cell.<br/>
+   *         false if there were no previous cells, meaning getCurrentCell() will return null.
+   *         Scanner position will be {@link org.apache.hbase.cell.CellScannerPosition#BEFORE_FIRST}
+   */
+  boolean previousRow(boolean endOfRow);
+
+}



Mime
View raw message