hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject hbase git commit: HBASE-15357 TableInputFormatBase getSplitKey does not handle signed bytes correctly (Nathan Schile)
Date Wed, 04 May 2016 23:58:29 GMT
Repository: hbase
Updated Branches:
  refs/heads/0.98 a2bc1a254 -> 176bd1cd3


HBASE-15357 TableInputFormatBase getSplitKey does not handle signed bytes correctly (Nathan
Schile)

Conflicts:
	hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/176bd1cd
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/176bd1cd
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/176bd1cd

Branch: refs/heads/0.98
Commit: 176bd1cd3c4d2a271dbfb81f411364691712c2b7
Parents: a2bc1a2
Author: tedyu <yuzhihong@gmail.com>
Authored: Fri Apr 29 20:55:47 2016 -0700
Committer: Andrew Purtell <apurtell@apache.org>
Committed: Wed May 4 16:15:21 2016 -0700

----------------------------------------------------------------------
 .../hbase/mapreduce/TableInputFormatBase.java   | 115 +++++++++----------
 .../mapreduce/TestTableInputFormatScan1.java    |  39 ++++---
 2 files changed, 79 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/176bd1cd/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
index 5b27383..55365af 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java
@@ -23,6 +23,7 @@ import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 
@@ -35,6 +36,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.classification.InterfaceStability;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.Scan;
@@ -304,20 +306,26 @@ extends InputFormat<ImmutableBytesWritable, Result> {
     int count = 0;
     while (count < list.size()) {
       TableSplit ts = (TableSplit)list.get(count);
+      TableName tableName = ts.getTable();
       String regionLocation = ts.getRegionLocation();
       long regionSize = ts.getLength();
       if (regionSize >= dataSkewThreshold) {
         // if the current region size is large than the data skew threshold,
         // split the region into two MapReduce input splits.
         byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
-         //Set the size of child TableSplit as 1/2 of the region size. The exact size of
the
-         // MapReduce input splits is not far off.
-        TableSplit t1 = new TableSplit(table.getName(), ts.getStartRow(), splitKey, regionLocation,
-                regionSize / 2);
-        TableSplit t2 = new TableSplit(table.getName(), splitKey, ts.getEndRow(), regionLocation,
-                regionSize - regionSize / 2);
-        resultList.add(t1);
-        resultList.add(t2);
+        if (Arrays.equals(ts.getEndRow(), splitKey)) {
+          // Not splitting since the end key is the same as the split key
+          resultList.add(ts);
+        } else {
+          //Set the size of child TableSplit as 1/2 of the region size. The exact size of
the
+          // MapReduce input splits is not far off.
+          TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey,
+              regionLocation, regionSize / 2);
+          TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation,
+              regionSize - regionSize / 2);
+          resultList.add(t1);
+          resultList.add(t2);
+        }
         count++;
       } else if (regionSize >= average) {
         // if the region size between average size and data skew threshold size,
@@ -353,11 +361,40 @@ extends InputFormat<ImmutableBytesWritable, Result> {
    * select a split point in the region. The selection of the split point is based on an
uniform
    * distribution assumption for the keys in a region.
    * Here are some examples:
-   * startKey: aaabcdefg  endKey: aaafff    split point: aaad
-   * startKey: 111000  endKey: 1125790    split point: 111b
-   * startKey: 1110  endKey: 1120    split point: 111_
-   * startKey: binary key { 13, -19, 126, 127 }, endKey: binary key { 13, -19, 127, 0 },
-   * split point: binary key { 13, -19, 127, -64 }
+   *
+   * <table>
+   *   <tr>
+   *     <th>start key</th>
+   *     <th>end key</th>
+   *     <th>is text</th>
+   *     <th>split point</th>
+   *   </tr>
+   *   <tr>
+   *     <td>'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'</td>
+   *     <td>'a', 'a', 'a', 'f', 'f', 'f'</td>
+   *     <td>true</td>
+   *     <td>'a', 'a', 'a', 'd', 'd', -78, 50, -77, 51</td>
+   *   </tr>
+   *   <tr>
+   *     <td>'1', '1', '1', '0', '0', '0'</td>
+   *     <td>'1', '1', '2', '5', '7', '9', '0'</td>
+   *     <td>true</td>
+   *     <td>'1', '1', '1', -78, -77, -76, -104</td>
+   *   </tr>
+   *   <tr>
+   *     <td>'1', '1', '1', '0'</td>
+   *     <td>'1', '1', '2', '0'</td>
+   *     <td>true</td>
+   *     <td>'1', '1', '1', -80</td>
+   *   </tr>
+   *   <tr>
+   *     <td>13, -19, 126, 127</td>
+   *     <td>13, -19, 127, 0</td>
+   *     <td>false</td>
+   *     <td>13, -19, 126, -65</td>
+   *   </tr>
+   * </table>
+   *
    * Set this function as "public static", make it easier for test.
    *
    * @param start Start key of the region
@@ -375,8 +412,8 @@ extends InputFormat<ImmutableBytesWritable, Result> {
       upperLimitByte = '~';
       lowerLimitByte = ' ';
     } else {
-      upperLimitByte = Byte.MAX_VALUE;
-      lowerLimitByte = Byte.MIN_VALUE;
+      upperLimitByte = -1;
+      lowerLimitByte = 0;
     }
     // For special case
     // Example 1 : startkey=null, endkey="hhhqqqwww", splitKey="h"
@@ -395,52 +432,8 @@ extends InputFormat<ImmutableBytesWritable, Result> {
       }
       return result;
     }
-    // A list to store bytes in split key
-    List<Byte> resultBytesList = new ArrayList<Byte>();
-    int maxLength = start.length > end.length ? start.length : end.length;
-    for (int i = 0; i < maxLength; i++) {
-      //calculate the midpoint byte between the first difference
-      //for example: "11ae" and "11chw", the midpoint is "11b"
-      //another example: "11ae" and "11bhw", the first different byte is 'a' and 'b',
-      // there is no midpoint between 'a' and 'b', so we need to check the next byte.
-      if (start[i] == end[i]) {
-        resultBytesList.add(start[i]);
-        //For special case like: startKey="aaa", endKey="aaaz", splitKey="aaaM"
-        if (i + 1 == start.length) {
-          resultBytesList.add((byte) ((lowerLimitByte + end[i + 1]) / 2));
-          break;
-        }
-      } else {
-        //if the two bytes differ by 1, like ['a','b'], We need to check the next byte to
find
-        // the midpoint.
-        if ((int)end[i] - (int)start[i] == 1) {
-          //get next byte after the first difference
-          byte startNextByte = (i + 1 < start.length) ? start[i + 1] : lowerLimitByte;
-          byte endNextByte = (i + 1 < end.length) ? end[i + 1] : lowerLimitByte;
-          int byteRange = (upperLimitByte - startNextByte) + (endNextByte - lowerLimitByte)
+ 1;
-          int halfRange = byteRange / 2;
-          if ((int)startNextByte + halfRange > (int)upperLimitByte) {
-            resultBytesList.add(end[i]);
-            resultBytesList.add((byte) (startNextByte + halfRange - upperLimitByte +
-                    lowerLimitByte));
-          } else {
-            resultBytesList.add(start[i]);
-            resultBytesList.add((byte) (startNextByte + halfRange));
-          }
-        } else {
-          //calculate the midpoint key by the fist different byte (normal case),
-          // like "11ae" and "11chw", the midpoint is "11b"
-          resultBytesList.add((byte) ((start[i] + end[i]) / 2));
-        }
-        break;
-      }
-    }
-    //transform the List of bytes to byte[]
-    byte result[] = new byte[resultBytesList.size()];
-    for (int k = 0; k < resultBytesList.size(); k++) {
-      result[k] = (byte) resultBytesList.get(k);
-    }
-    return result;
+
+    return Bytes.split(start, end, false, 1)[1];
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/176bd1cd/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java
index 143b70c..e29bcb7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java
@@ -127,48 +127,45 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase
{
   @Test
   public void testGetSplitsPoint() throws IOException, InterruptedException,
   ClassNotFoundException {
-    // Test Case 1: "aaabcdef" and "aaaff", split point is "aaad".
     byte[] start1 = { 'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f' };
     byte[] end1 = { 'a', 'a', 'a', 'f', 'f' };
-    byte[] splitPoint1 = { 'a', 'a', 'a', 'd' };
+    byte[] splitPoint1 = { 'a', 'a', 'a', 'd', 'd', -78, 50, -77  };
     testGetSplitKey(start1, end1, splitPoint1, true);
 
-    // Test Case 2: "111000" and "1125790", split point is "111b".
     byte[] start2 = { '1', '1', '1', '0', '0', '0' };
     byte[] end2 = { '1', '1', '2', '5', '7', '9', '0' };
-    byte[] splitPoint2 = { '1', '1', '1', 'b' };
+    byte[] splitPoint2 = { '1', '1', '1',  -78, -77, -76, -104 };
     testGetSplitKey(start2, end2, splitPoint2, true);
 
-    // Test Case 3: "aaaaaa" and "aab", split point is "aaap".
     byte[] start3 = { 'a', 'a', 'a', 'a', 'a', 'a' };
     byte[] end3 = { 'a', 'a', 'b' };
-    byte[] splitPoint3 = { 'a', 'a', 'a', 'p' };
+    byte[] splitPoint3 = { 'a', 'a', 'a', -80, -80, -80 };
     testGetSplitKey(start3, end3, splitPoint3, true);
 
-    // Test Case 4: "aaa" and "aaaz", split point is "aaaM".
     byte[] start4 = { 'a', 'a', 'a' };
     byte[] end4 = { 'a', 'a', 'a', 'z' };
-    byte[] splitPoint4 = { 'a', 'a', 'a', 'M' };
+    byte[] splitPoint4 = { 'a', 'a', 'a', '=' };
     testGetSplitKey(start4, end4, splitPoint4, true);
 
-    // Test Case 5: "aaa" and "aaba", split point is "aaap".
     byte[] start5 = { 'a', 'a', 'a' };
     byte[] end5 = { 'a', 'a', 'b', 'a' };
-    byte[] splitPoint5 = { 'a', 'a', 'a', 'p' };
+    byte[] splitPoint5 = { 'a', 'a', 'a', -80 };
     testGetSplitKey(start5, end5, splitPoint5, true);
 
     // Test Case 6: empty key and "hhhqqqwww", split point is "h"
     byte[] start6 = {};
     byte[] end6 = { 'h', 'h', 'h', 'q', 'q', 'q', 'w', 'w' };
-    byte[] splitPoint6 = { 'h' };
-    testGetSplitKey(start6, end6, splitPoint6, true);
+    byte[] splitPointText6 = { 'h' };
+    byte[] splitPointBinary6 = { 104 };
+    testGetSplitKey(start6, end6, splitPointText6, true);
+    testGetSplitKey(start6, end6, splitPointBinary6, false);
 
     // Test Case 7: "ffffaaa" and empty key, split point depends on the mode we choose(text
key or
     // binary key).
     byte[] start7 = { 'f', 'f', 'f', 'f', 'a', 'a', 'a' };
     byte[] end7 = {};
     byte[] splitPointText7 = { 'f', '~', '~', '~', '~', '~', '~'  };
-    byte[] splitPointBinary7 = { 'f', 127, 127, 127, 127, 127, 127  };
+    byte[] splitPointBinary7 = { 'f', -1, -1, -1, -1, -1, -1  };
     testGetSplitKey(start7, end7, splitPointText7, true);
     testGetSplitKey(start7, end7, splitPointBinary7, false);
 
@@ -184,7 +181,21 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase
{
     // Test Case 9: Binary Key example
     byte[] start9 = { 13, -19, 126, 127 };
     byte[] end9 = { 13, -19, 127, 0 };
-    byte[] splitPoint9 = { 13, -19, 127, -64 };
+    byte[] splitPoint9 = { 13, -19, 126, -65 };
     testGetSplitKey(start9, end9, splitPoint9, false);
+
+    // Test Case 10: Binary key split when the start key is an unsigned byte and the end
byte is a
+    // signed byte
+    byte[] start10 = { 'x' };
+    byte[] end10 = { -128 };
+    byte[] splitPoint10 = { '|' };
+    testGetSplitKey(start10, end10, splitPoint10, false);
+
+    // Test Case 11: Binary key split when the start key is an signed byte and the end byte
is a
+    // signed byte
+    byte[] start11 = { -100 };
+    byte[] end11 = { -90 };
+    byte[] splitPoint11 = { -95 };
+    testGetSplitKey(start11, end11, splitPoint11, false);
   }
 }


Mime
View raw message