hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ser...@apache.org
Subject [55/70] [abbrv] hive git commit: HIVE-14573: Vectorization: Implement StringExpr::find() (Teddy Choi, reviewed by Gopal V)
Date Tue, 07 Feb 2017 20:59:23 GMT
HIVE-14573: Vectorization: Implement StringExpr::find() (Teddy Choi, reviewed by Gopal V)

Signed-off-by: Gopal V <gopalv@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/fe9a6d53
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/fe9a6d53
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/fe9a6d53

Branch: refs/heads/hive-14535
Commit: fe9a6d5387154cec1c3fa5d82fda570103155656
Parents: ea9e851
Author: Teddy Choi <tchoi@hortonworks.com>
Authored: Mon Feb 6 17:00:52 2017 -0800
Committer: Gopal V <gopalv@apache.org>
Committed: Mon Feb 6 17:02:08 2017 -0800

----------------------------------------------------------------------
 .../vectorization/AbstractExpression.java       | 20 +++++-
 .../vectorization/VectorizedLikeBench.java      | 67 ++++++++++++++++++++
 ...AbstractFilterStringColLikeStringScalar.java | 25 ++------
 .../FilterStringColLikeStringScalar.java        | 11 +---
 .../ql/exec/vector/expressions/StringExpr.java  | 60 ++++++++++++++++++
 .../exec/vector/expressions/TestStringExpr.java | 60 ++++++++++++++++++
 6 files changed, 214 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java
b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java
index 94af3e0..879b437 100644
--- a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java
+++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/AbstractExpression.java
@@ -13,6 +13,7 @@
  */
 package org.apache.hive.benchmark.vectorization;
 
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
@@ -35,7 +36,7 @@ import java.util.concurrent.TimeUnit;
 @BenchmarkMode(Mode.AverageTime)
 @Fork(1)
 @State(Scope.Thread)
-@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
 public abstract class AbstractExpression {
   private static final int DEFAULT_ITER_TIME = 1000000;
   protected VectorExpression expression;
@@ -59,6 +60,9 @@ public abstract class AbstractExpression {
   @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
   public void bench() {
     for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+      rowBatch.selectedInUse = false;
+      rowBatch.size = VectorizedRowBatch.DEFAULT_SIZE;
+
       expression.evaluate(rowBatch);
     }
   }
@@ -147,4 +151,18 @@ public abstract class AbstractExpression {
     return columnVector;
   }
 
+  protected BytesColumnVector getBytesColumnVector() {
+    BytesColumnVector columnVector = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    Random random = new Random();
+    int length = 16;
+    for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+      columnVector.vector[i] = new byte[length];
+      columnVector.start[i] = 0;
+      columnVector.length[i] = length;
+      for (int j = 0; j < length; j++) {
+        columnVector.vector[i][j] = (byte)(random.nextInt(+ 'c' - 'a' + 1) + 'a');
+      }
+    }
+    return columnVector;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java
b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java
new file mode 100644
index 0000000..136c01b
--- /dev/null
+++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/vectorization/VectorizedLikeBench.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.benchmark.vectorization;
+
+import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * This test measures the performance for vectorization.
+ * <p/>
+ * This test uses JMH framework for benchmarking.
+ * You may execute this benchmark tool using JMH command line in different ways:
+ * <p/>
+ * To use the settings shown in the main() function, use:
+ * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench
+ * <p/>
+ * To use the default settings used by JMH, use:
+ * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench
+ * <p/>
+ * To specify different parameters, use:
+ * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it
will
+ * display the Average Time (avgt) in Microseconds (us)
+ * - Benchmark mode. Available modes are:
+ * [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all]
+ * - Output time unit. Available time units are: [m, s, ms, us, ns].
+ * <p/>
+ * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.vectorization.VectorizedLikeBench
+ * -wi 10 -i 5 -f 2 -bm avgt -tu us
+ */
+@State(Scope.Benchmark)
+public class VectorizedLikeBench {
+  public static class FilterStringColLikeStringScalarBench extends AbstractExpression {
+    @Override
+    public void setup() {
+      rowBatch = buildRowBatch(null, 1, getBytesColumnVector());
+      expression = new FilterStringColLikeStringScalar(0, "%aabb%".getBytes(StandardCharsets.UTF_8));
+    }
+  }
+
+  public static void main(String[] args) throws RunnerException {
+    Options opt = new OptionsBuilder().include(".*" + VectorizedLikeBench.class.getSimpleName()
+
+        ".*").build();
+    new Runner(opt).run();
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
index b49ff39..3208520 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
@@ -21,9 +21,9 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
-import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.StringTokenizer;
@@ -296,16 +296,10 @@ public abstract class AbstractFilterStringColLikeStringScalar extends
VectorExpr
    * Matches the middle of each string to its pattern.
    */
   protected static final class MiddleChecker implements Checker {
-    final byte[] byteSub;
-    final int lenSub;
+    final StringExpr.Finder finder;
 
     MiddleChecker(String pattern) {
-      try {
-        byteSub = pattern.getBytes("UTF-8");
-        lenSub = byteSub.length;
-      } catch (UnsupportedEncodingException e) {
-        throw new RuntimeException(e);
-      }
+      finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
     }
 
     public boolean check(byte[] byteS, int start, int len) {
@@ -316,16 +310,7 @@ public abstract class AbstractFilterStringColLikeStringScalar extends
VectorExpr
      * Returns absolute offset of the match
      */
     public int index(byte[] byteS, int start, int len) {
-      if (len < lenSub) {
-        return -1;
-      }
-      int end = start + len - lenSub + 1;
-      for (int i = start; i < end; i++) {
-        if (StringExpr.equal(byteSub, 0, lenSub, byteS, i, lenSub)) {
-          return i;
-        }
-      }
-      return -1;
+      return finder.find(byteS, start, len);
     }
   }
 
@@ -469,7 +454,7 @@ public abstract class AbstractFilterStringColLikeStringScalar extends
VectorExpr
     CharBuffer charBuffer;
 
     public FastUTF8Decoder() {
-      decoder = Charset.forName("UTF-8").newDecoder()
+      decoder = StandardCharsets.UTF_8.newDecoder()
           .onMalformedInput(CodingErrorAction.REPLACE)
           .onUnmappableCharacter(CodingErrorAction.REPLACE);
       byteBuffer = ByteBuffer.allocate(4);

http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
index ad9f75e..6f58160 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
@@ -18,10 +18,9 @@
 
 package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
-import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFLike;
-import org.apache.hadoop.io.Text;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.List;
 import java.util.regex.Matcher;
@@ -45,13 +44,9 @@ public class FilterStringColLikeStringScalar extends AbstractFilterStringColLike
     super();
   }
 
-  public FilterStringColLikeStringScalar(int colNum, byte[] likePattern) throws HiveException
{
+  public FilterStringColLikeStringScalar(int colNum, byte[] likePattern) {
     super(colNum, null);
-    try {
-      super.setPattern(new String(likePattern, "UTF-8"));
-    } catch (Exception ex) {
-      throw new HiveException(ex);
-    }
+    super.setPattern(new String(likePattern, StandardCharsets.UTF_8));
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
----------------------------------------------------------------------
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
index 90817a5..f2ae9bc 100644
--- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
@@ -351,4 +351,64 @@ public class StringExpr {
       return Arrays.copyOf(bytes, j);
     }
   }
+
+  /*
+   * Compiles the given pattern with a proper algorithm.
+   */
+  public static Finder compile(byte[] pattern) {
+    return new BoyerMooreHorspool(pattern);
+  }
+
+  /*
+   * A finder finds the first index of its pattern in a given byte array.
+   * Its thread-safety depends on its implementation.
+   */
+  public interface Finder {
+    int find(byte[] input, int start, int len);
+  }
+
+  /*
+   * StringExpr uses Boyer Moore Horspool algorithm to find faster.
+   * It is thread-safe, because it holds final member instances only.
+   * See https://en.wikipedia.org/wiki/Boyer–Moore–Horspool_algorithm .
+   */
+  private static class BoyerMooreHorspool implements Finder {
+    private static final int MAX_BYTE = 0xff;
+    private final long[] shift = new long[MAX_BYTE];
+    private final byte[] pattern;
+    private final int plen;
+
+    public BoyerMooreHorspool(byte[] pattern) {
+      this.pattern = pattern;
+      this.plen = pattern.length;
+      Arrays.fill(shift, plen);
+      for (int i = 0; i < plen - 1; i++) {
+        shift[pattern[i] & MAX_BYTE] = plen - i - 1;
+      }
+    }
+
+    public int find(byte[] input, int start, int len) {
+      if (pattern.length == 0) {
+        return 0;
+      }
+
+      final int end = start + len;
+      int next = start + plen - 1;
+      final int plen = this.plen;
+      final byte[] pattern = this.pattern;
+      while (next < end) {
+        int s_tmp = next;
+        int p_tmp = plen - 1;
+        while (input[s_tmp] == pattern[p_tmp]) {
+          p_tmp--;
+          if (p_tmp < 0) {
+            return s_tmp;
+          }
+          s_tmp--;
+        }
+        next += shift[input[next] & MAX_BYTE];
+      }
+      return -1;
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/fe9a6d53/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
----------------------------------------------------------------------
diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
new file mode 100644
index 0000000..63c210a
--- /dev/null
+++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.junit.Test;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.*;
+
+public class TestStringExpr {
+  @Test
+  public void test() throws Exception {
+    StringExpr.Finder pattern = compile("pattern");
+    assertNotNull(pattern);
+
+    StringExpr.Finder patternOneChar = compile("g");
+    assertNotNull(patternOneChar);
+
+    StringExpr.Finder patternZero = compile("");
+    assertNotNull(patternZero);
+
+    String input1 = "string that contains a patterN...";
+    String input2 = "string that contains a pattern...";
+    String input3 = "pattern at the start of a string";
+    String input4 = "string that ends with a pattern";
+
+    assertEquals("Testing invalid match", -1, find(pattern, input1));
+    assertEquals("Testing valid match", 23, find(pattern, input2));
+    assertEquals("Testing single-character match", 5, find(patternOneChar, input1));
+    assertEquals("Testing zero-length pattern", 0, find(patternZero, input1));
+    assertEquals("Testing match at start of string", 0, find(pattern, input3));
+    assertEquals("Testing match at end of string", 24, find(pattern, input4));
+  }
+
+  private StringExpr.Finder compile(String pattern) {
+    return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
+  }
+
+  private int find(StringExpr.Finder finder, String string) {
+    byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
+    return finder.find(bytes, 0, bytes.length);
+  }
+}
\ No newline at end of file


Mime
View raw message