hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hashut...@apache.org
Subject svn commit: r1477814 - in /hive/branches/vectorization/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Date Tue, 30 Apr 2013 20:31:30 GMT
Author: hashutosh
Date: Tue Apr 30 20:31:30 2013
New Revision: 1477814

URL: http://svn.apache.org/r1477814
Log:
HIVE-4385 : Implement vectorized LIKE filter (Jitendra Nath Pandey via Ashutosh Chauhan)

Added:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
Modified:
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java?rev=1477814&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
(added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
Tue Apr 30 20:31:30 2013
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+
+/**
+ * Evaluate LIKE filter on a batch for a vector of strings.
+ */
+public class FilterStringColLikeStringScalar extends VectorExpression {
+  private int colNum;
+  private Text likePattern;
+  private Text s;
+  private UDFLike likeFunc;
+
+  public FilterStringColLikeStringScalar(int colNum, Text likePattern) {
+    this.colNum = colNum;
+    this.likePattern = likePattern;
+    likeFunc = new UDFLike();
+    s = new Text();
+  }
+
+  /*
+   * This vectorized version of LIKE calls the standard LIKE
+   * function code. In the future, as an optimization, consider
+   * unwinding some of that logic here, e.g. to determine
+   * if the LIKE pattern is a simple one like 'abc%' so that
+   * can be executed more efficiently as a special case.
+   */
+
+  private boolean like(byte[] bytes, int start, int len) {
+    s.set(bytes, start, len);
+    return (likeFunc.evaluate(s, likePattern)).get();
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+    int[] sel = batch.selected;
+    boolean[] nullPos = inputColVector.isNull;
+    int n = batch.size;
+    byte[][] vector = inputColVector.vector;
+    int[] length = inputColVector.length;
+    int[] start = inputColVector.start;
+
+
+    // return immediately if batch is empty
+    if (n == 0) {
+      return;
+    }
+
+    if (inputColVector.noNulls) {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero Repeating property will not
change.
+        if (!like(vector[0], start[0], length[0])) {
+
+          //Entire batch is filtered out.
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          if (like(vector[i], start[i], length[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (like(vector[i], start[i], length[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    } else {
+      if (inputColVector.isRepeating) {
+
+        //All must be selected otherwise size would be zero. Repeating property will not
change.
+        if (!nullPos[0]) {
+          if (!like(vector[0], start[0], length[0])) {
+
+            //Entire batch is filtered out.
+            batch.size = 0;
+          }
+        } else {
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          if (!nullPos[i]) {
+           if (like(vector[i], start[i], length[i])) {
+             sel[newSize++] = i;
+           }
+          }
+        }
+
+        //Change the selected vector
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (!nullPos[i]) {
+            if (like(vector[i], start[i], length[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+
+        /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
+         * future operations. So selectedInUse will remain false.
+         */
+      }
+    }
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return -1;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "boolean";
+  }
+}

Modified: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1477814&r1=1477813&r2=1477814&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
(original)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Tue Apr 30 20:31:30 2013
@@ -31,6 +31,7 @@ import org.junit.Test;
 
 import java.io.UnsupportedEncodingException;
 import java.util.Arrays;
+import org.apache.hadoop.io.Text;
 
 /**
  * Test vectorized expression and filter evaluation for strings.
@@ -64,14 +65,14 @@ public class TestVectorStringExpressions
     red2 = new byte[red.length];
     System.arraycopy(red, 0, red2, 0, red.length);
   }
-  
+
   // add some multi-byte characters to test length routine later.
   // total characters = 4; byte length = 10
   static void addMultiByteChars(byte[] b) {
     int i = 0;
     b[i++] = (byte) 0x41; // letter "A" (1 byte)
     b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes)
-    b[i++] = (byte) 0x80; 
+    b[i++] = (byte) 0x80;
     b[i++] = (byte) 0xE2; // Euro sign (3 bytes)
     b[i++] = (byte) 0x82;
     b[i++] = (byte) 0xAC;
@@ -80,9 +81,9 @@ public class TestVectorStringExpressions
     b[i++] = (byte) 0xAD;
     b[i++] = (byte) 0xA2;
   }
-  
+
   @Test
-  // Load a BytesColumnVector by copying in large data, enough to force 
+  // Load a BytesColumnVector by copying in large data, enough to force
   // the buffer to expand.
   public void testLoadBytesColumnVectorByValueLargeData()  {
     BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
@@ -102,7 +103,7 @@ public class TestVectorStringExpressions
     }
     Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE);
   }
-  
+
   @Test
   // set values by reference, copy the data out, and verify equality
   public void testLoadBytesColumnVectorByRef() {
@@ -208,8 +209,8 @@ public class TestVectorStringExpressions
   }
   
   VectorizedRowBatch makeStringBatchMixedCharSize() {
-    // create a new batch with one char column (for input) 
-    // and one long column (for output)
+
+    // create a new batch with one char column (for input) and one long column (for output)

     VectorizedRowBatch batch = new VectorizedRowBatch(2, VectorizedRowBatch.DEFAULT_SIZE);
     BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
     batch.cols[0] = v;
@@ -289,10 +290,12 @@ public class TestVectorStringExpressions
   
   @Test
   public void testColUpper() {
+
     // no nulls, not repeating
     
-    // We don't test all the combinations because (at least currently)
-    // the logic is inherited to be the same as testColLower, which checks all the cases).
+    /* We don't test all the combinations because (at least currently)
+     * the logic is inherited to be the same as testColLower, which checks all the cases).
+     */
     VectorizedRowBatch batch = makeStringBatchMixedCase();
     StringUpper expr = new StringUpper(0, 1);
     batch.cols[0].noNulls = true;
@@ -332,7 +335,7 @@ public class TestVectorStringExpressions
     Assert.assertTrue(outCol.isRepeating);
     Assert.assertFalse(outCol.noNulls);
     Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
-    
+
     // no nulls, is repeating
     batch = makeStringBatchMixedCharSize();
     batch.cols[0].isRepeating = true;
@@ -341,6 +344,60 @@ public class TestVectorStringExpressions
     outCol = (LongColumnVector) batch.cols[1];
     Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
     Assert.assertTrue(outCol.isRepeating);
-    Assert.assertTrue(outCol.noNulls);   
+    Assert.assertTrue(outCol.noNulls);
+  }
+
+  @Test
+  public void testStringLike() {
+
+    // has nulls, not repeating
+    VectorizedRowBatch batch;
+    Text pattern;
+    int initialBatchSize;
+    batch = makeStringBatchMixedCharSize();
+    pattern = new Text(mixPercentPattern);
+    FilterStringColLikeStringScalar expr = new FilterStringColLikeStringScalar(0, pattern);
+    expr.evaluate(batch);
+
+    // verify that the beginning entry is the only one that matches
+    Assert.assertEquals(1, batch.size);
+    Assert.assertEquals(0, batch.selected[0]);
+
+    // no nulls, not repeating
+    batch = makeStringBatchMixedCharSize();
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+
+    // verify that the beginning entry is the only one that matches
+    Assert.assertEquals(1, batch.size);
+    Assert.assertEquals(0, batch.selected[0]);
+
+    // has nulls, is repeating
+    batch = makeStringBatchMixedCharSize();
+    initialBatchSize = batch.size;
+    batch.cols[0].isRepeating = true;
+    expr.evaluate(batch);
+
+    // all rows qualify
+    Assert.assertEquals(initialBatchSize, batch.size);
+
+    // same, but repeating value is null
+    batch = makeStringBatchMixedCharSize();
+    batch.cols[0].isRepeating = true;
+    batch.cols[0].isNull[0] = true;
+    expr.evaluate(batch);
+
+    // no rows qualify
+    Assert.assertEquals(0, batch.size);
+
+    // no nulls, is repeating
+    batch = makeStringBatchMixedCharSize();
+    initialBatchSize = batch.size;
+    batch.cols[0].isRepeating = true;
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+
+    // all rows qualify
+    Assert.assertEquals(initialBatchSize, batch.size);
   }
 }



Mime
View raw message