Return-Path: X-Original-To: apmail-hive-commits-archive@www.apache.org Delivered-To: apmail-hive-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 176C210DC5 for ; Wed, 6 Nov 2013 16:44:12 +0000 (UTC) Received: (qmail 48635 invoked by uid 500); 6 Nov 2013 16:44:11 -0000 Delivered-To: apmail-hive-commits-archive@hive.apache.org Received: (qmail 48611 invoked by uid 500); 6 Nov 2013 16:44:11 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 48601 invoked by uid 99); 6 Nov 2013 16:44:10 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 06 Nov 2013 16:44:10 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 06 Nov 2013 16:43:59 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 1394823888FE; Wed, 6 Nov 2013 16:43:37 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1539392 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/ java/org/apache/hadoop/hive/ql/exec/vector/expressions/ java/org/apache/hadoop/hive/ql/optimizer/physical/ test/org/apache/hadoop/hive/ql/exec/vector/ test/org/ap... Date: Wed, 06 Nov 2013 16:43:36 -0000 To: commits@hive.apache.org From: hashutosh@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131106164337.1394823888FE@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: hashutosh Date: Wed Nov 6 16:43:36 2013 New Revision: 1539392 URL: http://svn.apache.org/r1539392 Log: HIVE-5583 : Implement support for IN (list-of-constants) filter in vectorized mode (Eric Hanson via Ashutosh Chauhan) Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java Wed Nov 6 16:43:36 2013 @@ -76,6 +76,7 @@ import org.apache.hadoop.hive.ql.plan.Ex import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc; import org.apache.hadoop.hive.ql.udf.UDFConv; import org.apache.hadoop.hive.ql.udf.UDFHex; import org.apache.hadoop.hive.ql.udf.UDFOPNegative; @@ -557,6 +558,8 @@ public class VectorizationContext { //First handle special cases if (udf instanceof GenericUDFBetween) { return getBetweenFilterExpression(childExpr); + } else if (udf instanceof GenericUDFIn) { + return getInFilterExpression(childExpr); } else if (udf instanceof GenericUDFBridge) { VectorExpression v = getGenericUDFBridgeVectorExpression((GenericUDFBridge) udf, childExpr, mode); if (v != null) { @@ -579,6 +582,104 @@ public class VectorizationContext { } /** + * Create a filter expression for column IN ( ) + * @param childExpr + * @return + */ + private VectorExpression getInFilterExpression(List childExpr) + throws HiveException { + ExprNodeDesc colExpr = childExpr.get(0); + String colType = colExpr.getTypeString(); + + // prepare arguments for createVectorExpression + List childrenForInList = + foldConstantsForUnaryExprs(childExpr.subList(1, childExpr.size())); + + // Remove nulls. This is safe because "value IN ( )" is never true for a NULL member + // of , under SQL semantics, because value = NULL is always false. + childrenForInList = removeNullListEntries(childrenForInList); + VectorExpression expr = null; + + // determine class + Class cl = null; + if (isIntFamily(colType)) { + cl = FilterLongColumnInList.class; + long[] inVals = new long[childrenForInList.size()]; + for (int i = 0; i != inVals.length; i++) { + inVals[i] = getIntFamilyScalarAsLong((ExprNodeConstantDesc) childrenForInList.get(i)); + } + FilterLongColumnInList f = (FilterLongColumnInList) + createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION); + f.setInListValues(inVals); + expr = f; + } else if (colType.equals("timestamp")) { + cl = FilterLongColumnInList.class; + long[] inVals = new long[childrenForInList.size()]; + for (int i = 0; i != inVals.length; i++) { + inVals[i] = getTimestampScalar(childrenForInList.get(i)); + } + FilterLongColumnInList f = (FilterLongColumnInList) + createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION); + f.setInListValues(inVals); + expr = f; + } else if (colType.equals("string")) { + cl = FilterStringColumnInList.class; + byte[][] inVals = new byte[childrenForInList.size()][]; + for (int i = 0; i != inVals.length; i++) { + inVals[i] = getStringScalarAsByteArray((ExprNodeConstantDesc) childrenForInList.get(i)); + } + FilterStringColumnInList f =(FilterStringColumnInList) + createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION); + f.setInListValues(inVals); + expr = f; + } else if (isFloatFamily(colType)) { + cl = FilterDoubleColumnInList.class; + double[] inValsD = new double[childrenForInList.size()]; + for (int i = 0; i != inValsD.length; i++) { + inValsD[i] = getNumericScalarAsDouble(childrenForInList.get(i)); + } + FilterDoubleColumnInList f = (FilterDoubleColumnInList) + createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION); + f.setInListValues(inValsD); + expr = f; + } else { + throw new HiveException("Type " + colType + " not supported for IN in vectorized mode"); + } + return expr; + } + + // Return a version of the input IN list with the NULL entries removed. + private List removeNullListEntries(List childrenForInList) { + boolean hasNulls = false; + for (ExprNodeDesc e : childrenForInList) { + if (e instanceof ExprNodeNullDesc) { + hasNulls = true; + break; + } + } + if (!hasNulls) { + return childrenForInList; + } else { + List nullFreeList = new ArrayList(); + for (ExprNodeDesc e : childrenForInList) { + if (!(e instanceof ExprNodeNullDesc)) { + nullFreeList.add(e); + } + } + return nullFreeList; + } + } + + private byte[] getStringScalarAsByteArray(ExprNodeConstantDesc exprNodeConstantDesc) + throws HiveException { + Object o = getScalarValue(exprNodeConstantDesc); + if (!(o instanceof byte[])) { + throw new HiveException("Expected constant argument of type string"); + } + return (byte[]) o; + } + + /** * Invoke special handling for expressions that can't be vectorized by regular * descriptor based lookup. */ @@ -850,8 +951,38 @@ public class VectorizationContext { } } - // Get a timestamp as a long in number of nanos, from a string constant. + private long getIntFamilyScalarAsLong(ExprNodeConstantDesc constDesc) + throws HiveException { + Object o = getScalarValue(constDesc); + if (o instanceof Integer) { + return (Integer) o; + } else if (o instanceof Long) { + return (Long) o; + } + throw new HiveException("Unexpected type when converting to long"); + } + + private double getNumericScalarAsDouble(ExprNodeDesc constDesc) + throws HiveException { + Object o = getScalarValue((ExprNodeConstantDesc) constDesc); + if (o instanceof Double) { + return (Double) o; + } else if (o instanceof Float) { + return (Float) o; + } else if (o instanceof Integer) { + return (Integer) o; + } else if (o instanceof Long) { + return (Long) o; + } + throw new HiveException("Unexpected type when converting to double"); + } + + // Get a timestamp as a long in number of nanos, from a string constant or cast private long getTimestampScalar(ExprNodeDesc expr) throws HiveException { + if (expr instanceof ExprNodeGenericFuncDesc && + ((ExprNodeGenericFuncDesc) expr).getGenericUDF() instanceof GenericUDFTimestamp) { + return evaluateCastToTimestamp(expr); + } if (!(expr instanceof ExprNodeConstantDesc)) { throw new HiveException("Constant timestamp value expected for expression argument. " + "Non-constant argument not supported for vectorization."); @@ -868,25 +999,29 @@ public class VectorizationContext { expr2.setChildren(children); // initialize and evaluate - ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2); - ObjectInspector output = evaluator.initialize(null); - Object constant = evaluator.evaluate(null); - Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output); - - if (!(java instanceof Timestamp)) { - throw new HiveException("Udf: failed to convert from string to timestamp"); - } - Timestamp ts = (Timestamp) java; - long result = ts.getTime(); - result *= 1000000; // shift left 6 digits to make room for nanos below ms precision - result += ts.getNanos() % 1000000; // add in nanos, after removing the ms portion - return result; + return evaluateCastToTimestamp(expr2); } throw new HiveException("Udf: unhandled constant type for scalar argument. " + "Expecting string."); } + private long evaluateCastToTimestamp(ExprNodeDesc expr) throws HiveException { + ExprNodeGenericFuncDesc expr2 = (ExprNodeGenericFuncDesc) expr; + ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2); + ObjectInspector output = evaluator.initialize(null); + Object constant = evaluator.evaluate(null); + Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output); + + if (!(java instanceof Timestamp)) { + throw new HiveException("Udf: failed to convert to timestamp"); + } + Timestamp ts = (Timestamp) java; + long result = ts.getTime(); + result *= 1000000; // shift left 6 digits to make room for nanos below ms precision + result += ts.getNanos() % 1000000; // add in nanos, after removing the ms portion + return result; + } private Constructor getConstructor(Class cl) throws HiveException { try { Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,435 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.util.Random; + +/** + * A high-performance set implementation used to support fast set membership testing, + * using Cuckoo hashing. This is used to support fast tests of the form + * + * column IN ( 20) { + throw new RuntimeException("Too many rehashes"); + } + updateHashSalt(); + + // Save original values + if (prev1 == null) { + prev1 = t1; + prev1 = t2; + } + t1 = new byte[n][]; + t2 = new byte[n][]; + for (byte[] v : prev1) { + if (v != null) { + byte[] x = tryInsert(v); + if (x != null) { + rehash(); + return; + } + } + } + for (byte[] v : prev2) { + if (v != null) { + byte[] x = tryInsert(v); + if (x != null) { + rehash(); + return; + } + } + } + + // We succeeded in adding all the values, so + // clear the previous values recorded. + prev1 = null; + prev2 = null; + } + + /** + * This is adapted from the org.apache.hadoop.util.hash.JenkinsHash package. + * The interface needed to be modified to suit the use here, by adding + * a start offset parameter to the hash function. + * + * In the future, folding this back into the original Hadoop package should + * be considered. This could could them import that package and use it. + * The original comments from the source are below. + * + * taken from hashlittle() -- hash a variable-length key into a 32-bit value + * + * @param key the key (the unaligned variable-length array of bytes) + * @param nbytes number of bytes to include in hash + * @param initval can be any integer value + * @return a 32-bit value. Every bit of the key affects every bit of the + * return value. Two keys differing by one or two bits will have totally + * different hash values. + * + *

The best hash table sizes are powers of 2. There is no need to do mod + * a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. + * For example, if you need only 10 bits, do + * h = (h & hashmask(10)); + * In which case, the hash table should have hashsize(10) elements. + * + *

If you are hashing n strings byte[][] k, do it like this: + * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h); + * + *

By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this + * code any way you wish, private, educational, or commercial. It's free. + * + *

Use for hash table lookup, or anything where one collision in 2^^32 is + * acceptable. Do NOT use for cryptographic purposes. + */ + @SuppressWarnings("fallthrough") + private int hash(byte[] key, int start, int nbytes, int initval) { + int length = nbytes; + long a, b, c; // We use longs because we don't have unsigned ints + a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK; + int offset = start; + for (; length > 12; offset += 12, length -= 12) { + a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK; + a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK; + b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK; + c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + + /* + * mix -- mix 3 32-bit values reversibly. + * This is reversible, so any information in (a,b,c) before mix() is + * still in (a,b,c) after mix(). + * + * If four pairs of (a,b,c) inputs are run through mix(), or through + * mix() in reverse, there are at least 32 bits of the output that + * are sometimes the same for one pair and different for another pair. + * + * This was tested for: + * - pairs that differed by one bit, by two bits, in any combination + * of top bits of (a,b,c), or in any combination of bottom bits of + * (a,b,c). + * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + * is commonly produced by subtraction) look like a single 1-bit + * difference. + * - the base values were pseudorandom, all zero but one bit set, or + * all zero plus a counter that starts at zero. + * + * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that + * satisfy this are + * 4 6 8 16 19 4 + * 9 15 3 18 27 15 + * 14 9 3 7 17 3 + * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for + * "differ" defined as + with a one-bit base and a two-bit delta. I + * used http://burtleburtle.net/bob/hash/avalanche.html to choose + * the operations, constants, and arrangements of the variables. + * + * This does not achieve avalanche. There are input bits of (a,b,c) + * that fail to affect some output bits of (a,b,c), especially of a. + * The most thoroughly mixed value is c, but it doesn't really even + * achieve avalanche in c. + * + * This allows some parallelism. Read-after-writes are good at doubling + * the number of bits affected, so the goal of mixing pulls in the + * opposite direction as the goal of parallelism. I did what I could. + * Rotates seem to cost as much as shifts on every machine I could lay + * my hands on, and rotates are much kinder to the top and bottom bits, + * so I used rotates. + * + * #define mix(a,b,c) \ + * { \ + * a -= c; a ^= rot(c, 4); c += b; \ + * b -= a; b ^= rot(a, 6); a += c; \ + * c -= b; c ^= rot(b, 8); b += a; \ + * a -= c; a ^= rot(c,16); c += b; \ + * b -= a; b ^= rot(a,19); a += c; \ + * c -= b; c ^= rot(b, 4); b += a; \ + * } + * + * mix(a,b,c); + */ + a = (a - c) & INT_MASK; a ^= rot(c, 4); c = (c + b) & INT_MASK; + b = (b - a) & INT_MASK; b ^= rot(a, 6); a = (a + c) & INT_MASK; + c = (c - b) & INT_MASK; c ^= rot(b, 8); b = (b + a) & INT_MASK; + a = (a - c) & INT_MASK; a ^= rot(c,16); c = (c + b) & INT_MASK; + b = (b - a) & INT_MASK; b ^= rot(a,19); a = (a + c) & INT_MASK; + c = (c - b) & INT_MASK; c ^= rot(b, 4); b = (b + a) & INT_MASK; + } + + //-------------------------------- last block: affect all 32 bits of (c) + switch (length) { // all the case statements fall through + case 12: + c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 11: + c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 10: + c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 9: + c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK; + case 8: + b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 7: + b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 6: + b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 5: + b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK; + case 4: + a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 3: + a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 2: + a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 1: + a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK; + break; + case 0: + return (int)(c & INT_MASK); + } + /* + * final -- final mixing of 3 32-bit values (a,b,c) into c + * + * Pairs of (a,b,c) values differing in only a few bits will usually + * produce values of c that look totally different. This was tested for + * - pairs that differed by one bit, by two bits, in any combination + * of top bits of (a,b,c), or in any combination of bottom bits of + * (a,b,c). + * + * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + * is commonly produced by subtraction) look like a single 1-bit + * difference. + * + * - the base values were pseudorandom, all zero but one bit set, or + * all zero plus a counter that starts at zero. + * + * These constants passed: + * 14 11 25 16 4 14 24 + * 12 14 25 16 4 14 24 + * and these came close: + * 4 8 15 26 3 22 24 + * 10 8 15 26 3 22 24 + * 11 8 15 26 3 22 24 + * + * #define final(a,b,c) \ + * { + * c ^= b; c -= rot(b,14); \ + * a ^= c; a -= rot(c,11); \ + * b ^= a; b -= rot(a,25); \ + * c ^= b; c -= rot(b,16); \ + * a ^= c; a -= rot(c,4); \ + * b ^= a; b -= rot(a,14); \ + * c ^= b; c -= rot(b,24); \ + * } + * + */ + c ^= b; c = (c - rot(b,14)) & INT_MASK; + a ^= c; a = (a - rot(c,11)) & INT_MASK; + b ^= a; b = (b - rot(a,25)) & INT_MASK; + c ^= b; c = (c - rot(b,16)) & INT_MASK; + a ^= c; a = (a - rot(c,4)) & INT_MASK; + b ^= a; b = (b - rot(a,14)) & INT_MASK; + c ^= b; c = (c - rot(b,24)) & INT_MASK; + + return (int)(c & INT_MASK); + } + + private static long rot(long val, int pos) { + return ((Integer.rotateLeft( + (int)(val & INT_MASK), pos)) & INT_MASK); + } +} Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.util.Arrays; +import java.util.Random; + +/** + * A high-performance set implementation used to support fast set membership testing, + * using Cuckoo hashing. This is used to support fast tests of the form + * + * column IN ( >> 32) ^ y)) & 0xFFFFFFFF); + x = (x + salt[0]) + (x << 12); + x = (x ^ salt[1]) ^ (x >> 19); + x = (x + salt[2]) + (x << 5); + x = (x + salt[3]) ^ (x << 9); + x = (x + salt[4]) + (x << 3); + x = (x ^ salt[5]) ^ (x >> 16); + + // Return value modulo n but always in the positive range (0..n-1). + // And with the mask to zero the sign bit to make the input to mod positive + // so the output will definitely be positive. + return (x & 0x7FFFFFFF) % n; + } + + /** + * basic modular hash function + */ + private int h2(long x) { + + // Return value modulo n but always in the positive range (0..n-1). + // Since n is prime, this gives good spread for numbers that are multiples + // of one billion, which is important since timestamps internally + // are stored as a number of nanoseconds, and the fractional seconds + // part is often 0. + return (((int) (x % n)) + n) % n; + } + + /** + * In case of rehash, hash function h2 is changed by updating the + * entries in the salt array with new random values. + */ + private void updateHashSalt() { + for (int i = 0; i != 6; i++) { + salt[i] = gen.nextInt(0x7FFFFFFF); + } + } + + private void rehash() { + rehashCount++; + if (rehashCount > 20) { + throw new RuntimeException("Too many rehashes"); + } + updateHashSalt(); + + // Save original values + if (prev1 == null) { + prev1 = t1; + prev1 = t2; + } + t1 = new long[n]; + t2 = new long[n]; + Arrays.fill(t1, blank); + Arrays.fill(t2, blank); + for (Long v : prev1) { + if (v != blank) { + long x = tryInsert(v); + if (x != blank) { + rehash(); + return; + } + } + } + for (Long v : prev2) { + if (v != blank) { + long x = tryInsert(v); + if (x != blank) { + rehash(); + return; + } + } + } + + // We succeeded in adding all the values, so + // clear the previous values recorded. + prev1 = null; + prev2 = null; + } +} Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,180 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate IN filter on a batch for a vector of doubles. + */ +public class FilterDoubleColumnInList extends VectorExpression { + private static final long serialVersionUID = 1L; + private int inputCol; + private double[] inListValues; + + // The set object containing the IN list. This is optimized for lookup + // of the data type of the column. + private transient CuckooSetDouble inSet; + + public FilterDoubleColumnInList() { + super(); + inSet = null; + } + + /** + * After construction you must call setInListValues() to add the values to the IN set. + */ + public FilterDoubleColumnInList(int colNum) { + this.inputCol = colNum; + inSet = null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + if (inSet == null) { + inSet = new CuckooSetDouble(inListValues.length); + inSet.load(inListValues); + } + + DoubleColumnVector inputColVector = (DoubleColumnVector) batch.cols[inputCol]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + double[] vector = inputColVector.vector; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero + // Repeating property will not change. + + if (!(inSet.lookup(vector[0]))) { + //Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + //All must be selected otherwise size would be zero + //Repeating property will not change. + if (!nullPos[0]) { + if (!inSet.lookup(vector[0])) { + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + } + + // Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + } + + + @Override + public String getOutputType() { + return "boolean"; + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public Descriptor getDescriptor() { + + // This VectorExpression (IN) is a special case, so don't return a descriptor. + return null; + } + + public double[] getInListValues() { + return this.inListValues; + } + + public void setInListValues(double [] a) { + this.inListValues = a; + } +} Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate IN filter on a batch for a vector of longs. + */ +public class FilterLongColumnInList extends VectorExpression { + private static final long serialVersionUID = 1L; + private int inputCol; + private long[] inListValues; + + // The set object containing the IN list. This is optimized for lookup + // of the data type of the column. + private transient CuckooSetLong inSet; + + public FilterLongColumnInList() { + super(); + inSet = null; + } + + /** + * After construction you must call setInListValues() to add the values to the IN set. + */ + public FilterLongColumnInList(int colNum) { + this.inputCol = colNum; + inSet = null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + if (inSet == null) { + inSet = new CuckooSetLong(inListValues.length); + inSet.load(inListValues); + } + + LongColumnVector inputColVector = (LongColumnVector) batch.cols[inputCol]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + long[] vector = inputColVector.vector; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero + // Repeating property will not change. + + if (!(inSet.lookup(vector[0]))) { + //Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j=0; j != n; j++) { + int i = sel[j]; + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + //All must be selected otherwise size would be zero + //Repeating property will not change. + if (!nullPos[0]) { + if (!inSet.lookup(vector[0])) { + //Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + } + + // Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (inSet.lookup(vector[i])) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + } + + + @Override + public String getOutputType() { + return "boolean"; + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public Descriptor getDescriptor() { + + // This VectorExpression (IN) is a special case, so don't return a descriptor. + return null; + } + + public long[] getInListValues() { + return this.inListValues; + } + + public void setInListValues(long [] a) { + this.inListValues = a; + } +} Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,187 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFLike; +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Evaluate an IN filter on a batch for a vector of strings. + * This is optimized so that no objects have to be created in + * the inner loop, and there is a hash table implemented + * with Cuckoo hashing that has fast lookup to do the IN test. + */ +public class FilterStringColumnInList extends VectorExpression { + private static final long serialVersionUID = 1L; + private int inputCol; + private byte[][] inListValues; + + // The set object containing the IN list. This is optimized for lookup + // of the data type of the column. + private transient CuckooSetBytes inSet; + + public FilterStringColumnInList() { + super(); + inSet = null; + } + + /** + * After construction you must call setInListValues() to add the values to the IN set. + */ + public FilterStringColumnInList(int colNum) { + this.inputCol = colNum; + inSet = null; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + if (inSet == null) { + inSet = new CuckooSetBytes(inListValues.length); + inSet.load(inListValues); + } + + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[inputCol]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] start = inputColVector.start; + int[] len = inputColVector.length; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero + // Repeating property will not change. + if (!(inSet.lookup(vector[0], start[0], len[0]))) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (inSet.lookup(vector[i], start[i], len[i])) { + sel[newSize++] = i; + } + } + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (inSet.lookup(vector[i], start[i], len[i])) { + sel[newSize++] = i; + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } else { + if (inputColVector.isRepeating) { + + // All must be selected otherwise size would be zero + // Repeating property will not change. + if (!nullPos[0]) { + if (!inSet.lookup(vector[0], start[0], len[0])) { + + // Entire batch is filtered out. + batch.size = 0; + } + } else { + batch.size = 0; + } + } else if (batch.selectedInUse) { + int newSize = 0; + for(int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + if (inSet.lookup(vector[i], start[i], len[i] )) { + sel[newSize++] = i; + } + } + } + + // Change the selected vector + batch.size = newSize; + } else { + int newSize = 0; + for(int i = 0; i != n; i++) { + if (!nullPos[i]) { + if (inSet.lookup(vector[i], start[i], len[i])) { + sel[newSize++] = i; + } + } + } + if (newSize < n) { + batch.size = newSize; + batch.selectedInUse = true; + } + } + } + } + + + @Override + public String getOutputType() { + return "boolean"; + } + + @Override + public int getOutputColumn() { + return -1; + } + + @Override + public Descriptor getDescriptor() { + + // This VectorExpression (IN) is a special case, so don't return a descriptor. + return null; + } + + public byte[][] getInListValues() { + return this.inListValues; + } + + public void setInListValues(byte [][] a) { + this.inListValues = a; + } +} Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1539392&r1=1539391&r2=1539392&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Wed Nov 6 16:43:36 2013 @@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.udf.gen import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -172,6 +173,7 @@ public class Vectorizer implements Physi supportedGenericUDFs.add(GenericUDFConcat.class); supportedGenericUDFs.add(GenericUDFAbs.class); supportedGenericUDFs.add(GenericUDFBetween.class); + supportedGenericUDFs.add(GenericUDFIn.class); // For type casts supportedGenericUDFs.add(UDFToLong.class); Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff ============================================================================== --- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java (original) +++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java Wed Nov 6 16:43:36 2013 @@ -92,6 +92,7 @@ import org.apache.hadoop.hive.ql.udf.UDF import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -939,4 +940,42 @@ public class TestVectorizationContext { ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); assertTrue(ve instanceof FilterDoubleColumnNotBetween); } + + @Test + public void testInFilters() throws HiveException { + ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(String.class, "col1", "table", false); + ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc("Alpha"); + ExprNodeConstantDesc constDesc2 = new ExprNodeConstantDesc("Bravo"); + + // string IN + GenericUDFIn udf = new GenericUDFIn(); + ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc(); + exprDesc.setGenericUDF(udf); + List children1 = new ArrayList(); + children1.add(col1Expr); + children1.add(constDesc); + children1.add(constDesc2); + exprDesc.setChildren(children1); + + Map columnMap = new HashMap(); + columnMap.put("col1", 1); + columnMap.put("col2", 2); + VectorizationContext vc = new VectorizationContext(columnMap, 2); + VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterStringColumnInList); + + // long IN + children1.set(0, new ExprNodeColumnDesc(Long.class, "col1", "table", false)); + children1.set(1, new ExprNodeConstantDesc(10)); + children1.set(2, new ExprNodeConstantDesc(20)); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterLongColumnInList); + + // double IN + children1.set(0, new ExprNodeColumnDesc(Double.class, "col1", "table", false)); + children1.set(1, new ExprNodeConstantDesc(10d)); + children1.set(2, new ExprNodeConstantDesc(20d)); + ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER); + assertTrue(ve instanceof FilterDoubleColumnInList); + } } Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java?rev=1539392&view=auto ============================================================================== --- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java (added) +++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java Wed Nov 6 16:43:36 2013 @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import static org.junit.Assert.*; + +import java.util.Random; + +import org.junit.Test; + +public class TestCuckooSet { + + // maximum table size + private static int MAX_SIZE = 65437; + + @Test + public void testSetLong () { + + // Set of values to look for. Include the original blank value Long.MIN_VALUE to make sure + // the process of choosing a new blank works. + Long[] values = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L, + 22L, 23L, 24L, 25L, -26L, 27L, + 28L, 29L, 30L, 111111111111111L, -444444444444444L, Long.MIN_VALUE}; + Long[] negatives = {0L, 4L, 4000L, -2L, 19L, 222222222222222L, -333333333333333L}; + CuckooSetLong s = new CuckooSetLong(values.length); + for(Long v : values) { + s.insert(v); + } + + // test that the values we added are there + for(Long v : values) { + assertTrue(s.lookup(v)); + } + + // test that values that we know are missing are shown to be absent + for (Long v : negatives) { + assertFalse(s.lookup(v)); + } + + // Set of values to look for. + Long[] values2 = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L, 22L, 23L, + 24L, 25L, -26L, 27L, + 28L, 29L, 30L, 111111111111111L, -444444444444444L}; + + // Include the original blank value Long.MIN_VALUE in the negatives to make sure we get + // the correct result that the blank value is not there. + Long[] negatives2 = {0L, 4L, 4000L, -2L, 19L, 222222222222222L, + -333333333333333L, Long.MIN_VALUE}; + s = new CuckooSetLong(values2.length); + for(Long v : values2) { + s.insert(v); + } + + // test that the values we added are there + for(Long v : values2) { + assertTrue(s.lookup(v)); + } + + // test that values that we know are missing are shown to be absent + for (Long v : negatives2) { + assertFalse(s.lookup(v)); + } + + } + + // load multiple random sets of Long values + @Test + public void testSetLongRandom() { + long[] values; + Random gen = new Random(98763537); + for(int i = 0; i < 200;) { + + // Make a random array of longs + int size = gen.nextInt() % MAX_SIZE; + if (size <= 0) { // ensure size is >= 1, otherwise try again + continue; + } + i++; + values = new long[size]; + loadRandom(values, gen); + + // load them into a SetLong + CuckooSetLong s = new CuckooSetLong(size); + loadSet(s, values); + + // look them up to make sure they are all there + for (int j = 0; j != size; j++) { + assertTrue(s.lookup(values[j])); + } + } + } + + @Test + public void testSetDouble() { + + // Set of values to look for. + Double[] values = {7021.0D, 5780.0D, 0D, -1D, 1.999e50D}; + Double[] negatives = {7000.0D, -2D, 1.9999e50D}; + CuckooSetDouble s = new CuckooSetDouble(values.length); + for(Double v : values) { + s.insert(v); + } + + // test that the values we added are there + for(Double v : values) { + assertTrue(s.lookup(v)); + } + + // test that values that we know are missing are shown to be absent + for (Double v : negatives) { + assertFalse(s.lookup(v)); + } + } + + @Test + public void testSetBytes() { + String[] strings = {"foo", "bar", "baz", "a", "", "x1341", "Z"}; + String[] negativeStrings = {"not", "in", "the", "set", "foobar"}; + byte[][] values = getByteArrays(strings); + byte[][] negatives = getByteArrays(negativeStrings); + + // load set + CuckooSetBytes s = new CuckooSetBytes(strings.length); + for(byte[] v : values) { + s.insert(v); + } + + // test that the values we added are there + for(byte[] v : values) { + assertTrue(s.lookup(v, 0, v.length)); + } + + // test that values that we know are missing are shown to be absent + for (byte[] v : negatives) { + assertFalse(s.lookup(v, 0, v.length)); + } + + // Test that we can search correctly using a buffer and pulling + // a sequence of bytes out of the middle of it. In this case it + // is the 3 letter sequence "foo". + byte[] buf = getUTF8Bytes("thewordfooisinhere"); + assertTrue(s.lookup(buf, 7, 3)); + } + + @Test + public void testSetBytesLargeRandom() { + byte[][] values; + Random gen = new Random(98763537); + for(int i = 0; i < 200;) { + + // Make a random array of byte arrays + int size = gen.nextInt() % MAX_SIZE; + if (size <= 0) { // ensure size is >= 1, otherwise try again + continue; + } + i++; + values = new byte[size][]; + loadRandomBytes(values, gen); + + // load them into a set + CuckooSetBytes s = new CuckooSetBytes(size); + loadSet(s, values); + + // look them up to make sure they are all there + for (int j = 0; j != size; j++) { + assertTrue(s.lookup(values[j], 0, values[j].length)); + } + } + } + + public void loadRandomBytes(byte[][] values, Random gen) { + for (int i = 0; i != values.length; i++) { + values[i] = getUTF8Bytes(Integer.toString(gen.nextInt())); + } + } + + private byte[] getUTF8Bytes(String s) { + byte[] v = null; + try { + v = s.getBytes("UTF-8"); + } catch (Exception e) { + ; // won't happen + } + return v; + } + + // Get an array of UTF-8 byte arrays from an array of strings + private byte[][] getByteArrays(String[] strings) { + byte[][] values = new byte[strings.length][]; + for(int i = 0; i != strings.length; i++) { + try { + values[i] = strings[i].getBytes("UTF-8"); + } catch (Exception e) { + ; // can't happen + } + } + return values; + } + + private void loadSet(CuckooSetLong s, long[] values) { + for (Long v : values) { + s.insert(v); + } + } + + private void loadSet(CuckooSetBytes s, byte[][] values) { + for (byte[] v: values) { + s.insert(v); + } + } + + private void loadRandom(long[] a, Random gen) { + int size = a.length; + for(int i = 0; i != size; i++) { + a[i] = gen.nextLong(); + } + } +} Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java?rev=1539392&r1=1539391&r2=1539392&view=diff ============================================================================== --- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java (original) +++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java Wed Nov 6 16:43:36 2013 @@ -543,4 +543,189 @@ public class TestVectorFilterExpressions assertTrue(vrb.selectedInUse); assertEquals(0, vrb.selected[0]); } + + /** + * Test the IN filter VectorExpression classes. + */ + + @Test + public void testFilterLongIn() { + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + LongColumnVector lcv0 = (LongColumnVector) vrb.cols[0]; + long[] inList = {5, 20}; + FilterLongColumnInList f = new FilterLongColumnInList(0); + f.setInListValues(inList); + VectorExpression expr1 = f; + + // Basic case + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + expr1.evaluate(vrb); + + assertEquals(2, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(0, vrb.selected[0]); + assertEquals(1, vrb.selected[1]); + + // With nulls + VectorizedRowBatch vrb1 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + + lcv0 = (LongColumnVector) vrb1.cols[0]; + + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + lcv0.noNulls = false; + lcv0.isNull[0] = true; + lcv0.isNull[2] = true; + + expr1.evaluate(vrb1); + assertEquals(1, vrb1.size); + assertTrue(vrb1.selectedInUse); + assertEquals(1, vrb1.selected[0]); + + // With nulls and selected + VectorizedRowBatch vrb2 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 7, 2, seed); + vrb2.selectedInUse = true; + vrb2.selected[0] = 1; + vrb2.selected[1] = 2; + vrb2.selected[2] = 4; + vrb2.size = 3; + + lcv0 = (LongColumnVector) vrb2.cols[0]; + + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + lcv0.vector[5] = 19; + lcv0.vector[6] = 21; + + lcv0.noNulls = false; + lcv0.isNull[0] = true; + lcv0.isNull[2] = true; + lcv0.isNull[5] = true; + + expr1.evaluate(vrb2); + assertEquals(1, vrb2.size); + assertEquals(1, vrb2.selected[0]); + + // Repeating non null + VectorizedRowBatch vrb3 = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 7, 2, seed); + lcv0 = (LongColumnVector) vrb3.cols[0]; + + lcv0.isRepeating = true; + lcv0.vector[0] = 5; + lcv0.vector[1] = 20; + lcv0.vector[2] = 17; + lcv0.vector[3] = 15; + lcv0.vector[4] = 10; + + expr1.evaluate(vrb3); + assertEquals(7, vrb3.size); + assertFalse(vrb3.selectedInUse); + assertTrue(lcv0.isRepeating); + + // Repeating null + lcv0.noNulls = false; + lcv0.vector[0] = 5; + lcv0.isNull[0] = true; + + expr1.evaluate(vrb3); + assertEquals(0, vrb3.size); + } + + @Test + public void testFilterDoubleIn() { + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 5, 2, seed); + DoubleColumnVector dcv0 = new DoubleColumnVector(); + vrb.cols[0] = dcv0; + double[] inList = {5.0, 20.2}; + FilterDoubleColumnInList f = new FilterDoubleColumnInList(0); + f.setInListValues(inList); + VectorExpression expr1 = f; + + // Basic sanity check. Other cases are not skipped because it is similar to the case for Long. + dcv0.vector[0] = 5.0; + dcv0.vector[1] = 20.2; + dcv0.vector[2] = 17.0; + dcv0.vector[3] = 15.0; + dcv0.vector[4] = 10.0; + + expr1.evaluate(vrb); + + assertEquals(2, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(0, vrb.selected[0]); + assertEquals(1, vrb.selected[1]); + } + + @Test + public void testFilterStringIn() { + int seed = 17; + VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch( + 3, 2, seed); + vrb.cols[0] = new BytesColumnVector(); + BytesColumnVector bcv = (BytesColumnVector) vrb.cols[0]; + + bcv.initBuffer(); + bcv.setVal(0, a, 0, 1); + bcv.setVal(1, b, 0, 1); + bcv.setVal(2, c, 0, 1); + + VectorExpression expr = new FilterStringColumnInList(0); + byte[][] inList = {b, c}; + ((FilterStringColumnInList) expr).setInListValues(inList); + + // basic test + expr.evaluate(vrb); + + assertEquals(2, vrb.size); + assertTrue(vrb.selectedInUse); + assertEquals(1, vrb.selected[0]); + assertEquals(2, vrb.selected[1]); + + // nulls + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = false; + bcv.isNull[2] = true; + expr.evaluate(vrb); + assertEquals(1, vrb.size); + assertEquals(1, vrb.selected[0]); + assertTrue(vrb.selectedInUse); + + // repeating + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = true; + bcv.isRepeating = true; + expr.evaluate(vrb); + assertEquals(0, vrb.size); + + // nulls and repeating + vrb.selectedInUse = false; + vrb.size = 3; + bcv.noNulls = false; + bcv.isRepeating = true; + bcv.isNull[0] = true; + bcv.setVal(0, b, 0, 1); + expr.evaluate(vrb); + assertEquals(0, vrb.size); + } }