Return-Path: X-Original-To: apmail-drill-commits-archive@www.apache.org Delivered-To: apmail-drill-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 4C688185B7 for ; Wed, 20 Apr 2016 05:32:33 +0000 (UTC) Received: (qmail 37693 invoked by uid 500); 20 Apr 2016 05:32:33 -0000 Delivered-To: apmail-drill-commits-archive@drill.apache.org Received: (qmail 37651 invoked by uid 500); 20 Apr 2016 05:32:33 -0000 Mailing-List: contact commits-help@drill.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: commits@drill.apache.org Delivered-To: mailing list commits@drill.apache.org Received: (qmail 37638 invoked by uid 99); 20 Apr 2016 05:32:33 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 Apr 2016 05:32:33 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 0B39CDFE80; Wed, 20 Apr 2016 05:32:33 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: sudheesh@apache.org To: commits@drill.apache.org Date: Wed, 20 Apr 2016 05:32:33 -0000 Message-Id: <2b2e05fddd924b15bf67b3abd350a828@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [1/3] drill git commit: DRILL-4237, DRILL-4478: Implement hash to use murmur3 and add correspondent unit tests Repository: drill Updated Branches: refs/heads/master 852b01aa6 -> c6a03eb17 http://git-wip-us.apache.org/repos/asf/drill/blob/c6a03eb1/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java index 0124b2f..72ab492 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java @@ -24,14 +24,15 @@ import org.apache.drill.exec.memory.BoundsChecking; import com.google.common.primitives.UnsignedLongs; -public final class XXHash { +public final class XXHash extends DrillHash{ static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(XXHash.class); - static final long PRIME64_1 = UnsignedLongs.decode("11400714785074694791"); - static final long PRIME64_2 = UnsignedLongs.decode("14029467366897019727"); - static final long PRIME64_3 = UnsignedLongs.decode("1609587929392839161"); - static final long PRIME64_4 = UnsignedLongs.decode("9650029242287828579"); - static final long PRIME64_5 = UnsignedLongs.decode("2870177450012600261"); + //UnsignedLongs.decode won't give right output(keep the value in 8 bytes unchanged). + static final long PRIME64_1 = 0x9e3779b185ebca87L;//UnsignedLongs.decode("11400714785074694791"); + static final long PRIME64_2 = 0xc2b2ae3d27d4eb4fL;//UnsignedLongs.decode("14029467366897019727"); + static final long PRIME64_3 = 0x165667b19e3779f9L;//UnsignedLongs.decode("1609587929392839161"); + static final long PRIME64_4 = 0x85ebca77c2b2ae63L;//UnsignedLongs.decode("9650029242287828579"); + static final long PRIME64_5 = 0x27d4eb2f165667c5L;//UnsignedLongs.decode("2870177450012600261"); private static long hash64bytes(long start, long bEnd, long seed) { long len = bEnd - start; @@ -114,12 +115,14 @@ public final class XXHash { } if (p + 4 <= bEnd) { - h64 ^= PlatformDependent.getInt(p) * PRIME64_1; + //IMPORTANT: we are expecting a long from these 4 bytes. Which means it is always positive + long finalInt = getIntLittleEndian(p); + h64 ^= finalInt * PRIME64_1; h64 = Long.rotateLeft(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < bEnd) { - h64 ^= PlatformDependent.getByte(p) * PRIME64_5; + h64 ^= ((long)(PlatformDependent.getByte(p) & 0x00ff)) * PRIME64_5; h64 = Long.rotateLeft(h64, 11) * PRIME64_1; p++; } @@ -128,25 +131,17 @@ public final class XXHash { } private static long applyFinalHashComputation(long h64) { - h64 ^= h64 >> 33; + //IMPORTANT: using logical right shift instead of arithmetic right shift + h64 ^= h64 >>> 33; h64 *= PRIME64_2; - h64 ^= h64 >> 29; + h64 ^= h64 >>> 29; h64 *= PRIME64_3; - h64 ^= h64 >> 32; + h64 ^= h64 >>> 32; return h64; } - /* 64 bit variations */ - public static long hash64(int val, long seed){ - long h64 = seed + PRIME64_5; - h64 += 4; // add length (4 bytes) to hash value - h64 ^= val * PRIME64_1; - h64 = Long.rotateLeft(h64, 23) * PRIME64_2 + PRIME64_3; - return applyFinalHashComputation(h64); - } - - public static long hash64(long val, long seed){ + public static long hash64Internal(long val, long seed){ long h64 = seed + PRIME64_5; h64 += 8; // add length (8 bytes) to hash value long k1 = val* PRIME64_2; @@ -157,17 +152,22 @@ public final class XXHash { return applyFinalHashComputation(h64); } - public static long hash64(float val, long seed){ - return hash64(Float.floatToIntBits(val), seed); + /** + * @param val the input 64 bit hash value + * @return converted 32 bit hash value + */ + private static int convert64To32(long val) { + return (int) (val & 0x00FFFFFFFF); } + public static long hash64(double val, long seed){ - return hash64(Double.doubleToLongBits(val), seed); + return hash64Internal(Double.doubleToLongBits(val), seed); } - public static long hash64(int start, int end, DrillBuf buffer, long seed){ + public static long hash64(long start, long end, DrillBuf buffer, long seed){ if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { - buffer.checkBytes(start, end); + buffer.checkBytes((int)start, (int)end); } long s = buffer.memoryAddress() + start; @@ -176,38 +176,12 @@ public final class XXHash { return hash64bytes(s, e, seed); } - /* 32 bit variations */ - public static int hash32(int val, long seed){ - return convert64To32(hash64(val, seed)); - } - - public static int hash32(long val, long seed){ - return convert64To32(hash64(val, seed)); - } - - public static int hash32(float val, long seed){ - return convert64To32(hash64(val, seed)); - } - public static int hash32(double val, long seed){ return convert64To32(hash64(val, seed)); } - public static int hash32(int start, int end, DrillBuf buffer, long seed){ + public static int hash32(int start, int end, DrillBuf buffer, int seed){ return convert64To32(hash64(start, end, buffer, seed)); } - /** - * Convert a 64 bit hash value to a 32 bit by taking the XOR of the - * most significant 4 bytes with the least significant 4 bytes. - * @param val the input 64 bit hash value - * @return converted 32 bit hash value - */ - private static int convert64To32(long val) { - - int msb = (int) ((val >>> 32) & 0xFFFFFFFF); - int lsb = (int) (val); - return (msb ^ lsb); - } - } http://git-wip-us.apache.org/repos/asf/drill/blob/c6a03eb1/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java b/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java index 53a86bb..475d08a 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java +++ b/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java @@ -584,22 +584,6 @@ public class TestFunctionsQuery extends BaseTestQuery { } @Test - public void testHashFunctions() throws Exception { - String query = "select " + - "hash(cast(hire_date as date)) hash_date, " + - "hash(cast(employee_id as decimal(9, 2))) as hash_dec9, " + - "hash(cast(employee_id as decimal(38, 11))) as hash_dec38 " + - "from cp.`employee.json` where employee_id = 1 limit 1"; - - testBuilder() - .sqlQuery(query) - .unOrdered() - .baselineColumns("hash_date", "hash_dec9", "hash_dec38") - .baselineValues(312993367, 292570647, 337328302) - .go(); - } - - @Test public void testDecimalAddConstant() throws Exception { String query = "select (cast('-1' as decimal(37, 3)) + cast (employee_id as decimal(37, 3))) as CNT " + "from cp.`employee.json` where employee_id <= 4"; @@ -795,6 +779,7 @@ public class TestFunctionsQuery extends BaseTestQuery { .go(); } + /* * We may apply implicit casts in Hash Join while dealing with different numeric data types * For this to work we need to distribute the data based on a common key, below method @@ -810,7 +795,7 @@ public class TestFunctionsQuery extends BaseTestQuery { "hash64AsDouble(cast(employee_id as decimal(9, 0))) = hash64AsDouble(cast(employee_id as decimal(18, 0))) col5, " + "hash64AsDouble(cast(employee_id as decimal(18, 0))) = hash64AsDouble(cast(employee_id as decimal(28, 0))) col6, " + "hash64AsDouble(cast(employee_id as decimal(28, 0))) = hash64AsDouble(cast(employee_id as decimal(38, 0))) col7 " + - "from cp.`employee.json` where employee_id = 1"; + "from cp.`employee.json` where employee_id = 1"; testBuilder() .sqlQuery(query) @@ -818,11 +803,32 @@ public class TestFunctionsQuery extends BaseTestQuery { .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7") .baselineValues(true, true, true, true, true, true, true) .go(); + + java.util.Random seedGen = new java.util.Random(); + seedGen.setSeed(System.currentTimeMillis()); + int seed = seedGen.nextInt(); + + String querytemplate = "select " + + "hash64AsDouble(cast(employee_id as int), #RAND_SEED#) = hash64AsDouble(cast(employee_id as bigint), #RAND_SEED#) col1, " + + "hash64AsDouble(cast(employee_id as bigint), #RAND_SEED#) = hash64AsDouble(cast(employee_id as float), #RAND_SEED#) col2, " + + "hash64AsDouble(cast(employee_id as float), #RAND_SEED#) = hash64AsDouble(cast(employee_id as double), #RAND_SEED#) col3, " + + "hash64AsDouble(cast(employee_id as double), #RAND_SEED#) = hash64AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) col4, " + + "hash64AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) col5, " + + "hash64AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) col6, " + + "hash64AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id as decimal(38, 0)), #RAND_SEED#) col7 " + + "from cp.`employee.json` where employee_id = 1"; + + String queryWithSeed = querytemplate.replaceAll("#RAND_SEED#", String.format("%d",seed)); + testBuilder() + .sqlQuery(queryWithSeed) + .unOrdered() + .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7") + .baselineValues(true, true, true, true, true, true, true) + .go(); + } - /* - * hash32 version of the above test - */ + @Test public void testHash32() throws Exception { String query = "select " + @@ -841,6 +847,29 @@ public class TestFunctionsQuery extends BaseTestQuery { .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7") .baselineValues(true, true, true, true, true, true, true) .go(); + + java.util.Random seedGen = new java.util.Random(); + seedGen.setSeed(System.currentTimeMillis()); + int seed = seedGen.nextInt(); + + String querytemplate = "select " + + "hash32AsDouble(cast(employee_id as int), #RAND_SEED#) = hash32AsDouble(cast(employee_id as bigint), #RAND_SEED#) col1, " + + "hash32AsDouble(cast(employee_id as bigint), #RAND_SEED#) = hash32AsDouble(cast(employee_id as float), #RAND_SEED#) col2, " + + "hash32AsDouble(cast(employee_id as float), #RAND_SEED#) = hash32AsDouble(cast(employee_id as double), #RAND_SEED#) col3, " + + "hash32AsDouble(cast(employee_id as double), #RAND_SEED#) = hash32AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) col4, " + + "hash32AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) col5, " + + "hash32AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) col6, " + + "hash32AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id as decimal(38, 0)), #RAND_SEED#) col7 " + + "from cp.`employee.json` where employee_id = 1"; + + String queryWithSeed = querytemplate.replaceAll("#RAND_SEED#", String.format("%d",seed)); + testBuilder() + .sqlQuery(queryWithSeed) + .unOrdered() + .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7") + .baselineValues(true, true, true, true, true, true, true) + .go(); + } @Test