drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sudhe...@apache.org
Subject [1/3] drill git commit: DRILL-4237, DRILL-4478: Implement hash to use murmur3 and add correspondent unit tests
Date Wed, 20 Apr 2016 05:32:33 GMT
Repository: drill
Updated Branches:
  refs/heads/master 852b01aa6 -> c6a03eb17


http://git-wip-us.apache.org/repos/asf/drill/blob/c6a03eb1/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java
index 0124b2f..72ab492 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/XXHash.java
@@ -24,14 +24,15 @@ import org.apache.drill.exec.memory.BoundsChecking;
 
 import com.google.common.primitives.UnsignedLongs;
 
-public final class XXHash {
+public final class XXHash extends DrillHash{
   static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(XXHash.class);
 
-  static final long PRIME64_1 = UnsignedLongs.decode("11400714785074694791");
-  static final long PRIME64_2 = UnsignedLongs.decode("14029467366897019727");
-  static final long PRIME64_3 = UnsignedLongs.decode("1609587929392839161");
-  static final long PRIME64_4 = UnsignedLongs.decode("9650029242287828579");
-  static final long PRIME64_5 = UnsignedLongs.decode("2870177450012600261");
+  //UnsignedLongs.decode won't give right output(keep the value in 8 bytes unchanged).
+  static final long PRIME64_1 = 0x9e3779b185ebca87L;//UnsignedLongs.decode("11400714785074694791");
+  static final long PRIME64_2 = 0xc2b2ae3d27d4eb4fL;//UnsignedLongs.decode("14029467366897019727");
+  static final long PRIME64_3 = 0x165667b19e3779f9L;//UnsignedLongs.decode("1609587929392839161");
+  static final long PRIME64_4 = 0x85ebca77c2b2ae63L;//UnsignedLongs.decode("9650029242287828579");
+  static final long PRIME64_5 = 0x27d4eb2f165667c5L;//UnsignedLongs.decode("2870177450012600261");
 
   private static long hash64bytes(long start, long bEnd, long seed) {
     long len = bEnd - start;
@@ -114,12 +115,14 @@ public final class XXHash {
     }
 
     if (p + 4 <= bEnd) {
-      h64 ^= PlatformDependent.getInt(p) * PRIME64_1;
+      //IMPORTANT: we are expecting a long from these 4 bytes. Which means it is always positive
+      long finalInt = getIntLittleEndian(p);
+      h64 ^= finalInt * PRIME64_1;
       h64 = Long.rotateLeft(h64, 23) * PRIME64_2 + PRIME64_3;
       p += 4;
     }
     while (p < bEnd) {
-      h64 ^= PlatformDependent.getByte(p) * PRIME64_5;
+      h64 ^= ((long)(PlatformDependent.getByte(p) & 0x00ff)) * PRIME64_5;
       h64 = Long.rotateLeft(h64, 11) * PRIME64_1;
       p++;
     }
@@ -128,25 +131,17 @@ public final class XXHash {
   }
 
   private static long applyFinalHashComputation(long h64) {
-    h64 ^= h64 >> 33;
+    //IMPORTANT: using logical right shift instead of arithmetic right shift
+    h64 ^= h64 >>> 33;
     h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
+    h64 ^= h64 >>> 29;
     h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
+    h64 ^= h64 >>> 32;
     return h64;
   }
 
 
-  /* 64 bit variations */
-  public static long hash64(int val, long seed){
-    long h64 = seed + PRIME64_5;
-    h64 += 4; // add length (4 bytes) to hash value
-    h64 ^= val * PRIME64_1;
-    h64 = Long.rotateLeft(h64, 23) * PRIME64_2 + PRIME64_3;
-    return applyFinalHashComputation(h64);
-  }
-
-  public static long hash64(long val, long seed){
+  public static long hash64Internal(long val, long seed){
     long h64 = seed + PRIME64_5;
     h64 += 8; // add length (8 bytes) to hash value
     long k1 = val* PRIME64_2;
@@ -157,17 +152,22 @@ public final class XXHash {
     return applyFinalHashComputation(h64);
   }
 
-  public static long hash64(float val, long seed){
-    return hash64(Float.floatToIntBits(val), seed);
+  /**
+   * @param val the input 64 bit hash value
+   * @return converted 32 bit hash value
+   */
+  private static int convert64To32(long val) {
+    return (int) (val & 0x00FFFFFFFF);
   }
 
+
   public static long hash64(double val, long seed){
-    return hash64(Double.doubleToLongBits(val), seed);
+    return hash64Internal(Double.doubleToLongBits(val), seed);
   }
 
-  public static long hash64(int start, int end, DrillBuf buffer, long seed){
+  public static long hash64(long start, long end, DrillBuf buffer, long seed){
     if (BoundsChecking.BOUNDS_CHECKING_ENABLED) {
-      buffer.checkBytes(start, end);
+      buffer.checkBytes((int)start, (int)end);
     }
 
     long s = buffer.memoryAddress() + start;
@@ -176,38 +176,12 @@ public final class XXHash {
     return hash64bytes(s, e, seed);
   }
 
-  /* 32 bit variations */
-  public static int hash32(int val, long seed){
-    return convert64To32(hash64(val, seed));
-  }
-
-  public static int hash32(long val, long seed){
-    return convert64To32(hash64(val, seed));
-  }
-
-  public static int hash32(float val, long seed){
-    return convert64To32(hash64(val, seed));
-  }
-
   public static int hash32(double val, long seed){
     return convert64To32(hash64(val, seed));
   }
 
-  public static int hash32(int start, int end, DrillBuf buffer, long seed){
+  public static int hash32(int start, int end, DrillBuf buffer, int seed){
     return convert64To32(hash64(start, end, buffer, seed));
   }
 
-  /**
-   * Convert a 64 bit hash value to a 32 bit by taking the XOR of the
-   * most significant 4 bytes with the least significant 4 bytes.
-   * @param val the input 64 bit hash value
-   * @return converted 32 bit hash value
-   */
-  private static int convert64To32(long val) {
-
-    int msb = (int) ((val >>> 32) & 0xFFFFFFFF);
-    int lsb = (int) (val);
-    return (msb ^ lsb);
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/c6a03eb1/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java b/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java
index 53a86bb..475d08a 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/TestFunctionsQuery.java
@@ -584,22 +584,6 @@ public class TestFunctionsQuery extends BaseTestQuery {
   }
 
   @Test
-  public void testHashFunctions() throws Exception {
-    String query = "select " +
-        "hash(cast(hire_date as date)) hash_date, " +
-        "hash(cast(employee_id as decimal(9, 2))) as hash_dec9, " +
-        "hash(cast(employee_id as decimal(38, 11))) as hash_dec38 " +
-        "from cp.`employee.json` where employee_id = 1 limit 1";
-
-    testBuilder()
-        .sqlQuery(query)
-        .unOrdered()
-        .baselineColumns("hash_date", "hash_dec9", "hash_dec38")
-        .baselineValues(312993367, 292570647, 337328302)
-        .go();
-  }
-
-  @Test
   public void testDecimalAddConstant() throws Exception {
     String query = "select (cast('-1' as decimal(37, 3)) + cast (employee_id as decimal(37,
3))) as CNT " +
         "from cp.`employee.json` where employee_id <= 4";
@@ -795,6 +779,7 @@ public class TestFunctionsQuery extends BaseTestQuery {
         .go();
   }
 
+
   /*
    * We may apply implicit casts in Hash Join while dealing with different numeric data types
    * For this to work we need to distribute the data based on a common key, below method
@@ -810,7 +795,7 @@ public class TestFunctionsQuery extends BaseTestQuery {
         "hash64AsDouble(cast(employee_id as decimal(9, 0))) = hash64AsDouble(cast(employee_id
as decimal(18, 0))) col5, " +
         "hash64AsDouble(cast(employee_id as decimal(18, 0))) = hash64AsDouble(cast(employee_id
as decimal(28, 0))) col6, " +
         "hash64AsDouble(cast(employee_id as decimal(28, 0))) = hash64AsDouble(cast(employee_id
as decimal(38, 0))) col7 " +
-        "from cp.`employee.json` where employee_id = 1";
+        "from cp.`employee.json`  where employee_id = 1";
 
     testBuilder()
         .sqlQuery(query)
@@ -818,11 +803,32 @@ public class TestFunctionsQuery extends BaseTestQuery {
         .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7")
         .baselineValues(true, true, true, true, true, true, true)
         .go();
+
+    java.util.Random seedGen = new java.util.Random();
+    seedGen.setSeed(System.currentTimeMillis());
+    int seed = seedGen.nextInt();
+
+    String querytemplate = "select " +
+            "hash64AsDouble(cast(employee_id as int), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as bigint), #RAND_SEED#) col1, " +
+            "hash64AsDouble(cast(employee_id as bigint), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as float), #RAND_SEED#) col2, " +
+            "hash64AsDouble(cast(employee_id as float), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as double), #RAND_SEED#) col3, " +
+            "hash64AsDouble(cast(employee_id as double), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as decimal(9, 0)), #RAND_SEED#) col4, " +
+            "hash64AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as decimal(18, 0)), #RAND_SEED#) col5, " +
+            "hash64AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as decimal(28, 0)), #RAND_SEED#) col6, " +
+            "hash64AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) = hash64AsDouble(cast(employee_id
as decimal(38, 0)), #RAND_SEED#) col7 " +
+            "from cp.`employee.json` where employee_id = 1";
+
+    String queryWithSeed = querytemplate.replaceAll("#RAND_SEED#", String.format("%d",seed));
+    testBuilder()
+            .sqlQuery(queryWithSeed)
+            .unOrdered()
+            .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7")
+            .baselineValues(true, true, true, true, true, true, true)
+            .go();
+
   }
 
-  /*
-   * hash32 version of the above test
-   */
+
   @Test
   public void testHash32() throws Exception {
     String query = "select " +
@@ -841,6 +847,29 @@ public class TestFunctionsQuery extends BaseTestQuery {
         .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7")
         .baselineValues(true, true, true, true, true, true, true)
         .go();
+
+    java.util.Random seedGen = new java.util.Random();
+    seedGen.setSeed(System.currentTimeMillis());
+    int seed = seedGen.nextInt();
+
+    String querytemplate = "select " +
+            "hash32AsDouble(cast(employee_id as int), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as bigint), #RAND_SEED#) col1, " +
+            "hash32AsDouble(cast(employee_id as bigint), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as float), #RAND_SEED#) col2, " +
+            "hash32AsDouble(cast(employee_id as float),  #RAND_SEED#) = hash32AsDouble(cast(employee_id
as double), #RAND_SEED#) col3, " +
+            "hash32AsDouble(cast(employee_id as double), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as decimal(9, 0)), #RAND_SEED#) col4, " +
+            "hash32AsDouble(cast(employee_id as decimal(9, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as decimal(18, 0)), #RAND_SEED#) col5, " +
+            "hash32AsDouble(cast(employee_id as decimal(18, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as decimal(28, 0)), #RAND_SEED#) col6, " +
+            "hash32AsDouble(cast(employee_id as decimal(28, 0)), #RAND_SEED#) = hash32AsDouble(cast(employee_id
as decimal(38, 0)), #RAND_SEED#) col7 " +
+            "from cp.`employee.json` where employee_id = 1";
+
+    String queryWithSeed = querytemplate.replaceAll("#RAND_SEED#", String.format("%d",seed));
+    testBuilder()
+            .sqlQuery(queryWithSeed)
+            .unOrdered()
+            .baselineColumns("col1", "col2", "col3", "col4", "col5", "col6", "col7")
+            .baselineValues(true, true, true, true, true, true, true)
+            .go();
+
   }
 
   @Test


Mime
View raw message