hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ser...@apache.org
Subject [2/2] hive git commit: HIVE-13957 : vectorized IN is inconsistent with non-vectorized (at least for decimal in (string)) (Sergey Shelukhin, reviewed by Matt McCline)
Date Tue, 14 Jun 2016 01:55:00 GMT
HIVE-13957 : vectorized IN is inconsistent with non-vectorized (at least for decimal in (string))
(Sergey Shelukhin, reviewed by Matt McCline)

Conflicts:
	ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
	ql/src/test/results/clientpositive/spark/vector_between_in.q.out
	ql/src/test/results/clientpositive/tez/vector_between_in.q.out
	ql/src/test/results/clientpositive/vector_between_in.q.out


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/78bedc8e
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/78bedc8e
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/78bedc8e

Branch: refs/heads/branch-1
Commit: 78bedc8e212dddae70d8635a7cdaace7275923b3
Parents: 293e22e
Author: Sergey Shelukhin <sershe@apache.org>
Authored: Mon Jun 13 18:32:12 2016 -0700
Committer: Sergey Shelukhin <sershe@apache.org>
Committed: Mon Jun 13 18:48:35 2016 -0700

----------------------------------------------------------------------
 .../ql/exec/vector/VectorizationContext.java    |  30 ++++--
 .../hive/ql/udf/generic/GenericUDFUtils.java    |  52 +++++++--
 .../clientpositive/vector_string_decimal.q      |  21 ++++
 .../spark/vector_between_in.q.out               |   2 -
 .../clientpositive/tez/vector_between_in.q.out  |   2 -
 .../clientpositive/vector_between_in.q.out      |   2 -
 .../clientpositive/vector_string_decimal.q.out  | 106 +++++++++++++++++++
 .../hive/serde2/typeinfo/HiveDecimalUtils.java  |   4 +-
 8 files changed, 192 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index 9caa771..3aa182a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -500,8 +500,8 @@ public class VectorizationContext {
    * Given a udf and its children, return the common type to which the children's type should
be
    * cast.
    */
-  private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf, List<ExprNodeDesc>
children,
-      TypeInfo returnType) {
+  private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf,
+      List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException {
     TypeInfo commonType;
     if (genericUdf instanceof GenericUDFBaseCompare) {
 
@@ -513,9 +513,20 @@ public class VectorizationContext {
         commonType = returnType;
       }
     } else if (genericUdf instanceof GenericUDFIn) {
-
-      // Cast to the type of the first child
-      return children.get(0).getTypeInfo();
+      TypeInfo colTi = children.get(0).getTypeInfo();
+      if (colTi.getCategory() != Category.PRIMITIVE) {
+        return colTi; // Handled later, only struct will be supported.
+      }
+      TypeInfo opTi = GenericUDFUtils.deriveInType(children);
+      if (opTi == null || opTi.getCategory() != Category.PRIMITIVE) {
+        throw new HiveException("Cannot vectorize IN() - common type is " + opTi);
+      }
+      if (((PrimitiveTypeInfo)colTi).getPrimitiveCategory() !=
+          ((PrimitiveTypeInfo)opTi).getPrimitiveCategory()) {
+        throw new HiveException("Cannot vectorize IN() - casting a column is not supported.
"
+            + "Column type is " + colTi + " but the common type is " + opTi);
+      }
+      return colTi;
     } else {
       // The children type should be converted to return type
       commonType = returnType;
@@ -612,6 +623,7 @@ public class VectorizationContext {
     }
     PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) inputTypeInfo;
     int precision = getPrecisionForType(ptinfo);
+    // TODO: precision and scale would be practically invalid for string conversion (38,38)
     int scale = HiveDecimalUtils.getScaleForType(ptinfo);
     return new DecimalTypeInfo(precision, scale);
   }
@@ -1444,8 +1456,8 @@ public class VectorizationContext {
   /**
    * Create a filter or boolean-valued expression for column IN ( <list-of-constants>
)
    */
-  private VectorExpression getInExpression(List<ExprNodeDesc> childExpr, Mode mode,
TypeInfo returnType)
-      throws HiveException {
+  private VectorExpression getInExpression(List<ExprNodeDesc> childExpr,
+      VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
     ExprNodeDesc colExpr = childExpr.get(0);
     List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size());
 
@@ -1453,7 +1465,7 @@ public class VectorizationContext {
     colType = VectorizationContext.mapTypeNameSynonyms(colType);
     TypeInfo colTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colType);
     Category category = colTypeInfo.getCategory();
-    if (category == Category.STRUCT){
+    if (category == Category.STRUCT) {
       return getStructInExpression(childExpr, colExpr, colTypeInfo, inChildren, mode, returnType);
     } else if (category != Category.PRIMITIVE) {
       return null;
@@ -1481,6 +1493,8 @@ public class VectorizationContext {
 
     // determine class
     Class<?> cl = null;
+    // TODO: the below assumes that all the arguments to IN are of the same type;
+    //       non-vectorized validates that explicitly during UDF init.
     if (isIntFamily(colType)) {
       cl = (mode == Mode.FILTER ? FilterLongColumnInList.class : LongColumnInList.class);
       long[] inVals = new long[childrenForInList.size()];

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
index 222e0e0..13a765c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
@@ -23,14 +23,17 @@ import java.lang.reflect.Method;
 import java.lang.reflect.ParameterizedType;
 import java.lang.reflect.Type;
 import java.util.HashMap;
+import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
 import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.IdentityConverter;
@@ -169,17 +172,7 @@ public final class GenericUDFUtils {
         return false;
       }
 
-      /**
-       * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't be promoted
-       * to exact type, as FunctionRegistry.getCommonClass() might do. This corrects
-       * that.
-       */
-      if (commonTypeInfo instanceof DecimalTypeInfo) {
-        if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) oiTypeInfo)) ||
-            (!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) rTypeInfo))) {
-          commonTypeInfo = TypeInfoFactory.doubleTypeInfo;
-        }
-      }
+      commonTypeInfo = updateCommonTypeForDecimal(commonTypeInfo, oiTypeInfo, rTypeInfo);
 
       returnObjectInspector = TypeInfoUtils
           .getStandardWritableObjectInspectorFromTypeInfo(commonTypeInfo);
@@ -240,6 +233,43 @@ public final class GenericUDFUtils {
 
   }
 
+  protected static TypeInfo updateCommonTypeForDecimal(
+      TypeInfo commonTypeInfo, TypeInfo ti, TypeInfo returnType) {
+    /**
+     * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't be promoted
+     * to exact type, as FunctionRegistry.getCommonClass() might do. This corrects
+     * that.
+     */
+    if (commonTypeInfo instanceof DecimalTypeInfo) {
+      if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)ti)) ||
+          (!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)returnType))) {
+        return TypeInfoFactory.doubleTypeInfo;
+      }
+    }
+    return commonTypeInfo;
+  }
+
+  // Based on update() above.
+  public static TypeInfo deriveInType(List<ExprNodeDesc> children) {
+    TypeInfo returnType = null;
+    for (ExprNodeDesc node : children) {
+      TypeInfo ti = node.getTypeInfo();
+      if (ti.getCategory() == Category.PRIMITIVE
+        && ((PrimitiveTypeInfo)ti).getPrimitiveCategory() == PrimitiveCategory.VOID)
{
+        continue;
+      }
+      if (returnType == null) {
+        returnType = ti;
+        continue;
+      }
+      if (returnType == ti) continue;
+      TypeInfo commonTypeInfo = FunctionRegistry.getCommonClass(returnType, ti);
+      if (commonTypeInfo == null) return null;
+      returnType = updateCommonTypeForDecimal(commonTypeInfo, ti, returnType);
+    }
+    return returnType;
+  }
+
   /**
    * Convert parameters for the method if needed.
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/test/queries/clientpositive/vector_string_decimal.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_string_decimal.q b/ql/src/test/queries/clientpositive/vector_string_decimal.q
new file mode 100644
index 0000000..e69cd77
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_string_decimal.q
@@ -0,0 +1,21 @@
+set hive.vectorized.execution.enabled=false;
+set hive.fetch.task.conversion=none;
+
+drop table orc_decimal;
+drop table staging;
+create table orc_decimal (id decimal(18,0)) stored as orc;
+
+create table staging (id decimal(18,0));
+
+insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0);
+
+insert overwrite table orc_decimal select id from staging;
+
+set hive.vectorized.execution.enabled=true;
+
+explain
+select * from orc_decimal where id in ('100000000', '200000000');
+select * from orc_decimal where id in ('100000000', '200000000');
+
+drop table orc_decimal;
+drop table staging;

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
index 75520f4..31dcb5f 100644
--- a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
@@ -149,7 +149,6 @@ STAGE PLANS:
                         key expressions: _col0 (type: decimal(20,10))
                         sort order: +
                         Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE
Column stats: NONE
-            Execution mode: vectorized
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -205,7 +204,6 @@ STAGE PLANS:
                           sort order: 
                           Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: NONE
                           value expressions: _col0 (type: bigint)
-            Execution mode: vectorized
         Reducer 2 
             Reduce Operator Tree:
               Group By Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
index 61e215e..e016682 100644
--- a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
@@ -149,7 +149,6 @@ STAGE PLANS:
                         key expressions: _col0 (type: decimal(20,10))
                         sort order: +
                         Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE
Column stats: NONE
-            Execution mode: vectorized
         Reducer 2 
             Reduce Operator Tree:
               Select Operator
@@ -205,7 +204,6 @@ STAGE PLANS:
                           sort order: 
                           Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: NONE
                           value expressions: _col0 (type: bigint)
-            Execution mode: vectorized
         Reducer 2 
             Reduce Operator Tree:
               Group By Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/test/results/clientpositive/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_between_in.q.out b/ql/src/test/results/clientpositive/vector_between_in.q.out
index a9b9a4b..fdb0756 100644
--- a/ql/src/test/results/clientpositive/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/vector_between_in.q.out
@@ -130,7 +130,6 @@ STAGE PLANS:
                   key expressions: _col0 (type: decimal(20,10))
                   sort order: +
                   Statistics: Num rows: 6144 Data size: 1233808 Basic stats: COMPLETE Column
stats: NONE
-      Execution mode: vectorized
       Reduce Operator Tree:
         Select Operator
           expressions: KEY.reducesinkkey0 (type: decimal(20,10))
@@ -179,7 +178,6 @@ STAGE PLANS:
                     sort order: 
                     Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
                     value expressions: _col0 (type: bigint)
-      Execution mode: vectorized
       Reduce Operator Tree:
         Group By Operator
           aggregations: count(VALUE._col0)

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/ql/src/test/results/clientpositive/vector_string_decimal.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_string_decimal.q.out b/ql/src/test/results/clientpositive/vector_string_decimal.q.out
new file mode 100644
index 0000000..e0a3563
--- /dev/null
+++ b/ql/src/test/results/clientpositive/vector_string_decimal.q.out
@@ -0,0 +1,106 @@
+PREHOOK: query: drop table orc_decimal
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table orc_decimal
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_decimal
+PREHOOK: query: create table staging (id decimal(18,0))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@staging
+POSTHOOK: query: create table staging (id decimal(18,0))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@staging
+PREHOOK: query: insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@staging
+POSTHOOK: query: insert into staging values (34324.0), (100000000.0), (200000000.0), (300000000.0)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@staging
+POSTHOOK: Lineage: staging.id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1,
type:string, comment:), ]
+PREHOOK: query: insert overwrite table orc_decimal select id from staging
+PREHOOK: type: QUERY
+PREHOOK: Input: default@staging
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: insert overwrite table orc_decimal select id from staging
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@staging
+POSTHOOK: Output: default@orc_decimal
+POSTHOOK: Lineage: orc_decimal.id SIMPLE [(staging)staging.FieldSchema(name:id, type:decimal(18,0),
comment:null), ]
+PREHOOK: query: explain
+select * from orc_decimal where id in ('100000000', '200000000')
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select * from orc_decimal where id in ('100000000', '200000000')
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: orc_decimal
+            Statistics: Num rows: 4 Data size: 448 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: (id) IN ('100000000', '200000000') (type: boolean)
+              Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats:
NONE
+              Select Operator
+                expressions: id (type: decimal(18,0))
+                outputColumnNames: _col0
+                Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats:
NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE Column stats:
NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from orc_decimal where id in ('100000000', '200000000')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_decimal
+#### A masked pattern was here ####
+POSTHOOK: query: select * from orc_decimal where id in ('100000000', '200000000')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_decimal
+#### A masked pattern was here ####
+100000000
+200000000
+PREHOOK: query: drop table orc_decimal
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@orc_decimal
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: drop table orc_decimal
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@orc_decimal
+POSTHOOK: Output: default@orc_decimal
+PREHOOK: query: drop table staging
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@staging
+PREHOOK: Output: default@staging
+POSTHOOK: query: drop table staging
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@staging
+POSTHOOK: Output: default@staging

http://git-wip-us.apache.org/repos/asf/hive/blob/78bedc8e/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
index aa9e37a..00eedb6 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
@@ -108,7 +108,7 @@ public class HiveDecimalUtils {
     case VOID:
       return 1;
     default:
-      return HiveDecimal.MAX_PRECISION;
+      return HiveDecimal.SYSTEM_DEFAULT_PRECISION;
     }
   }
 
@@ -131,7 +131,7 @@ public class HiveDecimalUtils {
     case VOID:
       return 0;
     default:
-      return HiveDecimal.MAX_SCALE;
+      return HiveDecimal.SYSTEM_DEFAULT_SCALE;
     }
   }
 


Mime
View raw message