hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jcama...@apache.org
Subject hive git commit: HIVE-13942: Correctness of CASE folding in the presence of NULL values (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)
Date Tue, 07 Jun 2016 10:04:15 GMT
Repository: hive
Updated Branches:
  refs/heads/branch-2.1 2b5ae7458 -> e66519933


HIVE-13942: Correctness of CASE folding in the presence of NULL values (Jesus Camacho Rodriguez,
reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e6651993
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e6651993
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e6651993

Branch: refs/heads/branch-2.1
Commit: e66519933856c3b13a9d99f20309ee039a244372
Parents: 2b5ae74
Author: Jesus Camacho Rodriguez <jcamacho@apache.org>
Authored: Tue Jun 7 11:04:29 2016 +0100
Committer: Jesus Camacho Rodriguez <jcamacho@apache.org>
Committed: Tue Jun 7 11:04:29 2016 +0100

----------------------------------------------------------------------
 .../results/positive/hbase_ppd_key_range.q.out  | 34 ++++---
 .../test/results/positive/hbase_pushdown.q.out  |  2 +-
 .../ql/optimizer/calcite/HiveRelBuilder.java    | 94 ++++++++++++++++++++
 .../ql/optimizer/calcite/HiveRelFactories.java  |  3 +-
 .../hive/ql/optimizer/calcite/HiveRexUtil.java  | 50 ++++++++---
 .../test/results/clientpositive/fold_case.q.out | 20 ++---
 .../test/results/clientpositive/fold_when.q.out | 16 ++--
 7 files changed, 176 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/hbase-handler/src/test/results/positive/hbase_ppd_key_range.q.out
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/results/positive/hbase_ppd_key_range.q.out b/hbase-handler/src/test/results/positive/hbase_ppd_key_range.q.out
index f92371d..332c5e6 100644
--- a/hbase-handler/src/test/results/positive/hbase_ppd_key_range.q.out
+++ b/hbase-handler/src/test/results/positive/hbase_ppd_key_range.q.out
@@ -399,22 +399,36 @@ explain select * from hbase_pushdown
 where (case when key<'90' then 2 else 4 end) > 3
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
-  Stage-0 is a root stage
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
 
 STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: hbase_pushdown
+            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+            Filter Operator
+              predicate: (not NVL((key < '90'),false)) (type: boolean)
+              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+              Select Operator
+                expressions: key (type: string), value (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats:
NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
   Stage: Stage-0
     Fetch Operator
       limit: -1
       Processor Tree:
-        TableScan
-          alias: hbase_pushdown
-          filterExpr: (key >= '90') (type: boolean)
-          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
-          Select Operator
-            expressions: key (type: string), value (type: string)
-            outputColumnNames: _col0, _col1
-            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
-            ListSink
+        ListSink
 
 PREHOOK: query: -- with a predicate which is under an OR, so it should
 -- be ignored by pushdown

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/hbase-handler/src/test/results/positive/hbase_pushdown.q.out
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/results/positive/hbase_pushdown.q.out b/hbase-handler/src/test/results/positive/hbase_pushdown.q.out
index d957a7c..39c03eb 100644
--- a/hbase-handler/src/test/results/positive/hbase_pushdown.q.out
+++ b/hbase-handler/src/test/results/positive/hbase_pushdown.q.out
@@ -297,7 +297,7 @@ STAGE PLANS:
             alias: hbase_pushdown
             Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
             Filter Operator
-              predicate: (key <> 90) (type: boolean)
+              predicate: (not NVL((key = 90),false)) (type: boolean)
               Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
               Select Operator
                 expressions: key (type: int), value (type: string)

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java
new file mode 100644
index 0000000..1c64d64
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite;
+
+import org.apache.calcite.plan.Context;
+import org.apache.calcite.plan.Contexts;
+import org.apache.calcite.plan.RelOptCluster;
+import org.apache.calcite.plan.RelOptSchema;
+import org.apache.calcite.rel.RelCollations;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexUtil;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.server.CalciteServerStatement;
+import org.apache.calcite.tools.FrameworkConfig;
+import org.apache.calcite.tools.Frameworks;
+import org.apache.calcite.tools.RelBuilder;
+import org.apache.calcite.tools.RelBuilderFactory;
+
+
+/**
+ * Builder for relational expressions in Hive.
+ *
+ * <p>{@code RelBuilder} does not make possible anything that you could not
+ * also accomplish by calling the factory methods of the particular relational
+ * expression. But it makes common tasks more straightforward and concise.
+ *
+ * <p>It is not thread-safe.
+ */
+public class HiveRelBuilder extends RelBuilder {
+
+  private HiveRelBuilder(Context context, RelOptCluster cluster, RelOptSchema relOptSchema)
{
+    super(context, cluster, relOptSchema);
+  }
+
+  /** Creates a RelBuilder. */
+  public static RelBuilder create(FrameworkConfig config) {
+    final RelOptCluster[] clusters = {null};
+    final RelOptSchema[] relOptSchemas = {null};
+    Frameworks.withPrepare(
+        new Frameworks.PrepareAction<Void>(config) {
+          public Void apply(RelOptCluster cluster, RelOptSchema relOptSchema,
+              SchemaPlus rootSchema, CalciteServerStatement statement) {
+            clusters[0] = cluster;
+            relOptSchemas[0] = relOptSchema;
+            return null;
+          }
+        });
+    return new HiveRelBuilder(config.getContext(), clusters[0], relOptSchemas[0]);
+  }
+
+  /** Creates a {@link RelBuilderFactory}, a partially-created RelBuilder.
+   * Just add a {@link RelOptCluster} and a {@link RelOptSchema} */
+  public static RelBuilderFactory proto(final Context context) {
+    return new RelBuilderFactory() {
+      public RelBuilder create(RelOptCluster cluster, RelOptSchema schema) {
+        return new HiveRelBuilder(context, cluster, schema);
+      }
+    };
+  }
+
+  /** Creates a {@link RelBuilderFactory} that uses a given set of factories. */
+  public static RelBuilderFactory proto(Object... factories) {
+    return proto(Contexts.of(factories));
+  }
+
+  @Override
+  public RelBuilder filter(Iterable<? extends RexNode> predicates) {
+    final RexNode x = HiveRexUtil.simplify(cluster.getRexBuilder(),
+            RexUtil.composeConjunction(cluster.getRexBuilder(), predicates, false));
+    if (!x.isAlwaysTrue()) {
+      final RelNode input = build();
+      final RelNode filter = HiveRelFactories.HIVE_FILTER_FACTORY.createFilter(input, x);
+      return this.push(filter);
+    }
+    return this;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java
index 971b446..cf93ed8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java
@@ -41,7 +41,6 @@ import org.apache.calcite.rel.type.RelDataType;
 import org.apache.calcite.rex.RexNode;
 import org.apache.calcite.rex.RexUtil;
 import org.apache.calcite.sql.SqlKind;
-import org.apache.calcite.tools.RelBuilder;
 import org.apache.calcite.tools.RelBuilderFactory;
 import org.apache.calcite.util.ImmutableBitSet;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
@@ -78,7 +77,7 @@ public class HiveRelFactories {
           new HiveSetOpFactoryImpl();
 
   public static final RelBuilderFactory HIVE_BUILDER =
-      RelBuilder.proto(
+      HiveRelBuilder.proto(
           Contexts.of(HIVE_PROJECT_FACTORY,
               HIVE_FILTER_FACTORY,
               HIVE_JOIN_FACTORY,

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRexUtil.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRexUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRexUtil.java
index 73a67a8..d466378 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRexUtil.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRexUtil.java
@@ -134,25 +134,18 @@ public class HiveRexUtil {
     final List<RexNode> operands = call.getOperands();
     final List<RexNode> newOperands = new ArrayList<>();
     final Set<String> values = new HashSet<>();
-    boolean constainsNullableCase = false;
     for (int i = 0; i < operands.size(); i++) {
       RexNode operand = operands.get(i);
       if (RexUtil.isCasePredicate(call, i)) {
         if (operand.isAlwaysTrue()) {
           // Predicate is always TRUE. Make value the ELSE and quit.
           newOperands.add(operands.get(i + 1));
-          if (operand.getType().isNullable()) {
-            constainsNullableCase = true;
-          }
           break;
         } else if (operand.isAlwaysFalse() || RexUtil.isNull(operand)) {
           // Predicate is always FALSE or NULL. Skip predicate and value.
           ++i;
           continue;
         }
-        if (operand.getType().isNullable()) {
-          constainsNullableCase = true;
-        }
       } else {
         if (unknownAsFalse && RexUtil.isNull(operand)) {
           values.add(rexBuilder.makeLiteral(false).toString());
@@ -167,19 +160,52 @@ public class HiveRexUtil {
       return rexBuilder.makeCast(call.getType(), newOperands.get(newOperands.size() - 1));
     }
   trueFalse:
-    if (call.getType().getSqlTypeName() == SqlTypeName.BOOLEAN &&
-            (!constainsNullableCase || unknownAsFalse)) {
+    if (call.getType().getSqlTypeName() == SqlTypeName.BOOLEAN) {
       // Optimize CASE where every branch returns constant true or constant
-      // false:
+      // false.
+      final List<Pair<RexNode, RexNode>> pairs =
+          casePairs(rexBuilder, newOperands);
+      // 1) Possible simplification if unknown is treated as false:
+      //   CASE
+      //   WHEN p1 THEN TRUE
+      //   WHEN p2 THEN TRUE
+      //   ELSE FALSE
+      //   END
+      // can be rewritten to: (p1 or p2)
+      if (unknownAsFalse) {
+        final List<RexNode> terms = new ArrayList<>();
+        int pos = 0;
+        for (; pos < pairs.size(); pos++) {
+          // True block
+          Pair<RexNode, RexNode> pair = pairs.get(pos);
+          if (!pair.getValue().isAlwaysTrue()) {
+            break;
+          }
+          terms.add(pair.getKey());
+        }
+        for (; pos < pairs.size(); pos++) {
+          // False block
+          Pair<RexNode, RexNode> pair = pairs.get(pos);
+          if (!pair.getValue().isAlwaysFalse() && !RexUtil.isNull(pair.getValue()))
{
+            break;
+          }
+        }
+        if (pos == pairs.size()) {
+          return RexUtil.composeDisjunction(rexBuilder, terms, false);
+        }
+      }
+      // 2) Another simplification
       //   CASE
       //   WHEN p1 THEN TRUE
       //   WHEN p2 THEN FALSE
       //   WHEN p3 THEN TRUE
       //   ELSE FALSE
       //   END
-      final List<Pair<RexNode, RexNode>> pairs =
-          casePairs(rexBuilder, newOperands);
+      // if p1...pn cannot be nullable
       for (Ord<Pair<RexNode, RexNode>> pair : Ord.zip(pairs)) {
+        if (pair.e.getKey().getType().isNullable()) {
+          break trueFalse;
+        }
         if (!pair.e.getValue().isAlwaysTrue()
             && !pair.e.getValue().isAlwaysFalse()
             && (!unknownAsFalse || !RexUtil.isNull(pair.e.getValue()))) {

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/ql/src/test/results/clientpositive/fold_case.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/fold_case.q.out b/ql/src/test/results/clientpositive/fold_case.q.out
index f57da79..ec99197 100644
--- a/ql/src/test/results/clientpositive/fold_case.q.out
+++ b/ql/src/test/results/clientpositive/fold_case.q.out
@@ -67,10 +67,10 @@ STAGE PLANS:
             alias: src
             Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
             Filter Operator
-              predicate: (key <> '238') (type: boolean)
-              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              predicate: (not NVL((key = '238'),false)) (type: boolean)
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
               Select Operator
-                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
                 Group By Operator
                   aggregations: count(1)
                   mode: hash
@@ -370,15 +370,15 @@ STAGE PLANS:
             alias: src
             Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
             Filter Operator
-              predicate: (key <> '238') (type: boolean)
-              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              predicate: (not NVL((key = '238'),false)) (type: boolean)
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
               Select Operator
                 expressions: CASE WHEN ((key = '238')) THEN (null) ELSE (false) END (type:
boolean)
                 outputColumnNames: _col0
-                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column
stats: NONE
+                  Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column
stats: NONE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -448,10 +448,10 @@ STAGE PLANS:
             alias: src
             Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
             Filter Operator
-              predicate: (key <> '238') (type: boolean)
-              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              predicate: CASE WHEN ((key = '238')) THEN (null) ELSE (true) END (type: boolean)
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
               Select Operator
-                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
                 Group By Operator
                   aggregations: count(1)
                   mode: hash

http://git-wip-us.apache.org/repos/asf/hive/blob/e6651993/ql/src/test/results/clientpositive/fold_when.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/fold_when.q.out b/ql/src/test/results/clientpositive/fold_when.q.out
index 4f3eb14..d56de7f 100644
--- a/ql/src/test/results/clientpositive/fold_when.q.out
+++ b/ql/src/test/results/clientpositive/fold_when.q.out
@@ -328,15 +328,15 @@ STAGE PLANS:
             alias: src
             Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
             Filter Operator
-              predicate: (key <> '238') (type: boolean)
-              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              predicate: (not NVL((key = '238'),false)) (type: boolean)
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
               Select Operator
                 expressions: key (type: string)
                 outputColumnNames: _col0
-                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column
stats: NONE
+                  Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column
stats: NONE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -435,15 +435,15 @@ STAGE PLANS:
             alias: src
             Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
             Filter Operator
-              predicate: (key <> '11') (type: boolean)
-              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              predicate: (not NVL((key = '11'),false)) (type: boolean)
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
               Select Operator
                 expressions: key (type: string)
                 outputColumnNames: _col0
-                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats:
NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column
stats: NONE
+                  Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column
stats: NONE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat


Mime
View raw message