hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pxi...@apache.org
Subject [2/2] hive git commit: HIVE-10901: Optimize multi column distinct queries (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Date Thu, 17 Nov 2016 18:48:26 GMT
HIVE-10901: Optimize multi column distinct queries (Pengcheng Xiong, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/67c022f0
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/67c022f0
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/67c022f0

Branch: refs/heads/master
Commit: 67c022f05f5405abd996446fefc90e7ae34c960d
Parents: 0025b9d
Author: Pengcheng Xiong <pxiong@apache.org>
Authored: Thu Nov 17 10:48:06 2016 -0800
Committer: Pengcheng Xiong <pxiong@apache.org>
Committed: Thu Nov 17 10:48:06 2016 -0800

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../rules/HiveAggregateProjectMergeRule.java    |  14 ++
 .../rules/HiveExpandDistinctAggregatesRule.java | 206 ++++++++++++++++++-
 .../clientpositive/multi_count_distinct.q       |  38 ++++
 .../llap/vector_grouping_sets.q.out             |   8 +-
 .../results/clientpositive/perf/query70.q.out   |  52 ++---
 .../tez/multi_count_distinct.q.out              | 193 +++++++++++++++++
 .../clientpositive/vector_grouping_sets.q.out   |   8 +-
 8 files changed, 483 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index f30152b..e4910e4 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -51,6 +51,7 @@ minitez.query.files=explainuser_3.q,\
   explainanalyze_5.q,\
   hybridgrace_hashjoin_1.q,\
   hybridgrace_hashjoin_2.q,\
+  multi_count_distinct.q,\
   tez_union_with_udf.q
 
 

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
index c243266..7c3b4b0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
@@ -31,6 +31,7 @@ import org.apache.calcite.util.ImmutableBitSet;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
 
 import com.google.common.collect.ImmutableList;
@@ -59,6 +60,19 @@ public class HiveAggregateProjectMergeRule extends RelOptRule {
   }
 
   @Override
+  public boolean matches(RelOptRuleCall call) {
+    final Aggregate aggregate = call.rel(0);
+    // Rule cannot be applied if there are GroupingId because it will change the
+    // value as the position will be changed.
+    for (AggregateCall aggCall : aggregate.getAggCallList()) {
+      if (aggCall.getAggregation().equals(HiveGroupingID.INSTANCE)) {
+        return false;
+      }
+    }
+    return super.matches(call);
+  }
+
+  @Override
   public void onMatch(RelOptRuleCall call) {
     final HiveAggregate aggregate = call.rel(0);
     final HiveProject project = call.rel(1);

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
index 7d4411a..417eda3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
@@ -16,13 +16,17 @@
  */
 package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
 
+import java.math.BigDecimal;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.calcite.plan.RelOptCluster;
 import org.apache.calcite.plan.RelOptRule;
 import org.apache.calcite.plan.RelOptRuleCall;
 import org.apache.calcite.rel.RelNode;
@@ -31,16 +35,30 @@ import org.apache.calcite.rel.core.AggregateCall;
 import org.apache.calcite.rel.core.RelFactories;
 import org.apache.calcite.rel.metadata.RelColumnOrigin;
 import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rel.type.RelDataType;
 import org.apache.calcite.rel.type.RelDataTypeField;
+import org.apache.calcite.rex.RexBuilder;
 import org.apache.calcite.rex.RexInputRef;
 import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.sql.type.SqlTypeName;
 import org.apache.calcite.util.ImmutableBitSet;
 import org.apache.calcite.util.Pair;
 import org.apache.calcite.util.Util;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
 import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import com.google.common.base.Function;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 
@@ -73,6 +91,8 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
           HiveRelFactories.HIVE_PROJECT_FACTORY);
 
   private static RelFactories.ProjectFactory projFactory;
+  
+  protected static final Logger LOG = LoggerFactory.getLogger(HiveExpandDistinctAggregatesRule.class);
 
   //~ Constructors -----------------------------------------------------------
 
@@ -82,19 +102,25 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule
{
     projFactory = projectFactory;
   }
 
+  RelOptCluster cluster = null;
+  RexBuilder rexBuilder = null;
+
   //~ Methods ----------------------------------------------------------------
 
   @Override
   public void onMatch(RelOptRuleCall call) {
     final Aggregate aggregate = call.rel(0);
-    if (!aggregate.containsDistinctCall()) {
+    int numCountDistinct = getNumCountDistinctCall(aggregate);
+    if (numCountDistinct == 0) {
       return;
     }
 
-    // Find all of the agg expressions. We use a LinkedHashSet to ensure
-    // determinism.
+    // Find all of the agg expressions. We use a List (for all count(distinct))
+    // as well as a Set (for all others) to ensure determinism.
     int nonDistinctCount = 0;
+    List<List<Integer>> argListList = new ArrayList<List<Integer>>();
     Set<List<Integer>> argListSets = new LinkedHashSet<List<Integer>>();
+    Set<Integer> positions = new HashSet<>();
     for (AggregateCall aggCall : aggregate.getAggCallList()) {
       if (!aggCall.isDistinct()) {
         ++nonDistinctCount;
@@ -103,11 +129,35 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule
{
       ArrayList<Integer> argList = new ArrayList<Integer>();
       for (Integer arg : aggCall.getArgList()) {
         argList.add(arg);
+        positions.add(arg);
       }
+      // Aggr checks for sorted argList.
+      argListList.add(argList);
       argListSets.add(argList);
     }
     Util.permAssert(argListSets.size() > 0, "containsDistinctCall lied");
 
+    if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size()
+        && aggregate.getGroupSet().isEmpty()) {
+      // now positions contains all the distinct positions, i.e., $5, $4, $6
+      // we need to first sort them as group by set
+      // and then get their position later, i.e., $4->1, $5->2, $6->3
+      cluster = aggregate.getCluster();
+      rexBuilder = cluster.getRexBuilder();
+      RelNode converted = null;
+      List<Integer> sourceOfForCountDistinct = new ArrayList<>();
+      sourceOfForCountDistinct.addAll(positions);
+      Collections.sort(sourceOfForCountDistinct);
+      try {
+        converted = convert(aggregate, argListList, sourceOfForCountDistinct);
+      } catch (CalciteSemanticException e) {
+        LOG.debug(e.toString());
+        throw new RuntimeException(e);
+      }
+      call.transformTo(converted);
+      return;
+    }
+
     // If all of the agg expressions are distinct and have the same
     // arguments then we can use a more efficient form.
     if ((nonDistinctCount == 0) && (argListSets.size() == 1)) {
@@ -133,6 +183,156 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule
{
   }
 
   /**
+   * Converts an aggregate relational expression that contains only
+   * count(distinct) to grouping sets with count. For example select
+   * count(distinct department_id), count(distinct gender), count(distinct
+   * education_level) from employee; can be transformed to select count(case i
+   * when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null
+   * end) as c1, count(case i when 4 then 1 else null end) as c2 from (select
+   * grouping__id as i, department_id, gender, education_level from employee
+   * group by department_id, gender, education_level grouping sets
+   * (department_id, gender, education_level))subq;
+   * @throws CalciteSemanticException 
+   */
+  private RelNode convert(Aggregate aggregate, List<List<Integer>> argList, List<Integer>
sourceOfForCountDistinct) throws CalciteSemanticException {
+    // we use this map to map the position of argList to the position of grouping set
+    Map<Integer, Integer> map = new HashMap<>();
+    List<List<Integer>> cleanArgList = new ArrayList<>();
+    final Aggregate groupingSets = createGroupingSets(aggregate, argList, cleanArgList, map,
sourceOfForCountDistinct);
+    return createCount(groupingSets, argList, cleanArgList, map, sourceOfForCountDistinct);
+  }
+
+  private long getGroupingIdValue(List<Integer> list, List<Integer> sourceOfForCountDistinct)
{
+    long ind = 0;
+    for (int i : list) {
+      ind |= 1 << sourceOfForCountDistinct.indexOf(i);
+    }
+    return ind;
+  }
+
+  /**
+   * @param aggr: the original aggregate
+   * @param argList: the original argList in aggregate
+   * @param cleanArgList: the new argList without duplicates
+   * @param map: the mapping from the original argList to the new argList
+   * @param sourceOfForCountDistinct: the sorted positions of groupset
+   * @return
+   * @throws CalciteSemanticException
+   */
+  private RelNode createCount(Aggregate aggr, List<List<Integer>> argList,
+      List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
+      List<Integer> sourceOfForCountDistinct) throws CalciteSemanticException {
+    List<RexNode> originalInputRefs = Lists.transform(aggr.getRowType().getFieldList(),
+        new Function<RelDataTypeField, RexNode>() {
+          @Override
+          public RexNode apply(RelDataTypeField input) {
+            return new RexInputRef(input.getIndex(), input.getType());
+          }
+        });
+    final List<RexNode> gbChildProjLst = Lists.newArrayList();
+    for (List<Integer> list : cleanArgList) {
+      RexNode equal = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS,
+          originalInputRefs.get(originalInputRefs.size() - 1),
+          rexBuilder.makeExactLiteral(new BigDecimal(getGroupingIdValue(list, sourceOfForCountDistinct))));
+      RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.CASE, equal,
+          rexBuilder.makeExactLiteral(BigDecimal.ONE), rexBuilder.constantNull());
+      gbChildProjLst.add(condition);
+    }
+
+    // create the project before GB
+    RelNode gbInputRel = HiveProject.create(aggr, gbChildProjLst, null);
+
+    // create the aggregate
+    List<AggregateCall> aggregateCalls = Lists.newArrayList();
+    RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo,
+        cluster.getTypeFactory());
+    for (int i = 0; i < cleanArgList.size(); i++) {
+      AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster,
+          TypeInfoFactory.longTypeInfo, i, aggFnRetType);
+      aggregateCalls.add(aggregateCall);
+    }
+    Aggregate aggregate = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
gbInputRel,
+        false, ImmutableBitSet.of(), null, aggregateCalls);
+
+    // create the project after GB. For those repeated values, e.g., select
+    // count(distinct x, y), count(distinct y, x), we find the correct mapping.
+    if (map.isEmpty()) {
+      return aggregate;
+    } else {
+      List<RexNode> originalAggrRefs = Lists.transform(aggregate.getRowType().getFieldList(),
+          new Function<RelDataTypeField, RexNode>() {
+            @Override
+            public RexNode apply(RelDataTypeField input) {
+              return new RexInputRef(input.getIndex(), input.getType());
+            }
+          });
+      final List<RexNode> projLst = Lists.newArrayList();
+      int index = 0;
+      for (int i = 0; i < argList.size(); i++) {
+        if (map.containsKey(i)) {
+          projLst.add(originalAggrRefs.get(map.get(i)));
+        } else {
+          projLst.add(originalAggrRefs.get(index++));
+        }
+      }
+      return HiveProject.create(aggregate, projLst, null);
+    }
+  }
+
+  /**
+   * @param aggregate: the original aggregate
+   * @param argList: the original argList in aggregate
+   * @param cleanArgList: the new argList without duplicates
+   * @param map: the mapping from the original argList to the new argList
+   * @param sourceOfForCountDistinct: the sorted positions of groupset
+   * @return
+   */
+  private Aggregate createGroupingSets(Aggregate aggregate, List<List<Integer>>
argList,
+      List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
+      List<Integer> sourceOfForCountDistinct) {
+    final ImmutableBitSet groupSet = ImmutableBitSet.of(sourceOfForCountDistinct);
+    final List<ImmutableBitSet> origGroupSets = new ArrayList<>();
+
+    for (int i = 0; i < argList.size(); i++) {
+      List<Integer> list = argList.get(i);
+      ImmutableBitSet bitSet = ImmutableBitSet.of(list);
+      int prev = origGroupSets.indexOf(bitSet);
+      if (prev == -1) {
+        origGroupSets.add(bitSet);
+        cleanArgList.add(list);
+      } else {
+        map.put(i, prev);
+      }
+    }
+    // Calcite expects the grouping sets sorted and without duplicates
+    Collections.sort(origGroupSets, ImmutableBitSet.COMPARATOR);
+
+    List<AggregateCall> aggregateCalls = new ArrayList<AggregateCall>();
+    // Create GroupingID column
+    AggregateCall aggCall = AggregateCall.create(HiveGroupingID.INSTANCE, false,
+        new ImmutableList.Builder<Integer>().build(), -1, this.cluster.getTypeFactory()
+            .createSqlType(SqlTypeName.INTEGER), HiveGroupingID.INSTANCE.getName());
+    aggregateCalls.add(aggCall);
+    return new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
+        aggregate.getInput(), true, groupSet, origGroupSets, aggregateCalls);
+  }
+
+  /**
+   * Returns the number of count DISTINCT
+   *
+   * @return the number of count DISTINCT
+   */
+  private int getNumCountDistinctCall(Aggregate hiveAggregate) {
+    int cnt = 0;
+    for (AggregateCall aggCall : hiveAggregate.getAggCallList()) {
+      if (aggCall.isDistinct() && (aggCall.getAggregation().getName().equalsIgnoreCase("count")))
{
+        cnt++;
+      }
+    }
+    return cnt;
+  }
+
+  /**
    * Converts an aggregate relational expression that contains just one
    * distinct aggregate function (or perhaps several over the same arguments)
    * and no non-distinct aggregate functions.

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/queries/clientpositive/multi_count_distinct.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/multi_count_distinct.q b/ql/src/test/queries/clientpositive/multi_count_distinct.q
new file mode 100644
index 0000000..855cb64
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/multi_count_distinct.q
@@ -0,0 +1,38 @@
+set hive.mapred.mode=nonstrict;
+
+drop table employee;
+
+create table employee (department_id int, gender varchar(10), education_level int);
+ 
+insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4,
'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2);
+
+explain select count(distinct department_id), count(distinct gender), count(distinct education_level)
from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level)
from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level),
count(distinct education_level) from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level),

+count(distinct education_level, department_id) from employee;
+
+select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct
education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level)
from employee;
+
+explain select count(distinct gender), count(distinct department_id), count(distinct gender),
count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee;
+
+select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct
education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee;
+
+select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null
end) as c1, 
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id,
gender, 
+education_level from employee group by department_id, gender, education_level grouping sets

+(department_id, gender, education_level))subq;
+
+select grouping__id as i, department_id, gender, education_level from employee 
+group by department_id, gender, education_level grouping sets 
+(department_id, gender, education_level, (education_level, department_id));
+
+
+
+

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
index 2cd758c..127478e 100644
--- a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
@@ -241,10 +241,10 @@ STAGE PLANS:
                   Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column
stats: NONE
                   Select Operator
                     expressions: s_store_id (type: string)
-                    outputColumnNames: s_store_id
+                    outputColumnNames: _col0
                     Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column
stats: NONE
                     Group By Operator
-                      keys: s_store_id (type: string), '0' (type: string)
+                      keys: _col0 (type: string), '0' (type: string)
                       mode: hash
                       outputColumnNames: _col0, _col1
                       Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column
stats: NONE
@@ -329,10 +329,10 @@ STAGE PLANS:
                   Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column
stats: NONE
                   Select Operator
                     expressions: s_store_id (type: string)
-                    outputColumnNames: s_store_id
+                    outputColumnNames: _col0
                     Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column
stats: NONE
                     Group By Operator
-                      keys: s_store_id (type: string), '0' (type: string)
+                      keys: _col0 (type: string), '0' (type: string)
                       mode: hash
                       outputColumnNames: _col0, _col1
                       Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column
stats: NONE

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/perf/query70.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/perf/query70.q.out b/ql/src/test/results/clientpositive/perf/query70.q.out
index 5a82cd1..611af74 100644
--- a/ql/src/test/results/clientpositive/perf/query70.q.out
+++ b/ql/src/test/results/clientpositive/perf/query70.q.out
@@ -21,46 +21,46 @@ Stage-0
     limit:100
     Stage-1
       Reducer 6
-      File Output Operator [FS_62]
-        Limit [LIM_61] (rows=100 width=88)
+      File Output Operator [FS_63]
+        Limit [LIM_62] (rows=100 width=88)
           Number of rows:100
-          Select Operator [SEL_60] (rows=1045432122 width=88)
+          Select Operator [SEL_61] (rows=1045432122 width=88)
             Output:["_col0","_col1","_col2","_col3","_col4"]
           <-Reducer 5 [SIMPLE_EDGE]
-            SHUFFLE [RS_59]
-              Select Operator [SEL_57] (rows=1045432122 width=88)
+            SHUFFLE [RS_60]
+              Select Operator [SEL_58] (rows=1045432122 width=88)
                 Output:["_col0","_col1","_col2","_col3","_col4"]
-                PTF Operator [PTF_56] (rows=1045432122 width=88)
+                PTF Operator [PTF_57] (rows=1045432122 width=88)
                   Function definitions:[{},{"name:":"windowingtablefunction","order by:":"_col4
ASC NULLS FIRST","partition by:":"_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null)
END"}]
-                  Select Operator [SEL_55] (rows=1045432122 width=88)
+                  Select Operator [SEL_56] (rows=1045432122 width=88)
                     Output:["_col0","_col1","_col4","_col5"]
                   <-Reducer 4 [SIMPLE_EDGE]
-                    SHUFFLE [RS_54]
+                    SHUFFLE [RS_55]
                       PartitionCols:_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null)
END
-                      Select Operator [SEL_53] (rows=1045432122 width=88)
+                      Select Operator [SEL_54] (rows=1045432122 width=88)
                         Output:["_col0","_col1","_col4","_col5"]
-                        Group By Operator [GBY_52] (rows=1045432122 width=88)
+                        Group By Operator [GBY_53] (rows=1045432122 width=88)
                           Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0,
KEY._col1, KEY._col2
                         <-Reducer 3 [SIMPLE_EDGE]
-                          SHUFFLE [RS_51]
+                          SHUFFLE [RS_52]
                             PartitionCols:_col0, _col1, _col2
-                            Group By Operator [GBY_50] (rows=2090864244 width=88)
-                              Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col6,
_col7, '0'
+                            Group By Operator [GBY_51] (rows=2090864244 width=88)
+                              Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col0,
_col1, '0'
                               Select Operator [SEL_49] (rows=696954748 width=88)
-                                Output:["_col6","_col7","_col2"]
-                                Merge Join Operator [MERGEJOIN_91] (rows=696954748 width=88)
+                                Output:["_col0","_col1","_col2"]
+                                Merge Join Operator [MERGEJOIN_92] (rows=696954748 width=88)
                                   Conds:RS_46._col1=RS_47._col0(Inner),Output:["_col2","_col6","_col7"]
                                 <-Reducer 2 [SIMPLE_EDGE]
                                   SHUFFLE [RS_46]
                                     PartitionCols:_col1
-                                    Merge Join Operator [MERGEJOIN_87] (rows=633595212 width=88)
+                                    Merge Join Operator [MERGEJOIN_88] (rows=633595212 width=88)
                                       Conds:RS_43._col0=RS_44._col0(Inner),Output:["_col1","_col2"]
                                     <-Map 1 [SIMPLE_EDGE]
                                       SHUFFLE [RS_43]
                                         PartitionCols:_col0
                                         Select Operator [SEL_2] (rows=575995635 width=88)
                                           Output:["_col0","_col1","_col2"]
-                                          Filter Operator [FIL_80] (rows=575995635 width=88)
+                                          Filter Operator [FIL_81] (rows=575995635 width=88)
                                             predicate:(ss_sold_date_sk is not null and ss_store_sk
is not null)
                                             TableScan [TS_0] (rows=575995635 width=88)
                                               default@store_sales,ss,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"]
@@ -69,21 +69,21 @@ Stage-0
                                         PartitionCols:_col0
                                         Select Operator [SEL_5] (rows=8116 width=1119)
                                           Output:["_col0"]
-                                          Filter Operator [FIL_81] (rows=8116 width=1119)
+                                          Filter Operator [FIL_82] (rows=8116 width=1119)
                                             predicate:(d_month_seq BETWEEN 1193 AND 1204
and d_date_sk is not null)
                                             TableScan [TS_3] (rows=73049 width=1119)
                                               default@date_dim,d1,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"]
                                 <-Reducer 9 [SIMPLE_EDGE]
                                   SHUFFLE [RS_47]
                                     PartitionCols:_col0
-                                    Merge Join Operator [MERGEJOIN_90] (rows=127775039 width=88)
+                                    Merge Join Operator [MERGEJOIN_91] (rows=127775039 width=88)
                                       Conds:RS_39._col2=RS_40._col0(Left Semi),Output:["_col0","_col1","_col2"]
                                     <-Map 8 [SIMPLE_EDGE]
                                       SHUFFLE [RS_39]
                                         PartitionCols:_col2
                                         Select Operator [SEL_8] (rows=1704 width=1910)
                                           Output:["_col0","_col1","_col2"]
-                                          Filter Operator [FIL_82] (rows=1704 width=1910)
+                                          Filter Operator [FIL_83] (rows=1704 width=1910)
                                             predicate:(s_store_sk is not null and s_state
is not null)
                                             TableScan [TS_6] (rows=1704 width=1910)
                                               default@store,s,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_county","s_state"]
@@ -94,7 +94,7 @@ Stage-0
                                           Output:["_col0"],keys:_col0
                                           Select Operator [SEL_32] (rows=116159124 width=88)
                                             Output:["_col0"]
-                                            Filter Operator [FIL_83] (rows=116159124 width=88)
+                                            Filter Operator [FIL_84] (rows=116159124 width=88)
                                               predicate:(rank_window_0 <= 5)
                                               PTF Operator [PTF_31] (rows=348477374 width=88)
                                                 Function definitions:[{},{"name:":"windowingtablefunction","order
by:":"_col1 DESC NULLS LAST","partition by:":"_col0"}]
@@ -112,28 +112,28 @@ Stage-0
                                                           Output:["_col0","_col1"],aggregations:["sum(_col2)"],keys:_col6
                                                           Select Operator [SEL_24] (rows=696954748
width=88)
                                                             Output:["_col6","_col2"]
-                                                            Merge Join Operator [MERGEJOIN_89]
(rows=696954748 width=88)
+                                                            Merge Join Operator [MERGEJOIN_90]
(rows=696954748 width=88)
                                                               Conds:RS_21._col1=RS_22._col0(Inner),Output:["_col2","_col6"]
                                                             <-Map 16 [SIMPLE_EDGE]
                                                               SHUFFLE [RS_22]
                                                                 PartitionCols:_col0
                                                                 Select Operator [SEL_17]
(rows=1704 width=1910)
                                                                   Output:["_col0","_col1"]
-                                                                  Filter Operator [FIL_86]
(rows=1704 width=1910)
+                                                                  Filter Operator [FIL_87]
(rows=1704 width=1910)
                                                                     predicate:(s_store_sk
is not null and s_state is not null)
                                                                     TableScan [TS_15] (rows=1704
width=1910)
                                                                       default@store,store,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_state"]
                                                             <-Reducer 11 [SIMPLE_EDGE]
                                                               SHUFFLE [RS_21]
                                                                 PartitionCols:_col1
-                                                                Merge Join Operator [MERGEJOIN_88]
(rows=633595212 width=88)
+                                                                Merge Join Operator [MERGEJOIN_89]
(rows=633595212 width=88)
                                                                   Conds:RS_18._col0=RS_19._col0(Inner),Output:["_col1","_col2"]
                                                                 <-Map 10 [SIMPLE_EDGE]
                                                                   SHUFFLE [RS_18]
                                                                     PartitionCols:_col0
                                                                     Select Operator [SEL_11]
(rows=575995635 width=88)
                                                                       Output:["_col0","_col1","_col2"]
-                                                                      Filter Operator [FIL_84]
(rows=575995635 width=88)
+                                                                      Filter Operator [FIL_85]
(rows=575995635 width=88)
                                                                         predicate:(ss_store_sk
is not null and ss_sold_date_sk is not null)
                                                                         TableScan [TS_9]
(rows=575995635 width=88)
                                                                           default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"]
@@ -142,7 +142,7 @@ Stage-0
                                                                     PartitionCols:_col0
                                                                     Select Operator [SEL_14]
(rows=8116 width=1119)
                                                                       Output:["_col0"]
-                                                                      Filter Operator [FIL_85]
(rows=8116 width=1119)
+                                                                      Filter Operator [FIL_86]
(rows=8116 width=1119)
                                                                         predicate:(d_month_seq
BETWEEN 1193 AND 1204 and d_date_sk is not null)
                                                                         TableScan [TS_12]
(rows=73049 width=1119)
                                                                           default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"]

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
new file mode 100644
index 0000000..b623f1d
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
@@ -0,0 +1,193 @@
+PREHOOK: query: drop table employee
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table employee
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table employee (department_id int, gender varchar(10), education_level
int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@employee
+POSTHOOK: query: create table employee (department_id int, gender varchar(10), education_level
int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@employee
+PREHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F',
3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@employee
+POSTHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F',
3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@employee
+POSTHOOK: Lineage: employee.department_id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1,
type:string, comment:), ]
+POSTHOOK: Lineage: employee.education_level EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3,
type:string, comment:), ]
+POSTHOOK: Lineage: employee.gender EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2,
type:string, comment:), ]
+PREHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct
education_level) from employee
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct
education_level) from employee
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+  Fetch Operator
+    limit:-1
+    Stage-1
+      Reducer 3
+      File Output Operator [FS_12]
+        Group By Operator [GBY_10] (rows=1 width=24)
+          Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"]
+        <-Reducer 2 [SIMPLE_EDGE]
+          SHUFFLE [RS_9]
+            Group By Operator [GBY_8] (rows=1 width=24)
+              Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","count(_col1)","count(_col2)"]
+              Select Operator [SEL_6] (rows=13 width=5)
+                Output:["_col0","_col1","_col2"]
+                Group By Operator [GBY_5] (rows=13 width=5)
+                  Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2,
KEY._col3
+                <-Map 1 [SIMPLE_EDGE]
+                  SHUFFLE [RS_4]
+                    PartitionCols:_col0, _col1, _col2, _col3
+                    Group By Operator [GBY_3] (rows=27 width=5)
+                      Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2,
'0'
+                      Select Operator [SEL_1] (rows=9 width=5)
+                        Output:["_col0","_col1","_col2"]
+                        TableScan [TS_0] (rows=9 width=5)
+                          default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["department_id","gender","education_level"]
+
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4	2	3
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level), count(distinct education_level) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level), count(distinct education_level) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4	2	3	3
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level), 
+count(distinct education_level, department_id) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct
education_level), 
+count(distinct education_level, department_id) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4	2	3	7
+PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level)
from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level)
from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+2	4	2	3	7	7
+PREHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+  Fetch Operator
+    limit:-1
+    Stage-1
+      Reducer 3
+      File Output Operator [FS_12]
+        Select Operator [SEL_11] (rows=1 width=40)
+          Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"]
+          Group By Operator [GBY_10] (rows=1 width=40)
+            Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","count(VALUE._col4)"]
+          <-Reducer 2 [SIMPLE_EDGE]
+            SHUFFLE [RS_9]
+              Group By Operator [GBY_8] (rows=1 width=40)
+                Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(_col0)","count(_col1)","count(_col2)","count(_col3)","count(_col4)"]
+                Select Operator [SEL_6] (rows=22 width=5)
+                  Output:["_col0","_col1","_col2","_col3","_col4"]
+                  Group By Operator [GBY_5] (rows=22 width=5)
+                    Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2,
KEY._col3
+                  <-Map 1 [SIMPLE_EDGE]
+                    SHUFFLE [RS_4]
+                      PartitionCols:_col0, _col1, _col2, _col3
+                      Group By Operator [GBY_3] (rows=45 width=5)
+                        Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2,
'0'
+                        Select Operator [SEL_1] (rows=9 width=5)
+                          Output:["_col0","_col1","_col2"]
+                          TableScan [TS_0] (rows=9 width=5)
+                            default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["gender","department_id","education_level"]
+
+PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct
gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level),
count(distinct department_id, education_level, gender) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+2	4	2	3	7	7	7
+PREHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when
2 then 1 else null end) as c1, 
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id,
gender, 
+education_level from employee group by department_id, gender, education_level grouping sets

+(department_id, gender, education_level))subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when
2 then 1 else null end) as c1, 
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id,
gender, 
+education_level from employee group by department_id, gender, education_level grouping sets

+(department_id, gender, education_level))subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4	2	3
+PREHOOK: query: select grouping__id as i, department_id, gender, education_level from employee

+group by department_id, gender, education_level grouping sets 
+(department_id, gender, education_level, (education_level, department_id))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select grouping__id as i, department_id, gender, education_level from employee

+group by department_id, gender, education_level grouping sets 
+(department_id, gender, education_level, (education_level, department_id))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4	NULL	NULL	1
+4	NULL	NULL	2
+4	NULL	NULL	3
+2	NULL	F	NULL
+2	NULL	M	NULL
+1	1	NULL	NULL
+5	1	NULL	1
+5	1	NULL	2
+5	1	NULL	3
+1	2	NULL	NULL
+5	2	NULL	1
+5	2	NULL	3
+1	3	NULL	NULL
+5	3	NULL	2
+1	4	NULL	NULL
+5	4	NULL	1

http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
index 9e35049..aa9ee87 100644
--- a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
+++ b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
@@ -225,10 +225,10 @@ STAGE PLANS:
             Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats:
NONE
             Select Operator
               expressions: s_store_id (type: string)
-              outputColumnNames: s_store_id
+              outputColumnNames: _col0
               Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats:
NONE
               Group By Operator
-                keys: s_store_id (type: string), '0' (type: string)
+                keys: _col0 (type: string), '0' (type: string)
                 mode: hash
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats:
NONE
@@ -303,10 +303,10 @@ STAGE PLANS:
             Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats:
NONE
             Select Operator
               expressions: s_store_id (type: string)
-              outputColumnNames: s_store_id
+              outputColumnNames: _col0
               Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats:
NONE
               Group By Operator
-                keys: s_store_id (type: string), '0' (type: string)
+                keys: _col0 (type: string), '0' (type: string)
                 mode: hash
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats:
NONE


Mime
View raw message