hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gunt...@apache.org
Subject [2/2] hive git commit: HIVE-10062: HiveOnTez: Union followed by Multi-GB followed by Multi-insert loses data (Pengcheng Xiong via Gunther Hagleitner)
Date Thu, 23 Apr 2015 20:54:58 GMT
HIVE-10062: HiveOnTez: Union followed by Multi-GB followed by Multi-insert loses data (Pengcheng
Xiong via Gunther Hagleitner)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7612b0f5
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7612b0f5
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7612b0f5

Branch: refs/heads/master
Commit: 7612b0f5974940ef638287b7e244742d7dd52eea
Parents: 84494ad
Author: Gunther Hagleitner <gunther@apache.org>
Authored: Thu Apr 23 13:48:31 2015 -0700
Committer: Gunther Hagleitner <gunther@apache.org>
Committed: Thu Apr 23 13:48:31 2015 -0700

----------------------------------------------------------------------
 .../hive/common/jsonexplain/tez/Vertex.java     |    2 +-
 .../test/resources/testconfiguration.properties |    3 +-
 .../hadoop/hive/ql/parse/GenTezProcContext.java |    5 +-
 .../hadoop/hive/ql/parse/GenTezUtils.java       |    5 +-
 .../apache/hadoop/hive/ql/parse/GenTezWork.java |   40 +-
 .../hadoop/hive/ql/parse/GenTezWorkWalker.java  |    6 +
 .../test/queries/clientpositive/explainuser_2.q |   22 +
 .../clientpositive/tez_union_multiinsert.q      |  120 +
 .../clientpositive/tez/explainuser_2.q.out      |  287 ++
 .../tez/tez_union_multiinsert.q.out             | 4293 ++++++++++++++++++
 10 files changed, 4764 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/common/src/java/org/apache/hadoop/hive/common/jsonexplain/tez/Vertex.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/common/jsonexplain/tez/Vertex.java b/common/src/java/org/apache/hadoop/hive/common/jsonexplain/tez/Vertex.java
index b45c782..9b3405e 100644
--- a/common/src/java/org/apache/hadoop/hive/common/jsonexplain/tez/Vertex.java
+++ b/common/src/java/org/apache/hadoop/hive/common/jsonexplain/tez/Vertex.java
@@ -191,7 +191,7 @@ public class Vertex {
       out.println(TezJsonParser.prefixString(indentFlag) + this.name);
     }
     // print operators
-    if (hasMultiReduceOp) {
+    if (hasMultiReduceOp && !callingVertex.union) {
       // find the right op
       Op choose = null;
       for (Op op : this.rootOps) {

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 441c078..311193b 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -320,7 +320,8 @@ minitez.query.files=bucket_map_join_tez1.q,\
   tez_smb_1.q,\
   vectorized_dynamic_partition_pruning.q,\
   tez_multi_union.q,\
-  tez_join.q
+  tez_join.q,\
+  tez_union_multiinsert.q
 
 encrypted.query.files=encryption_join_unencrypted_tbl.q,\
   encryption_insert_partition_static.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java
index 90616ad..adc31ae 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java
@@ -51,6 +51,7 @@ import org.apache.hadoop.hive.ql.plan.MoveWork;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.TezEdgeProperty;
 import org.apache.hadoop.hive.ql.plan.TezWork;
+import org.apache.hadoop.hive.ql.plan.UnionWork;
 
 /**
  * GenTezProcContext. GenTezProcContext maintains information
@@ -124,7 +125,8 @@ public class GenTezProcContext implements NodeProcessorCtx{
 
   // used to hook up unions
   public final Map<Operator<?>, BaseWork> unionWorkMap;
-  public final List<UnionOperator> currentUnionOperators;
+  public final Map<Operator<?>, UnionWork> rootUnionWorkMap;
+  public List<UnionOperator> currentUnionOperators;
   public final Set<BaseWork> workWithUnionOperators;
   public final Set<ReduceSinkOperator> clonedReduceSinks;
 
@@ -171,6 +173,7 @@ public class GenTezProcContext implements NodeProcessorCtx{
     this.dependencyTask = (DependencyCollectionTask)
         TaskFactory.get(new DependencyCollectionWork(), conf);
     this.unionWorkMap = new LinkedHashMap<Operator<?>, BaseWork>();
+    this.rootUnionWorkMap = new LinkedHashMap<Operator<?>, UnionWork>();
     this.currentUnionOperators = new LinkedList<UnionOperator>();
     this.workWithUnionOperators = new LinkedHashSet<BaseWork>();
     this.clonedReduceSinks = new LinkedHashSet<ReduceSinkOperator>();

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
index 4dcdf91..241e9d7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
@@ -87,9 +87,10 @@ public class GenTezUtils {
     sequenceNumber = 0;
   }
 
-  public UnionWork createUnionWork(GenTezProcContext context, Operator<?> operator,
TezWork tezWork) {
+  public UnionWork createUnionWork(GenTezProcContext context, Operator<?> root, Operator<?>
leaf, TezWork tezWork) {
     UnionWork unionWork = new UnionWork("Union "+ (++sequenceNumber));
-    context.unionWorkMap.put(operator, unionWork);
+    context.rootUnionWorkMap.put(root, unionWork);
+    context.unionWorkMap.put(leaf, unionWork);
     tezWork.add(unionWork);
     return unionWork;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWork.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWork.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWork.java
index 0990894..6db8220 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWork.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWork.java
@@ -274,8 +274,8 @@ public class GenTezWork implements NodeProcessor {
     }
 
     if (!context.currentUnionOperators.isEmpty()) {
-      // if there are union all operators we need to add the work to the set
-      // of union operators.
+      // if there are union all operators, it means that the walking context contains union
all operators.
+      // please see more details of context.currentUnionOperator in GenTezWorkWalker
 
       UnionWork unionWork;
       if (context.unionWorkMap.containsKey(operator)) {
@@ -284,22 +284,25 @@ public class GenTezWork implements NodeProcessor {
         // since we've passed this operator before.
         assert operator.getChildOperators().isEmpty();
         unionWork = (UnionWork) context.unionWorkMap.get(operator);
-
+        // finally connect the union work with work
+        connectUnionWorkWithWork(unionWork, work, tezWork, context);
       } else {
-        // first time through. we need to create a union work object and add this
-        // work to it. Subsequent work should reference the union and not the actual
-        // work.
-        unionWork = utils.createUnionWork(context, operator, tezWork);
+        // we've not seen this terminal before. we need to check
+        // rootUnionWorkMap which contains the information of mapping the root
+        // operator of a union work to a union work
+        unionWork = context.rootUnionWorkMap.get(root);
+        if (unionWork == null) {
+          // if unionWork is null, it means it is the first time. we need to
+          // create a union work object and add this work to it. Subsequent 
+          // work should reference the union and not the actual work.
+          unionWork = utils.createUnionWork(context, root, operator, tezWork);
+          // finally connect the union work with work
+          connectUnionWorkWithWork(unionWork, work, tezWork, context);
+        }
       }
-
-      // finally hook everything up
-      LOG.debug("Connecting union work ("+unionWork+") with work ("+work+")");
-      TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.CONTAINS);
-      tezWork.connect(unionWork, work, edgeProp);
-      unionWork.addUnionOperators(context.currentUnionOperators);
       context.currentUnionOperators.clear();
-      context.workWithUnionOperators.add(work);
       work = unionWork;
+
     }
 
     // We're scanning a tree from roots to leaf (this is not technically
@@ -419,4 +422,13 @@ public class GenTezWork implements NodeProcessor {
     int pos = stack.indexOf(currentMergeJoinOperator);
     return (Operator<? extends OperatorDesc>) stack.get(pos - 1);
   }
+  
+  private void connectUnionWorkWithWork(UnionWork unionWork, BaseWork work, TezWork tezWork,
+      GenTezProcContext context) {
+    LOG.debug("Connecting union work (" + unionWork + ") with work (" + work + ")");
+    TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.CONTAINS);
+    tezWork.connect(unionWork, work, edgeProp);
+    unionWork.addUnionOperators(context.currentUnionOperators);
+    context.workWithUnionOperators.add(work);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWorkWalker.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWorkWalker.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWorkWalker.java
index 08fd61e..2d8c8b2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWorkWalker.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezWorkWalker.java
@@ -18,11 +18,13 @@
 
 package org.apache.hadoop.hive.ql.parse;
 
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.UnionOperator;
 import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 import org.apache.hadoop.hive.ql.lib.Dispatcher;
 import org.apache.hadoop.hive.ql.lib.Node;
@@ -52,6 +54,7 @@ public class GenTezWorkWalker extends DefaultGraphWalker {
     ctx.currentRootOperator = (Operator<? extends OperatorDesc>) nd;
     ctx.preceedingWork = null;
     ctx.parentOfRoot = null;
+    ctx.currentUnionOperators = new ArrayList<>();
   }
 
   /**
@@ -89,6 +92,7 @@ public class GenTezWorkWalker extends DefaultGraphWalker {
     // save some positional state
     Operator<? extends OperatorDesc> currentRoot = ctx.currentRootOperator;
     Operator<? extends OperatorDesc> parentOfRoot = ctx.parentOfRoot;
+    List<UnionOperator> currentUnionOperators = ctx.currentUnionOperators;
     BaseWork preceedingWork = ctx.preceedingWork;
 
     if (skip == null || !skip) {
@@ -99,6 +103,8 @@ public class GenTezWorkWalker extends DefaultGraphWalker {
         ctx.currentRootOperator = currentRoot;
         ctx.parentOfRoot = parentOfRoot;
         ctx.preceedingWork = preceedingWork;
+        ctx.currentUnionOperators = new ArrayList<>();
+        ctx.currentUnionOperators.addAll(currentUnionOperators);
 
         walk(ch);
       }

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/test/queries/clientpositive/explainuser_2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/explainuser_2.q b/ql/src/test/queries/clientpositive/explainuser_2.q
index 03264ca..8e8ac92 100644
--- a/ql/src/test/queries/clientpositive/explainuser_2.q
+++ b/ql/src/test/queries/clientpositive/explainuser_2.q
@@ -305,3 +305,25 @@ TRANSFORM(a.key, a.value) USING 'cat' AS (tkey, tvalue)
 FROM src a join src b
 on a.key = b.key;
 
+explain
+FROM (
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub
+                         UNION all
+      select key, value from src s0
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+explain
+FROM (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/test/queries/clientpositive/tez_union_multiinsert.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/tez_union_multiinsert.q b/ql/src/test/queries/clientpositive/tez_union_multiinsert.q
new file mode 100644
index 0000000..f7c11a3
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/tez_union_multiinsert.q
@@ -0,0 +1,120 @@
+-- SORT_QUERY_RESULTS
+
+CREATE TABLE DEST1(key STRING, value STRING) STORED AS TEXTFILE;
+
+CREATE TABLE DEST2(key STRING, val1 STRING, val2 STRING) STORED AS TEXTFILE;
+
+explain
+FROM (
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub
+                         UNION all
+      select key, value from src s0
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+FROM (
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub
+                         UNION all
+      select key, value from src s0
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+select * from DEST1;
+select * from DEST2;
+
+explain
+FROM (
+      select key, value from src s0
+                         UNION all
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+FROM (
+      select key, value from src s0
+                         UNION all
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+select * from DEST1;
+select * from DEST2;
+
+
+explain
+FROM (
+      select key, value from src s0
+                         UNION all
+      select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION all 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+FROM (
+      select key, value from src s0
+                         UNION all
+      select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION all 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+select * from DEST1;
+select * from DEST2;
+
+explain 
+FROM (select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION all 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+FROM (select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION all 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+select * from DEST1;
+select * from DEST2;
+
+explain
+FROM (select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION distinct 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+FROM (select 'tst1' as key, cast(count(1) as string) as value from src s1
+                         UNION distinct 
+      select s2.key as key, s2.value as value from src s2) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value;
+
+select * from DEST1;
+select * from DEST2;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/7612b0f5/ql/src/test/results/clientpositive/tez/explainuser_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/explainuser_2.q.out b/ql/src/test/results/clientpositive/tez/explainuser_2.q.out
index ea6b558..f7026a8 100644
--- a/ql/src/test/results/clientpositive/tez/explainuser_2.q.out
+++ b/ql/src/test/results/clientpositive/tez/explainuser_2.q.out
@@ -4666,3 +4666,290 @@ Stage-0
                         TableScan [TS_1]
                            alias:b
                            Statistics:Num rows: 500 Data size: 5312 Basic stats: COMPLETE
Column stats: NONE
+PREHOOK: query: explain
+FROM (
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub
+                         UNION all
+      select key, value from src s0
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+FROM (
+      select key, value from (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2) unionsub
+                         UNION all
+      select key, value from src s0
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value
+POSTHOOK: type: QUERY
+Plan not optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE), Union 3 (CONTAINS)
+Reducer 5 <- Union 3 (SIMPLE_EDGE)
+Reducer 4 <- Union 3 (SIMPLE_EDGE)
+Map 7 <- Union 3 (CONTAINS)
+Map 6 <- Union 3 (CONTAINS)
+
+Stage-5
+   Stats-Aggr Operator
+      Stage-1
+         Move Operator
+            table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest2","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+            Stage-3
+               Dependency Collection{}
+                  Stage-2
+                     Reducer 5
+                     File Output Operator [FS_24]
+                        compressed:false
+                        Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest2","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+                        Group By Operator [GBY_22]
+                        |  aggregations:["count(DISTINCT KEY._col2:0._col0)"]
+                        |  keys:KEY._col0 (type: string), KEY._col1 (type: string)
+                        |  outputColumnNames:["_col0","_col1","_col2"]
+                        |  Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        |<-Union 3 [SIMPLE_EDGE]
+                           |<-Reducer 2 [CONTAINS]
+                           |  Reduce Output Operator [RS_15]
+                           |     key expressions:_col0 (type: string), _col1 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string)
+                           |     sort order:++
+                           |     Group By Operator [GBY_14]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), substr(_col1, 5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2"]
+                           |        Select Operator [SEL_9]
+                           |           outputColumnNames:["_col0","_col1"]
+                           |           Select Operator [SEL_5]
+                           |              outputColumnNames:["_col0","_col1"]
+                           |              Group By Operator [GBY_4]
+                           |              |  aggregations:["count(VALUE._col0)"]
+                           |              |  outputColumnNames:["_col0"]
+                           |              |<-Map 1 [SIMPLE_EDGE]
+                           |                 Reduce Output Operator [RS_3]
+                           |                    sort order:
+                           |                    Statistics:Num rows: 1 Data size: 8 Basic
stats: COMPLETE Column stats: COMPLETE
+                           |                    value expressions:_col0 (type: bigint)
+                           |                    Group By Operator [GBY_2]
+                           |                       aggregations:["count(1)"]
+                           |                       outputColumnNames:["_col0"]
+                           |                       Statistics:Num rows: 1 Data size: 8 Basic
stats: COMPLETE Column stats: COMPLETE
+                           |                       Select Operator [SEL_1]
+                           |                          Statistics:Num rows: 500 Data size:
5312 Basic stats: COMPLETE Column stats: COMPLETE
+                           |                          TableScan [TS_0]
+                           |                             alias:s1
+                           |                             Statistics:Num rows: 500 Data size:
5312 Basic stats: COMPLETE Column stats: COMPLETE
+                           |  Reduce Output Operator [RS_21]
+                           |     key expressions:_col0 (type: string), _col1 (type: string),
_col2 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string), _col1
(type: string)
+                           |     sort order:+++
+                           |     Group By Operator [GBY_20]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), _col1 (type: string), substr(_col1,
5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2","_col3"]
+                           |         Please refer to the previous Select Operator [SEL_9]
+                           |<-Map 7 [CONTAINS]
+                           |  Reduce Output Operator [RS_15]
+                           |     key expressions:_col0 (type: string), _col1 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string)
+                           |     sort order:++
+                           |     Group By Operator [GBY_14]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), substr(_col1, 5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2"]
+                           |        Select Operator [SEL_11]
+                           |           outputColumnNames:["_col0","_col1"]
+                           |           TableScan [TS_10]
+                           |              alias:s0
+                           |  Reduce Output Operator [RS_21]
+                           |     key expressions:_col0 (type: string), _col1 (type: string),
_col2 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string), _col1
(type: string)
+                           |     sort order:+++
+                           |     Group By Operator [GBY_20]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), _col1 (type: string), substr(_col1,
5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2","_col3"]
+                           |         Please refer to the previous Select Operator [SEL_11]
+                           |<-Map 6 [CONTAINS]
+                              Reduce Output Operator [RS_15]
+                                 key expressions:_col0 (type: string), _col1 (type: string)
+                                 Map-reduce partition columns:_col0 (type: string)
+                                 sort order:++
+                                 Group By Operator [GBY_14]
+                                    aggregations:["count(DISTINCT substr(_col1, 5))"]
+                                    keys:_col0 (type: string), substr(_col1, 5) (type: string)
+                                    outputColumnNames:["_col0","_col1","_col2"]
+                                    Select Operator [SEL_9]
+                                       outputColumnNames:["_col0","_col1"]
+                                       Select Operator [SEL_7]
+                                          outputColumnNames:["_col0","_col1"]
+                                          TableScan [TS_6]
+                                             alias:s2
+                              Reduce Output Operator [RS_21]
+                                 key expressions:_col0 (type: string), _col1 (type: string),
_col2 (type: string)
+                                 Map-reduce partition columns:_col0 (type: string), _col1
(type: string)
+                                 sort order:+++
+                                 Group By Operator [GBY_20]
+                                    aggregations:["count(DISTINCT substr(_col1, 5))"]
+                                    keys:_col0 (type: string), _col1 (type: string), substr(_col1,
5) (type: string)
+                                    outputColumnNames:["_col0","_col1","_col2","_col3"]
+                                     Please refer to the previous Select Operator [SEL_9]
+                     Reducer 4
+                     File Output Operator [FS_18]
+                        compressed:false
+                        Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest1","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+                        Group By Operator [GBY_16]
+                        |  aggregations:["count(DISTINCT KEY._col1:0._col0)"]
+                        |  keys:KEY._col0 (type: string)
+                        |  outputColumnNames:["_col0","_col1"]
+                        |  Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        |<- Please refer to the previous Union 3 [SIMPLE_EDGE]
+Stage-4
+   Stats-Aggr Operator
+      Stage-0
+         Move Operator
+            table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest1","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+             Please refer to the previous Stage-3
+PREHOOK: query: explain
+FROM (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+FROM (
+      select 'tst1' as key, cast(count(1) as string) as value, 'tst1' as value2 from src
s1
+                         UNION all 
+      select s2.key as key, s2.value as value, 'tst1' as value2 from src s2
+                             ) unionsrc
+INSERT OVERWRITE TABLE DEST1 SELECT unionsrc.key, COUNT(DISTINCT SUBSTR(unionsrc.value,5))
GROUP BY unionsrc.key
+INSERT OVERWRITE TABLE DEST2 SELECT unionsrc.key, unionsrc.value, COUNT(DISTINCT SUBSTR(unionsrc.value,5))

+GROUP BY unionsrc.key, unionsrc.value
+POSTHOOK: type: QUERY
+Plan not optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE), Union 3 (CONTAINS)
+Reducer 5 <- Union 3 (SIMPLE_EDGE)
+Reducer 4 <- Union 3 (SIMPLE_EDGE)
+Map 6 <- Union 3 (CONTAINS)
+
+Stage-5
+   Stats-Aggr Operator
+      Stage-1
+         Move Operator
+            table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest2","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+            Stage-3
+               Dependency Collection{}
+                  Stage-2
+                     Reducer 5
+                     File Output Operator [FS_20]
+                        compressed:false
+                        Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest2","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+                        Group By Operator [GBY_18]
+                        |  aggregations:["count(DISTINCT KEY._col2:0._col0)"]
+                        |  keys:KEY._col0 (type: string), KEY._col1 (type: string)
+                        |  outputColumnNames:["_col0","_col1","_col2"]
+                        |  Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        |<-Union 3 [SIMPLE_EDGE]
+                           |<-Reducer 2 [CONTAINS]
+                           |  Reduce Output Operator [RS_11]
+                           |     key expressions:_col0 (type: string), _col1 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string)
+                           |     sort order:++
+                           |     Group By Operator [GBY_10]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), substr(_col1, 5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2"]
+                           |        Select Operator [SEL_9]
+                           |           outputColumnNames:["_col0","_col1"]
+                           |           Select Operator [SEL_5]
+                           |              outputColumnNames:["_col0","_col1"]
+                           |              Group By Operator [GBY_4]
+                           |              |  aggregations:["count(VALUE._col0)"]
+                           |              |  outputColumnNames:["_col0"]
+                           |              |<-Map 1 [SIMPLE_EDGE]
+                           |                 Reduce Output Operator [RS_3]
+                           |                    sort order:
+                           |                    Statistics:Num rows: 1 Data size: 8 Basic
stats: COMPLETE Column stats: COMPLETE
+                           |                    value expressions:_col0 (type: bigint)
+                           |                    Group By Operator [GBY_2]
+                           |                       aggregations:["count(1)"]
+                           |                       outputColumnNames:["_col0"]
+                           |                       Statistics:Num rows: 1 Data size: 8 Basic
stats: COMPLETE Column stats: COMPLETE
+                           |                       Select Operator [SEL_1]
+                           |                          Statistics:Num rows: 500 Data size:
5312 Basic stats: COMPLETE Column stats: COMPLETE
+                           |                          TableScan [TS_0]
+                           |                             alias:s1
+                           |                             Statistics:Num rows: 500 Data size:
5312 Basic stats: COMPLETE Column stats: COMPLETE
+                           |  Reduce Output Operator [RS_17]
+                           |     key expressions:_col0 (type: string), _col1 (type: string),
_col2 (type: string)
+                           |     Map-reduce partition columns:_col0 (type: string), _col1
(type: string)
+                           |     sort order:+++
+                           |     Group By Operator [GBY_16]
+                           |        aggregations:["count(DISTINCT substr(_col1, 5))"]
+                           |        keys:_col0 (type: string), _col1 (type: string), substr(_col1,
5) (type: string)
+                           |        outputColumnNames:["_col0","_col1","_col2","_col3"]
+                           |        Select Operator [SEL_15]
+                           |           outputColumnNames:["_col0","_col1"]
+                           |            Please refer to the previous Select Operator [SEL_5]
+                           |<-Map 6 [CONTAINS]
+                              Reduce Output Operator [RS_11]
+                                 key expressions:_col0 (type: string), _col1 (type: string)
+                                 Map-reduce partition columns:_col0 (type: string)
+                                 sort order:++
+                                 Group By Operator [GBY_10]
+                                    aggregations:["count(DISTINCT substr(_col1, 5))"]
+                                    keys:_col0 (type: string), substr(_col1, 5) (type: string)
+                                    outputColumnNames:["_col0","_col1","_col2"]
+                                    Select Operator [SEL_9]
+                                       outputColumnNames:["_col0","_col1"]
+                                       Select Operator [SEL_7]
+                                          outputColumnNames:["_col0","_col1"]
+                                          TableScan [TS_6]
+                                             alias:s2
+                              Reduce Output Operator [RS_17]
+                                 key expressions:_col0 (type: string), _col1 (type: string),
_col2 (type: string)
+                                 Map-reduce partition columns:_col0 (type: string), _col1
(type: string)
+                                 sort order:+++
+                                 Group By Operator [GBY_16]
+                                    aggregations:["count(DISTINCT substr(_col1, 5))"]
+                                    keys:_col0 (type: string), _col1 (type: string), substr(_col1,
5) (type: string)
+                                    outputColumnNames:["_col0","_col1","_col2","_col3"]
+                                    Select Operator [SEL_15]
+                                       outputColumnNames:["_col0","_col1"]
+                                        Please refer to the previous Select Operator [SEL_7]
+                     Reducer 4
+                     File Output Operator [FS_14]
+                        compressed:false
+                        Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest1","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+                        Group By Operator [GBY_12]
+                        |  aggregations:["count(DISTINCT KEY._col1:0._col0)"]
+                        |  keys:KEY._col0 (type: string)
+                        |  outputColumnNames:["_col0","_col1"]
+                        |  Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: PARTIAL
+                        |<- Please refer to the previous Union 3 [SIMPLE_EDGE]
+Stage-4
+   Stats-Aggr Operator
+      Stage-0
+         Move Operator
+            table:{"serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe","name:":"default.dest1","input
format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"}
+             Please refer to the previous Stage-3


Mime
View raw message