hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From xu...@apache.org
Subject svn commit: r1645338 [1/9] - in /hive/branches/spark: data/conf/spark/ itests/src/test/resources/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/test/results/clientpositive/spark/
Date Sat, 13 Dec 2014 17:44:42 GMT
Author: xuefu
Date: Sat Dec 13 17:44:41 2014
New Revision: 1645338

URL: http://svn.apache.org/r1645338
Log:
HIVE-8911: Enable mapjoin hints [Spark Branch] (Chao via Xuefu)

Added:
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
Modified:
    hive/branches/spark/data/conf/spark/hive-site.xml
    hive/branches/spark/itests/src/test/resources/testconfiguration.properties
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_2.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin1.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin10.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin11.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin12.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin13.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin2.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin3.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin4.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin5.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin7.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin8.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin9.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative2.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/bucketmapjoin_negative3.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join25.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join26.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join27.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join30.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join36.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join37.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join38.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join39.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join40.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/join_map_ppr.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin1.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_distinct.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_filter_on_outerjoin.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/mapjoin_test_outer.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/semijoin.q.out
    hive/branches/spark/ql/src/test/results/clientpositive/spark/skewjoin.q.out

Modified: hive/branches/spark/data/conf/spark/hive-site.xml
URL: http://svn.apache.org/viewvc/hive/branches/spark/data/conf/spark/hive-site.xml?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/data/conf/spark/hive-site.xml (original)
+++ hive/branches/spark/data/conf/spark/hive-site.xml Sat Dec 13 17:44:41 2014
@@ -162,7 +162,7 @@
 
 <property>
   <name>hive.ignore.mapjoin.hint</name>
-  <value>true</value>
+  <value>false</value>
   <description>Whether Hive ignores the mapjoin hint</description>
 </property>
 

Modified: hive/branches/spark/itests/src/test/resources/testconfiguration.properties
URL: http://svn.apache.org/viewvc/hive/branches/spark/itests/src/test/resources/testconfiguration.properties?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/itests/src/test/resources/testconfiguration.properties (original)
+++ hive/branches/spark/itests/src/test/resources/testconfiguration.properties Sat Dec 13
17:44:41 2014
@@ -506,7 +506,6 @@ spark.query.files=add_part_multiple.q, \
   auto_sortmerge_join_8.q, \
   auto_sortmerge_join_9.q, \
   auto_sortmerge_join_10.q, \
-  auto_sortmerge_join_11.q, \
   auto_sortmerge_join_12.q, \
   auto_sortmerge_join_13.q, \
   auto_sortmerge_join_14.q, \
@@ -524,7 +523,6 @@ spark.query.files=add_part_multiple.q, \
   bucketmapjoin3.q, \
   bucketmapjoin4.q, \
   bucketmapjoin5.q, \
-  bucketmapjoin6.q, \
   bucketmapjoin7.q, \
   bucketmapjoin8.q, \
   bucketmapjoin9.q, \
@@ -671,13 +669,11 @@ spark.query.files=add_part_multiple.q, \
   join_cond_pushdown_unqual3.q, \
   join_cond_pushdown_unqual4.q, \
   join_empty.q \
-  join_filters.q, \
   join_filters_overlap.q, \
   join_hive_626.q, \
   join_map_ppr.q, \
   join_merge_multi_expressions.q, \
   join_merging.q, \
-  join_nulls.q, \
   join_rc.q, \
   join_reorder.q, \
   join_reorder2.q, \
@@ -808,21 +804,6 @@ spark.query.files=add_part_multiple.q, \
   skewjoin_noskew.q, \
   skewjoin_union_remove_1.q, \
   skewjoin_union_remove_2.q, \
-  smb_mapjoin9.q, \
-  smb_mapjoin_1.q, \
-  smb_mapjoin_2.q, \
-  smb_mapjoin_3.q, \
-  smb_mapjoin_4.q, \
-  smb_mapjoin_5.q, \
-  smb_mapjoin_6.q, \
-  smb_mapjoin_7.q, \
-  smb_mapjoin_8.q, \
-  smb_mapjoin_10.q, \
-  smb_mapjoin_13.q, \
-  smb_mapjoin_14.q, \
-  smb_mapjoin_15.q, \
-  smb_mapjoin_16.q, \
-  smb_mapjoin_17.q, \
   smb_mapjoin_18.q, \
   smb_mapjoin_19.q, \
   smb_mapjoin_20.q, \
@@ -830,14 +811,6 @@ spark.query.files=add_part_multiple.q, \
   smb_mapjoin_22.q, \
   smb_mapjoin_25.q, \
   sort.q, \
-  sort_merge_join_desc_1.q, \
-  sort_merge_join_desc_2.q, \
-  sort_merge_join_desc_3.q, \
-  sort_merge_join_desc_4.q, \
-  sort_merge_join_desc_5.q, \
-  sort_merge_join_desc_6.q, \
-  sort_merge_join_desc_7.q, \
-  sort_merge_join_desc_8.q, \
   spark_test.q, \
   stats_counter.q, \
   stats_counter_partitioned.q, \
@@ -951,7 +924,6 @@ spark.query.files=add_part_multiple.q, \
   vectorization_part_project.q, \
   vectorization_pushdown.q, \
   vectorization_short_regress.q, \
-  vectorized_bucketmapjoin1.q, \
   vectorized_case.q, \
   vectorized_mapjoin.q, \
   vectorized_math_funcs.q, \

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
(original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AbstractSMBJoinProc.java
Sat Dec 13 17:44:41 2014
@@ -521,7 +521,7 @@ abstract public class AbstractSMBJoinPro
     JoinOperator joinOp,
     SortBucketJoinProcCtx joinContext,
     ParseContext parseContext) throws SemanticException {
-    MapJoinOperator mapJoinOp = MapJoinProcessor.convertMapJoin(
+    MapJoinOperator mapJoinOp = new MapJoinProcessor().convertMapJoin(
       parseContext.getConf(),
       parseContext.getOpParseCtx(),
       joinOp,

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
(original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
Sat Dec 13 17:44:41 2014
@@ -233,7 +233,7 @@ public class MapJoinProcessor implements
         newWork.getMapWork().getOpParseCtxMap();
     QBJoinTree newJoinTree = newWork.getMapWork().getJoinTree();
     // generate the map join operator; already checked the map join
-    MapJoinOperator newMapJoinOp = MapJoinProcessor.convertMapJoin(conf, opParseCtxMap, op,
+    MapJoinOperator newMapJoinOp = new MapJoinProcessor().convertMapJoin(conf, opParseCtxMap,
op,
         newJoinTree, mapJoinPos, true, false);
     genLocalWorkForMapJoin(newWork, newMapJoinOp, mapJoinPos);
   }
@@ -302,8 +302,9 @@ public class MapJoinProcessor implements
    *          position of the source to be read as part of map-reduce framework. All other
sources
    *          are cached in memory
    * @param noCheckOuterJoin
+   * @param validateMapJoinTree
    */
-  public static MapJoinOperator convertMapJoin(HiveConf conf,
+  public MapJoinOperator convertMapJoin(HiveConf conf,
     LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtxMap,
     JoinOperator op, QBJoinTree joinTree, int mapJoinPos, boolean noCheckOuterJoin,
     boolean validateMapJoinTree)
@@ -598,7 +599,7 @@ public class MapJoinProcessor implements
     return mapJoinPos;
   }
 
-  private void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException
{
+  protected void genSelectPlan(ParseContext pctx, MapJoinOperator input) throws SemanticException
{
     List<Operator<? extends OperatorDesc>> childOps = input.getChildOperators();
     input.setChildOperators(null);
 

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Sat
Dec 13 17:44:41 2014
@@ -101,7 +101,9 @@ public class Optimizer {
       transformations.add(new RewriteGBUsingIndex());
     }
     transformations.add(new SamplePruner());
-    transformations.add(new MapJoinProcessor());
+
+    MapJoinProcessor mapJoinProcessor = isSparkExecEngine ? new SparkMapJoinProcessor() :
new MapJoinProcessor();
+    transformations.add(mapJoinProcessor);
 
     if ((HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) &&
!isTezExecEngine && !isSparkExecEngine) {
       transformations.add(new BucketMapJoinOptimizer());

Added: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java?rev=1645338&view=auto
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
(added)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SparkMapJoinProcessor.java
Sat Dec 13 17:44:41 2014
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.parse.OpParseContext;
+import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+
+public class SparkMapJoinProcessor extends MapJoinProcessor {
+
+  /**
+   * convert a regular join to a a map-side join.
+   *
+   * @param conf
+   * @param opParseCtxMap
+   * @param op join operator
+   * @param joinTree qb join tree
+   * @param bigTablePos position of the source to be read as part of
+   *                   map-reduce framework. All other sources are cached in memory
+   * @param noCheckOuterJoin
+   * @param validateMapJoinTree
+   */
+  @Override
+  public MapJoinOperator convertMapJoin(HiveConf conf,
+                                        LinkedHashMap<Operator<? extends OperatorDesc>,
OpParseContext> opParseCtxMap,
+                                        JoinOperator op, QBJoinTree joinTree, int bigTablePos,
+                                        boolean noCheckOuterJoin,
+                                        boolean validateMapJoinTree) throws SemanticException
{
+
+    // outer join cannot be performed on a table which is being cached
+    JoinCondDesc[] condns = op.getConf().getConds();
+
+    if (!noCheckOuterJoin) {
+      if (checkMapJoin(bigTablePos, condns) < 0) {
+        throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
+      }
+    }
+
+    // create the map-join operator
+    MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, opParseCtxMap,
+        op, joinTree, bigTablePos, noCheckOuterJoin);
+
+    // 1. remove RS as parent for the big table branch
+    // 2. remove old join op from child set of all the RSs
+    List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
+    for (int i = 0; i < parentOps.size(); i++) {
+      Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
+      parentOp.getChildOperators().remove(op);
+      if (i == bigTablePos) {
+        List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
+        Preconditions.checkArgument(grandParentOps.size() == 1,
+            "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
+        Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
+        grandParentOp.replaceChild(parentOp, mapJoinOp);
+        mapJoinOp.replaceParent(parentOp, grandParentOp);
+      }
+    }
+
+    return mapJoinOp;
+  }
+}

Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out?rev=1645338&r1=1645337&r2=1645338&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/bucket_map_join_1.q.out Sat
Dec 13 17:44:41 2014
@@ -104,59 +104,60 @@ TOK_QUERY
 
 
 STAGE DEPENDENCIES:
-  Stage-1 is a root stage
+  Stage-2 is a root stage
+  Stage-1 depends on stages: Stage-2
   Stage-0 depends on stages: Stage-1
 
 STAGE PLANS:
-  Stage: Stage-1
+  Stage: Stage-2
     Spark
-      Edges:
-        Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 1), Map 4 (PARTITION-LEVEL SORT, 1)
-        Reducer 3 <- Reducer 2 (GROUP, 1)
 #### A masked pattern was here ####
       Vertices:
-        Map 1 
+        Map 3 
             Map Operator Tree:
                 TableScan
-                  alias: a
-                  Statistics: Num rows: 0 Data size: 20 Basic stats: PARTIAL Column stats:
NONE
+                  alias: b
+                  Statistics: Num rows: 0 Data size: 21 Basic stats: PARTIAL Column stats:
NONE
                   GatherStats: false
                   Filter Operator
                     isSamplingPred: false
                     predicate: (key is not null and value is not null) (type: boolean)
                     Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats:
NONE
-                    Reduce Output Operator
-                      key expressions: key (type: string), value (type: string)
-                      sort order: ++
-                      Map-reduce partition columns: key (type: string), value (type: string)
-                      Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats:
NONE
-                      tag: 0
-                      auto parallelism: false
+                    Spark HashTable Sink Operator
+                      condition expressions:
+                        0 
+                        1 
+                      keys:
+                        0 key (type: string), value (type: string)
+                        1 key (type: string), value (type: string)
+                      Position of Big Table: 0
+            Local Work:
+              Map Reduce Local Work
             Path -> Alias:
 #### A masked pattern was here ####
             Path -> Partition:
 #### A masked pattern was here ####
                 Partition
-                  base file name: table1
+                  base file name: table2
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                   properties:
                     COLUMN_STATS_ACCURATE true
                     SORTBUCKETCOLSPREFIX TRUE
                     bucket_count 1
-                    bucket_field_name key
+                    bucket_field_name value
                     columns key,value
                     columns.comments 
Mime
View raw message