hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From br...@apache.org
Subject svn commit: r1618095 - in /hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark: GenSparkUtils.java GenSparkWork.java
Date Fri, 15 Aug 2014 05:00:26 GMT
Author: brock
Date: Fri Aug 15 05:00:26 2014
New Revision: 1618095

URL: http://svn.apache.org/r1618095
Log:
HIVE-7659 - Unnecessary sort in query plan (Rui Li via Brock) [Spark Branch]

Modified:
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java
    hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkWork.java

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java?rev=1618095&r1=1618094&r2=1618095&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java
(original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java
Fri Aug 15 05:00:26 2014
@@ -109,7 +109,7 @@ public class GenSparkUtils {
       edgeProp.setShuffleGroup();
     }
     String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim();
-    if (!sortOrder.isEmpty()) {
+    if (!sortOrder.isEmpty() && isSortNecessary(reduceSink)) {
       edgeProp.setShuffleSort();
     }
 
@@ -297,4 +297,26 @@ public class GenSparkUtils {
       }
     }
   }
+
+  /**
+   * Test if the sort order in the RS is necessary.
+   * Unnecessary sort is mainly introduced when GBY is created. Therefore, if the sorting
+   * keys, partitioning keys and grouping keys are the same, we ignore the sort and use
+   * GroupByShuffler to shuffle the data. In this case a group-by transformation should be
+   * sufficient to produce the correct results, i.e. data is properly grouped by the keys
+   * but keys are not guaranteed to be sorted.
+   */
+  public static boolean isSortNecessary(ReduceSinkOperator reduceSinkOperator) {
+    List<Operator<? extends OperatorDesc>> children = reduceSinkOperator.getChildOperators();
+    if (children != null && children.size() == 1 &&
+        children.get(0) instanceof GroupByOperator) {
+      GroupByOperator child = (GroupByOperator) children.get(0);
+      if (reduceSinkOperator.getConf().getKeyCols().equals(
+          reduceSinkOperator.getConf().getPartitionCols()) &&
+          reduceSinkOperator.getConf().getKeyCols().size() == child.getConf().getKeys().size())
{
+        return false;
+      }
+    }
+    return true;
+  }
 }

Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkWork.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkWork.java?rev=1618095&r1=1618094&r2=1618095&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkWork.java
(original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkWork.java
Fri Aug 15 05:00:26 2014
@@ -277,7 +277,7 @@ public class GenSparkWork implements Nod
           edgeProp.setShuffleGroup();
         }
         String sortOrder = Strings.nullToEmpty(rs.getConf().getOrder()).trim();
-        if (!sortOrder.isEmpty()) {
+        if (!sortOrder.isEmpty() && GenSparkUtils.isSortNecessary(rs)) {
           edgeProp.setShuffleSort();
         }
         sparkWork.connect(work, rWork, edgeProp);



Mime
View raw message