hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jpull...@apache.org
Subject svn commit: r1674191 - in /hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite: cost/ reloperators/ stats/
Date Fri, 17 Apr 2015 02:33:46 GMT
Author: jpullokk
Date: Fri Apr 17 02:33:46 2015
New Revision: 1674191

URL: http://svn.apache.org/r1674191
Log:
HIVE-10369: Extended Cost Model Changes: splitCount, scan cost (Laljo John Pullokkaran)

Modified:
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveAlgorithmsUtil.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java
    hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveAlgorithmsUtil.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveAlgorithmsUtil.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveAlgorithmsUtil.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveAlgorithmsUtil.java
Fri Apr 17 02:33:46 2015
@@ -65,9 +65,8 @@ public class HiveAlgorithmsUtil {
     return new HiveCost(hr.getRows(), 0, 0);
   }
 
-  public HiveCost computeCost(HiveTableScan t) {
-    double cardinality = t.getRows();
-    return new HiveCost(cardinality, 0, hdfsWrite * cardinality * 0);
+  public HiveCost computeScanCost(double cardinality, double avgTupleSize) {
+    return new HiveCost(cardinality, 0, hdfsRead * cardinality * avgTupleSize);
   }
 
   public double computeSortMergeCPUCost(

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveCostModel.java
Fri Apr 17 02:33:46 2015
@@ -27,6 +27,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 
 import com.google.common.collect.ImmutableList;
 
@@ -48,6 +49,8 @@ public abstract class HiveCostModel {
 
   public abstract RelOptCost getAggregateCost(HiveAggregate aggregate);
 
+  public abstract RelOptCost getScanCost(HiveTableScan ts);
+
   public RelOptCost getJoinCost(HiveJoin join) {
     // Select algorithm with min cost
     JoinAlgorithm joinAlgorithm = null;

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveDefaultCostModel.java
Fri Apr 17 02:33:46 2015
@@ -23,6 +23,7 @@ import org.apache.calcite.rel.RelDistrib
 import org.apache.calcite.rel.metadata.RelMetadataQuery;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Sets;
@@ -53,10 +54,14 @@ public class HiveDefaultCostModel extend
   }
 
   @Override
-  public RelOptCost getAggregateCost(HiveAggregate aggregate) {
+  public RelOptCost getScanCost(HiveTableScan ts) {
     return HiveCost.FACTORY.makeZeroCost();
   }
 
+  @Override
+  public RelOptCost getAggregateCost(HiveAggregate aggregate) {
+    return HiveCost.FACTORY.makeZeroCost();
+  }
 
   /**
    * Default join algorithm. Cost is based on cardinality.

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveOnTezCostModel.java
Fri Apr 17 02:33:46 2015
@@ -33,6 +33,7 @@ import org.apache.hadoop.hive.conf.HiveC
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin.MapJoinStreamingRelation;
 
 import com.google.common.collect.ImmutableList;
@@ -71,6 +72,11 @@ public class HiveOnTezCostModel extends
   }
 
   @Override
+  public RelOptCost getScanCost(HiveTableScan ts) {
+    return algoUtils.computeScanCost(ts.getRows(), RelMetadataQuery.getAverageRowSize(ts));
+  }
+
+  @Override
   public RelOptCost getAggregateCost(HiveAggregate aggregate) {
     if (aggregate.isBucketedInput()) {
       return HiveCost.FACTORY.makeZeroCost();
@@ -166,12 +172,18 @@ public class HiveOnTezCostModel extends
 
     @Override
     public Double getCumulativeMemoryWithinPhaseSplit(HiveJoin join) {
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezCommonJoinAlgorithm.INSTANCE);
+
       final Double memoryWithinPhase =
           RelMetadataQuery.cumulativeMemoryWithinPhase(join);
       final Integer splitCount = RelMetadataQuery.splitCount(join);
+      join.setJoinAlgorithm(oldAlgo);
+
       if (memoryWithinPhase == null || splitCount == null) {
         return null;
       }
+      
       return memoryWithinPhase / splitCount;
     }
 
@@ -252,8 +264,11 @@ public class HiveOnTezCostModel extends
               add(new Pair<Double,Double>(leftRCount,leftRAverageSize)).
               add(new Pair<Double,Double>(rightRCount,rightRAverageSize)).
               build();
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezMapJoinAlgorithm.INSTANCE);
       final int parallelism = RelMetadataQuery.splitCount(join) == null
               ? 1 : RelMetadataQuery.splitCount(join);
+      join.setJoinAlgorithm(oldAlgo);
       final double ioCost = algoUtils.computeMapJoinIOCost(relationInfos, streaming, parallelism);
       // 4. Result
       return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost);
@@ -346,7 +361,13 @@ public class HiveOnTezCostModel extends
 
       // Requirements: for Bucket, bucketed by their keys on both sides and fitting in memory
       // Obtain number of buckets
+      //TODO: Incase of non bucketed splits would be computed based on data size/max part
size
+      // What we need is a way to get buckets not splits
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezBucketJoinAlgorithm.INSTANCE);
       Integer buckets = RelMetadataQuery.splitCount(smallInput);
+      join.setJoinAlgorithm(oldAlgo);
+
       if (buckets == null) {
         return false;
       }
@@ -406,8 +427,13 @@ public class HiveOnTezCostModel extends
               add(new Pair<Double,Double>(leftRCount,leftRAverageSize)).
               add(new Pair<Double,Double>(rightRCount,rightRAverageSize)).
               build();
+      //TODO: No Of buckets is not same as no of splits
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezBucketJoinAlgorithm.INSTANCE);
       final int parallelism = RelMetadataQuery.splitCount(join) == null
               ? 1 : RelMetadataQuery.splitCount(join);
+      join.setJoinAlgorithm(oldAlgo);
+
       final double ioCost = algoUtils.computeBucketMapJoinIOCost(relationInfos, streaming,
parallelism);
       // 4. Result
       return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost);
@@ -550,8 +576,14 @@ public class HiveOnTezCostModel extends
               add(new Pair<Double,Double>(leftRCount,leftRAverageSize)).
               add(new Pair<Double,Double>(rightRCount,rightRAverageSize)).
               build();
-      final int parallelism = RelMetadataQuery.splitCount(join) == null
-              ? 1 : RelMetadataQuery.splitCount(join);
+
+      // TODO: Split count is not the same as no of buckets
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezSMBJoinAlgorithm.INSTANCE);
+      final int parallelism = RelMetadataQuery.splitCount(join) == null ? 1 : RelMetadataQuery
+          .splitCount(join);
+      join.setJoinAlgorithm(oldAlgo);
+
       final double ioCost = algoUtils.computeSMBMapJoinIOCost(relationInfos, streaming, parallelism);
       // 4. Result
       return HiveCost.FACTORY.makeCost(rCount, cpuCost, ioCost);
@@ -575,9 +607,14 @@ public class HiveOnTezCostModel extends
 
     @Override
     public Double getCumulativeMemoryWithinPhaseSplit(HiveJoin join) {
-      final Double memoryWithinPhase =
-          RelMetadataQuery.cumulativeMemoryWithinPhase(join);
+      // TODO: Split count is not same as no of buckets
+      JoinAlgorithm oldAlgo = join.getJoinAlgorithm();
+      join.setJoinAlgorithm(TezSMBJoinAlgorithm.INSTANCE);
+
+      final Double memoryWithinPhase = RelMetadataQuery.cumulativeMemoryWithinPhase(join);
       final Integer splitCount = RelMetadataQuery.splitCount(join);
+      join.setJoinAlgorithm(oldAlgo);
+
       if (memoryWithinPhase == null || splitCount == null) {
         return null;
       }

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/cost/HiveRelMdCost.java
Fri Apr 17 02:33:46 2015
@@ -26,6 +26,7 @@ import org.apache.calcite.rel.metadata.R
 import org.apache.calcite.util.BuiltInMethod;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 
 import com.google.common.collect.ImmutableList;
 
@@ -56,6 +57,10 @@ public class HiveRelMdCost {
     return hiveCostModel.getJoinCost(join);
   }
 
+  public RelOptCost getNonCumulativeCost(HiveTableScan ts) {
+    return hiveCostModel.getScanCost(ts);
+  }
+
   // Default case
   public RelOptCost getNonCumulativeCost(RelNode rel) {
     return hiveCostModel.getDefaultCost();

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java
Fri Apr 17 02:33:46 2015
@@ -112,6 +112,10 @@ public class HiveJoin extends Join imple
     this.joinAlgorithm = joinAlgorithm;
   }
 
+  public JoinAlgorithm getJoinAlgorithm() {
+    return this.joinAlgorithm;
+  }
+
   public ImmutableList<RelCollation> getCollation() {
     return joinAlgorithm.getCollation(this);
   }

Modified: hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java
URL: http://svn.apache.org/viewvc/hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java?rev=1674191&r1=1674190&r2=1674191&view=diff
==============================================================================
--- hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java
(original)
+++ hive/branches/cbo/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdParallelism.java
Fri Apr 17 02:33:46 2015
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hive.ql.optimizer.calcite.stats;
 
+import java.util.List;
+
 import org.apache.calcite.rel.RelNode;
 import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMdParallelism;
@@ -61,8 +63,21 @@ public class HiveRelMdParallelism extend
   }
 
   public Integer splitCount(HiveTableScan scan) {
+    Integer splitCount;
+
     RelOptHiveTable table = (RelOptHiveTable) scan.getTable();
-    return table.getHiveTableMD().getNumBuckets();
+    List<String> bucketCols = table.getHiveTableMD().getBucketCols();
+    if (bucketCols != null && !bucketCols.isEmpty()) {
+      splitCount = table.getHiveTableMD().getNumBuckets();
+    } else {
+      splitCount = splitCountRepartition(scan);
+      if (splitCount == null) {
+        throw new RuntimeException("Could not get split count for table: "
+            + scan.getTable().getQualifiedName());
+      }
+    }
+
+    return splitCount;
   }
 
   public Integer splitCount(RelNode rel) {



Mime
View raw message