hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jd...@apache.org
Subject [3/3] hive git commit: HIVE-16154: Determine when dynamic runtime filtering should be disabled (Jason Dere, reviewed by Gunther Hagleitner)
Date Thu, 23 Mar 2017 21:31:13 GMT
HIVE-16154: Determine when dynamic runtime filtering should be disabled (Jason Dere, reviewed
by Gunther Hagleitner)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/736d2e86
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/736d2e86
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/736d2e86

Branch: refs/heads/master
Commit: 736d2e861f396532bdc6c2a4f1410aeae8c36ec6
Parents: 0466fca
Author: Jason Dere <jdere@hortonworks.com>
Authored: Thu Mar 23 14:30:16 2017 -0700
Committer: Jason Dere <jdere@hortonworks.com>
Committed: Thu Mar 23 14:30:16 2017 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   2 +
 .../DynamicPartitionPruningOptimization.java    |   1 +
 .../hadoop/hive/ql/parse/RuntimeValuesInfo.java |  10 +
 .../hadoop/hive/ql/parse/TezCompiler.java       | 218 +++++++
 .../hadoop/hive/ql/plan/ExprNodeDescUtils.java  |  71 +++
 .../apache/hadoop/hive/ql/stats/StatsUtils.java |  47 +-
 .../clientpositive/dynamic_semijoin_reduction.q |   4 +-
 .../dynamic_semijoin_reduction_2.q              |   1 +
 .../dynamic_semijoin_reduction_3.q              |   2 +
 .../vectorized_dynamic_semijoin_reduction2.q    |   2 +
 .../llap/dynamic_partition_pruning.q.out        |  39 +-
 .../llap/dynamic_semijoin_reduction.q.out       | 587 +++++++++----------
 .../results/clientpositive/llap/mergejoin.q.out | 438 +++-----------
 .../vectorized_dynamic_partition_pruning.q.out  |  46 +-
 14 files changed, 716 insertions(+), 752 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 1bc3a6e..6e16200 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2857,6 +2857,8 @@ public class HiveConf extends Configuration {
             "Bloom filter should be a multiple of this factor with nDV"),
     TEZ_BIGTABLE_MIN_SIZE_SEMIJOIN_REDUCTION("hive.tez.bigtable.minsize.semijoin.reduction",
1000000L,
             "Big table for runtime filteting should be of atleast this size"),
+    TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD("hive.tez.dynamic.semijoin.reduction.threshold",
(float) 0.50,
+            "Only perform semijoin optimization if the estimated benefit at or above this
fraction of the target table"),
     TEZ_SMB_NUMBER_WAVES(
         "hive.tez.smb.number.waves",
         (float) 0.5,

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index 727f7bc..b9f5912 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -663,6 +663,7 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor
{
     runtimeValuesInfo.setTableDesc(rsFinalTableDesc);
     runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs);
     runtimeValuesInfo.setColExprs(rsValueCols);
+    runtimeValuesInfo.setTsColExpr(ctx.parent.getChildren().get(0));
     parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
 
     return true;

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java
index 5865f1a..0fe8a27 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java
@@ -33,6 +33,8 @@ public class RuntimeValuesInfo implements Serializable {
   private TableDesc tableDesc;
   private List<String> dynamicValueIDs;
   private List<ExprNodeDesc> colExprs;
+  // Column expression of the table being filtered by the semijoin optimization.
+  private ExprNodeDesc tsColExpr;
 
   // get-set methods
   public TableDesc getTableDesc() {
@@ -58,5 +60,13 @@ public class RuntimeValuesInfo implements Serializable {
   public void setColExprs(List<ExprNodeDesc> colExprs) {
     this.colExprs = colExprs;
   }
+
+  public ExprNodeDesc getTsColExpr() {
+    return tsColExpr;
+  }
+
+  public void setTsColExpr(ExprNodeDesc tsColExpr) {
+    this.tsColExpr = tsColExpr;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index 468e18e..62bd652 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hive.ql.parse;
 
+import com.google.common.base.Preconditions;
 import java.io.Serializable;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -58,6 +59,8 @@ import org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer;
 import org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
 
 /**
  * TezCompiler translates the operator plan into TezTasks.
@@ -101,6 +104,9 @@ public class TezCompiler extends TaskCompiler {
     runStatsDependentOptimizations(procCtx, inputs, outputs);
     perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run the optimizations
that use stats for optimization");
 
+    // Removing semijoin optimization when it may not be beneficial
+    removeSemijoinOptimizationByBenefit(procCtx);
+
     perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER);
     // Remove any parallel edge between semijoin and mapjoin.
     removeSemijoinsParallelToMapJoin(procCtx);
@@ -932,4 +938,216 @@ public class TezCompiler extends TaskCompiler {
     }
   }
 
+  private static boolean canUseNDV(ColStatistics colStats) {
+    return (colStats != null) && (colStats.getCountDistint() >= 0);
+  }
+
+  private static double getBloomFilterCost(
+      SelectOperator sel,
+      FilterOperator fil) {
+    double cost = -1;
+    Statistics selStats = sel.getStatistics();
+    if (selStats != null) {
+      cost = selStats.getNumRows();
+
+      // Some other things that could be added here to model cost:
+      // Cost of computing/sending partial BloomFilter results? BloomFilterSize * # mappers
+      // For reduce-side join, add the cost of the semijoin table scan/dependent tablescans?
+    }
+    return cost;
+  }
+
+  private static long getCombinedKeyDomainCardinality(
+      ColStatistics selColStat,
+      ColStatistics selColSourceStat,
+      ColStatistics tsColStat) {
+    long keyDomainCardinality = -1;
+    if (!canUseNDV(selColStat) || !canUseNDV(tsColStat)) {
+      return -1;
+    }
+
+    long selColSourceNdv = canUseNDV(selColSourceStat) ? selColSourceStat.getCountDistint()
: -1;
+    boolean semiJoinKeyIsPK = StatsUtils.inferForeignKey(selColStat, tsColStat);
+    if (semiJoinKeyIsPK) {
+      // PK/FQ relationship: NDV of selColSourceStat is a superset of what is in tsColStat
+      if (selColSourceNdv >= 0) {
+        // Most accurate domain cardinality would be source column NDV if available.
+        keyDomainCardinality = selColSourceNdv;
+      }
+    } else {
+      if (selColSourceNdv >= 0) {
+        // If semijoin keys and ts keys completely unrelated, the cardinality of both sets
+        // could be obtained by adding both cardinalities. Would there be an average case?
+        keyDomainCardinality = selColSourceNdv + tsColStat.getCountDistint();
+
+        // Don't exceed the range if we have one.
+        if (StatsUtils.hasDiscreteRange(selColStat)
+            && StatsUtils.hasDiscreteRange(tsColStat)) {
+          long range = 0;
+          // Trying using the cardinality from the value range.
+          ColStatistics.Range combinedRange = StatsUtils.combineRange(selColStat.getRange(),
tsColStat.getRange());
+          if (combinedRange != null) {
+            range = StatsUtils.getRangeDelta(combinedRange);
+          } else {
+            range = StatsUtils.getRangeDelta(selColStat.getRange())
+                + StatsUtils.getRangeDelta(tsColStat.getRange());
+          }
+          keyDomainCardinality = Math.min(keyDomainCardinality, range);
+        }
+      }
+      // Otherwise, we tried ..
+    }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Computing key domain cardinality, keyDomainCardinality=" + keyDomainCardinality
+          + ", semiJoinKeyIsPK=" + semiJoinKeyIsPK
+          + ", selColStat=" + selColStat
+          + ", selColSourceStat=" + selColSourceStat
+          + ", tsColStat=" + tsColStat);
+    }
+
+    return keyDomainCardinality;
+  }
+
+  private static double getBloomFilterBenefit(
+      SelectOperator sel, ExprNodeDesc selExpr,
+      FilterOperator fil, ExprNodeDesc tsExpr) {
+    double benefit = -1;
+    Statistics selStats = sel.getStatistics();
+    Statistics filStats = fil.getStatistics();
+    if (selStats == null || filStats == null) {
+      LOG.debug("No stats available to compute BloomFilter benefit");
+      return benefit;
+    }
+
+    // For cardinality values use numRows as default, try to use ColStats if available
+    long selKeyCardinality = selStats.getNumRows();
+    long tsKeyCardinality = filStats.getNumRows();
+    long tsRows = filStats.getNumRows();
+    long tsRowSize = filStats.getAvgRowSize();
+    long keyDomainCardinality = selKeyCardinality + tsKeyCardinality;
+
+    ExprNodeColumnDesc selCol = ExprNodeDescUtils.getColumnExpr(selExpr);
+    ExprNodeColumnDesc tsCol = ExprNodeDescUtils.getColumnExpr(tsExpr);
+    if (selCol != null && tsCol != null) {
+      // Check if there are column stats available for these columns
+      ColStatistics selColStat = selStats.getColumnStatisticsFromColName(selCol.getColumn());
+      ColStatistics filColStat = filStats.getColumnStatisticsFromColName(tsCol.getColumn());
+      if (canUseNDV(selColStat)) {
+        selKeyCardinality = selColStat.getCountDistint();
+      }
+      if (canUseNDV(filColStat)) {
+        tsKeyCardinality = filColStat.getCountDistint();
+      }
+      // Get colstats for the original table column for selCol if possible, this would have
+      // more accurate information about the original NDV of the column before any filtering.
+      ColStatistics selColSourceStat = null;
+      if (selColStat != null) {
+        ExprNodeDescUtils.ColumnOrigin selColSource = ExprNodeDescUtils.findColumnOrigin(selCol,
sel);
+        if (selColSource != null && selColSource.op.getStatistics() != null) {
+          selColSourceStat = selColSource.op.getStatistics().getColumnStatisticsFromColName(
+              selColSource.col.getColumn());
+        }
+      }
+      long domainCardinalityFromColStats = getCombinedKeyDomainCardinality(
+          selColStat, selColSourceStat, filColStat);
+      if (domainCardinalityFromColStats >= 0) {
+        keyDomainCardinality = domainCardinalityFromColStats;
+      }
+    }
+
+    // Selectivity: key cardinality of semijoin / domain cardinality
+    // Benefit (rows filtered from ts): (1 - selectivity) * # ts rows
+    double selectivity = selKeyCardinality / (double) keyDomainCardinality;
+    selectivity = Math.min(selectivity, 1);
+    benefit = tsRows * (1 - selectivity);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("BloomFilter benefit for " + selCol + " to " + tsCol
+          + ", selKeyCardinality=" + selKeyCardinality
+          + ", tsKeyCardinality=" + tsKeyCardinality
+          + ", tsRows=" + tsRows
+          + ", keyDomainCardinality=" + keyDomainCardinality);
+      LOG.debug("SemiJoin key selectivity=" + selectivity
+          + ", benefit=" + benefit);
+    }
+
+    return benefit;
+  }
+
+  private static double computeBloomFilterNetBenefit(
+      SelectOperator sel, ExprNodeDesc selExpr,
+      FilterOperator fil, ExprNodeDesc tsExpr) {
+    double netBenefit = -1;
+    double benefit = getBloomFilterBenefit(sel, selExpr, fil, tsExpr);
+    Statistics filStats = fil.getStatistics();
+    if (benefit > 0 && filStats != null) {
+      double cost = getBloomFilterCost(sel, fil);
+      if (cost > 0) {
+        long filDataSize = filStats.getNumRows();
+        netBenefit = (benefit - cost) / filDataSize;
+        LOG.debug("BloomFilter benefit=" + benefit
+            + ", cost=" + cost
+            + ", tsDataSize=" + filDataSize
+            + ", netBenefit=" + (benefit - cost));
+      }
+    }
+    LOG.debug("netBenefit=" + netBenefit);
+    return netBenefit;
+  }
+
+  private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
+      throws SemanticException {
+    if(!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) {
+      // Not needed without semi-join reduction
+      return;
+    }
+
+    List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<ReduceSinkOperator>();
+    Map<ReduceSinkOperator, TableScanOperator> map = procCtx.parseContext.getRsOpToTsOpMap();
+    double semijoinReductionThreshold = procCtx.conf.getFloatVar(
+        HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
+    for (ReduceSinkOperator rs : map.keySet()) {
+      // rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
+      // Get to the SelectOperator ancestor
+      SelectOperator sel = null;
+      for (Operator<?> currOp = rs; currOp.getParentOperators().size() > 0; currOp
= currOp.getParentOperators().get(0)) {
+        if (currOp instanceof SelectOperator) {
+          sel = (SelectOperator) currOp;
+          break;
+        }
+      }
+      if (sel == null) {
+        throw new SemanticException("Unexpected error - could not find SEL ancestor from
semijoin branch of " + rs);
+      }
+
+      // Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is
going to.
+      TableScanOperator ts = map.get(rs);
+      RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
+      ExprNodeDesc tsExpr = rti.getTsColExpr();
+      // In the SEL operator of the semijoin branch, there should be only one column in the
operator
+      ExprNodeDesc selExpr = sel.getConf().getColList().get(0);
+
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Computing BloomFilter cost/benefit for " + OperatorUtils.getOpNamePretty(rs)
+            + " - " + OperatorUtils.getOpNamePretty(ts) + " (" + tsExpr + ")");
+      }
+
+      double reductionFactor = computeBloomFilterNetBenefit(sel, selExpr,
+              (FilterOperator)ts.getChildOperators().get(0), tsExpr);
+      if (reductionFactor < semijoinReductionThreshold) {
+        // This semijoin optimization should be removed. Do it after we're done iterating
+        semijoinRsToRemove.add(rs);
+      }
+    }
+
+    for (ReduceSinkOperator rs : semijoinRsToRemove) {
+      TableScanOperator ts = map.get(rs);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs)
+            + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");
+      }
+      GenTezUtils.removeBranch(rs);
+      GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, ts);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
index fac60c1..bfc1eca 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
 import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
 import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
 import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
@@ -786,4 +787,74 @@ public class ExprNodeDescUtils {
     }
     return true;
   }
+
+  public static class ColumnOrigin {
+    public ExprNodeColumnDesc col;
+    public Operator<?> op;
+
+    public ColumnOrigin(ExprNodeColumnDesc col, Operator<?> op) {
+      super();
+      this.col = col;
+      this.op = op;
+    }
+  }
+
+  private static ExprNodeDesc findParentExpr(ExprNodeColumnDesc col, Operator<?> op)
{
+    if (op instanceof ReduceSinkOperator) {
+      return col;
+    }
+
+    ExprNodeDesc parentExpr = col;
+    Map<String, ExprNodeDesc> mapping = op.getColumnExprMap();
+    if (mapping != null) {
+      parentExpr = mapping.get(col.getColumn());
+    }
+    return parentExpr;
+  }
+
+  public static ColumnOrigin findColumnOrigin(ExprNodeDesc expr, Operator<?> op) {
+    if (expr == null || op == null) {
+      // bad input
+      return null;
+    }
+
+    ExprNodeColumnDesc col = ExprNodeDescUtils.getColumnExpr(expr);
+    if (col == null) {
+      // not a column
+      return null;
+    }
+
+    Operator<?> parentOp = null;
+    int numParents = op.getNumParent();
+    if (numParents == 0) {
+      return new ColumnOrigin(col, op);
+    }
+
+    ExprNodeDesc parentExpr = findParentExpr(col, op);
+    if (parentExpr == null) {
+      // couldn't find proper parent column expr
+      return null;
+    }
+
+    if (numParents == 1) {
+      parentOp = op.getParentOperators().get(0);
+    } else {
+      // Multiple parents - find the right one based on the table alias in the parentExpr
+      ExprNodeColumnDesc parentCol = ExprNodeDescUtils.getColumnExpr(parentExpr);
+      if (parentCol != null) {
+        for (Operator<?> currParent : op.getParentOperators()) {
+          if (currParent.getSchema().getTableNames().contains(parentCol.getTabAlias())) {
+            parentOp = currParent;
+            break;
+          }
+        }
+      }
+    }
+
+    if (parentOp == null) {
+      return null;
+    }
+
+    return findColumnOrigin(parentExpr, parentOp);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 413aacf..bda2050 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -75,6 +75,7 @@ import org.apache.hadoop.hive.ql.util.JavaDataModel;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
 import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantStructObjectInspector;
@@ -100,6 +101,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObje
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
 import org.apache.hadoop.io.BytesWritable;
@@ -393,7 +396,7 @@ public class StatsUtils {
     return scaledSelectivity;
   }
 
-  private static long getRangeDelta(ColStatistics.Range range) {
+  public static long getRangeDelta(ColStatistics.Range range) {
     if (range.minValue != null && range.maxValue != null) {
       return (range.maxValue.longValue() - range.minValue.longValue());
     }
@@ -1684,4 +1687,46 @@ public class StatsUtils {
     }
     return numBitVectors;
   }
+
+  public static boolean hasDiscreteRange(ColStatistics colStat) {
+    if (colStat.getRange() != null) {
+      TypeInfo colType = TypeInfoUtils.getTypeInfoFromTypeString(colStat.getColumnType());
+      if (colType.getCategory() == Category.PRIMITIVE) {
+        PrimitiveTypeInfo pti = (PrimitiveTypeInfo) colType;
+        switch (pti.getPrimitiveCategory()) {
+          case BOOLEAN:
+          case BYTE:
+          case SHORT:
+          case INT:
+          case LONG:
+            return true;
+          default:
+            break;
+        }
+      }
+    }
+    return false;
+  }
+
+  public static Range combineRange(Range range1, Range range2) {
+    if (   range1.minValue != null && range1.maxValue != null
+        && range2.minValue != null && range2.maxValue != null) {
+      long min1 = range1.minValue.longValue();
+      long max1 = range1.maxValue.longValue();
+      long min2 = range2.minValue.longValue();
+      long max2 = range2.maxValue.longValue();
+
+      if (   (min1 < min2 && max1 < max2)
+          || (min1 > min2 && max1 > max2)) {
+        // No overlap between the two ranges
+        return null;
+      } else {
+        // There is an overlap of ranges - create combined range.
+        return new ColStatistics.Range(
+            Math.min(min1, min2),
+            Math.max(max1,  max2));
+      }
+    }
+    return null;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q
index f04a923..6338ac3 100644
--- a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q
+++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q
@@ -10,6 +10,7 @@ set hive.optimize.index.filter=true;
 set hive.stats.autogather=true;
 set hive.tez.bigtable.minsize.semijoin.reduction=1;
 set hive.tez.min.bloom.filter.entries=1;
+set hive.stats.fetch.column.stats=true;
 
 -- Create Tables
 create table alltypesorc_int ( cint int, cstring string ) stored as ORC;
@@ -27,7 +28,8 @@ alter table srcpart_small add partition (ds = "2008-04-09");
 insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc;
 insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from
srcpart where ds = "2008-04-08";
 insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from
srcpart where ds = "2008-04-09";
-insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from
srcpart where ds = "2008-04-09";
+insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from
srcpart where ds = "2008-04-09" limit 20;
+
 set hive.tez.dynamic.semijoin.reduction=false;
 
 analyze table alltypesorc_int compute statistics for columns;

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_2.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_2.q
index 88386a6..55f6e8a 100644
--- a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_2.q
+++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_2.q
@@ -9,6 +9,7 @@ set hive.optimize.metadataonly=false;
 set hive.optimize.index.filter=true;
 set hive.tez.bigtable.minsize.semijoin.reduction=1;
 set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
 
 CREATE TABLE `table_1`(
   `bigint_col_7` bigint,

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_3.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_3.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_3.q
index d5fe136..18408e4 100644
--- a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_3.q
+++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction_3.q
@@ -14,6 +14,8 @@ set hive.optimize.metadataonly=false;
 set hive.optimize.index.filter=true;
 set hive.tez.bigtable.minsize.semijoin.reduction=1;
 set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
+
 
 -- Try with merge statements
 create table acidTbl(a int, b int) clustered by (a) into 2 buckets stored as orc TBLPROPERTIES
('transactional'='true');

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction2.q b/ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction2.q
index 4bdff42..c4784bd 100644
--- a/ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction2.q
+++ b/ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction2.q
@@ -12,6 +12,8 @@ set hive.tez.min.bloom.filter.entries=1;
 
 set hive.vectorized.adaptor.usage.mode=none;
 set hive.vectorized.execution.enabled=true;
+set hive.stats.fetch.column.stats=true;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
 
 -- Create Tables
 create table dsrv2_big stored as orc as

http://git-wip-us.apache.org/repos/asf/hive/blob/736d2e86/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out b/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out
index e514e2e..35dde96 100644
--- a/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out
+++ b/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out
@@ -3135,11 +3135,9 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 7 <- Reducer 5 (BROADCAST_EDGE)
-        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
-        Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
         Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)
-        Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -3163,7 +3161,7 @@ STAGE PLANS:
                         value expressions: _col1 (type: string)
             Execution mode: llap
             LLAP IO: unknown
-        Map 6 
+        Map 5 
             Map Operator Tree:
                 TableScan
                   alias: srcpart_date
@@ -3183,14 +3181,14 @@ STAGE PLANS:
                         Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column
stats: NONE
             Execution mode: llap
             LLAP IO: no inputs
-        Map 7 
+        Map 6 
             Map Operator Tree:
                 TableScan
                   alias: srcpart_hour
-                  filterExpr: ((UDFToDouble(hr) = 13.0) and (hr BETWEEN DynamicValue(RS_12_srcpart_hr_min)
AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))))
(type: boolean)
+                  filterExpr: (UDFToDouble(hr) = 13.0) (type: boolean)
                   Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats:
NONE
                   Filter Operator
-                    predicate: ((UDFToDouble(hr) = 13.0) and (hr BETWEEN DynamicValue(RS_12_srcpart_hr_min)
AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))))
(type: boolean)
+                    predicate: (UDFToDouble(hr) = 13.0) (type: boolean)
                     Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats:
NONE
                     Select Operator
                       expressions: hr (type: string)
@@ -3219,19 +3217,6 @@ STAGE PLANS:
                   sort order: +
                   Map-reduce partition columns: _col1 (type: string)
                   Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats:
NONE
-                Select Operator
-                  expressions: _col1 (type: string)
-                  outputColumnNames: _col0
-                  Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats:
NONE
-                  Group By Operator
-                    aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=2)
-                    mode: hash
-                    outputColumnNames: _col0, _col1, _col2
-                    Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats:
NONE
-                    Reduce Output Operator
-                      sort order: 
-                      Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column
stats: NONE
-                      value expressions: _col0 (type: string), _col1 (type: string), _col2
(type: binary)
         Reducer 3 
             Execution mode: llap
             Reduce Operator Tree:
@@ -3266,18 +3251,6 @@ STAGE PLANS:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                       serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-        Reducer 5 
-            Execution mode: llap
-            Reduce Operator Tree:
-              Group By Operator
-                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2,
expectedEntries=2)
-                mode: final
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats:
NONE
-                Reduce Output Operator
-                  sort order: 
-                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats:
NONE
-                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type:
binary)
 
   Stage: Stage-0
     Fetch Operator


Mime
View raw message