hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ser...@apache.org
Subject svn commit: r1617399 - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ ql/src/java/org/apac...
Date Tue, 12 Aug 2014 02:13:02 GMT
Author: sershe
Date: Tue Aug 12 02:13:01 2014
New Revision: 1617399

URL: http://svn.apache.org/r1617399
Log:
HIVE-7616 : pre-size mapjoin hashtable based on statistics (Sergey Shelukhin, reviewed by
Gunther Hagleitner, Prasanth J, Mostafa Mokhtar, Gopal V)

Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
    hive/trunk/ql/src/test/queries/clientpositive/mapjoin_mapjoin.q
    hive/trunk/ql/src/test/results/clientpositive/mapjoin_mapjoin.q.out
    hive/trunk/ql/src/test/results/clientpositive/tez/mapjoin_mapjoin.q.out

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue Aug 12 02:13:01
2014
@@ -897,7 +897,12 @@ public class HiveConf extends Configurat
         "This controls how many partitions can be scanned for each partitioned table.\n"
+
         "The default value \"-1\" means no limit."),
 
-    HIVEHASHTABLETHRESHOLD("hive.hashtable.initialCapacity", 100000, ""),
+    HIVEHASHTABLEKEYCOUNTADJUSTMENT("hive.hashtable.key.count.adjustment", 1.0f,
+        "Adjustment to mapjoin hashtable size derived from table and column statistics; the
estimate" +
+        " of the number of keys is divided by this value. If the value is 0, statistics are
not used" +
+        "and hive.hashtable.initialCapacity is used instead."),
+    HIVEHASHTABLETHRESHOLD("hive.hashtable.initialCapacity", 100000, "Initial capacity of
" +
+        "mapjoin hashtable if statistics are absent, or if hive.hashtable.stats.key.estimate.adjustment
is set to 0"),
     HIVEHASHTABLELOADFACTOR("hive.hashtable.loadfactor", (float) 0.75, ""),
     HIVEHASHTABLEFOLLOWBYGBYMAXMEMORYUSAGE("hive.mapjoin.followby.gby.localtask.max.memory.usage",
(float) 0.55,
         "This number means how much memory the local task can take to hold the key/value
into an in-memory hash table \n" +

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java Tue Aug
12 02:13:01 2014
@@ -181,7 +181,7 @@ public class HashTableSinkOperator exten
         if (pos == posBigTableAlias) {
           continue;
         }
-        mapJoinTables[pos] = new HashMapWrapper(hconf);
+        mapJoinTables[pos] = new HashMapWrapper(hconf, -1);
         TableDesc valueTableDesc = conf.getValueTblFilteredDescs().get(pos);
         SerDe valueSerDe = (SerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(),
null);
         SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/BytesBytesMultiHashMap.java
Tue Aug 12 02:13:01 2014
@@ -145,8 +145,7 @@ public final class BytesBytesMultiHashMa
   private long[] refs;
   private int startingHashBitCount, hashBitCount;
 
-  private int metricPutConflict = 0, metricSameBitsDiffKey = 0,
-      metricSameBitsSameKey = 0, metricDiffBits = 0;
+  private int metricPutConflict = 0, metricExpands = 0, metricExpandsUs = 0;
 
   /** We have 39 bits to store list pointer from the first record; this is size limit */
   final static long MAX_WB_SIZE = ((long)1) << 38;
@@ -430,16 +429,13 @@ public final class BytesBytesMultiHashMa
    */
   private boolean isSameKey(byte[] key, int length, long ref, int hashCode) {
     if (!compareHashBits(ref, hashCode)) {
-      ++metricDiffBits;
       return false;  // Hash bits don't match.
     }
     writeBuffers.setReadPoint(getFirstRecordLengthsOffset(ref));
     int valueLength = (int)writeBuffers.readVLong(), keyLength = (int)writeBuffers.readVLong();
     long keyOffset = Ref.getOffset(ref) - (valueLength + keyLength);
     // See the comment in the other isSameKey
-    boolean result = writeBuffers.isEqual(key, length, keyOffset, keyLength);
-    if (result) { ++metricSameBitsSameKey; } else { ++metricSameBitsDiffKey; }
-    return result;
+    return writeBuffers.isEqual(key, length, keyOffset, keyLength);
   }
 
   private boolean compareHashBits(long ref, int hashCode) {
@@ -461,6 +457,7 @@ public final class BytesBytesMultiHashMa
   }
 
   private void expandAndRehash() {
+    long expandTime = System.nanoTime();
     final long[] oldRefs = refs;
     long capacity = refs.length << 1;
     validateCapacity(capacity);
@@ -492,6 +489,9 @@ public final class BytesBytesMultiHashMa
     this.largestNumberOfSteps = maxSteps;
     this.hashBitCount = newHashBitCount;
     this.resizeThreshold = (int)(capacity * loadFactor);
+    metricExpandsUs += (System.nanoTime() - expandTime);
+    ++metricExpands;
+
   }
 
   /**
@@ -703,11 +703,9 @@ public final class BytesBytesMultiHashMa
   }
 
   public void debugDumpMetrics() {
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("Map metrics: keys " + this.keysAssigned + ", write conflict " + metricPutConflict
-          + ", write max dist " + largestNumberOfSteps + ", read neq " + metricDiffBits
-          + ", read eq-eq " + metricSameBitsSameKey + ", read eq-neq " + metricSameBitsDiffKey);
-    }
+    LOG.info("Map metrics: keys allocated " + this.refs.length +", keys assigned " + keysAssigned
+        + ", write conflict " + metricPutConflict  + ", write max dist " + largestNumberOfSteps
+        + ", expanded " + metricExpands + " times in " + metricExpandsUs + "us");
   }
 
   private void debugDumpKeyProbe(long keyOffset, int keyLength, int hashCode, int finalSlot)
{

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/HashMapWrapper.java
Tue Aug 12 02:13:01 2014
@@ -71,25 +71,39 @@ public class HashMapWrapper extends Abst
   }
 
   public HashMapWrapper() {
-    this(HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD.defaultIntVal,
-        HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR.defaultFloatVal, false, false);
+    this(HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT.defaultFloatVal,
+        HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD.defaultIntVal,
+        HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR.defaultFloatVal, false, false, -1);
   }
 
-  public HashMapWrapper(Configuration hconf) {
-    this(HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD),
+  public HashMapWrapper(Configuration hconf, long keyCount) {
+    this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT),
+        HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD),
         HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR),
         HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINLAZYHASHTABLE),
-        HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDKEYS));
+        HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDKEYS), keyCount);
   }
 
-  private HashMapWrapper(
-      int threshold, float loadFactor, boolean useLazyRows, boolean useOptimizedKeys) {
+  private HashMapWrapper(float keyCountAdj, int threshold, float loadFactor,
+      boolean useLazyRows, boolean useOptimizedKeys, long keyCount) {
     super(createConstructorMetaData(threshold, loadFactor));
+    threshold = calculateTableSize(keyCountAdj, threshold, loadFactor, keyCount);
     mHash = new HashMap<MapJoinKey, MapJoinRowContainer>(threshold, loadFactor);
     this.useLazyRows = useLazyRows;
     this.useOptimizedKeys = useOptimizedKeys;
   }
 
+  public static int calculateTableSize(
+      float keyCountAdj, int threshold, float loadFactor, long keyCount) {
+    if (keyCount >= 0 && keyCountAdj != 0) {
+      // We have statistics for the table. Size appropriately.
+      threshold = (int)Math.ceil(keyCount / (keyCountAdj * loadFactor));
+    }
+    LOG.info("Key count from statistics is " + keyCount + "; setting map size to " + threshold);
+    return threshold;
+  }
+
+
   @Override
   public MapJoinRowContainer get(MapJoinKey key) {
     return mHash.get(key);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/persistence/MapJoinBytesTableContainer.java
Tue Aug 12 02:13:01 2014
@@ -59,15 +59,17 @@ public class MapJoinBytesTableContainer 
 
   private List<Object> EMPTY_LIST = new ArrayList<Object>(0);
 
-  public MapJoinBytesTableContainer(Configuration hconf, MapJoinObjectSerDeContext valCtx)
-      throws SerDeException {
-    this(HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD),
+  public MapJoinBytesTableContainer(Configuration hconf,
+      MapJoinObjectSerDeContext valCtx, long keyCount) throws SerDeException {
+    this(HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEKEYCOUNTADJUSTMENT),
+        HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD),
         HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR),
-        HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE), valCtx);
+        HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLEWBSIZE), valCtx, keyCount);
   }
 
-  private MapJoinBytesTableContainer(int threshold, float loadFactor, int wbSize,
-      MapJoinObjectSerDeContext valCtx) throws SerDeException {
+  private MapJoinBytesTableContainer(float keyCountAdj, int threshold, float loadFactor,
+      int wbSize, MapJoinObjectSerDeContext valCtx, long keyCount) throws SerDeException
{
+    threshold = HashMapWrapper.calculateTableSize(keyCountAdj, threshold, loadFactor, keyCount);
     hashMap = new BytesBytesMultiHashMap(threshold, loadFactor, wbSize);
   }
 

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/HashTableLoader.java Tue Aug
12 02:13:01 2014
@@ -88,6 +88,7 @@ public class HashTableLoader implements 
 
     TezContext tezContext = (TezContext) MapredContext.get();
     Map<Integer, String> parentToInput = desc.getParentToInput();
+    Map<Integer, Long> parentKeyCounts = desc.getParentKeyCounts();
 
     boolean useOptimizedTables = HiveConf.getBoolVar(
         hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
@@ -117,8 +118,11 @@ public class HashTableLoader implements 
           }
         }
         isFirstKey = false;
+        Long keyCountObj = parentKeyCounts.get(pos);
+        long keyCount = (keyCountObj == null) ? -1 : keyCountObj.longValue();
         MapJoinTableContainer tableContainer = useOptimizedTables
-            ? new MapJoinBytesTableContainer(hconf, valCtx) : new HashMapWrapper(hconf);
+            ? new MapJoinBytesTableContainer(hconf, valCtx, keyCount)
+            : new HashMapWrapper(hconf, keyCount);
 
         while (kvReader.next()) {
           lastKey = tableContainer.putRow(keyCtx, (Writable)kvReader.getCurrentKey(),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
Tue Aug 12 02:13:01 2014
@@ -26,6 +26,7 @@ import java.util.Stack;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
 import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
 import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator;
 import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
@@ -33,21 +34,28 @@ import org.apache.hadoop.hive.ql.exec.Op
 import org.apache.hadoop.hive.ql.exec.OperatorFactory;
 import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
 import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.lib.Node;
 import org.apache.hadoop.hive.ql.lib.NodeProcessor;
 import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 import org.apache.hadoop.hive.ql.parse.GenTezProcContext;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.BaseWork;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.HashTableDummyDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.OpTraits;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
 import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
 import org.apache.hadoop.hive.ql.plan.TezEdgeProperty;
 import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType;
 import org.apache.hadoop.hive.ql.plan.TezWork;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
 
 public class ReduceSinkMapJoinProc implements NodeProcessor {
 
@@ -111,18 +119,59 @@ public class ReduceSinkMapJoinProc imple
     if (pos == -1) {
       throw new SemanticException("Cannot find position of parent in mapjoin");
     }
-    LOG.debug("Mapjoin "+mapJoinOp+", pos: "+pos+" --> "+parentWork.getName());
-    mapJoinOp.getConf().getParentToInput().put(pos, parentWork.getName());
+    MapJoinDesc joinConf = mapJoinOp.getConf();
+    long keyCount = Long.MAX_VALUE, rowCount = Long.MAX_VALUE, bucketCount = 1;
+    Statistics stats = parentRS.getStatistics();
+    if (stats != null) {
+      keyCount = rowCount = stats.getNumRows();
+      if (keyCount <= 0) {
+        keyCount = rowCount = Long.MAX_VALUE;
+      }
+      ArrayList<String> keyCols = parentRS.getConf().getOutputKeyColumnNames();
+      if (keyCols != null && !keyCols.isEmpty()) {
+        // See if we can arrive at a smaller number using distinct stats from key columns.
+        long maxKeyCount = 1;
+        String prefix = Utilities.ReduceField.KEY.toString();
+        for (String keyCol : keyCols) {
+          ExprNodeDesc realCol = parentRS.getColumnExprMap().get(prefix + "." + keyCol);
+          ColStatistics cs = StatsUtils.getColStatisticsFromExpression(null, stats, realCol);
+          if (cs == null || cs.getCountDistint() <= 0) {
+            maxKeyCount = Long.MAX_VALUE;
+            break;
+          }
+          maxKeyCount *= cs.getCountDistint();
+          if (maxKeyCount >= keyCount) {
+            break;
+          }
+        }
+        keyCount = Math.min(maxKeyCount, keyCount);
+      }
+      if (joinConf.isBucketMapJoin()) {
+        OpTraits opTraits = mapJoinOp.getOpTraits();
+        bucketCount = (opTraits == null) ? -1 : opTraits.getNumBuckets();
+        if (bucketCount > 0) {
+          // We cannot obtain a better estimate without CustomPartitionVertex providing it
+          // to us somehow; in which case using statistics would be completely unnecessary.
+          keyCount /= bucketCount;
+        }
+      }
+    }
+    LOG.info("Mapjoin " + mapJoinOp + ", pos: " + pos + " --> " + parentWork.getName()
+ " ("
+      + keyCount + " keys estimated from " + rowCount + " rows, " + bucketCount + " buckets)");
+    joinConf.getParentToInput().put(pos, parentWork.getName());
+    if (keyCount != Long.MAX_VALUE) {
+      joinConf.getParentKeyCounts().put(pos, keyCount);
+    }
 
     int numBuckets = -1;
     EdgeType edgeType = EdgeType.BROADCAST_EDGE;
-    if (mapJoinOp.getConf().isBucketMapJoin()) {
+    if (joinConf.isBucketMapJoin()) {
 
       // disable auto parallelism for bucket map joins
       parentRS.getConf().setAutoParallel(false);
 
-      numBuckets = (Integer) mapJoinOp.getConf().getBigTableBucketNumMapping().values().toArray()[0];
-      if (mapJoinOp.getConf().getCustomBucketMapJoin()) {
+      numBuckets = (Integer) joinConf.getBigTableBucketNumMapping().values().toArray()[0];
+      if (joinConf.getCustomBucketMapJoin()) {
         edgeType = EdgeType.CUSTOM_EDGE;
       } else {
         edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java Tue Aug 12 02:13:01
2014
@@ -49,8 +49,10 @@ public class MapJoinDesc extends JoinDes
   private transient String bigTableAlias;
 
   // for tez. used to remember which position maps to which logical input
+  // TODO: should these rather be arrays?
   private Map<Integer, String> parentToInput = new HashMap<Integer, String>();
-  
+  private Map<Integer, Long> parentKeyCounts = new HashMap<Integer, Long>();
+
   // for tez. used to remember which type of a Bucket Map Join this is.
   private boolean customBucketMapJoin;
 
@@ -86,6 +88,7 @@ public class MapJoinDesc extends JoinDes
     this.bigTablePartSpecToFileMapping = clone.bigTablePartSpecToFileMapping;
     this.dumpFilePrefix = clone.dumpFilePrefix;
     this.parentToInput = clone.parentToInput;
+    this.parentKeyCounts = clone.parentKeyCounts;
     this.customBucketMapJoin = clone.customBucketMapJoin;
   }
 
@@ -127,6 +130,28 @@ public class MapJoinDesc extends JoinDes
     this.parentToInput = parentToInput;
   }
 
+  public Map<Integer, Long> getParentKeyCounts() {
+    return parentKeyCounts;
+  }
+
+  @Explain(displayName = "Estimated key counts", normalExplain = false)
+  public String getKeyCountsExplainDesc() {
+    StringBuilder result = null;
+    for (Map.Entry<Integer, Long> entry : parentKeyCounts.entrySet()) {
+      if (result == null) {
+        result = new StringBuilder();
+      } else {
+        result.append(", ");
+      }
+      result.append(parentToInput.get(entry.getKey())).append(" => ").append(entry.getValue());
+    }
+    return result == null ? null : result.toString();
+  }
+
+  public void setParentKeyCount(Map<Integer, Long> parentKeyCounts) {
+    this.parentKeyCounts = parentKeyCounts;
+  }
+
   public Map<Byte, int[]> getValueIndices() {
     return valueIndices;
   }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java Tue Aug 12 02:13:01
2014
@@ -237,12 +237,14 @@ public class Statistics implements Seria
   }
 
   public ColStatistics getColumnStatisticsFromColName(String colName) {
+    if (columnStats == null) {
+      return null;
+    }
     for (ColStatistics cs : columnStats.values()) {
       if (cs.getColumnName().equalsIgnoreCase(colName)) {
         return cs;
       }
     }
-
     return null;
   }
 

Modified: hive/trunk/ql/src/test/queries/clientpositive/mapjoin_mapjoin.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/mapjoin_mapjoin.q?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/mapjoin_mapjoin.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/mapjoin_mapjoin.q Tue Aug 12 02:13:01 2014
@@ -4,7 +4,7 @@ set hive.auto.convert.join.noconditional
 
 -- Since the inputs are small, it should be automatically converted to mapjoin
 
-explain select srcpart.key from srcpart join src on (srcpart.value=src.value) join src1 on
(srcpart.key=src1.key);
+explain extended select srcpart.key from srcpart join src on (srcpart.value=src.value) join
src1 on (srcpart.key=src1.key);
 
 explain
 select srcpart.key from srcpart join src on (srcpart.value=src.value) join src1 on (srcpart.key=src1.key)
where srcpart.value > 'val_450';

Modified: hive/trunk/ql/src/test/results/clientpositive/mapjoin_mapjoin.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/mapjoin_mapjoin.q.out?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
Files hive/trunk/ql/src/test/results/clientpositive/mapjoin_mapjoin.q.out (original) and hive/trunk/ql/src/test/results/clientpositive/mapjoin_mapjoin.q.out
Tue Aug 12 02:13:01 2014 differ

Modified: hive/trunk/ql/src/test/results/clientpositive/tez/mapjoin_mapjoin.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/tez/mapjoin_mapjoin.q.out?rev=1617399&r1=1617398&r2=1617399&view=diff
==============================================================================
Files hive/trunk/ql/src/test/results/clientpositive/tez/mapjoin_mapjoin.q.out (original) and
hive/trunk/ql/src/test/results/clientpositive/tez/mapjoin_mapjoin.q.out Tue Aug 12 02:13:01
2014 differ



Mime
View raw message