hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasan...@apache.org
Subject hive git commit: HIVE-15069: Optimize MetaStoreDirectSql:: aggrColStatsForPartitions during query compilation (Rajesh Balamohan reviewed by Sergey Shelukhin)
Date Tue, 15 Nov 2016 05:01:35 GMT
Repository: hive
Updated Branches:
  refs/heads/master 652ed7a79 -> 6536e30f2


HIVE-15069: Optimize MetaStoreDirectSql:: aggrColStatsForPartitions during query compilation
(Rajesh Balamohan reviewed by Sergey Shelukhin)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6536e30f
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6536e30f
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6536e30f

Branch: refs/heads/master
Commit: 6536e30f2ee9707e16d4cd8d7c8321c4073a30b9
Parents: 652ed7a
Author: Prasanth Jayachandran <prasanthj@apache.org>
Authored: Mon Nov 14 21:00:50 2016 -0800
Committer: Prasanth Jayachandran <prasanthj@apache.org>
Committed: Mon Nov 14 21:00:50 2016 -0800

----------------------------------------------------------------------
 .../hive/metastore/MetaStoreDirectSql.java      | 58 ++++++++++----------
 .../ql/optimizer/calcite/RelOptHiveTable.java   | 26 ++++++---
 .../hive/ql/optimizer/physical/Vectorizer.java  |  4 +-
 3 files changed, 51 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/6536e30f/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
----------------------------------------------------------------------
diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
index 561f3e3..dadc6f6 100644
--- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
+++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java
@@ -1188,44 +1188,46 @@ class MetaStoreDirectSql {
       LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
       return new AggrStats(new ArrayList<ColumnStatisticsObj>(), 0); // Nothing to
aggregate
     }
-    long partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
+    long partsFound = 0;
     List<ColumnStatisticsObj> colStatsList;
     // Try to read from the cache first
-    if (isAggregateStatsCacheEnabled) {
+    if (isAggregateStatsCacheEnabled
+        && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
       AggrColStats colStatsAggrCached;
       List<ColumnStatisticsObj> colStatsAggrFromDB;
       int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
       float fpp = aggrStatsCache.getFalsePositiveProbability();
-      int partitionsRequested = partNames.size();
-      if (partitionsRequested > maxPartsPerCacheNode) {
-        colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames,
-            partsFound, useDensityFunctionForNDVEstimation);
-      } else {
-        colStatsList = new ArrayList<ColumnStatisticsObj>();
-        // Bloom filter for the new node that we will eventually add to the cache
-        BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
-        for (String colName : colNames) {
-          // Check the cache first
-          colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames);
-          if (colStatsAggrCached != null) {
-            colStatsList.add(colStatsAggrCached.getColStats());
-          } else {
-            List<String> colNamesForDB = new ArrayList<String>();
-            colNamesForDB.add(colName);
-            // Read aggregated stats for one column
-            colStatsAggrFromDB =
-                columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB,
-                    partsFound, useDensityFunctionForNDVEstimation);
-            if (!colStatsAggrFromDB.isEmpty()) {
-              ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
-              colStatsList.add(colStatsAggr);
-              // Update the cache to add this new aggregate node
-              aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
-            }
+      colStatsList = new ArrayList<ColumnStatisticsObj>();
+      // Bloom filter for the new node that we will eventually add to the cache
+      BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
+      boolean computePartsFound = true;
+      for (String colName : colNames) {
+        // Check the cache first
+        colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames);
+        if (colStatsAggrCached != null) {
+          colStatsList.add(colStatsAggrCached.getColStats());
+          partsFound = colStatsAggrCached.getNumPartsCached();
+        } else {
+          if (computePartsFound) {
+            partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
+            computePartsFound = false;
+          }
+          List<String> colNamesForDB = new ArrayList<String>();
+          colNamesForDB.add(colName);
+          // Read aggregated stats for one column
+          colStatsAggrFromDB =
+              columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB,
+                  partsFound, useDensityFunctionForNDVEstimation);
+          if (!colStatsAggrFromDB.isEmpty()) {
+            ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
+            colStatsList.add(colStatsAggr);
+            // Update the cache to add this new aggregate node
+            aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
           }
         }
       }
     } else {
+      partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
       colStatsList =
           columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound,
               useDensityFunctionForNDVEstimation);

http://git-wip-us.apache.org/repos/asf/hive/blob/6536e30f/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java
index 73ca9bf..4ebbb13 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/RelOptHiveTable.java
@@ -412,20 +412,32 @@ public class RelOptHiveTable extends RelOptAbstractTable {
 
   public List<ColStatistics> getColStat(List<Integer> projIndxLst, boolean allowNullColumnForMissingStats)
{
     List<ColStatistics> colStatsBldr = Lists.newArrayList();
-
+    Set<Integer> projIndxSet = new HashSet<Integer>(projIndxLst);
     if (projIndxLst != null) {
-      updateColStats(new HashSet<Integer>(projIndxLst), allowNullColumnForMissingStats);
       for (Integer i : projIndxLst) {
-        colStatsBldr.add(hiveColStatsMap.get(i));
+        if (hiveColStatsMap.get(i) != null) {
+          colStatsBldr.add(hiveColStatsMap.get(i));
+          projIndxSet.remove(i);
+        }
+      }
+      if (!projIndxSet.isEmpty()) {
+        updateColStats(projIndxSet, allowNullColumnForMissingStats);
+        for (Integer i : projIndxSet) {
+          colStatsBldr.add(hiveColStatsMap.get(i));
+        }
       }
     } else {
       List<Integer> pILst = new ArrayList<Integer>();
       for (Integer i = 0; i < noOfNonVirtualCols; i++) {
-        pILst.add(i);
+        if (hiveColStatsMap.get(i) == null) {
+          pILst.add(i);
+        }
       }
-      updateColStats(new HashSet<Integer>(pILst), allowNullColumnForMissingStats);
-      for (Integer pi : pILst) {
-        colStatsBldr.add(hiveColStatsMap.get(pi));
+      if (!pILst.isEmpty()) {
+        updateColStats(new HashSet<Integer>(pILst), allowNullColumnForMissingStats);
+        for (Integer pi : pILst) {
+          colStatsBldr.add(hiveColStatsMap.get(pi));
+        }
       }
     }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/6536e30f/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 3a179a3..37baaf6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -749,8 +749,8 @@ public class Vectorizer implements PhysicalPlanResolver {
           return false;
         }
         VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Vectorizer path: " + path + ", " + vectorPartDesc.toString() +
+          if (LOG.isDebugEnabled()) {
+          LOG.debug("Vectorizer path: " + path + ", " + vectorPartDesc.toString() +
               ", aliases " + aliases);
         }
 


Mime
View raw message