hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From prasan...@apache.org
Subject svn commit: r1617541 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java test/results/clientpositive/annotate_stats_join.q.out test/results/clientpositive/union20.q.out
Date Tue, 12 Aug 2014 17:46:21 GMT
Author: prasanthj
Date: Tue Aug 12 17:46:21 2014
New Revision: 1617541

URL: http://svn.apache.org/r1617541
Log:
HIVE-7679: JOIN operator should update the column stats when number of rows changes (Prasanth
J, reviewed by Gunther Hagleitner)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
    hive/trunk/ql/src/test/results/clientpositive/union20.q.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1617541&r1=1617540&r2=1617541&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
Tue Aug 12 17:46:21 2014
@@ -18,11 +18,8 @@
 
 package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
 
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.Stack;
-
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hive.conf.HiveConf;
@@ -69,8 +66,10 @@ import org.apache.hadoop.hive.ql.udf.gen
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
 import org.apache.hadoop.hive.serde.serdeConstants;
 
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
 
 public class StatsRulesProcFactory {
 
@@ -921,8 +920,7 @@ public class StatsRulesProcFactory {
                 + " #Rows of parents: " + rowCountParents.toString() + ". Denominator: "
+ denom);
           }
 
-          stats.setNumRows(newRowCount);
-          stats.setDataSize(StatsUtils.getDataSizeFromColumnStats(newRowCount, outColStats));
+          updateStatsForJoinType(stats, newRowCount, true, jop.getConf());
           jop.setStatistics(stats);
 
           if (LOG.isDebugEnabled()) {
@@ -968,6 +966,39 @@ public class StatsRulesProcFactory {
       return null;
     }
 
+    private void updateStatsForJoinType(Statistics stats, long newNumRows,
+        boolean useColStats, JoinDesc conf) {
+      long oldRowCount = stats.getNumRows();
+      double ratio = (double) newNumRows / (double) oldRowCount;
+      stats.setNumRows(newNumRows);
+
+      if (useColStats) {
+        List<ColStatistics> colStats = stats.getColumnStats();
+        for (ColStatistics cs : colStats) {
+          long oldDV = cs.getCountDistint();
+          long newDV = oldDV;
+
+          // if ratio is greater than 1, then number of rows increases. This can happen
+          // when some operators like GROUPBY duplicates the input rows in which case
+          // number of distincts should not change. Update the distinct count only when
+          // the output number of rows is less than input number of rows.
+          if (ratio <= 1.0) {
+            newDV = (long) Math.ceil(ratio * oldDV);
+          }
+          // Assumes inner join
+          // TODO: HIVE-5579 will handle different join types
+          cs.setNumNulls(0);
+          cs.setCountDistint(newDV);
+        }
+        stats.setColumnStats(colStats);
+        long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
+        stats.setDataSize(newDataSize);
+      } else {
+        long newDataSize = (long) (ratio * stats.getDataSize());
+        stats.setDataSize(newDataSize);
+      }
+    }
+
     private long computeNewRowCount(List<Long> rowCountParents, long denom) {
       double factor = 0.0d;
       long result = 1;

Modified: hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out?rev=1617541&r1=1617540&r2=1617541&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out Tue Aug 12 17:46:21
2014
@@ -551,14 +551,14 @@ STAGE PLANS:
             1 {KEY.reducesinkkey0} {VALUE._col0}
             2 {VALUE._col0} {KEY.reducesinkkey0} {VALUE._col1} {VALUE._col2}
           outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col9, _col10, _col11, _col12
-          Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col5
(type: int), _col6 (type: string), _col9 (type: string), _col10 (type: int), _col11 (type:
bigint), _col12 (type: int)
             outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
-            Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats:
COMPLETE
+            Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats:
COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 47 Data size: 13900 Basic stats: COMPLETE Column stats:
COMPLETE
+              Statistics: Num rows: 47 Data size: 13912 Basic stats: COMPLETE Column stats:
COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -631,14 +631,14 @@ STAGE PLANS:
             1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
             2 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0} {VALUE._col1}
           outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col9, _col10, _col11, _col12
-          Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
             expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col5
(type: int), _col6 (type: string), _col9 (type: string), _col10 (type: int), _col11 (type:
bigint), _col12 (type: int)
             outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
-            Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE
+            Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats: COMPLETE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 1 Data size: 284 Basic stats: COMPLETE Column stats:
COMPLETE
+              Statistics: Num rows: 1 Data size: 296 Basic stats: COMPLETE Column stats:
COMPLETE
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

Modified: hive/trunk/ql/src/test/results/clientpositive/union20.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/union20.q.out?rev=1617541&r1=1617540&r2=1617541&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/union20.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/union20.q.out Tue Aug 12 17:46:21 2014
@@ -130,7 +130,7 @@ STAGE PLANS:
             0 {KEY.reducesinkkey0} {VALUE._col0}
             1 {KEY.reducesinkkey0} {VALUE._col0}
           outputColumnNames: _col0, _col1, _col2, _col3
-          Statistics: Num rows: 36 Data size: 19584 Basic stats: COMPLETE Column stats: PARTIAL
+          Statistics: Num rows: 36 Data size: 9792 Basic stats: COMPLETE Column stats: PARTIAL
           Select Operator
             expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string),
_col3 (type: string)
             outputColumnNames: _col0, _col1, _col2, _col3



Mime
View raw message