hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jcama...@apache.org
Subject hive git commit: HIVE-11266: count(*) wrong result based on table statistics for external tables (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)
Date Mon, 09 Oct 2017 20:17:51 GMT
Repository: hive
Updated Branches:
  refs/heads/master 6ef65f49d -> a2dff9e13


HIVE-11266: count(*) wrong result based on table statistics for external tables (Jesus Camacho
Rodriguez, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a2dff9e1
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a2dff9e1
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a2dff9e1

Branch: refs/heads/master
Commit: a2dff9e13acc62ecc0388b3b2e221f26c9184dbb
Parents: 6ef65f4
Author: Jesus Camacho Rodriguez <jcamacho@apache.org>
Authored: Mon Oct 9 10:38:51 2017 -0700
Committer: Jesus Camacho Rodriguez <jcamacho@apache.org>
Committed: Mon Oct 9 13:17:48 2017 -0700

----------------------------------------------------------------------
 .../hive/ql/optimizer/StatsOptimizer.java       |  5 ++
 .../queries/clientpositive/stats_noscan_2.q     |  3 +
 .../clientpositive/spark/stats_noscan_2.q.out   | 77 ++++++++++++++++++++
 .../results/clientpositive/stats_noscan_2.q.out | 71 ++++++++++++++++++
 4 files changed, 156 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 7b2a1a9..a3a35ff 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -31,6 +31,7 @@ import java.util.Stack;
 
 import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
@@ -272,6 +273,10 @@ public class StatsOptimizer extends Transform {
           return null;
         }
         Table tbl = tsOp.getConf().getTableMetadata();
+        if (MetaStoreUtils.isExternalTable(tbl.getTTable())) {
+          Logger.info("Table " + tbl.getTableName() + " is external. Skip StatsOptimizer.");
+          return null;
+        }
         if (AcidUtils.isAcidTable(tbl)) {
           Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer.");
           return null;

http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/queries/clientpositive/stats_noscan_2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/stats_noscan_2.q b/ql/src/test/queries/clientpositive/stats_noscan_2.q
index 8639ab7..d473081 100644
--- a/ql/src/test/queries/clientpositive/stats_noscan_2.q
+++ b/ql/src/test/queries/clientpositive/stats_noscan_2.q
@@ -16,6 +16,7 @@ create table texternal(key string, val string) partitioned by (insertdate
string
 dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/texternal/2008-01-01;
 alter table texternal add partition (insertdate='2008-01-01') location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01';
 from src insert overwrite table texternal partition (insertdate='2008-01-01') select *;
+explain select count(*) from texternal where insertdate='2008-01-01';
 select count(*) from texternal where insertdate='2008-01-01';
 -- create external table
 CREATE EXTERNAL TABLE anaylyze_external (key string, val string) partitioned by (insertdate
string) LOCATION "pfile://${system:test.tmp.dir}/texternal"; 
@@ -27,6 +28,8 @@ describe formatted anaylyze_external PARTITION (insertdate='2008-01-01');
 analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics;
 describe formatted anaylyze_external PARTITION (insertdate='2008-01-01');
 dfs -rmr ${system:test.tmp.dir}/texternal;
+explain select count(*) from anaylyze_external where insertdate='2008-01-01';
+select count(*) from anaylyze_external where insertdate='2008-01-01';
 drop table anaylyze_external;
 
 

http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out b/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out
index 14e131c..1f5bd0c 100644
--- a/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out
+++ b/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out
@@ -139,6 +139,20 @@ POSTHOOK: Input: default@src
 POSTHOOK: Output: default@texternal@insertdate=2008-01-01
 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).key SIMPLE [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).val SIMPLE [(src)src.FieldSchema(name:value,
type:string, comment:default), ]
+PREHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: select count(*) from texternal where insertdate='2008-01-01'
 PREHOOK: type: QUERY
 PREHOOK: Input: default@texternal
@@ -275,6 +289,69 @@ Sort Columns:       	[]
 Storage Desc Params:	 	 
 	serialization.format	1                   
 #### A masked pattern was here ####
+PREHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP, 1)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: anaylyze_external
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column
stats: NONE
+                  Select Operator
+                    Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column
stats: NONE
+                    Group By Operator
+                      aggregations: count()
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column
stats: NONE
+                        value expressions: _col0 (type: bigint)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@anaylyze_external
+PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@anaylyze_external
+POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01
+#### A masked pattern was here ####
+0
 PREHOOK: query: drop table anaylyze_external
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@anaylyze_external

http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/results/clientpositive/stats_noscan_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/stats_noscan_2.q.out b/ql/src/test/results/clientpositive/stats_noscan_2.q.out
index 14e131c..ef397c3 100644
--- a/ql/src/test/results/clientpositive/stats_noscan_2.q.out
+++ b/ql/src/test/results/clientpositive/stats_noscan_2.q.out
@@ -139,6 +139,20 @@ POSTHOOK: Input: default@src
 POSTHOOK: Output: default@texternal@insertdate=2008-01-01
 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).key SIMPLE [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).val SIMPLE [(src)src.FieldSchema(name:value,
type:string, comment:default), ]
+PREHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: select count(*) from texternal where insertdate='2008-01-01'
 PREHOOK: type: QUERY
 PREHOOK: Input: default@texternal
@@ -275,6 +289,63 @@ Sort Columns:       	[]
 Storage Desc Params:	 	 
 	serialization.format	1                   
 #### A masked pattern was here ####
+PREHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: anaylyze_external
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+            Select Operator
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats:
NONE
+              Group By Operator
+                aggregations: count()
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats:
NONE
+                  value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@anaylyze_external
+PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@anaylyze_external
+POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01
+#### A masked pattern was here ####
+0
 PREHOOK: query: drop table anaylyze_external
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@anaylyze_external


Mime
View raw message