Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 325F6200D24 for ; Tue, 10 Oct 2017 02:51:46 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 30F89160BE7; Tue, 10 Oct 2017 00:51:46 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 2A8A41609E0 for ; Tue, 10 Oct 2017 02:51:45 +0200 (CEST) Received: (qmail 81745 invoked by uid 500); 10 Oct 2017 00:51:41 -0000 Mailing-List: contact commits-help@hive.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hive-dev@hive.apache.org Delivered-To: mailing list commits@hive.apache.org Received: (qmail 77806 invoked by uid 99); 10 Oct 2017 00:51:38 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 10 Oct 2017 00:51:38 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A0286F5DA4; Tue, 10 Oct 2017 00:51:36 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: sershe@apache.org To: commits@hive.apache.org Date: Tue, 10 Oct 2017 00:52:27 -0000 Message-Id: <26290547658d4afe8cb90fe6ee5cb455@git.apache.org> In-Reply-To: <97334c38fc1c41faafc6b13d11b4c2f2@git.apache.org> References: <97334c38fc1c41faafc6b13d11b4c2f2@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [53/61] [abbrv] hive git commit: HIVE-11266: count(*) wrong result based on table statistics for external tables (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) archived-at: Tue, 10 Oct 2017 00:51:46 -0000 HIVE-11266: count(*) wrong result based on table statistics for external tables (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a2dff9e1 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a2dff9e1 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a2dff9e1 Branch: refs/heads/hive-14535 Commit: a2dff9e13acc62ecc0388b3b2e221f26c9184dbb Parents: 6ef65f4 Author: Jesus Camacho Rodriguez Authored: Mon Oct 9 10:38:51 2017 -0700 Committer: Jesus Camacho Rodriguez Committed: Mon Oct 9 13:17:48 2017 -0700 ---------------------------------------------------------------------- .../hive/ql/optimizer/StatsOptimizer.java | 5 ++ .../queries/clientpositive/stats_noscan_2.q | 3 + .../clientpositive/spark/stats_noscan_2.q.out | 77 ++++++++++++++++++++ .../results/clientpositive/stats_noscan_2.q.out | 71 ++++++++++++++++++ 4 files changed, 156 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 7b2a1a9..a3a35ff 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -31,6 +31,7 @@ import java.util.Stack; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; @@ -272,6 +273,10 @@ public class StatsOptimizer extends Transform { return null; } Table tbl = tsOp.getConf().getTableMetadata(); + if (MetaStoreUtils.isExternalTable(tbl.getTTable())) { + Logger.info("Table " + tbl.getTableName() + " is external. Skip StatsOptimizer."); + return null; + } if (AcidUtils.isAcidTable(tbl)) { Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer."); return null; http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/queries/clientpositive/stats_noscan_2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/stats_noscan_2.q b/ql/src/test/queries/clientpositive/stats_noscan_2.q index 8639ab7..d473081 100644 --- a/ql/src/test/queries/clientpositive/stats_noscan_2.q +++ b/ql/src/test/queries/clientpositive/stats_noscan_2.q @@ -16,6 +16,7 @@ create table texternal(key string, val string) partitioned by (insertdate string dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/texternal/2008-01-01; alter table texternal add partition (insertdate='2008-01-01') location 'pfile://${system:test.tmp.dir}/texternal/2008-01-01'; from src insert overwrite table texternal partition (insertdate='2008-01-01') select *; +explain select count(*) from texternal where insertdate='2008-01-01'; select count(*) from texternal where insertdate='2008-01-01'; -- create external table CREATE EXTERNAL TABLE anaylyze_external (key string, val string) partitioned by (insertdate string) LOCATION "pfile://${system:test.tmp.dir}/texternal"; @@ -27,6 +28,8 @@ describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); analyze table anaylyze_external PARTITION (insertdate='2008-01-01') compute statistics; describe formatted anaylyze_external PARTITION (insertdate='2008-01-01'); dfs -rmr ${system:test.tmp.dir}/texternal; +explain select count(*) from anaylyze_external where insertdate='2008-01-01'; +select count(*) from anaylyze_external where insertdate='2008-01-01'; drop table anaylyze_external; http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out b/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out index 14e131c..1f5bd0c 100644 --- a/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out +++ b/ql/src/test/results/clientpositive/spark/stats_noscan_2.q.out @@ -139,6 +139,20 @@ POSTHOOK: Input: default@src POSTHOOK: Output: default@texternal@insertdate=2008-01-01 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01' +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + PREHOOK: query: select count(*) from texternal where insertdate='2008-01-01' PREHOOK: type: QUERY PREHOOK: Input: default@texternal @@ -275,6 +289,69 @@ Sort Columns: [] Storage Desc Params: serialization.format 1 #### A masked pattern was here #### +PREHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01' +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: anaylyze_external + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01' +PREHOOK: type: QUERY +PREHOOK: Input: default@anaylyze_external +PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@anaylyze_external +POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 +#### A masked pattern was here #### +0 PREHOOK: query: drop table anaylyze_external PREHOOK: type: DROPTABLE PREHOOK: Input: default@anaylyze_external http://git-wip-us.apache.org/repos/asf/hive/blob/a2dff9e1/ql/src/test/results/clientpositive/stats_noscan_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/stats_noscan_2.q.out b/ql/src/test/results/clientpositive/stats_noscan_2.q.out index 14e131c..ef397c3 100644 --- a/ql/src/test/results/clientpositive/stats_noscan_2.q.out +++ b/ql/src/test/results/clientpositive/stats_noscan_2.q.out @@ -139,6 +139,20 @@ POSTHOOK: Input: default@src POSTHOOK: Output: default@texternal@insertdate=2008-01-01 POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] POSTHOOK: Lineage: texternal PARTITION(insertdate=2008-01-01).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01' +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from texternal where insertdate='2008-01-01' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + PREHOOK: query: select count(*) from texternal where insertdate='2008-01-01' PREHOOK: type: QUERY PREHOOK: Input: default@texternal @@ -275,6 +289,63 @@ Sort Columns: [] Storage Desc Params: serialization.format 1 #### A masked pattern was here #### +PREHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01' +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from anaylyze_external where insertdate='2008-01-01' +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: anaylyze_external + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01' +PREHOOK: type: QUERY +PREHOOK: Input: default@anaylyze_external +PREHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from anaylyze_external where insertdate='2008-01-01' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@anaylyze_external +POSTHOOK: Input: default@anaylyze_external@insertdate=2008-01-01 +#### A masked pattern was here #### +0 PREHOOK: query: drop table anaylyze_external PREHOOK: type: DROPTABLE PREHOOK: Input: default@anaylyze_external