impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ab...@apache.org
Subject incubator-impala git commit: IMPALA-1657: Rework detection and reporting of corrupt table stats.
Date Wed, 31 Aug 2016 01:02:29 GMT
Repository: incubator-impala
Updated Branches:
  refs/heads/master 6bbb7fb3d -> 12496c7fb


IMPALA-1657: Rework detection and reporting of corrupt table stats.

1. Minor fixes for cardinality estimation of unpartitioned tables.
2. Reworks handling of corrupt table stats as follows:
   The stats of a table or partition are reported as corrupt if the
   numRows < -1, or if numRows == 0 but the table size is positive.
3. Removes the Preconditions check reported in IMPALA-1657 in favor
   or issuing a corrupt table stats warning.
4. Fixes a few tests to set numRows together with
   STATS_GENERATED_VIA_STATS_TASK so that the numRows is definitely
   set in the HMS.

Change-Id: I1d3305791d96e1c23a901af7b7c109af9352bb44
Reviewed-on: http://gerrit.cloudera.org:8080/4166
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12496c7f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12496c7f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12496c7f

Branch: refs/heads/master
Commit: 12496c7fbf62e364f7c23d819185237a5b1b9f59
Parents: 6bbb7fb
Author: Alex Behm <alex.behm@cloudera.com>
Authored: Mon Aug 29 14:42:17 2016 -0700
Committer: Internal Jenkins <cloudera-hudson@gerrit.cloudera.org>
Committed: Wed Aug 31 00:58:03 2016 +0000

----------------------------------------------------------------------
 .../cloudera/impala/planner/HdfsScanNode.java   | 30 ++++++---
 .../com/cloudera/impala/planner/Planner.java    |  5 +-
 .../queries/QueryTest/corrupt-stats.test        | 64 +++++++++++++++++---
 tests/metadata/test_compute_stats.py            |  4 +-
 4 files changed, 81 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
index 31ec600..9b83902 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
@@ -356,26 +356,35 @@ public class HdfsScanNode extends ScanNode {
   }
 
   /**
-   * Also computes totalBytes_
+   * Also computes totalBytes_, totalFiles_, numPartitionsMissingStats_,
+   * and sets hasCorruptTableStats_.
    */
   @Override
   public void computeStats(Analyzer analyzer) {
     super.computeStats(analyzer);
     LOG.debug("collecting partitions for table " + tbl_.getName());
     numPartitionsMissingStats_ = 0;
-    if (tbl_.getPartitions().isEmpty()) {
+    totalFiles_ = 0;
+    totalBytes_ = 0;
+    if (tbl_.getNumClusteringCols() == 0) {
       cardinality_ = tbl_.getNumRows();
-      if ((cardinality_ < -1 || cardinality_ == 0) && tbl_.getTotalHdfsBytes()
> 0) {
+      if (cardinality_ < -1 || (cardinality_ == 0 && tbl_.getTotalHdfsBytes()
> 0)) {
         hasCorruptTableStats_ = true;
       }
+      if (partitions_.isEmpty()) {
+        // Nothing to scan. Definitely a cardinality of 0 even if we have no stats.
+        cardinality_ = 0;
+      } else {
+        Preconditions.checkState(partitions_.size() == 1);
+        totalFiles_ += partitions_.get(0).getFileDescriptors().size();
+        totalBytes_ += partitions_.get(0).getSize();
+      }
     } else {
       cardinality_ = 0;
-      totalFiles_ = 0;
-      totalBytes_ = 0;
       boolean hasValidPartitionCardinality = false;
       for (HdfsPartition p: partitions_) {
         // Check for corrupt table stats
-        if ((p.getNumRows() == 0 || p.getNumRows() < -1) && p.getSize() > 0)
 {
+        if (p.getNumRows() < -1  || (p.getNumRows() == 0 && p.getSize() > 0))
 {
           hasCorruptTableStats_ = true;
         }
         // ignore partitions with missing stats in the hope they don't matter
@@ -402,8 +411,13 @@ public class HdfsScanNode extends ScanNode {
       }
     }
     inputCardinality_ = cardinality_;
-    Preconditions.checkState(cardinality_ >= 0 || cardinality_ == -1,
-        "Internal error: invalid scan node cardinality: " + cardinality_);
+
+    // Sanity check scan node cardinality.
+    if (cardinality_ < -1) {
+      hasCorruptTableStats_ = true;
+      cardinality_ = -1;
+    }
+
     if (cardinality_ > 0) {
       LOG.debug("cardinality_=" + Long.toString(cardinality_) +
                 " sel=" + Double.toString(computeSelectivity()));

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/fe/src/main/java/com/cloudera/impala/planner/Planner.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/Planner.java b/fe/src/main/java/com/cloudera/impala/planner/Planner.java
index 64a6c05..de1c088 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/Planner.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/Planner.java
@@ -209,8 +209,9 @@ public class Planner {
       for (TTableName tableName: request.query_ctx.getTables_with_corrupt_stats()) {
         tableNames.add(tableName.db_name + "." + tableName.table_name);
       }
-      str.append("WARNING: The following tables have potentially corrupt table\n" +
-          "statistics. Drop and re-compute statistics to resolve this problem.\n" +
+      str.append(
+          "WARNING: The following tables have potentially corrupt table statistics.\n" +
+          "Drop and re-compute statistics to resolve this problem.\n" +
           Joiner.on(", ").join(tableNames) + "\n");
       hasHeader = true;
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test b/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test
index 7c5b561..da88f3b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/corrupt-stats.test
@@ -34,7 +34,7 @@ ORG, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL
S
 STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING
 ====
 ---- QUERY
-alter table corrupted partition(org=1) set tblproperties('numRows'='0');
+alter table corrupted partition(org=1) set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true');
 ====
 ---- QUERY
 invalidate metadata corrupted;
@@ -53,8 +53,8 @@ STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING
 ---- QUERY
 explain select count(*) from corrupted where org = 1;
 ---- RESULTS: VERIFY_IS_SUBSET
-'WARNING: The following tables have potentially corrupt table'
-'statistics. Drop and re-compute statistics to resolve this problem.'
+'WARNING: The following tables have potentially corrupt table statistics.'
+'Drop and re-compute statistics to resolve this problem.'
 '$DATABASE.corrupted'
 ''
 '03:AGGREGATE [FINALIZE]'
@@ -71,8 +71,8 @@ explain select count(*) from corrupted where org = 1;
 STRING
 ====
 ---- QUERY
-alter table corrupted partition(org=1) set tblproperties('numRows'='3');
-alter table corrupted set tblproperties('numRows'='0');
+alter table corrupted partition(org=1) set tblproperties('numRows'='3', 'STATS_GENERATED_VIA_STATS_TASK'='true');
+alter table corrupted set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true');
 ====
 ---- QUERY
 show table stats corrupted;
@@ -97,7 +97,7 @@ explain select count(*) from corrupted;
 STRING
 ====
 ---- QUERY
-alter table corrupted set tblproperties('numRows'='6');
+alter table corrupted set tblproperties('numRows'='6', 'STATS_GENERATED_VIA_STATS_TASK'='true');
 ====
 ---- QUERY
 show table stats corrupted;
@@ -122,6 +122,28 @@ explain select count(*) from corrupted;
 STRING
 ====
 ---- QUERY
+# IMPALA-3930: Set numRows of a partition to a negative value and check warning,
+alter table corrupted partition(org=2) set tblproperties('numRows'='-1234', 'STATS_GENERATED_VIA_STATS_TASK'='true');
+explain select count(*) from corrupted where org = 2;
+---- RESULTS: VERIFY_IS_SUBSET
+'WARNING: The following tables have potentially corrupt table statistics.'
+'Drop and re-compute statistics to resolve this problem.'
+'$DATABASE.corrupted'
+''
+'03:AGGREGATE [FINALIZE]'
+'|  output: count:merge(*)'
+'|'
+'02:EXCHANGE [UNPARTITIONED]'
+'|'
+'01:AGGREGATE'
+'|  output: count(*)'
+'|'
+'00:SCAN HDFS [$DATABASE.corrupted]'
+'   partitions=1/2 files=1 size=24B'
+---- TYPES
+STRING
+====
+---- QUERY
 create table corrupted_no_part (id int);
 insert into corrupted_no_part values (1),(2),(3);
 compute stats corrupted_no_part;
@@ -148,22 +170,44 @@ explain select count(*) from corrupted_no_part;
 STRING
 ====
 ---- QUERY
-alter table corrupted_no_part set tblproperties('numRows'='0');
+alter table corrupted_no_part set tblproperties('numRows'='0', 'STATS_GENERATED_VIA_STATS_TASK'='true');
 ====
 ---- QUERY
 show table stats corrupted_no_part;
 ---- LABELS
 #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
 ---- RESULTS
--1,1,'6B','NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/$DATABASE.db/corrupted_no_part'
+0,1,'6B','NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/$DATABASE.db/corrupted_no_part'
 ---- TYPES
 BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING, STRING
 ====
 ---- QUERY
--- After setting num rows to 0, the HMS will set it to -1 and avoids bad behavior.
 explain select count(*) from corrupted_no_part;
 ---- RESULTS: VERIFY_IS_SUBSET
-'WARNING: The following tables are missing relevant table and/or column statistics.'
+'WARNING: The following tables have potentially corrupt table statistics.'
+'Drop and re-compute statistics to resolve this problem.'
+'$DATABASE.corrupted_no_part'
+''
+'03:AGGREGATE [FINALIZE]'
+'|  output: count:merge(*)'
+'|'
+'02:EXCHANGE [UNPARTITIONED]'
+'|'
+'01:AGGREGATE'
+'|  output: count(*)'
+'|'
+'00:SCAN HDFS [$DATABASE.corrupted_no_part]'
+'   partitions=1/1 files=1 size=6B'
+---- TYPES
+STRING
+====
+---- QUERY
+# IMPALA-3930: Set numRows of the table to a negative value and check warning,
+alter table corrupted_no_part set tblproperties('numRows'='-1234', 'STATS_GENERATED_VIA_STATS_TASK'='true');
+explain select count(*) from corrupted_no_part;
+---- RESULTS: VERIFY_IS_SUBSET
+'WARNING: The following tables have potentially corrupt table statistics.'
+'Drop and re-compute statistics to resolve this problem.'
 '$DATABASE.corrupted_no_part'
 ''
 '03:AGGREGATE [FINALIZE]'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12496c7f/tests/metadata/test_compute_stats.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_compute_stats.py b/tests/metadata/test_compute_stats.py
index 348d7f7..6f6d31a 100644
--- a/tests/metadata/test_compute_stats.py
+++ b/tests/metadata/test_compute_stats.py
@@ -151,8 +151,8 @@ class TestCorruptTableStats(ImpalaTestSuite):
     cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
 
   def test_corrupt_stats(self, vector, unique_database):
-    """IMPALA-1983: Test that in the presence of corrupt table statistics a warning is
-    issued and the small query optimization is disabled."""
+    """IMPALA-1983/IMPALA-1657: Test that in the presence of corrupt table statistics a
+    warning is issued and the small query optimization is disabled."""
     if self.exploration_strategy() != 'exhaustive': pytest.skip("Only run in exhaustive")
     self.run_test_case('QueryTest/corrupt-stats', vector, unique_database)
 


Mime
View raw message