impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From taras...@apache.org
Subject [1/2] incubator-impala git commit: IMPALA-5036: Parquet count star optimization
Date Thu, 06 Jul 2017 17:44:31 GMT
Repository: incubator-impala
Updated Branches:
  refs/heads/master d7d6c0367 -> 57d7c614b


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test
deleted file mode 100644
index d03b4c9..0000000
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test
+++ /dev/null
@@ -1,460 +0,0 @@
-====
----- QUERY
-select id, bool_col from functional_parquet.alltypessmall where int_col < 0
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-set explain_level=2;
-explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
----- RESULTS: VERIFY_IS_SUBSET
-'   parquet statistics predicates: int_col < 0'
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where smallint_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where int_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where bigint_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where float_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where double_col < 0
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-# Test with inverted predicate
-select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where smallint_col > 9
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select id, bool_col from functional_parquet.alltypessmall where int_col > 9
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where bigint_col > 90
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where float_col > 9.9
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where double_col > 99
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
----- RESULTS
-12
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
----- RESULTS
-8
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-set explain_level=2;
-explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
----- RESULTS: VERIFY_IS_SUBSET
-'   parquet statistics predicates: tinyint_col = 10'
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
----- RESULTS
-51
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-# Mix with partitioning columns
-select count(*) from functional_parquet.alltypessmall where int_col < 0 and year <
2012
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-# Test that without expr rewrite and thus without constant folding, constant exprs still
-# can be used to prune row groups.
-set enable_expr_rewrites=0;
-select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-# Test that without expr rewrite and thus without constant folding, constant exprs still
-# can be used to prune row groups.
-set enable_expr_rewrites=0;
-select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
----- RESULTS
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-# Test name based column resolution
-create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
-alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint,
smallint_col smallint, id int);
-set parquet_fallback_schema_resolution=NAME;
-# If this picks up the stats from int_col, then it will filter all row groups and return
-# an incorrect result.
-select count(*) from name_resolve where id > 10;
----- RESULTS
-89
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-# Query that has an implicit cast to a larger integer type
-select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
-select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as
string)
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-# Explicit casts between numerical types can violate the transitivity of "min()", so they
-# are not supported when evaluating parquet::Statistics.
-select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
----- RESULTS
-3878
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 24
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
----- RESULTS
-9
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 2
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-# Test the conversion of constant IN lists to min/max predicats
-select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 24
-aggregation(SUM, NumStatsFilteredRowGroups): 24
-====
----- QUERY
-select count(*) from functional_parquet.alltypes where id IN (1,25,49);
----- RESULTS
-3
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 24
-aggregation(SUM, NumStatsFilteredRowGroups): 23
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where string_col < "0"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where string_col <= "/"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where string_col < "1"
----- RESULTS
-12
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where string_col >= "9"
----- RESULTS
-8
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where string_col > ":"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01
00:00:00"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01
00:00:00"
----- RESULTS
-1
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 3
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
----- RESULTS
-1
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 3
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03
00:24:00.96"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 4
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03
00:24:00.96"
----- RESULTS
-1
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 3
-====
----- QUERY
-select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
----- RESULTS
-1
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 4
-aggregation(SUM, NumStatsFilteredRowGroups): 3
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d1 < 1234
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
----- RESULTS
-1
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
----- RESULTS
-5
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 1
-====
----- QUERY
-# Test that stats are disabled for CHAR type columns.
-create table chars (id int, c char(4)) stored as parquet;
-insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
-select count(*) from chars;
----- RESULTS
-2
-====
----- QUERY
-select count(*) from chars where c <= "aaaa"
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 1
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====
----- QUERY
-# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics
-# query option.
-set parquet_read_statistics=0;
-select count(*) from functional_parquet.alltypes where id < 0;
----- RESULTS
-0
----- RUNTIME_PROFILE
-aggregation(SUM, NumRowGroups): 24
-aggregation(SUM, NumStatsFilteredRowGroups): 0
-====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/tests/query_test/test_aggregation.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py
index a257bc0..289a867 100644
--- a/tests/query_test/test_aggregation.py
+++ b/tests/query_test/test_aggregation.py
@@ -271,6 +271,15 @@ class TestAggregationQueries(ImpalaTestSuite):
       # Verify codegen was enabled for all four stages of the aggregation.
       assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
 
+  def test_parquet_count_star_optimization(self, vector):
+    if (vector.get_value('table_format').file_format != 'text' or
+        vector.get_value('table_format').compression_codec != 'none'):
+      # No need to run this test on all file formats
+      pytest.skip()
+    self.run_test_case('QueryTest/parquet-stats-agg', vector)
+    vector.get_value('exec_option')['batch_size'] = 1
+    self.run_test_case('QueryTest/parquet-stats-agg', vector)
+
 class TestWideAggregationQueries(ImpalaTestSuite):
   """Test that aggregations with many grouping columns work"""
   @classmethod

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/57d7c614/tests/query_test/test_parquet_stats.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_parquet_stats.py b/tests/query_test/test_parquet_stats.py
index 93fc06d..00afd09 100644
--- a/tests/query_test/test_parquet_stats.py
+++ b/tests/query_test/test_parquet_stats.py
@@ -46,7 +46,7 @@ class TestParquetStats(ImpalaTestSuite):
     # The test makes assumptions about the number of row groups that are processed and
     # skipped inside a fragment, so we ensure that the tests run in a single fragment.
     vector.get_value('exec_option')['num_nodes'] = 1
-    self.run_test_case('QueryTest/parquet_stats', vector, use_db=unique_database)
+    self.run_test_case('QueryTest/parquet-stats', vector, use_db=unique_database)
 
   def test_deprecated_stats(self, vector, unique_database):
     """Test that reading parquet files with statistics with deprecated 'min'/'max' fields
@@ -69,4 +69,3 @@ class TestParquetStats(ImpalaTestSuite):
     # skipped inside a fragment, so we ensure that the tests run in a single fragment.
     vector.get_value('exec_option')['num_nodes'] = 1
     self.run_test_case('QueryTest/parquet-deprecated-stats', vector, unique_database)
-


Mime
View raw message