hive-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Mostafa Mokhtar (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (HIVE-7982) Regression in explain with CBO enabled due to issuing query per K,V for the stats
Date Thu, 04 Sep 2014 14:02:52 GMT

     [ https://issues.apache.org/jira/browse/HIVE-7982?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Mostafa Mokhtar updated HIVE-7982:
----------------------------------
    Description: 

Now explain for Q17 is back in the 12 second range, I checked the queries issues to MySQL
and they are very different than before 

on August 15 explain was completing in under 5 seconds and we issued the following queries
: 
{code}
select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"),
max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES")
from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME"
= 'store_returns' and "COLUMN_NAME" in ('sr_item_sk','sr_customer_sk','sr_ticket_number')
AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01')
group by "COLUMN_NAME", "COLUMN_TYPE";

select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"),
max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES")
from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME"
= 'store_returns' and "COLUMN_NAME" in ('sr_returned_date_sk') AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01')
group by "COLUMN_NAME", "COLUMN_TYPE"
{code}


Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very inefficient
because 
1) They no longer do the aggregation on MySQL and get a row per partition 
2) There is a query per stats K,V pair so the number of queries is up by 9x

{code}
		select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME)  from PART_COL_STATS where DB_NAME
= 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'  and COLUMN_NAME in ('sr_item_sk','sr_customer_sk','sr_ticket_number')
and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group
by COLUMN_NAME, COLUMN_TYPE
		select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES), sum(NUM_FALSES) from PART_COL_STATS
where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'  and COLUMN_NAME
in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
group by COLUMN_NAME
		select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_LOW_VALUE'
		select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_HIGH_VALUE'
		select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_LOW_VALUE'
		select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_HIGH_VALUE'
		select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_LOW_VALUE'
		select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_HIGH_VALUE'
		select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'NUM_DISTINCTS'
		select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'AVG_COL_LEN'
		select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'MAX_COL_LEN'
		select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
order by 'LONG_LOW_VALUE'
		select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
order by 'LONG_HIGH_VALUE'
 {code}

  was:
With CBO we need the correct set of indexes to provide an efficient Read/Write access.
These indexes improve performance of Explain plan and Analyzed table by 60% and 300%.

{code}
MySQL 
 CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME) USING
BTREE;

MsSQL
CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME);

Oracle 
CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS (DB_NAME,TABLE_NAME,COLUMN_NAME);

Postgres
CREATE INDEX "PART_COL_STATS_N50" ON "PART_COL_STATS" USING btree ("DB_NAME","TABLE_NAME","COLUMN_NAME");
{code}



> Regression in explain with CBO enabled due to issuing query per K,V for the stats
> ---------------------------------------------------------------------------------
>
>                 Key: HIVE-7982
>                 URL: https://issues.apache.org/jira/browse/HIVE-7982
>             Project: Hive
>          Issue Type: Bug
>          Components: CBO
>    Affects Versions: 0.14.0
>            Reporter: Mostafa Mokhtar
>            Assignee: Ashutosh Chauhan
>             Fix For: 0.14.0
>
>
> Now explain for Q17 is back in the 12 second range, I checked the queries issues to MySQL
and they are very different than before 
> on August 15 explain was completing in under 5 seconds and we issued the following queries
: 
> {code}
> select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"),
max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES")
from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME"
= 'store_returns' and "COLUMN_NAME" in ('sr_item_sk','sr_customer_sk','sr_ticket_number')
AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01')
group by "COLUMN_NAME", "COLUMN_TYPE";
> select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"), max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"),
max("DOUBLE_HIGH_VALUE"), min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES")
from "PART_COL_STATS" where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME"
= 'store_returns' and "COLUMN_NAME" in ('sr_returned_date_sk') AND "PARTITION_NAME" in ('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01')
group by "COLUMN_NAME", "COLUMN_TYPE"
> {code}
> Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very inefficient
because 
> 1) They no longer do the aggregation on MySQL and get a row per partition 
> 2) There is a query per stats K,V pair so the number of queries is up by 9x
> {code}
> 		select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME)  from PART_COL_STATS where DB_NAME
= 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'  and COLUMN_NAME in ('sr_item_sk','sr_customer_sk','sr_ticket_number')
and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group
by COLUMN_NAME, COLUMN_TYPE
> 		select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES), sum(NUM_FALSES) from PART_COL_STATS
where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'  and COLUMN_NAME
in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
group by COLUMN_NAME
> 		select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_LOW_VALUE'
> 		select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'LONG_HIGH_VALUE'
> 		select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_LOW_VALUE'
> 		select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'DOUBLE_HIGH_VALUE'
> 		select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_LOW_VALUE'
> 		select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'BIG_DECIMAL_HIGH_VALUE'
> 		select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'NUM_DISTINCTS'
> 		select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'AVG_COL_LEN'
> 		select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME
in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by 'MAX_COL_LEN'
> 		select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
order by 'LONG_LOW_VALUE'
> 		select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200'
and TABLE_NAME = 'store_returns' and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
order by 'LONG_HIGH_VALUE'
>  {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message