hive-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Mostafa Mokhtar (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (HIVE-8526) Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect CE
Date Mon, 20 Oct 2014 23:21:33 GMT

     [ https://issues.apache.org/jira/browse/HIVE-8526?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Mostafa Mokhtar updated HIVE-8526:
----------------------------------
    Description: 
The join order has Item joined last where it should be joined first

Query 
{code}
select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
    JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
 where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
                             )
            )
        and d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan
{code}
2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.710158597922807E7, cumulative cost = {7.169080587598123E10
rows, 3.420317295845614E7 cpu, 0.0 io}, id = 579
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.710158597922807E7,
cumulative cost = {6.827294821015483E10 rows, 1.710158697922807E7 cpu, 0.0 io}, id = 577
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.710158597922807E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 575
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.710158597922807E7, cumulative
cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 573
        HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 6.0197670310147226E7, cumulative
cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 571
          HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 569
            HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 567
              SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 565
                HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], ws_sales_price=[$2],
ws_sold_date_sk=[$3], c_customer_sk=[$9], c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12],
ca_zip=[$13], d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): rowcount
= 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id
= 669
                  HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id = 667
                    HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): rowcount = 2.1594638446E10,
cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 0.0 io}, id = 664
                      HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): rowcount = 2.1594638446E10,
cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 0.0 io}, id = 601
                        HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 497
                          HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 341
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
                          HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 340
                      HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount =
73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 342
                    HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 598
                      HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount
= 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 500
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]):
rowcount = 8.0E7, cumulative cost = {0}, id = 343
                      HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 547
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 339
                HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative cost
= {0.0 rows, 0.0 cpu, 0.0 io}, id = 563
                  HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 561
                    HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29)]):
rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 559
                      HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount
= 462000.0, cumulative cost = {0}, id = 340
{code}


Then I rewrote the query trying to force CBO to generate the correct join order
{code}
with items as (select i_item_sk from 
item  where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
                             )
            )
)

select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
 where
 d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

But the correct join order wasn't generated because CE for item x item + filter has a selectivity
of 1.
{code}
2014-10-20 18:46:27,120 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.6595391288544238E7, cumulative cost = {2.8364280421639153E10
rows, 3.3190782577088475E7 cpu, 0.0 io}, id = 1291
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.505357243157397E10 rows, 1.6595391288544238E7 cpu, 0.0 io}, id = 1289
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1287
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1285
        HiveProjectRel($f0=[$9], $f1=[$8], $f2=[$2]): rowcount = 6.019767031014723E7, cumulative
cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1283
          HiveProjectRel(ws_item_sk=[$5], ws_bill_customer_sk=[$6], ws_sales_price=[$7], ws_sold_date_sk=[$8],
i_item_sk=[$12], c_customer_sk=[$0], c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3],
ca_zip=[$4], d_date_sk=[$9], d_year=[$10], d_qoy=[$11]): rowcount = 6.019767031014723E7, cumulative
cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1380
            HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount = 6.019767031014723E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1378
              HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 1309
                HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount = 8.0E7,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1269
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]): rowcount
= 8.0E7, cumulative cost = {0}, id = 1035
                HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1273
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 1032
              HiveJoinRel(condition=[=($0, $7)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.16336624308125E10 rows, 0.0 cpu, 0.0 io}, id = 1376
                HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.159463857644464E10 rows, 0.0 cpu, 0.0 io}, id = 1316
                  HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 1205
                    HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 1033
                  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1279
                    HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1277
                      HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 1034
                HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative cost = {1.0
rows, 1.0 cpu, 0.0 io}, id = 1265
                  HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, cumulative cost
= {1.0 rows, 1.0 cpu, 0.0 io}, id = 1263
                    SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 462000.0,
cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1261
                      HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1253
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
                      HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1259
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1257
                          HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 23,
29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1255
                            HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
{code}


This query generates the correct join order 
{code}
 with items as (select i_item_sk from 
item  where
         item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
                             )
            
),
  ws as (
 select ws_bill_customer_sk,ws_sales_price,ws_sold_date_sk
from  web_sales
    JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
 )
 select  ca_zip, ca_county, sum(ws_sales_price)
 from ws 
    JOIN customer ON ws.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk 
    JOIN date_dim ON ws.ws_sold_date_sk = date_dim.d_date_sk
 where d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan 
{code}
2014-10-20 19:13:15,989 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.6595391288544238E7, cumulative cost = {4.99203570142713E10
rows, 3.3190783577088475E7 cpu, 0.0 io}, id = 4367
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.6609649024206116E10 rows, 1.6595392288544238E7 cpu, 0.0 io}, id = 4365
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4363
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4361
        HiveProjectRel($f0=[$7], $f1=[$6], $f2=[$1]): rowcount = 6.019767031014723E7, cumulative
cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4359
          HiveProjectRel(ws_bill_customer_sk=[$5], ws_sales_price=[$6], ws_sold_date_sk=[$7],
c_customer_sk=[$0], c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3], ca_zip=[$4],
d_date_sk=[$8], d_year=[$9], d_qoy=[$10]): rowcount = 6.019767031014723E7, cumulative cost
= {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4426
            HiveJoinRel(condition=[=($5, $0)], joinType=[inner]): rowcount = 6.019767031014723E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4424
              HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 4392
                HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount = 8.0E7,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4345
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]): rowcount
= 8.0E7, cumulative cost = {0}, id = 4101
                HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4349
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 4099
              HiveJoinRel(condition=[=($2, $3)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {4.318973902344464E10 rows, 1.0 cpu, 0.0 io}, id = 4395
                HiveProjectRel(ws_bill_customer_sk=[$1], ws_sales_price=[$2], ws_sold_date_sk=[$3]):
rowcount = 2.1594638446E10, cumulative cost = {2.1595100447E10 rows, 1.0 cpu, 0.0 io}, id
= 4343
                  HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], ws_sales_price=[$2],
ws_sold_date_sk=[$3], i_item_sk=[$4]): rowcount = 2.1594638446E10, cumulative cost = {2.1595100447E10
rows, 1.0 cpu, 0.0 io}, id = 4388
                    HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): rowcount = 2.1594638446E10,
cumulative cost = {2.1595100447E10 rows, 1.0 cpu, 0.0 io}, id = 4383
                      HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 4277
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 4096
                      HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative cost
= {1.0 rows, 1.0 cpu, 0.0 io}, id = 4339
                        HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, cumulative
cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 4337
                          SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount =
462000.0, cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 4335
                            HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4327
                              HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 4088
                            HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4333
                              HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4331
                                HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19,
23, 29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id
= 4329
                                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 4088
                HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4355
                  HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4353
                    HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 4100

{code}

  was:
The join order has Item joined last where it should be joined first

Query 
{code}
select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
    JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
 where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
                             )
            )
        and d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan
{code}
2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.710158597922807E7, cumulative cost = {7.169080587598123E10
rows, 3.420317295845614E7 cpu, 0.0 io}, id = 579
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.710158597922807E7,
cumulative cost = {6.827294821015483E10 rows, 1.710158697922807E7 cpu, 0.0 io}, id = 577
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.710158597922807E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 575
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.710158597922807E7, cumulative
cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 573
        HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 6.0197670310147226E7, cumulative
cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 571
          HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 569
            HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 567
              SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 565
                HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], ws_sales_price=[$2],
ws_sold_date_sk=[$3], c_customer_sk=[$9], c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12],
ca_zip=[$13], d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): rowcount
= 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id
= 669
                  HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id = 667
                    HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): rowcount = 2.1594638446E10,
cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 0.0 io}, id = 664
                      HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): rowcount = 2.1594638446E10,
cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 0.0 io}, id = 601
                        HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 497
                          HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 341
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
                          HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 340
                      HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount =
73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 342
                    HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 598
                      HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount
= 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 500
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]):
rowcount = 8.0E7, cumulative cost = {0}, id = 343
                      HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 547
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 339
                HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative cost
= {0.0 rows, 0.0 cpu, 0.0 io}, id = 563
                  HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 561
                    HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29)]):
rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 559
                      HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount
= 462000.0, cumulative cost = {0}, id = 340
{code}


Then I rewrote the query trying to force CBO to generate the correct join order
{code}
with items as (select i_item_sk from 
item  where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
                             )
            )
)

select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
 where
 d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

But the correct join order wasn't generated because CE for item x item + filter has a selectivity
of 1.
{code}
2014-10-20 18:46:27,120 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.6595391288544238E7, cumulative cost = {2.8364280421639153E10
rows, 3.3190782577088475E7 cpu, 0.0 io}, id = 1291
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.505357243157397E10 rows, 1.6595391288544238E7 cpu, 0.0 io}, id = 1289
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1287
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1285
        HiveProjectRel($f0=[$9], $f1=[$8], $f2=[$2]): rowcount = 6.019767031014723E7, cumulative
cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1283
          HiveProjectRel(ws_item_sk=[$5], ws_bill_customer_sk=[$6], ws_sales_price=[$7], ws_sold_date_sk=[$8],
i_item_sk=[$12], c_customer_sk=[$0], c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3],
ca_zip=[$4], d_date_sk=[$9], d_year=[$10], d_qoy=[$11]): rowcount = 6.019767031014723E7, cumulative
cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1380
            HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount = 6.019767031014723E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1378
              HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 1309
                HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount = 8.0E7,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1269
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]): rowcount
= 8.0E7, cumulative cost = {0}, id = 1035
                HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1273
                  HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 1032
              HiveJoinRel(condition=[=($0, $7)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.16336624308125E10 rows, 0.0 cpu, 0.0 io}, id = 1376
                HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.159463857644464E10 rows, 0.0 cpu, 0.0 io}, id = 1316
                  HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 1205
                    HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 1033
                  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1279
                    HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): rowcount = 130.44464285714287,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1277
                      HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 1034
                HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative cost = {1.0
rows, 1.0 cpu, 0.0 io}, id = 1265
                  HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, cumulative cost
= {1.0 rows, 1.0 cpu, 0.0 io}, id = 1263
                    SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 462000.0,
cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1261
                      HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1253
                        HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
                      HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1259
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1257
                          HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 23,
29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1255
                            HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
{code}


> Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect
CE
> ---------------------------------------------------------------------------------------
>
>                 Key: HIVE-8526
>                 URL: https://issues.apache.org/jira/browse/HIVE-8526
>             Project: Hive
>          Issue Type: Bug
>          Components: CBO
>    Affects Versions: 0.14.0
>            Reporter: Mostafa Mokhtar
>            Assignee: Harish Butani
>             Fix For: 0.14.0
>
>
> The join order has Item joined last where it should be joined first
> Query 
> {code}
> select  ca_zip, ca_county, sum(ws_sales_price)
>  from
>     web_sales
>     JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
>     JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk

>     JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
>     JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
>  where
>         ( item.i_item_id in (select i_item_id
>                              from item i2
>                              where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
>                              )
>             )
>         and d_qoy = 2 and d_year = 2000
>  group by ca_zip, ca_county
>  order by ca_zip, ca_county
>  limit 100
> {code}
> Plan
> {code}
> 2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.710158597922807E7, cumulative cost = {7.169080587598123E10
rows, 3.420317295845614E7 cpu, 0.0 io}, id = 579
>   HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.710158597922807E7,
cumulative cost = {6.827294821015483E10 rows, 1.710158697922807E7 cpu, 0.0 io}, id = 577
>     HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.710158597922807E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 575
>       HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.710158597922807E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 573
>         HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 571
>           HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): rowcount
= 6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id
= 569
>             HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 6.0197670310147226E7,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 567
>               SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 io}, id = 565
>                 HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], ws_sales_price=[$2],
ws_sold_date_sk=[$3], c_customer_sk=[$9], c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12],
ca_zip=[$13], d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): rowcount
= 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id
= 669
>                   HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): rowcount = 3.371069537368245E10,
cumulative cost = {6.485509054332843E10 rows, 0.0 cpu, 0.0 io}, id = 667
>                     HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): rowcount =
2.1594638446E10, cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 0.0 io}, id = 664
>                       HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): rowcount
= 2.1594638446E10, cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 0.0 io}, id = 601
>                         HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 497
>                           HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 341
>                         HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
>                           HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 340
>                       HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount
= 73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551
>                         HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 342
>                     HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount =
7.064015632843196E7, cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 598
>                       HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount
= 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 500
>                         HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]):
rowcount = 8.0E7, cumulative cost = {0}, id = 343
>                       HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]):
rowcount = 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 547
>                         HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 339
>                 HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 563
>                   HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 561
>                     HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29)]):
rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 559
>                       HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 340
> {code}
> Then I rewrote the query trying to force CBO to generate the correct join order
> {code}
> with items as (select i_item_sk from 
> item  where
>         ( item.i_item_id in (select i_item_id
>                              from item i2
>                              where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
>                              )
>             )
> )
> select  ca_zip, ca_county, sum(ws_sales_price)
>  from
>     web_sales
>     JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
>     JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
>     JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk

>     JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
>  where
>  d_qoy = 2 and d_year = 2000
>  group by ca_zip, ca_county
>  order by ca_zip, ca_county
>  limit 100
> {code}
> But the correct join order wasn't generated because CE for item x item + filter has a
selectivity of 1.
> {code}
> 2014-10-20 18:46:27,120 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.6595391288544238E7, cumulative cost = {2.8364280421639153E10
rows, 3.3190782577088475E7 cpu, 0.0 io}, id = 1291
>   HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.505357243157397E10 rows, 1.6595391288544238E7 cpu, 0.0 io}, id = 1289
>     HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1287
>       HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.6595391288544238E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1285
>         HiveProjectRel($f0=[$9], $f1=[$8], $f2=[$2]): rowcount = 6.019767031014723E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1283
>           HiveProjectRel(ws_item_sk=[$5], ws_bill_customer_sk=[$6], ws_sales_price=[$7],
ws_sold_date_sk=[$8], i_item_sk=[$12], c_customer_sk=[$0], c_current_addr_sk=[$1], ca_address_sk=[$2],
ca_county=[$3], ca_zip=[$4], d_date_sk=[$9], d_year=[$10], d_qoy=[$11]): rowcount = 6.019767031014723E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1380
>             HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount = 6.019767031014723E7,
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1378
>               HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 1309
>                 HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount
= 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1269
>                   HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]):
rowcount = 8.0E7, cumulative cost = {0}, id = 1035
>                 HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1273
>                   HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 1032
>               HiveJoinRel(condition=[=($0, $7)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.16336624308125E10 rows, 0.0 cpu, 0.0 io}, id = 1376
>                 HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {2.159463857644464E10 rows, 0.0 cpu, 0.0 io}, id = 1316
>                   HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 1205
>                     HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 1033
>                   HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount
= 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1279
>                     HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): rowcount
= 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1277
>                       HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 1034
>                 HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative cost
= {1.0 rows, 1.0 cpu, 0.0 io}, id = 1265
>                   HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, cumulative
cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1263
>                     SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount =
462000.0, cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1261
>                       HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 462000.0,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1253
>                         HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
>                       HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, cumulative
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1259
>                         HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1257
>                           HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19,
23, 29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id
= 1255
>                             HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 1024
> {code}
> This query generates the correct join order 
> {code}
>  with items as (select i_item_sk from 
> item  where
>          item.i_item_id in (select i_item_id
>                              from item i2
>                              where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
>                              )
>             
> ),
>   ws as (
>  select ws_bill_customer_sk,ws_sales_price,ws_sold_date_sk
> from  web_sales
>     JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
>  )
>  select  ca_zip, ca_county, sum(ws_sales_price)
>  from ws 
>     JOIN customer ON ws.ws_bill_customer_sk = customer.c_customer_sk
>     JOIN customer_address ON customer.c_current_addr_sk = customer_address.ca_address_sk

>     JOIN date_dim ON ws.ws_sold_date_sk = date_dim.d_date_sk
>  where d_qoy = 2 and d_year = 2000
>  group by ca_zip, ca_county
>  order by ca_zip, ca_county
>  limit 100
> {code}
> Plan 
> {code}
> 2014-10-20 19:13:15,989 DEBUG [main]: parse.SemanticAnalyzer (SemanticAnalyzer.java:apply(12330))
- HiveSortRel(fetch=[100]): rowcount = 1.6595391288544238E7, cumulative cost = {4.99203570142713E10
rows, 3.3190783577088475E7 cpu, 0.0 io}, id = 4367
>   HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.6609649024206116E10 rows, 1.6595392288544238E7 cpu, 0.0 io}, id = 4365
>     HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4363
>       HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 1.6595391288544238E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4361
>         HiveProjectRel($f0=[$7], $f1=[$6], $f2=[$1]): rowcount = 6.019767031014723E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4359
>           HiveProjectRel(ws_bill_customer_sk=[$5], ws_sales_price=[$6], ws_sold_date_sk=[$7],
c_customer_sk=[$0], c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3], ca_zip=[$4],
d_date_sk=[$8], d_year=[$9], d_qoy=[$10]): rowcount = 6.019767031014723E7, cumulative cost
= {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4426
>             HiveJoinRel(condition=[=($5, $0)], joinType=[inner]): rowcount = 6.019767031014723E7,
cumulative cost = {4.329894103414093E10 rows, 1.0 cpu, 0.0 io}, id = 4424
>               HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 7.064015632843196E7,
cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 4392
>                 HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): rowcount
= 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4345
>                   HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]):
rowcount = 8.0E7, cumulative cost = {0}, id = 4101
>                 HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], ca_zip=[$9]): rowcount
= 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4349
>                   HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]):
rowcount = 4.0E7, cumulative cost = {0}, id = 4099
>               HiveJoinRel(condition=[=($2, $3)], joinType=[inner]): rowcount = 3.856185436785714E7,
cumulative cost = {4.318973902344464E10 rows, 1.0 cpu, 0.0 io}, id = 4395
>                 HiveProjectRel(ws_bill_customer_sk=[$1], ws_sales_price=[$2], ws_sold_date_sk=[$3]):
rowcount = 2.1594638446E10, cumulative cost = {2.1595100447E10 rows, 1.0 cpu, 0.0 io}, id
= 4343
>                   HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], ws_sales_price=[$2],
ws_sold_date_sk=[$3], i_item_sk=[$4]): rowcount = 2.1594638446E10, cumulative cost = {2.1595100447E10
rows, 1.0 cpu, 0.0 io}, id = 4388
>                     HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): rowcount =
2.1594638446E10, cumulative cost = {2.1595100447E10 rows, 1.0 cpu, 0.0 io}, id = 4383
>                       HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], ws_sales_price=[$20],
ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu,
0.0 io}, id = 4277
>                         HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]):
rowcount = 2.1594638446E10, cumulative cost = {0}, id = 4096
>                       HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative
cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 4339
>                         HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, cumulative
cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 4337
>                           SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount
= 462000.0, cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 4335
>                             HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount
= 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4327
>                               HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 4088
>                             HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814,
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4333
>                               HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount
= 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4331
>                                 HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17,
19, 23, 29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io},
id = 4329
>                                   HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]):
rowcount = 462000.0, cumulative cost = {0}, id = 4088
>                 HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): rowcount =
130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4355
>                   HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): rowcount =
130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4353
>                     HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]):
rowcount = 73049.0, cumulative cost = {0}, id = 4100
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message