hive-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Sergey Shelukhin (JIRA)" <j...@apache.org>
Subject [jira] [Created] (HIVE-16965) SMB join may produce incorrect results
Date Mon, 26 Jun 2017 23:01:00 GMT
Sergey Shelukhin created HIVE-16965:
---------------------------------------

             Summary: SMB join may produce incorrect results
                 Key: HIVE-16965
                 URL: https://issues.apache.org/jira/browse/HIVE-16965
             Project: Hive
          Issue Type: Bug
            Reporter: Sergey Shelukhin


Running the following on MiniTez
{noformat}
set hive.mapred.mode=nonstrict;
SET hive.vectorized.execution.enabled=true;
SET hive.exec.orc.default.buffer.size=32768;
SET hive.exec.orc.default.row.index.stride=1000;
SET hive.optimize.index.filter=true;
set hive.fetch.task.conversion=none;
set hive.exec.dynamic.partition.mode=nonstrict;

DROP TABLE orc_a;
DROP TABLE orc_b;

CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q smallint)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;
CREATE TABLE orc_b (id bigint, cfloat float)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;

insert into table orc_a partition (y=2000, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;
insert into table orc_a partition (y=2001, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;

insert into table orc_b 
select cbigint, cfloat from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc limit 200;

set hive.cbo.enable=false;

select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

set hive.enforce.sortmergebucketmapjoin=false;
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join=true;
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask.size=10;

explain
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

DROP TABLE orc_a;
DROP TABLE orc_b;
{noformat}

Produces different results for two selects. The SMB one looks incorrect. cc [~djaiswal] [~hagleitn]



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message