hadoop-hive-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Min Zhou (JIRA)" <j...@apache.org>
Subject [jira] Commented: (HIVE-503) improvement on distinct: distinguish distinct aggregate function from distinct
Date Mon, 25 May 2009 05:34:45 GMT

    [ https://issues.apache.org/jira/browse/HIVE-503?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12712641#action_12712641
] 

Min Zhou commented on HIVE-503:
-------------------------------

consider about a real world example

Approach 1: 
{code:sql}
set hive.groupby.skewindata=false;

create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, memeber_uv int, guest_uv
int);
INSERT OVERWRITE TABLE rslt_tbg_pvuv 
SELECT
  tbg,
  cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv,
  cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv,
  count(distinct CASE WHEN is_member=0 THEN id END) AS member_uv,
  count(distinct CASE WHEN is_member=1 THEN id END) AS guest_uv
FROM 
  (SELECT
    regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) as tbg,
    if(uid <> '-', 0, 1) as is_member,
    if(uid <> '-', uid, mid) as id
  FROM
    web_log
  WHERE
    (  url rlike 'http://.*TBG=1.3.1.1'
    OR url rlike 'http://.*TBG=1.3.2.2'
    OR url rlike 'http://.*TBG=1.3.3.3'
    OR url rlike 'http://.*TBG=1.3.4.4'
    OR url rlike 'http://.*TBG=1.3.5.5'
    OR url rlike 'http://.*TBG=1.3.6.6'
    OR url rlike 'http://.*TBG=1.3.7.7' )
    AND refer like '/1.gif?%'
    AND logdate='20090510')
GROUP BY tbg;
{code}

Approach 2:
{code:sql}
set hive.groupby.skewindata=false;

create table tmp_tbg_pvuv_1 (tbg STRING, is_member tinyint, id STRING);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_1
SELECT
  regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) as tbg,
  if(uid <> '-', 0, 1) as is_member,
  if(uid <> '-', uid, mid) as id
FROM
  web_log
WHERE
  (  url rlike 'http://.*TBG=1.3.1.1'
  OR url rlike 'http://.*TBG=1.3.2.2'
  OR url rlike 'http://.*TBG=1.3.3.3'
  OR url rlike 'http://.*TBG=1.3.4.4'
  OR url rlike 'http://.*TBG=1.3.5.5'
  OR url rlike 'http://.*TBG=1.3.6.6'
  OR url rlike 'http://.*TBG=1.3.7.7' )
 AND refer like '/1.gif?%'
 AND logdate='20090510';

create table tmp_tbg_pvuv_2 (tbg STRING, member_pv int, guest_pv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_2
SELECT
  tbg,
  cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv,
  cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table tmp_tbg_pvuv_3 (tbg STRING, member_uv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_3
SELECT
  tbg,
  count(distinct CASE WHEN is_member=0 THEN id ELSE 'NULL' END) - 1 AS member_uv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table tmp_tbg_pvuv_4 (tbg STRING, guest_uv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_4
SELECT
  tbg,
  count(distinct CASE WHEN is_member=1 THEN id ELSE 'NULL' END) - 1 AS guest_uv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, memeber_uv int, guest_uv
int);
INSERT OVERWRITE TABLE rslt_tbg_pvuv
SELECT
  t2.tbg,
  member_pv,
  guest_pv,
  member_uv,
  guest_uv
FROM
    tmp_tbg_pvuv_2 t2 
    join tmp_tbg_pvuv_3 t3 on t2.tbg=t3.tbg
    join tmp_tbg_pvuv_4 t4 on t2.tbg=t4.tbg
{code}

Do you insist the second approach is faster than the first one?


> improvement on distinct: distinguish distinct aggregate function from distinct
> ------------------------------------------------------------------------------
>
>                 Key: HIVE-503
>                 URL: https://issues.apache.org/jira/browse/HIVE-503
>             Project: Hadoop Hive
>          Issue Type: Improvement
>            Reporter: Min Zhou
>
> h4.distinct
> # OK
> {code:sql}
> select 
>    distinct col
> from 
>   tbl
> {code}
> # FAILED
> {code:sql}
> select 
>    distinct  col1,
>    distinct  col2
> from 
>   tbl
> {code}
> h4.distinct aggregate function
> # OK
> {code:sql}
> select 
>    count(distinct col % 10)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>    count(distinct col1% 10)
>    count(distinct col1% 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>    count(distinct col1 % 10)
>    count(distinct col2 % 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>   sum(distinct col1 % 10),
>   count(distinct col2 % 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>   max(distinct substr(col1, 1, 10)),
>   count(distinct col2 % 9)
> from 
>   tbl
> {code}
> The keyword "distinct" ofen produce more than one results, so it's impossible removing
two different columns' duplicates in only one mapreduce job, so it failed.
> But the term "distinct aggregate function" with a form like aggregate_function(distinct
....),  is in connection with the term "all aggregate function",  it essentially is an aggregate
function. Only one result each aggregate function will produce,  it's very possible one mapreduce
job could deal with two or more different aggregate expression simultaneously.

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


Mime
View raw message