hive-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Igor Kravzov <igork.ine...@gmail.com>
Subject Re: External partitoned table based on yyyy/mm/dd HDFS structure
Date Thu, 02 Jun 2016 14:55:08 GMT
Thanks Dudu for the great explanation.
I was doing some reading and thinking instead of complicated hierarchical
structure to have flat one.
Like

/user/igor/data/date=2016-06-02
create external table t (i int) partitioned by (yyyymmdd date) location
'/user/igor/data/';
 or
 /user/igor/date=20160602
create external table t (i int) partitioned by (yyyymmdd int) location
'/user/igor/data/';
Will it work?

Also I will need to schedule msck repair table t; if I want partitions
automatically picked up. Hive does not have this feature. Correct?

What is the optimal directory size for a partition? Is about 2GB OK?


On Wed, Jun 1, 2016 at 4:38 PM, Markovitz, Dudu <dmarkovitz@paypal.com>
wrote:

> The short answer:
>
> In this naming convention it will require to specifically define each
> partition.
>
> If the naming convention was yyyy=2016/mm=11/dd=28 instead of 2016/11/28 it
> would have been straight forward.
>
>
>
> Dudu
>
>
>
> + The long answer:
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- bash
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> mkdir t
>
> mkdir t/2015
>
> mkdir t/2015/01
>
> mkdir t/2015/01/22
>
> mkdir t/2015/01/23
>
> mkdir t/2015/02
>
> mkdir t/2015/02/17
>
> mkdir t/2015/03
>
> mkdir t/2015/03/04
>
> mkdir t/2015/03/05
>
> mkdir t/2015/03/06
>
> mkdir t/2016
>
> mkdir t/2016/10
>
> mkdir t/2016/10/01
>
> mkdir t/2016/10/02
>
> mkdir t/2016/10/03
>
> mkdir t/2016/11
>
> mkdir t/2016/11/27
>
> mkdir t/2016/11/28
>
>
>
>
>
> echo -e "1\n2\n3"               > t/2015/01/22/data.txt
>
> echo -e "4"                     > t/2015/01/23/data.txt
>
> echo -e "5\n6"                  > t/2015/02/17/data.txt
>
> echo -e "7\n8\n9"               > t/2015/03/04/data.txt
>
> echo -e "10"                    > t/2015/03/05/data.txt
>
> echo -e "11\n12"                > t/2015/03/06/data.txt
>
> echo -e "13\n14"                > t/2016/10/01/data.txt
>
> echo -e "15\n16\n17\n18\n19"    > t/2016/10/02/data.txt
>
> echo -e "20\n21"                > t/2016/10/03/data.txt
>
> echo -e "22"                    > t/2016/11/27/data.txt
>
> echo -e "23\n24\n25"            > t/2016/11/28/data.txt
>
>
>
> hdfs dfs -put t /user/dmarkovitz/t
>
>
>
>
>
> t
>
> ├── 2015
>
> │   ├── 01
>
> │   │   ├── 22
>
> │   │   │   └── data.txt
>
> │   │   └── 23
>
> │   │       └── data.txt
>
> │   ├── 02
>
> │   │   └── 17
>
> │   │       └── data.txt
>
> │   └── 03
>
> │       ├── 04
>
> │       │   └── data.txt
>
> │       ├── 05
>
> │       │   └── data.txt
>
> │       └── 06
>
> │           └── data.txt
>
> └── 2016
>
>     ├── 10
>
>     │   ├── 01
>
>     │   │   └── data.txt
>
>     │   ├── 02
>
>     │   │   └── data.txt
>
>     │   └── 03
>
>     │       └── data.txt
>
>     └── 11
>
>         ├── 27
>
>         │   └── data.txt
>
>         └── 28
>
>             └── data.txt
>
>
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- hive
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> set hive.mapred.supports.subdirectories=true;
>
> set mapred.input.dir.recursive=true;
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- t1:     no partitions
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> create external table t1 (i int) location '/user/dmarkovitz/t';
>
>
>
> select i,input__file__name from t1;
>
>
>
> i    input__file__name
>
> 1    hdfs://horton/user/dmarkovitz/t/2015/01/22/data.txt
>
> 2    hdfs://horton/user/dmarkovitz/t/2015/01/22/data.txt
>
> 3    hdfs://horton/user/dmarkovitz/t/2015/01/22/data.txt
>
> 4    hdfs://horton/user/dmarkovitz/t/2015/01/23/data.txt
>
> 5    hdfs://horton/user/dmarkovitz/t/2015/02/17/data.txt
>
> 6    hdfs://horton/user/dmarkovitz/t/2015/02/17/data.txt
>
> 7    hdfs://horton/user/dmarkovitz/t/2015/03/04/data.txt
>
> 8    hdfs://horton/user/dmarkovitz/t/2015/03/04/data.txt
>
> 9    hdfs://horton/user/dmarkovitz/t/2015/03/04/data.txt
>
> 10   hdfs://horton/user/dmarkovitz/t/2015/03/05/data.txt
>
> 11   hdfs://horton/user/dmarkovitz/t/2015/03/06/data.txt
>
> 12   hdfs://horton/user/dmarkovitz/t/2015/03/06/data.txt
>
> 13   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt
>
> 14   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt
>
> 15   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt
>
> 16   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt
>
> 17   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt
>
> 18   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt
>
> 19   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt
>
> 20   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt
>
> 21   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt
>
> 22   hdfs://horton/user/dmarkovitz/t/2016/11/27/data.txt
>
> 23   hdfs://horton/user/dmarkovitz/t/2016/11/28/data.txt
>
> 24   hdfs://horton/user/dmarkovitz/t/2016/11/28/data.txt
>
> 25   hdfs://horton/user/dmarkovitz/t/2016/11/28/data.txt
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- t2:     3 partition columns (yyyy smallint,mm tinyint,dd tinyint)
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> create external table t2 (i int) partitioned by (yyyy smallint,mm
> tinyint,dd tinyint) location '/user/dmarkovitz/t';
>
>
>
> alter table t2 add if not exists partition (yyyy=2015,mm=01,dd=22)
> location '/user/dmarkovitz/t/2015/01/22';
>
> alter table t2 add if not exists partition (yyyy=2015,mm=01,dd=23)
> location '/user/dmarkovitz/t/2015/01/23';
>
> alter table t2 add if not exists partition (yyyy=2015,mm=02,dd=17)
> location '/user/dmarkovitz/t/2015/02/17';
>
> alter table t2 add if not exists partition (yyyy=2015,mm=03,dd=04)
> location '/user/dmarkovitz/t/2015/03/04';
>
> alter table t2 add if not exists partition (yyyy=2015,mm=03,dd=05)
> location '/user/dmarkovitz/t/2015/03/05';
>
> alter table t2 add if not exists partition (yyyy=2015,mm=03,dd=06)
> location '/user/dmarkovitz/t/2015/03/06';
>
> alter table t2 add if not exists partition (yyyy=2016,mm=10,dd=01)
> location '/user/dmarkovitz/t/2016/10/01';
>
> alter table t2 add if not exists partition (yyyy=2016,mm=10,dd=02)
> location '/user/dmarkovitz/t/2016/10/02';
>
> alter table t2 add if not exists partition (yyyy=2016,mm=10,dd=03)
> location '/user/dmarkovitz/t/2016/10/03';
>
> alter table t2 add if not exists partition (yyyy=2016,mm=11,dd=27)
> location '/user/dmarkovitz/t/2016/11/27';
>
> alter table t2 add if not exists partition (yyyy=2016,mm=11,dd=28)
> location '/user/dmarkovitz/t/2016/11/28';
>
>
>
>
>
> select i,input__file__name,yyyy,mm,dd from t2 where yyyy=2016 and mm=10;
>
>
>
> i    input__file__name    yyyy mm   dd
>
> 13   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt  2016 10   1
>
> 14   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt  2016 10   1
>
> 15   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016 10   2
>
> 16   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016 10   2
>
> 17   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016 10   2
>
> 18   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016 10   2
>
> 19   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016 10   2
>
> 20   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt  2016 10   3
>
> 21   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt  2016 10   3
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- t3:     1 partition column (yyyymmdd date)
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> create external table t3 (i int) partitioned by (yyyymmdd date) location
> '/user/dmarkovitz/t';
>
>
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-01-22')
> location '/user/dmarkovitz/t/2015/01/22';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-01-23')
> location '/user/dmarkovitz/t/2015/01/23';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-02-17')
> location '/user/dmarkovitz/t/2015/02/17';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-03-04')
> location '/user/dmarkovitz/t/2015/03/04';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-03-05')
> location '/user/dmarkovitz/t/2015/03/05';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2015-03-06')
> location '/user/dmarkovitz/t/2015/03/06';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2016-10-01')
> location '/user/dmarkovitz/t/2016/10/01';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2016-10-02')
> location '/user/dmarkovitz/t/2016/10/02';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2016-10-03')
> location '/user/dmarkovitz/t/2016/10/03';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2016-11-27')
> location '/user/dmarkovitz/t/2016/11/27';
>
> alter table t3 add if not exists partition (yyyymmdd=date '2016-11-28')
> location '/user/dmarkovitz/t/2016/11/28';
>
>
>
>
>
> select i,input__file__name,yyyymmdd from t3 where yyyymmdd between date
> '2016-10-01' and date '2016-10-30';
>
>
>
> i    input__file__name    yyyymmdd
>
> 13   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt  2016-10-01
>
> 14   hdfs://horton/user/dmarkovitz/t/2016/10/01/data.txt  2016-10-01
>
> 15   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016-10-02
>
> 16   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016-10-02
>
> 17   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016-10-02
>
> 18   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016-10-02
>
> 19   hdfs://horton/user/dmarkovitz/t/2016/10/02/data.txt  2016-10-02
>
> 20   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt  2016-10-03
>
> 21   hdfs://horton/user/dmarkovitz/t/2016/10/03/data.txt  2016-10-03
>
>
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
> -- Automatic partitions handling
>
>
> ------------------------------------------------------------------------------------------------------------------------
>
>
>
> There is a specific directories naming convention that is supported by
> Hive partitioning which is {partition column}={partition value}
>
>
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/01/22
> /user/dmarkovitz/t/2015/01/dd=22
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/01/23
> /user/dmarkovitz/t/2015/01/dd=23
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/02/17
> /user/dmarkovitz/t/2015/02/dd=17
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/03/04
> /user/dmarkovitz/t/2015/03/dd=04
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/03/05
> /user/dmarkovitz/t/2015/03/dd=05
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/03/06
> /user/dmarkovitz/t/2015/03/dd=06
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/10/01
> /user/dmarkovitz/t/2016/10/dd=01
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/10/02
> /user/dmarkovitz/t/2016/10/dd=02
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/10/03
> /user/dmarkovitz/t/2016/10/dd=03
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/11/27
> /user/dmarkovitz/t/2016/11/dd=27
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/11/28
> /user/dmarkovitz/t/2016/11/dd=28
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/01     /user/dmarkovitz/t/2015/mm=01
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/02     /user/dmarkovitz/t/2015/mm=02
>
> hdfs dfs -mv /user/dmarkovitz/t/2015/03     /user/dmarkovitz/t/2015/mm=03
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/10     /user/dmarkovitz/t/2016/mm=10
>
> hdfs dfs -mv /user/dmarkovitz/t/2016/11     /user/dmarkovitz/t/2016/mm=11
>
> hdfs dfs -mv /user/dmarkovitz/t/2015        /user/dmarkovitz/t/yyyy=2015
>
> hdfs dfs -mv /user/dmarkovitz/t/2016        /user/dmarkovitz/t/yyyy=2016
>
>
>
> t
>
> ├── yyyy=2015
>
> │   ├── mm=01
>
> │   │   ├── dd=22
>
> │   │   │   └── data.txt
>
> │   │   └── dd=23
>
> │   │       └── data.txt
>
> │   ├── mm=02
>
> │   │   └── dd=17
>
> │   │       └── data.txt
>
> │   └── mm=03
>
> │       ├── dd=04
>
> │       │   └── data.txt
>
> │       ├── dd=05
>
> │       │   └── data.txt
>
> │       └── dd=06
>
> │           └── data.txt
>
> └── yyyy=2016
>
>     ├── mm=10
>
>     │   ├── dd=01
>
>     │   │   └── data.txt
>
>     │   ├── dd=02
>
>     │   │   └── data.txt
>
>     │   └── dd=03
>
>     │       └── data.txt
>
>     └── mm=11
>
>         ├── dd=27
>
>         │   └── data.txt
>
>         └── dd=28
>
>             └── data.txt
>
>
>
>
>
> create external table t (i int) partitioned by (yyyy smallint,mm
> tinyint,dd tinyint) location '/user/dmarkovitz/t';
>
> msck repair table t;
>
>
>
> select i,input__file__name,yyyy,mm,dd from t where yyyy >= 2016 and mm >=
> 10;
>
>
>
> i    input__file__name    yyyy mm   dd
>
> 13   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=01/data.txt  2016
> 10     1
>
> 14   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=01/data.txt  2016
> 10     1
>
> 15   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=02/data.txt  2016
> 10     2
>
> 16   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=02/data.txt  2016
> 10     2
>
> 17   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=02/data.txt  2016
> 10     2
>
> 18   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=02/data.txt  2016
> 10     2
>
> 19   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=02/data.txt  2016
> 10     2
>
> 20   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=03/data.txt  2016
> 10     3
>
> 21   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=10/dd=03/data.txt  2016
> 10     3
>
> 22   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=11/dd=27/data.txt  2016
> 11     27
>
> 23   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=11/dd=28/data.txt  2016
> 11     28
>
> 24   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=11/dd=28/data.txt  2016
> 11     28
>
> 25   hdfs://horton/user/dmarkovitz/t/yyyy=2016/mm=11/dd=28/data.txt  2016
> 11     28
>
>
>
>
>
> *From:* Igor Kravzov [mailto:igork.inexso@gmail.com]
> *Sent:* Wednesday, June 01, 2016 6:03 PM
> *To:* user@hive.apache.org
> *Subject:* External partitoned table based on yyyy/mm/dd HDFS structure
>
>
>
> Is it possible to create?
>
> What would be the table definition in this case?
>

Mime
View raw message