spark-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Yuming Wang (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (SPARK-24538) Decimal type support push down to the data sources
Date Wed, 13 Jun 2018 01:51:00 GMT

     [ https://issues.apache.org/jira/browse/SPARK-24538?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Yuming Wang updated SPARK-24538:
--------------------------------
    Description: 
Latest parquet support decimal type statistics. then we can push down to the data sources:
{noformat}
LM-SHC-16502798:parquet-mr yumwang$ java -jar ./parquet-tools/target/parquet-tools-1.10.10-column-index-SNAPSHOT.jar
meta /tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet

file:         file:/tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet

creator:      parquet-mr version 1.10.0 (build 031a6654009e3b82020012a18434c582bd74c73a)

extra:        org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"d1","type":"decimal(9,0)","nullable":true,"metadata":{}},{"name":"d2","type":"decimal(9,2)","nullable":true,"metadata":{}},{"name":"d3","type":"decimal(18,0)","nullable":true,"metadata":{}},{"name":"d4","type":"decimal(18,4)","nullable":true,"metadata":{}},{"name":"d5","type":"decimal(38,0)","nullable":true,"metadata":{}},{"name":"d6","type":"decimal(38,18)","nullable":true,"metadata":{}}]}



file schema:  spark_schema

--------------------------------------------------------------------------------

id:           REQUIRED INT64 R:0 D:0

d1:           OPTIONAL INT32 O:DECIMAL R:0 D:1

d2:           OPTIONAL INT32 O:DECIMAL R:0 D:1

d3:           OPTIONAL INT64 O:DECIMAL R:0 D:1

d4:           OPTIONAL INT64 O:DECIMAL R:0 D:1

d5:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1

d6:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1



row group 1:  RC:241867 TS:15480513 OFFSET:4

--------------------------------------------------------------------------------

id:            INT64 SNAPPY DO:0 FPO:4 SZ:968154/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d1:            INT32 SNAPPY DO:0 FPO:968158 SZ:967555/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d2:            INT32 SNAPPY DO:0 FPO:1935713 SZ:967558/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0.00, max: 241866.00, num_nulls: 0]

d3:            INT64 SNAPPY DO:0 FPO:2903271 SZ:968866/1935047/2.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d4:            INT64 SNAPPY DO:0 FPO:3872137 SZ:1247007/1935047/1.55 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0.0000, max: 241866.0000, num_nulls: 0]

d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:5119144 SZ:1266850/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0, max: 241866, num_nulls: 0]

d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:6385994 SZ:2198910/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0E-18, max: 241866.000000000000000000, num_nulls:
0]



row group 2:  RC:241867 TS:15480513 OFFSET:8584904

--------------------------------------------------------------------------------

id:            INT64 SNAPPY DO:0 FPO:8584904 SZ:968131/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d1:            INT32 SNAPPY DO:0 FPO:9553035 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d2:            INT32 SNAPPY DO:0 FPO:10520598 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867.00, max: 483733.00, num_nulls: 0]

d3:            INT64 SNAPPY DO:0 FPO:11488161 SZ:968110/1935047/2.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d4:            INT64 SNAPPY DO:0 FPO:12456271 SZ:1247071/1935047/1.55 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867.0000, max: 483733.0000, num_nulls: 0]

d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:13703342 SZ:1270587/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867, max: 483733, num_nulls: 0]

d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:14973929 SZ:2197306/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867.000000000000000000, max: 483733.000000000000000000,
num_nulls: 0]{noformat}

  was:
Latest parquet support decimal type statistics. then we can push down:
{noformat}
LM-SHC-16502798:parquet-mr yumwang$ java -jar ./parquet-tools/target/parquet-tools-1.10.10-column-index-SNAPSHOT.jar
meta /tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet

file:         file:/tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet

creator:      parquet-mr version 1.10.0 (build 031a6654009e3b82020012a18434c582bd74c73a)

extra:        org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"d1","type":"decimal(9,0)","nullable":true,"metadata":{}},{"name":"d2","type":"decimal(9,2)","nullable":true,"metadata":{}},{"name":"d3","type":"decimal(18,0)","nullable":true,"metadata":{}},{"name":"d4","type":"decimal(18,4)","nullable":true,"metadata":{}},{"name":"d5","type":"decimal(38,0)","nullable":true,"metadata":{}},{"name":"d6","type":"decimal(38,18)","nullable":true,"metadata":{}}]}



file schema:  spark_schema

--------------------------------------------------------------------------------

id:           REQUIRED INT64 R:0 D:0

d1:           OPTIONAL INT32 O:DECIMAL R:0 D:1

d2:           OPTIONAL INT32 O:DECIMAL R:0 D:1

d3:           OPTIONAL INT64 O:DECIMAL R:0 D:1

d4:           OPTIONAL INT64 O:DECIMAL R:0 D:1

d5:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1

d6:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1



row group 1:  RC:241867 TS:15480513 OFFSET:4

--------------------------------------------------------------------------------

id:            INT64 SNAPPY DO:0 FPO:4 SZ:968154/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d1:            INT32 SNAPPY DO:0 FPO:968158 SZ:967555/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d2:            INT32 SNAPPY DO:0 FPO:1935713 SZ:967558/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0.00, max: 241866.00, num_nulls: 0]

d3:            INT64 SNAPPY DO:0 FPO:2903271 SZ:968866/1935047/2.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]

d4:            INT64 SNAPPY DO:0 FPO:3872137 SZ:1247007/1935047/1.55 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0.0000, max: 241866.0000, num_nulls: 0]

d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:5119144 SZ:1266850/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0, max: 241866, num_nulls: 0]

d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:6385994 SZ:2198910/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0E-18, max: 241866.000000000000000000, num_nulls:
0]



row group 2:  RC:241867 TS:15480513 OFFSET:8584904

--------------------------------------------------------------------------------

id:            INT64 SNAPPY DO:0 FPO:8584904 SZ:968131/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d1:            INT32 SNAPPY DO:0 FPO:9553035 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d2:            INT32 SNAPPY DO:0 FPO:10520598 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867.00, max: 483733.00, num_nulls: 0]

d3:            INT64 SNAPPY DO:0 FPO:11488161 SZ:968110/1935047/2.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]

d4:            INT64 SNAPPY DO:0 FPO:12456271 SZ:1247071/1935047/1.55 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867.0000, max: 483733.0000, num_nulls: 0]

d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:13703342 SZ:1270587/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867, max: 483733, num_nulls: 0]

d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:14973929 SZ:2197306/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867.000000000000000000, max: 483733.000000000000000000,
num_nulls: 0]{noformat}


> Decimal type support push down to the data sources
> --------------------------------------------------
>
>                 Key: SPARK-24538
>                 URL: https://issues.apache.org/jira/browse/SPARK-24538
>             Project: Spark
>          Issue Type: New Feature
>          Components: SQL
>    Affects Versions: 2.4.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> Latest parquet support decimal type statistics. then we can push down to the data sources:
> {noformat}
> LM-SHC-16502798:parquet-mr yumwang$ java -jar ./parquet-tools/target/parquet-tools-1.10.10-column-index-SNAPSHOT.jar
meta /tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet
> file:         file:/tmp/spark/parquet/decimal/part-00000-3880e69a-6dd1-4c2b-946c-e7dae047f65c-c000.snappy.parquet
> creator:      parquet-mr version 1.10.0 (build 031a6654009e3b82020012a18434c582bd74c73a)
> extra:        org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"d1","type":"decimal(9,0)","nullable":true,"metadata":{}},{"name":"d2","type":"decimal(9,2)","nullable":true,"metadata":{}},{"name":"d3","type":"decimal(18,0)","nullable":true,"metadata":{}},{"name":"d4","type":"decimal(18,4)","nullable":true,"metadata":{}},{"name":"d5","type":"decimal(38,0)","nullable":true,"metadata":{}},{"name":"d6","type":"decimal(38,18)","nullable":true,"metadata":{}}]}
> file schema:  spark_schema
> --------------------------------------------------------------------------------
> id:           REQUIRED INT64 R:0 D:0
> d1:           OPTIONAL INT32 O:DECIMAL R:0 D:1
> d2:           OPTIONAL INT32 O:DECIMAL R:0 D:1
> d3:           OPTIONAL INT64 O:DECIMAL R:0 D:1
> d4:           OPTIONAL INT64 O:DECIMAL R:0 D:1
> d5:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1
> d6:           OPTIONAL FIXED_LEN_BYTE_ARRAY O:DECIMAL R:0 D:1
> row group 1:  RC:241867 TS:15480513 OFFSET:4
> --------------------------------------------------------------------------------
> id:            INT64 SNAPPY DO:0 FPO:4 SZ:968154/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]
> d1:            INT32 SNAPPY DO:0 FPO:968158 SZ:967555/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]
> d2:            INT32 SNAPPY DO:0 FPO:1935713 SZ:967558/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0.00, max: 241866.00, num_nulls: 0]
> d3:            INT64 SNAPPY DO:0 FPO:2903271 SZ:968866/1935047/2.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 0, max: 241866, num_nulls: 0]
> d4:            INT64 SNAPPY DO:0 FPO:3872137 SZ:1247007/1935047/1.55 VC:241867
ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0.0000, max: 241866.0000, num_nulls: 0]
> d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:5119144 SZ:1266850/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0, max: 241866, num_nulls: 0]
> d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:6385994 SZ:2198910/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 0E-18, max: 241866.000000000000000000, num_nulls:
0]
> row group 2:  RC:241867 TS:15480513 OFFSET:8584904
> --------------------------------------------------------------------------------
> id:            INT64 SNAPPY DO:0 FPO:8584904 SZ:968131/1935071/2.00 VC:241867 ENC:BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]
> d1:            INT32 SNAPPY DO:0 FPO:9553035 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867, max: 483733, num_nulls: 0]
> d2:            INT32 SNAPPY DO:0 FPO:10520598 SZ:967563/967515/1.00 VC:241867 ENC:RLE,BIT_PACKED,PLAIN
ST:[min: 241867.00, max: 483733.00, num_nulls: 0]
> d3:            INT64 SNAPPY DO:0 FPO:11488161 SZ:968110/1935047/2.00 VC:241867
ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867, max: 483733, num_nulls: 0]
> d4:            INT64 SNAPPY DO:0 FPO:12456271 SZ:1247071/1935047/1.55 VC:241867
ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867.0000, max: 483733.0000, num_nulls: 0]
> d5:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:13703342 SZ:1270587/3870159/3.05
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867, max: 483733, num_nulls: 0]
> d6:            FIXED_LEN_BYTE_ARRAY SNAPPY DO:0 FPO:14973929 SZ:2197306/3870159/1.76
VC:241867 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 241867.000000000000000000, max: 483733.000000000000000000,
num_nulls: 0]{noformat}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org


Mime
View raw message