asterixdb-notifications mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Wenhai (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (ASTERIXDB-2116) Very different efficiency behavior on Nested Schema.
Date Fri, 29 Sep 2017 08:56:01 GMT

     [ https://issues.apache.org/jira/browse/ASTERIXDB-2116?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Wenhai updated ASTERIXDB-2116:
------------------------------
    Description: 
When we apply several modified TPCH queries to the nested schema of AsterixDB, we get strange
results regarding very similar queries. One can check the associated figures with the arrows
denoting the runtime of the following queries.
Schema:
{\noformat}
use dataverse tpch;

drop dataset NestedPartsupp if exists;
drop dataset NestedPart if exists;

drop type NestedPartType if exists;
drop type NestedPartType_ps if exists;
drop type NestedPartsuppType if exists;
drop type NestedPartsuppType_l if exists;
drop type NestedLineItemType if exists;

create type NestedLineItemType as closed {
  l_orderkey: int32,
  l_partkey: int32,
  l_suppkey: int32,
  l_linenumber: int32,
  l_quantity: int32,
  l_extendedprice: double,
  l_discount: double,
  l_tax: double,
  l_returnflag: string,
  l_linestatus: string,
  l_shipdate: string,
  l_commitdate: string,
  l_receiptdate: string,
  l_shipinstruct: string,
  l_shipmode: string,
  l_comment: string
}

create type NestedPartsuppType as closed {
  ps_partkey: int32,
  ps_suppkey: int32,
  ps_availqty: int32,
  ps_supplycost: double,
  ps_comment: string,
  l: [NestedLineItemType]
}

create type NestedPartType as open {
  p_partkey: int32,
  p_name: string,
  p_mfgr: string,
  p_brand: string,
  p_type: string,
  p_size: int32,
  p_container: string,
  p_retailprice: double,
  p_comment: string,
  ps: [NestedPartsuppType]?
}

create dataset NestedPart(NestedPartType) primary key p_partkey;

{\noformat}
Queries:
{\noformat}
MQ06
use dataverse tpch;

let $revenue := sum(
  for $t in dataset NestedPart
  for $s in $t.PartsuppList
  for $e in $s.LineitemList
  where $e.l_shipdate >= '1992-01-01'
    and $e.l_shipdate < '1994-01-01'
    and $e.l_discount >= 0.0
    and $e.l_discount <= 0.04
    and $e.l_quantity < 40
  return $e.l_extendedprice * $e.l_discount
)
return {
  "revenue": $revenue
}

MQ14
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where $l.l_shipdate >= '1993-05-01'
and $l.l_shipdate < '1994-01-01'
return {
     "pt": $p.p_type,
     "ep": $l.l_extendedprice,
     "dc": $l.l_discount
  }
)

MQ19
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where $p.p_brand <= 'Brand#50'
and (starts-with($p.p_container, "SM") or starts-with($p.p_container, "MED") or starts-with($p.p_container,
"LG") or starts-with($p.p_container, "WRAP"))
and $p.p_size <= 50
and $l.l_quantity >= 0
and $l.l_quantity <= 50
and (starts-with($l.l_shipmode, "TRUCK") or starts-with($l.l_shipmode, "AIR") or starts-with($l.l_shipmode,
"SHIP") or starts-with($l.l_shipmode, "FOB"))
and $l.l_shipinstruct = 'DELIVER IN PERSON'
return {
     "ep": $l.l_extendedprice,
     "dc": $l.l_discount
  }
)

MQ20
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where (contains($p.p_name, "green") or contains($p.p_name, "lemon") or contains($p.p_name,
"red"))
and $l.l_shipdate >= "1992-01-01"
and $l.l_shipdate < "1998-01-01"
return {
"sk":$ps.ps_suppkey,
"qt":$ps.ps_availqty
}
)
{\noformat}
nestedeff.jpg|thumbnail!

  was:
When we apply several modified TPCH queries to the nested schema of AsterixDB, we get strange
results regarding very similar queries. One can check the associated figures with the arrows
denoting the runtime of the following queries.
Schema:
{\noformat}
use dataverse tpch;

drop dataset NestedPartsupp if exists;
drop dataset NestedPart if exists;

drop type NestedPartType if exists;
drop type NestedPartType_ps if exists;
drop type NestedPartsuppType if exists;
drop type NestedPartsuppType_l if exists;
drop type NestedLineItemType if exists;

create type NestedLineItemType as closed {
  l_orderkey: int32,
  l_partkey: int32,
  l_suppkey: int32,
  l_linenumber: int32,
  l_quantity: int32,
  l_extendedprice: double,
  l_discount: double,
  l_tax: double,
  l_returnflag: string,
  l_linestatus: string,
  l_shipdate: string,
  l_commitdate: string,
  l_receiptdate: string,
  l_shipinstruct: string,
  l_shipmode: string,
  l_comment: string
}

create type NestedPartsuppType as closed {
  ps_partkey: int32,
  ps_suppkey: int32,
  ps_availqty: int32,
  ps_supplycost: double,
  ps_comment: string,
  l: [NestedLineItemType]
}

create type NestedPartType as open {
  p_partkey: int32,
  p_name: string,
  p_mfgr: string,
  p_brand: string,
  p_type: string,
  p_size: int32,
  p_container: string,
  p_retailprice: double,
  p_comment: string,
  ps: [NestedPartsuppType]?
}

create dataset NestedPart(NestedPartType) primary key p_partkey;

{\noformat}
Queries:
{\noformat}
MQ06
use dataverse tpch;

let $revenue := sum(
  for $t in dataset NestedPart
  for $s in $t.PartsuppList
  for $e in $s.LineitemList
  where $e.l_shipdate >= '1992-01-01'
    and $e.l_shipdate < '1994-01-01'
    and $e.l_discount >= 0.0
    and $e.l_discount <= 0.04
    and $e.l_quantity < 40
  return $e.l_extendedprice * $e.l_discount
)
return {
  "revenue": $revenue
}

MQ14
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where $l.l_shipdate >= '1993-05-01'
and $l.l_shipdate < '1994-01-01'
return {
     "pt": $p.p_type,
     "ep": $l.l_extendedprice,
     "dc": $l.l_discount
  }
)

MQ19
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where $p.p_brand <= 'Brand#50'
and (starts-with($p.p_container, "SM") or starts-with($p.p_container, "MED") or starts-with($p.p_container,
"LG") or starts-with($p.p_container, "WRAP"))
and $p.p_size <= 50
and $l.l_quantity >= 0
and $l.l_quantity <= 50
and (starts-with($l.l_shipmode, "TRUCK") or starts-with($l.l_shipmode, "AIR") or starts-with($l.l_shipmode,
"SHIP") or starts-with($l.l_shipmode, "FOB"))
and $l.l_shipinstruct = 'DELIVER IN PERSON'
return {
     "ep": $l.l_extendedprice,
     "dc": $l.l_discount
  }
)

MQ20
use dataverse tpch;
count(
  for $p in dataset NestedPart
  for $ps in $p.PartsuppList
  for $l in $ps.LineitemList
where (contains($p.p_name, "green") or contains($p.p_name, "lemon") or contains($p.p_name,
"red"))
and $l.l_shipdate >= "1992-01-01"
and $l.l_shipdate < "1998-01-01"
return {
"sk":$ps.ps_suppkey,
"qt":$ps.ps_availqty
}
)
{\noformat}
!attachment-name.jpg|thumbnail!


> Very different efficiency behavior on Nested Schema.
> ----------------------------------------------------
>
>                 Key: ASTERIXDB-2116
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-2116
>             Project: Apache AsterixDB
>          Issue Type: Improvement
>          Components: STO - Storage
>         Environment: 8 nodes with 1/4 NCs, 128GB memory in each node, Centos
>            Reporter: Wenhai
>            Assignee: Pouria
>         Attachments: nestedeff.jpg
>
>
> When we apply several modified TPCH queries to the nested schema of AsterixDB, we get
strange results regarding very similar queries. One can check the associated figures with
the arrows denoting the runtime of the following queries.
> Schema:
> {\noformat}
> use dataverse tpch;
> drop dataset NestedPartsupp if exists;
> drop dataset NestedPart if exists;
> drop type NestedPartType if exists;
> drop type NestedPartType_ps if exists;
> drop type NestedPartsuppType if exists;
> drop type NestedPartsuppType_l if exists;
> drop type NestedLineItemType if exists;
> create type NestedLineItemType as closed {
>   l_orderkey: int32,
>   l_partkey: int32,
>   l_suppkey: int32,
>   l_linenumber: int32,
>   l_quantity: int32,
>   l_extendedprice: double,
>   l_discount: double,
>   l_tax: double,
>   l_returnflag: string,
>   l_linestatus: string,
>   l_shipdate: string,
>   l_commitdate: string,
>   l_receiptdate: string,
>   l_shipinstruct: string,
>   l_shipmode: string,
>   l_comment: string
> }
> create type NestedPartsuppType as closed {
>   ps_partkey: int32,
>   ps_suppkey: int32,
>   ps_availqty: int32,
>   ps_supplycost: double,
>   ps_comment: string,
>   l: [NestedLineItemType]
> }
> create type NestedPartType as open {
>   p_partkey: int32,
>   p_name: string,
>   p_mfgr: string,
>   p_brand: string,
>   p_type: string,
>   p_size: int32,
>   p_container: string,
>   p_retailprice: double,
>   p_comment: string,
>   ps: [NestedPartsuppType]?
> }
> create dataset NestedPart(NestedPartType) primary key p_partkey;
> {\noformat}
> Queries:
> {\noformat}
> MQ06
> use dataverse tpch;
> let $revenue := sum(
>   for $t in dataset NestedPart
>   for $s in $t.PartsuppList
>   for $e in $s.LineitemList
>   where $e.l_shipdate >= '1992-01-01'
>     and $e.l_shipdate < '1994-01-01'
>     and $e.l_discount >= 0.0
>     and $e.l_discount <= 0.04
>     and $e.l_quantity < 40
>   return $e.l_extendedprice * $e.l_discount
> )
> return {
>   "revenue": $revenue
> }
> MQ14
> use dataverse tpch;
> count(
>   for $p in dataset NestedPart
>   for $ps in $p.PartsuppList
>   for $l in $ps.LineitemList
> where $l.l_shipdate >= '1993-05-01'
> and $l.l_shipdate < '1994-01-01'
> return {
>      "pt": $p.p_type,
>      "ep": $l.l_extendedprice,
>      "dc": $l.l_discount
>   }
> )
> MQ19
> use dataverse tpch;
> count(
>   for $p in dataset NestedPart
>   for $ps in $p.PartsuppList
>   for $l in $ps.LineitemList
> where $p.p_brand <= 'Brand#50'
> and (starts-with($p.p_container, "SM") or starts-with($p.p_container, "MED") or starts-with($p.p_container,
"LG") or starts-with($p.p_container, "WRAP"))
> and $p.p_size <= 50
> and $l.l_quantity >= 0
> and $l.l_quantity <= 50
> and (starts-with($l.l_shipmode, "TRUCK") or starts-with($l.l_shipmode, "AIR") or starts-with($l.l_shipmode,
"SHIP") or starts-with($l.l_shipmode, "FOB"))
> and $l.l_shipinstruct = 'DELIVER IN PERSON'
> return {
>      "ep": $l.l_extendedprice,
>      "dc": $l.l_discount
>   }
> )
> MQ20
> use dataverse tpch;
> count(
>   for $p in dataset NestedPart
>   for $ps in $p.PartsuppList
>   for $l in $ps.LineitemList
> where (contains($p.p_name, "green") or contains($p.p_name, "lemon") or contains($p.p_name,
"red"))
> and $l.l_shipdate >= "1992-01-01"
> and $l.l_shipdate < "1998-01-01"
> return {
> "sk":$ps.ps_suppkey,
> "qt":$ps.ps_availqty
> }
> )
> {\noformat}
> nestedeff.jpg|thumbnail!



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message