Return-Path: X-Original-To: apmail-drill-issues-archive@minotaur.apache.org Delivered-To: apmail-drill-issues-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 3276210116 for ; Sun, 4 Jan 2015 23:57:43 +0000 (UTC) Received: (qmail 9544 invoked by uid 500); 4 Jan 2015 23:57:38 -0000 Delivered-To: apmail-drill-issues-archive@drill.apache.org Received: (qmail 9515 invoked by uid 500); 4 Jan 2015 23:57:38 -0000 Mailing-List: contact issues-help@drill.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@drill.apache.org Delivered-To: mailing list issues@drill.apache.org Received: (qmail 9505 invoked by uid 99); 4 Jan 2015 23:57:38 -0000 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 04 Jan 2015 23:57:38 +0000 Date: Sun, 4 Jan 2015 23:57:38 +0000 (UTC) From: "Jacques Nadeau (JIRA)" To: issues@drill.apache.org Message-ID: In-Reply-To: References: Subject: [jira] [Updated] (DRILL-1861) Optimizer selects hash aggregate even when we have a sorted dataset MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394 [ https://issues.apache.org/jira/browse/DRILL-1861?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Jacques Nadeau updated DRILL-1861: ---------------------------------- Fix Version/s: 0.9.0 Assignee: Jinfeng Ni > Optimizer selects hash aggregate even when we have a sorted dataset > ------------------------------------------------------------------- > > Key: DRILL-1861 > URL: https://issues.apache.org/jira/browse/DRILL-1861 > Project: Apache Drill > Issue Type: Improvement > Components: Execution - Operators, Query Planning & Optimization > Reporter: Rahul Challapalli > Assignee: Jinfeng Ni > Fix For: 0.9.0 > > Attachments: wide-strings.parquet > > > git.commit.id.abbrev=142e577 > Query : > {code} > select max(length(sub.str_var)) from ( select str_var, tinyint_var from `wide-strings` order by tinyint_var) sub group by sub.tinyint_var; > {code} > Plan : > {code} > +------------+------------+ > | text | json | > +------------+------------+ > | 00-00 Screen > 00-01 Project(EXPR$0=[$1]) > 00-02 HashAgg(group=[{0}], EXPR$0=[MAX($1)]) > 00-03 Project(tinyint_var=[$1], $f1=[LENGTH($0)]) > 00-04 SelectionVectorRemover > 00-05 Sort(sort0=[$1], dir0=[ASC]) > 00-06 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings]], selectionRoot=/drill/testdata/data-shapes/wide-columns/flat/wide-strings, numFiles=1, columns=[`str_var`, `tinyint_var`]]]) > | { > "head" : { > "version" : 1, > "generator" : { > "type" : "ExplainHandler", > "info" : "" > }, > "type" : "APACHE_DRILL_PHYSICAL", > "options" : [ ], > "queue" : 0, > "resultMode" : "EXEC" > }, > "graph" : [ { > "pop" : "parquet-scan", > "@id" : 6, > "entries" : [ { > "path" : "maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings" > } ], > "storage" : { > "type" : "file", > "enabled" : true, > "connection" : "maprfs:///", > "workspaces" : { > "root" : { > "location" : "/", > "writable" : false, > "defaultInputFormat" : null > }, > "tmp" : { > "location" : "/tmp", > "writable" : true, > "defaultInputFormat" : "csv" > }, > "drillTestDir" : { > "location" : "/drill/testdata/", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirAmplab" : { > "location" : "/drill/testdata/amplab", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirInformationSchema" : { > "location" : "/drill/testdata/information-schema", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirUdfs" : { > "location" : "/drill/testdata/udfs/", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirP1" : { > "location" : "/drill/testdata/p1tests", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirTpch10Parquet" : { > "location" : "/drill/testdata/tpch10", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "Join" : { > "location" : "/drill/testdata/join", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "NoExtJson" : { > "location" : "/drill/testdata/no-extension/json", > "writable" : true, > "defaultInputFormat" : "json" > }, > "NoExtParquet" : { > "location" : "/drill/testdata/no-extension/parquet", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "NoExtParquetNull" : { > "location" : "/drill/testdata/no-extension/parquet", > "writable" : true, > "defaultInputFormat" : null > }, > "NoExtText" : { > "location" : "/drill/testdata/no-extension/text", > "writable" : true, > "defaultInputFormat" : "psv" > }, > "drillTestDirExchanges" : { > "location" : "/drill/testdata/exchanges_test", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "TpcHMulti" : { > "location" : "/drill/testdata/tpch-multi", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "TpcHMulti100" : { > "location" : "/drill/testdata/SF100", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "TpcHMulti1" : { > "location" : "/drill/testdata/tpch_SF1", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirExplicit" : { > "location" : "/drill/testdata/explicit_cast", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirImplicit" : { > "location" : "/drill/testdata/implicit_cast", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirImplicit1" : { > "location" : "/drill/testdata/implicit_cast", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirTPCDS" : { > "location" : "/user/root/tpcds/parquet", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "TPCDS" : { > "location" : "/drill/testdata/tpcds", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillMondrian" : { > "location" : "/user/root/mondrian", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirDatetime" : { > "location" : "/drill/testdata/datetime/datasources", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirViews" : { > "location" : "/drill/testdata/views/", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirNumerical" : { > "location" : "/drill/testdata/numerical/", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "drillTestDirJson" : { > "location" : "/drill/testdata/json_storage/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTestNewWS" : { > "location" : "/drill/testdata/newWS/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpch01Text" : { > "location" : "/drill/testdata/Tpch0.01/text/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpch01Json" : { > "location" : "/drill/testdata/Tpch0.01/json/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpch01Parquet" : { > "location" : "/drill/testdata/Tpch0.01/parquet/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirConvert" : { > "location" : "/drill/testdata/convert", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpch100Text" : { > "location" : "/drill/testdata/tpch100/text/", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpch100Parquet" : { > "location" : "/drill/testdata/tpch100/parquet", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirAggregate1parquet" : { > "location" : "/drill/testdata/tpcds/parquet/s1", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirAggregate1csv" : { > "location" : "/drill/testdata/tpcds/csv/s1", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirAggregate1json" : { > "location" : "/drill/testdata/tpcds/json/s1", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirMondrian" : { > "location" : "/drill/testdata/mondrian", > "writable" : true, > "defaultInputFormat" : null > }, > "drillTestDirTpcdsImpalaSF1" : { > "location" : "/drill/testdata/tpcds-impala-sf1", > "writable" : true, > "defaultInputFormat" : null > }, > "sandbox" : { > "location" : "/sandbox", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "sandbox-logs" : { > "location" : "/sandbox/flat", > "writable" : true, > "defaultInputFormat" : "parquet" > }, > "sandbox-json" : { > "location" : "/sandbox/json", > "writable" : true, > "defaultInputFormat" : "parquet" > } > }, > "formats" : { > "psv" : { > "type" : "text", > "extensions" : [ "tbl" ], > "delimiter" : "|" > }, > "dsv" : { > "type" : "text", > "extensions" : [ "dat" ], > "delimiter" : "|" > }, > "csv" : { > "type" : "text", > "extensions" : [ "csv" ], > "delimiter" : "," > }, > "tsv" : { > "type" : "text", > "extensions" : [ "tsv" ], > "delimiter" : "\t" > }, > "parquet" : { > "type" : "parquet" > }, > "json" : { > "type" : "json" > } > } > }, > "format" : { > "type" : "parquet" > }, > "columns" : [ "`str_var`", "`tinyint_var`" ], > "selectionRoot" : "/drill/testdata/data-shapes/wide-columns/flat/wide-strings", > "cost" : 50.0 > }, { > "pop" : "external-sort", > "@id" : 5, > "child" : 6, > "orderings" : [ { > "order" : "ASC", > "expr" : "`tinyint_var`", > "nullDirection" : "UNSPECIFIED" > } ], > "reverse" : false, > "initialAllocation" : 20000000, > "maxAllocation" : 10000000000, > "cost" : 50.0 > }, { > "pop" : "selection-vector-remover", > "@id" : 4, > "child" : 5, > "initialAllocation" : 1000000, > "maxAllocation" : 10000000000, > "cost" : 50.0 > }, { > "pop" : "project", > "@id" : 3, > "exprs" : [ { > "ref" : "`tinyint_var`", > "expr" : "`tinyint_var`" > }, { > "ref" : "`$f1`", > "expr" : "length(`str_var`) " > } ], > "child" : 4, > "initialAllocation" : 1000000, > "maxAllocation" : 10000000000, > "cost" : 50.0 > }, { > "pop" : "hash-aggregate", > "@id" : 2, > "child" : 3, > "cardinality" : 1.0, > "initialAllocation" : 1000000, > "maxAllocation" : 10000000000, > "groupByExprs" : [ { > "ref" : "`tinyint_var`", > "expr" : "`tinyint_var`" > } ], > "aggrExprs" : [ { > "ref" : "`EXPR$0`", > "expr" : "max(`$f1`) " > } ], > "cost" : 25.0 > }, { > "pop" : "project", > "@id" : 1, > "ex | > +------------+------------+ > 1 row selected (0.087 seconds) > {code} > The sub-query actually does an order by. So when we do a group by, it makes more sense cost-wise to use the streaming aggregate instead of hash-aggregate. > I attached the parquet file used. Let me know if you have any questions. -- This message was sent by Atlassian JIRA (v6.3.4#6332)