impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tarmstr...@apache.org
Subject [44/50] incubator-impala git commit: IMPALA-3148. Fix selectivity computation for pushed Kudu predicates
Date Tue, 12 Apr 2016 21:19:18 GMT
IMPALA-3148. Fix selectivity computation for pushed Kudu predicates

This follows up on a TODO from the Kudu merge and also fixes a bug:
IMPALA-976 changed the computation of selectivities for a combined
list of conjuncts to better handle expressions with no selectivity
estimate. The Kudu implementation was forked from before this change
and thus did not have an equivalent change.

This refactors the algorithm to a new static method and calls it from
both PlanNode and KuduScanNode so that the selectivity estimate
behavior is the same regardless of whether Kudu can evaluate the
predicate server-side.

Todd tested this on TPCH 3TB and verified that the plans are reasonable
now where they used to be nonsense.

Change-Id: Id507077b577ed5804fc80517f33ea185f2bff41a
Reviewed-on: http://gerrit.cloudera.org:8080/2628
Reviewed-by: Casey Ching <casey@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4bdd0b97
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4bdd0b97
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4bdd0b97

Branch: refs/heads/master
Commit: 4bdd0b976dc80f64ebd1f2c37ad03b4ec661cf02
Parents: 86fd262
Author: Todd Lipcon <todd@cloudera.com>
Authored: Thu Mar 24 22:53:33 2016 -0700
Committer: Tim Armstrong <tarmstrong@cloudera.com>
Committed: Tue Apr 12 14:03:44 2016 -0700

----------------------------------------------------------------------
 .../cloudera/impala/planner/KuduScanNode.java   | 18 ++---
 .../com/cloudera/impala/planner/PlanNode.java   | 10 ++-
 .../queries/PlannerTest/kudu-selectivity.test   | 71 +++++++++++++++++++-
 3 files changed, 80 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java
index 558c282..ee5c318 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/KuduScanNode.java
@@ -29,6 +29,7 @@ import com.cloudera.impala.common.ImpalaRuntimeException;
 import com.google.common.base.Charsets;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
 import org.kududb.client.KuduClient;
 import org.kududb.client.LocatedTablet;
 import org.slf4j.Logger;
@@ -151,20 +152,11 @@ public class KuduScanNode extends ScanNode {
     }
   }
 
-  /**
-   * TODO(kudu-merge): IMPALA-3148 - Update selectivity computation.
-   */
   @Override
   protected double computeSelectivity() {
-    double baseSelectivity = super.computeSelectivity();
-    // The kuduConjuncts_ are not part of the selectivity calculation of the PlanNode
-    // superclass. Adjust the selectivity to account for predicates that are
-    // pushed down to Kudu.
-    for (Expr e : kuduConjuncts_) {
-      if (baseSelectivity < 0) continue;
-      baseSelectivity *= e.getSelectivity();
-    }
-    return baseSelectivity;
+    List<Expr> allConjuncts = Lists.newArrayList(
+        Iterables.concat(conjuncts_, kuduConjuncts_));
+    return computeCombinedSelectivity(allConjuncts);
   }
 
   @Override
@@ -201,7 +193,7 @@ public class KuduScanNode extends ScanNode {
         }
         if (!kuduConjuncts_.isEmpty()) {
           result.append(detailPrefix + "kudu predicates: " + getExplainString(
-              kuduConjuncts_));
+              kuduConjuncts_) + "\n");
         }
       }
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java b/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java
index 1169190..643ca01 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/PlanNode.java
@@ -491,13 +491,13 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
    * The second issue is addressed by an exponential backoff when multiplying each
    * additional selectivity into the final result.
    */
-  protected double computeSelectivity() {
+  static protected double computeCombinedSelectivity(List<Expr> conjuncts) {
     // Collect all estimated selectivities.
     List<Double> selectivities = Lists.newArrayList();
-    for (Expr e: conjuncts_) {
+    for (Expr e: conjuncts) {
       if (e.hasSelectivity()) selectivities.add(e.getSelectivity());
     }
-    if (selectivities.size() != conjuncts_.size()) {
+    if (selectivities.size() != conjuncts.size()) {
       // Some conjuncts have no estimated selectivity. Use a single default
       // representative selectivity for all those conjuncts.
       selectivities.add(Expr.DEFAULT_SELECTIVITY);
@@ -515,6 +515,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
     return Math.max(0.0, Math.min(1.0, result));
   }
 
+  protected double computeSelectivity() {
+    return computeCombinedSelectivity(conjuncts_);
+  }
+
   // Convert this plan node into msg (excluding children), which requires setting
   // the node type and the node-specific field.
   protected abstract void toThrift(TPlanNode msg);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4bdd0b97/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
index 0062fd5..fe1cb29 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
@@ -2,7 +2,30 @@ select * from functional_kudu.zipcode_incomes where id = '8600000US00601'
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED]
   00:SCAN KUDU [functional_kudu.zipcode_incomes]
-     kudu predicates: id = '8600000US00601'     hosts=3 per-host-mem=unavailable
+     kudu predicates: id = '8600000US00601'
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=1
+---- DISTRIBUTEDPLAN
+F01:PLAN FRAGMENT [UNPARTITIONED]
+  01:EXCHANGE [UNPARTITIONED]
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=1
+
+F00:PLAN FRAGMENT [RANDOM]
+  DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     kudu predicates: id = '8600000US00601'
+     hosts=3 per-host-mem=0B
+     tuple-ids=0 row-size=124B cardinality=1
+====
+# The cardinality from "zip = '2'" should dominate.
+select * from functional_kudu.zipcode_incomes where id != '1' and zip = '2'
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     predicates: id != '1'
+     kudu predicates: zip = '2'
+     hosts=3 per-host-mem=unavailable
      tuple-ids=0 row-size=124B cardinality=1
 ---- DISTRIBUTEDPLAN
 F01:PLAN FRAGMENT [UNPARTITIONED]
@@ -13,6 +36,48 @@ F01:PLAN FRAGMENT [UNPARTITIONED]
 F00:PLAN FRAGMENT [RANDOM]
   DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED]
   00:SCAN KUDU [functional_kudu.zipcode_incomes]
-     kudu predicates: id = '8600000US00601'     hosts=3 per-host-mem=0B
+     predicates: id != '1'
+     kudu predicates: zip = '2'
+     hosts=3 per-host-mem=0B
      tuple-ids=0 row-size=124B cardinality=1
-====
\ No newline at end of file
+====
+select * from functional_kudu.zipcode_incomes where id > '1' and zip > '2'
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     predicates: id > '1', zip > '2'
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=3317
+---- DISTRIBUTEDPLAN
+F01:PLAN FRAGMENT [UNPARTITIONED]
+  01:EXCHANGE [UNPARTITIONED]
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=3317
+
+F00:PLAN FRAGMENT [RANDOM]
+  DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     predicates: id > '1', zip > '2'
+     hosts=3 per-host-mem=0B
+     tuple-ids=0 row-size=124B cardinality=3317
+====
+select * from functional_kudu.zipcode_incomes where id = '1' or id = '2'
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     predicates: id = '1' OR id = '2'
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=2
+---- DISTRIBUTEDPLAN
+F01:PLAN FRAGMENT [UNPARTITIONED]
+  01:EXCHANGE [UNPARTITIONED]
+     hosts=3 per-host-mem=unavailable
+     tuple-ids=0 row-size=124B cardinality=2
+
+F00:PLAN FRAGMENT [RANDOM]
+  DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, UNPARTITIONED]
+  00:SCAN KUDU [functional_kudu.zipcode_incomes]
+     predicates: id = '1' OR id = '2'
+     hosts=3 per-host-mem=0B
+     tuple-ids=0 row-size=124B cardinality=2
+====


Mime
View raw message