quickstep-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hbdeshm...@apache.org
Subject [1/3] incubator-quickstep git commit: Improve StarSchemaSimpleCostModel to provide closer estimation of number of groups for hash aggregation. [Forced Update!]
Date Fri, 07 Oct 2016 02:49:07 GMT
Repository: incubator-quickstep
Updated Branches:
  refs/heads/fix-fasthash-destructor a0405edbe -> f48f9dc87 (forced update)


Improve StarSchemaSimpleCostModel to provide closer estimation of number of groups for hash
aggregation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/ad1f8c40
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/ad1f8c40
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/ad1f8c40

Branch: refs/heads/fix-fasthash-destructor
Commit: ad1f8c406954778d332f02b680e08e0118727436
Parents: 4126c4f
Author: Jianqiao Zhu <jianqiao@cs.wisc.edu>
Authored: Mon Oct 3 23:58:30 2016 -0500
Committer: Jianqiao Zhu <jianqiao@cs.wisc.edu>
Committed: Tue Oct 4 17:06:04 2016 -0500

----------------------------------------------------------------------
 query_optimizer/CMakeLists.txt                  |   2 +-
 query_optimizer/ExecutionGenerator.cpp          |   8 +-
 query_optimizer/ExecutionGenerator.hpp          |   1 -
 query_optimizer/cost_model/CMakeLists.txt       |   3 +
 query_optimizer/cost_model/CostModel.hpp        |  11 +
 .../cost_model/StarSchemaSimpleCostModel.cpp    | 243 ++++++++++++++-----
 .../cost_model/StarSchemaSimpleCostModel.hpp    |  70 ++++--
 query_optimizer/expressions/ExpressionUtil.hpp  |  19 ++
 .../StarSchemaHashJoinOrderOptimization.hpp     |   2 +-
 9 files changed, 280 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/query_optimizer/CMakeLists.txt b/query_optimizer/CMakeLists.txt
index 32f7885..988ffd8 100644
--- a/query_optimizer/CMakeLists.txt
+++ b/query_optimizer/CMakeLists.txt
@@ -78,7 +78,7 @@ target_link_libraries(quickstep_queryoptimizer_ExecutionGenerator
                       quickstep_queryoptimizer_QueryHandle
                       quickstep_queryoptimizer_QueryPlan
                       quickstep_queryoptimizer_costmodel_CostModel
-                      quickstep_queryoptimizer_costmodel_SimpleCostModel
+                      quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostModel
                       quickstep_queryoptimizer_expressions_AggregateFunction
                       quickstep_queryoptimizer_expressions_Alias
                       quickstep_queryoptimizer_expressions_AttributeReference

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/ExecutionGenerator.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/ExecutionGenerator.cpp b/query_optimizer/ExecutionGenerator.cpp
index 968314e..9347c9c 100644
--- a/query_optimizer/ExecutionGenerator.cpp
+++ b/query_optimizer/ExecutionGenerator.cpp
@@ -58,7 +58,7 @@
 #include "query_optimizer/OptimizerContext.hpp"
 #include "query_optimizer/QueryHandle.hpp"
 #include "query_optimizer/QueryPlan.hpp"
-#include "query_optimizer/cost_model/SimpleCostModel.hpp"
+#include "query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp"
 #include "query_optimizer/expressions/AggregateFunction.hpp"
 #include "query_optimizer/expressions/Alias.hpp"
 #include "query_optimizer/expressions/AttributeReference.hpp"
@@ -167,7 +167,7 @@ void ExecutionGenerator::generatePlan(const P::PhysicalPtr &physical_plan)
{
       << "The physical plan must be rooted by a TopLevelPlan";
 
   cost_model_.reset(
-      new cost::SimpleCostModel(top_level_physical_plan_->shared_subplans()));
+      new cost::StarSchemaSimpleCostModel(top_level_physical_plan_->shared_subplans()));
 
   const CatalogRelation *result_relation = nullptr;
 
@@ -1411,7 +1411,9 @@ void ExecutionGenerator::convertAggregate(
     aggr_state_proto->mutable_predicate()->CopyFrom(predicate->getProto());
   }
 
-  aggr_state_proto->set_estimated_num_entries(cost_model_->estimateCardinality(physical_plan));
+  const std::size_t estimated_num_groups =
+      cost_model_->estimateNumGroupsForAggregate(physical_plan);
+  aggr_state_proto->set_estimated_num_entries(std::max(16uL, estimated_num_groups));
 
   const QueryPlan::DAGNodeIndex aggregation_operator_index =
       execution_plan_->addRelationalOperator(

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/ExecutionGenerator.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/ExecutionGenerator.hpp b/query_optimizer/ExecutionGenerator.hpp
index 6017aa5..2aaf5ab 100644
--- a/query_optimizer/ExecutionGenerator.hpp
+++ b/query_optimizer/ExecutionGenerator.hpp
@@ -37,7 +37,6 @@
 #include "query_optimizer/QueryHandle.hpp"
 #include "query_optimizer/QueryPlan.hpp"
 #include "query_optimizer/cost_model/CostModel.hpp"
-#include "query_optimizer/cost_model/SimpleCostModel.hpp"
 #include "query_optimizer/expressions/ExprId.hpp"
 #include "query_optimizer/expressions/NamedExpression.hpp"
 #include "query_optimizer/expressions/Predicate.hpp"

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/CMakeLists.txt b/query_optimizer/cost_model/CMakeLists.txt
index abbc3da..ba99de3 100644
--- a/query_optimizer/cost_model/CMakeLists.txt
+++ b/query_optimizer/cost_model/CMakeLists.txt
@@ -24,6 +24,7 @@ add_library(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostModel
 
 # Link dependencies:
 target_link_libraries(quickstep_queryoptimizer_costmodel_CostModel
+                      quickstep_queryoptimizer_physical_Aggregate
                       quickstep_queryoptimizer_physical_Physical
                       quickstep_utility_Macros)
 target_link_libraries(quickstep_queryoptimizer_costmodel_SimpleCostModel
@@ -50,6 +51,7 @@ target_link_libraries(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostMod
                       quickstep_queryoptimizer_expressions_ComparisonExpression
                       quickstep_queryoptimizer_expressions_ExprId
                       quickstep_queryoptimizer_expressions_ExpressionType
+                      quickstep_queryoptimizer_expressions_ExpressionUtil
                       quickstep_queryoptimizer_expressions_LogicalAnd
                       quickstep_queryoptimizer_expressions_LogicalOr
                       quickstep_queryoptimizer_expressions_PatternMatcher
@@ -57,6 +59,7 @@ target_link_libraries(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostMod
                       quickstep_queryoptimizer_physical_Aggregate
                       quickstep_queryoptimizer_physical_HashJoin
                       quickstep_queryoptimizer_physical_NestedLoopsJoin
+                      quickstep_queryoptimizer_physical_PatternMatcher
                       quickstep_queryoptimizer_physical_Physical
                       quickstep_queryoptimizer_physical_PhysicalType
                       quickstep_queryoptimizer_physical_Selection

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/CostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/CostModel.hpp b/query_optimizer/cost_model/CostModel.hpp
index ca49733..20bc208 100644
--- a/query_optimizer/cost_model/CostModel.hpp
+++ b/query_optimizer/cost_model/CostModel.hpp
@@ -22,6 +22,7 @@
 
 #include <cstddef>
 
+#include "query_optimizer/physical/Aggregate.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "utility/Macros.hpp"
 
@@ -53,6 +54,16 @@ class CostModel {
   virtual std::size_t estimateCardinality(
       const physical::PhysicalPtr &physical_plan) = 0;
 
+  /**
+   * @brief Estimate the number of groups of an aggregation.
+   *
+   * @param aggregate The physical plan of the aggregation.
+   * @return The estimated number of groups.
+   */
+  virtual std::size_t estimateNumGroupsForAggregate(const physical::AggregatePtr &aggregate)
{
+    return estimateCardinality(aggregate);
+  }
+
  protected:
   /**
    * @brief Constructor.

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
index 911a765..8d254fa 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
@@ -29,6 +29,7 @@
 #include "query_optimizer/expressions/ComparisonExpression.hpp"
 #include "query_optimizer/expressions/ExprId.hpp"
 #include "query_optimizer/expressions/ExpressionType.hpp"
+#include "query_optimizer/expressions/ExpressionUtil.hpp"
 #include "query_optimizer/expressions/LogicalAnd.hpp"
 #include "query_optimizer/expressions/LogicalOr.hpp"
 #include "query_optimizer/expressions/Predicate.hpp"
@@ -36,6 +37,7 @@
 #include "query_optimizer/physical/Aggregate.hpp"
 #include "query_optimizer/physical/NestedLoopsJoin.hpp"
 #include "query_optimizer/physical/HashJoin.hpp"
+#include "query_optimizer/physical/PatternMatcher.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "query_optimizer/physical/PhysicalType.hpp"
 #include "query_optimizer/physical/Selection.hpp"
@@ -85,8 +87,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinality(
           shared_subplans_[shared_subplan_reference->subplan_id()]);
     }
     case P::PhysicalType::kSort:
-      return estimateCardinality(
-          std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+      return estimateCardinalityForSort(
+          std::static_pointer_cast<const P::Sort>(physical_plan));
     case P::PhysicalType::kWindowAggregate:
       return estimateCardinalityForWindowAggregate(
           std::static_pointer_cast<const P::WindowAggregate>(physical_plan));
@@ -111,9 +113,17 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableReference(
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection(
     const P::SelectionPtr &physical_plan) {
-  double selectivity = estimateSelectivityForSelection(physical_plan);
-  return std::max(static_cast<std::size_t>(estimateCardinality(physical_plan->input())
* selectivity),
-                  static_cast<std::size_t>(1));
+  double selectivity =
+      estimateSelectivityForPredicate(physical_plan->filter_predicate(), physical_plan);
+  return static_cast<std::size_t>(
+      estimateCardinality(physical_plan->input()) * selectivity);
+}
+
+std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSort(
+    const physical::SortPtr &physical_plan) {
+  std::size_t cardinality = estimateCardinality(
+      std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+  return std::min(cardinality, static_cast<std::size_t>(physical_plan->limit()));
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableGenerator(
@@ -127,8 +137,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForHashJoin(
   std::size_t right_cardinality = estimateCardinality(physical_plan->right());
   double left_selectivity = estimateSelectivity(physical_plan->left());
   double right_selectivity = estimateSelectivity(physical_plan->right());
-  return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity) +
1,
-                  static_cast<std::size_t>(right_cardinality * left_selectivity) +
1);
+  return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity + 0.5),
+                  static_cast<std::size_t>(right_cardinality * left_selectivity + 0.5));
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin(
@@ -139,11 +149,10 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin(
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForAggregate(
     const P::AggregatePtr &physical_plan) {
-  if (physical_plan->grouping_expressions().empty()) {
-    return 1;
-  }
-  return std::max(static_cast<std::size_t>(1),
-                  estimateCardinality(physical_plan->input()) / 10);
+  double filter_selectivity =
+      estimateSelectivityForPredicate(physical_plan->filter_predicate(), physical_plan);
+  return static_cast<std::size_t>(
+      estimateNumGroupsForAggregate(physical_plan) * filter_selectivity);
 }
 
 std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
@@ -151,24 +160,115 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
   return estimateCardinality(physical_plan->input());
 }
 
+
+std::size_t StarSchemaSimpleCostModel::estimateNumGroupsForAggregate(
+    const physical::AggregatePtr &aggregate) {
+  if (aggregate->grouping_expressions().empty()) {
+    return 1uL;
+  }
+
+  std::size_t estimated_child_cardinality = estimateCardinality(aggregate->input());
+  std::size_t estimated_num_groups = 1;
+  std::size_t max_attr_num_distinct_values = 0;
+  for (const auto &expr : aggregate->grouping_expressions()) {
+    E::AttributeReferencePtr attr;
+    if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) {
+      std::size_t attr_num_distinct_values =
+          estimateNumDistinctValues(attr->id(), aggregate->input());
+      estimated_num_groups *= std::max(1uL, attr_num_distinct_values);
+      max_attr_num_distinct_values =
+          std::max(max_attr_num_distinct_values, attr_num_distinct_values);
+    } else {
+      // TODO(jianqiao): implement estimateNumDistinctValues() for expressions.
+      estimated_num_groups *= 64uL;
+    }
+  }
+  estimated_num_groups = std::max(
+      std::min(estimated_num_groups, estimated_child_cardinality / 10),
+      max_attr_num_distinct_values);
+  return estimated_num_groups;
+}
+
+
+std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues(
+    const expressions::ExprId attribute_id,
+    const physical::PhysicalPtr &physical_plan) {
+  DCHECK(E::ContainsExprId(physical_plan->getOutputAttributes(), attribute_id));
+
+  P::TableReferencePtr table_reference;
+  if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, &table_reference))
{
+    return getNumDistinctValues(attribute_id, table_reference);
+  }
+
+  double filter_selectivity = estimateSelectivityForFilterPredicate(physical_plan);
+  switch (physical_plan->getPhysicalType()) {
+    case P::PhysicalType::kSelection:  // Fall through
+    case P::PhysicalType::kAggregate: {
+      const P::PhysicalPtr &child = physical_plan->children()[0];
+      if (E::ContainsExprId(child->getOutputAttributes(), attribute_id)) {
+        std::size_t child_num_distinct_values =
+            estimateNumDistinctValues(attribute_id, child);
+        return static_cast<std::size_t>(
+            child_num_distinct_values * filter_selectivity + 0.5);
+      }
+      break;
+    }
+    case P::PhysicalType::kHashJoin: {
+      const P::HashJoinPtr &hash_join =
+          std::static_pointer_cast<const P::HashJoin>(physical_plan);
+      if (E::ContainsExprId(hash_join->left()->getOutputAttributes(), attribute_id))
{
+        std::size_t left_child_num_distinct_values =
+            estimateNumDistinctValues(attribute_id, hash_join->left());
+        double right_child_selectivity =
+            estimateSelectivity(hash_join->right());
+        return static_cast<std::size_t>(
+            left_child_num_distinct_values * right_child_selectivity * filter_selectivity
+ 0.5);
+      }
+      if (E::ContainsExprId(hash_join->right()->getOutputAttributes(), attribute_id))
{
+        std::size_t right_child_num_distinct_values =
+            estimateNumDistinctValues(attribute_id, hash_join->right());
+        double left_child_selectivity =
+            estimateSelectivity(hash_join->left());
+        return static_cast<std::size_t>(
+            right_child_num_distinct_values * left_child_selectivity * filter_selectivity
+ 0.5);
+      }
+    }
+    default:
+      break;
+  }
+
+  return 16uL;
+}
+
 double StarSchemaSimpleCostModel::estimateSelectivity(
     const physical::PhysicalPtr &physical_plan) {
   switch (physical_plan->getPhysicalType()) {
     case P::PhysicalType::kSelection: {
-      return estimateSelectivityForSelection(
-          std::static_pointer_cast<const P::Selection>(physical_plan));
+      const P::SelectionPtr &selection =
+          std::static_pointer_cast<const P::Selection>(physical_plan);
+      double filter_selectivity =
+          estimateSelectivityForPredicate(selection->filter_predicate(), selection);
+      double child_selectivity = estimateSelectivity(selection->input());
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kHashJoin: {
       const P::HashJoinPtr &hash_join =
           std::static_pointer_cast<const P::HashJoin>(physical_plan);
-      return std::min(estimateSelectivity(hash_join->left()),
-                      estimateSelectivity(hash_join->right()));
+      double filter_selectivity =
+          estimateSelectivityForPredicate(hash_join->residual_predicate(), hash_join);
+      double child_selectivity =
+          estimateSelectivity(hash_join->left()) * estimateSelectivity(hash_join->right());
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kNestedLoopsJoin: {
       const P::NestedLoopsJoinPtr &nested_loop_join =
           std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan);
-      return std::min(estimateSelectivity(nested_loop_join->left()),
-                      estimateSelectivity(nested_loop_join->right()));
+      double filter_selectivity =
+          estimateSelectivityForPredicate(nested_loop_join->join_predicate(), nested_loop_join);
+      double child_selectivity = std::min(
+          estimateSelectivity(nested_loop_join->left()),
+          estimateSelectivity(nested_loop_join->right()));
+      return filter_selectivity * child_selectivity;
     }
     case P::PhysicalType::kSharedSubplanReference: {
       const P::SharedSubplanReferencePtr shared_subplan_reference =
@@ -177,36 +277,51 @@ double StarSchemaSimpleCostModel::estimateSelectivity(
           shared_subplans_[shared_subplan_reference->subplan_id()]);
     }
     default:
-      return 1.0;
+      break;
   }
+
+  if (physical_plan->getNumChildren() == 1) {
+    return estimateSelectivity(physical_plan->children()[0]);
+  }
+
+  return 1.0;
 }
 
-double StarSchemaSimpleCostModel::estimateSelectivityForSelection(
-    const physical::SelectionPtr &physical_plan) {
-  const E::PredicatePtr &filter_predicate = physical_plan->filter_predicate();
-
-  // If the subplan is a table reference, gather the number of distinct values
-  // statistics for each column (attribute).
-  std::unordered_map<E::ExprId, std::size_t> num_distinct_values_map;
-  if (physical_plan->input()->getPhysicalType() == P::PhysicalType::kTableReference)
{
-    const P::TableReferencePtr &table_reference =
-        std::static_pointer_cast<const P::TableReference>(physical_plan->input());
-    const CatalogRelation &relation = *table_reference->relation();
-    const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list();
-    for (std::size_t i = 0; i < attributes.size(); ++i) {
-      std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i);
-      if (num_distinct_values > 0) {
-        num_distinct_values_map[attributes[i]->id()] = num_distinct_values;
-      }
-    }
+double StarSchemaSimpleCostModel::estimateSelectivityForFilterPredicate(
+    const physical::PhysicalPtr &physical_plan) {
+  E::PredicatePtr filter_predicate = nullptr;
+  switch (physical_plan->getPhysicalType()) {
+    case P::PhysicalType::kSelection:
+      filter_predicate =
+          std::static_pointer_cast<const P::Selection>(physical_plan)->filter_predicate();
+      break;
+    case P::PhysicalType::kAggregate:
+      filter_predicate =
+          std::static_pointer_cast<const P::Aggregate>(physical_plan)->filter_predicate();
+      break;
+    case P::PhysicalType::kHashJoin:
+      filter_predicate =
+          std::static_pointer_cast<const P::HashJoin>(physical_plan)->residual_predicate();
+      break;
+    case P::PhysicalType::kNestedLoopsJoin:
+      filter_predicate =
+          std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan)->join_predicate();
+      break;
+    default:
+      break;
   }
 
-  return estimateSelectivityForPredicate(num_distinct_values_map, filter_predicate);
+  if (filter_predicate == nullptr) {
+    return 1.0;
+  } else {
+    return estimateSelectivityForPredicate(filter_predicate, physical_plan);
+  }
 }
 
+
 double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
-    const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map,
-    const expressions::PredicatePtr &filter_predicate) {
+    const expressions::PredicatePtr &filter_predicate,
+    const P::PhysicalPtr &physical_plan) {
   if (filter_predicate == nullptr) {
     return 1.0;
   }
@@ -215,10 +330,8 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
     case E::ExpressionType::kComparisonExpression: {
       // Case 1 - Number of distinct values statistics available
       //   Case 1.1 - Equality comparison: 1.0 / num_distinct_values
-      //   Case 1.2 - Otherwise: 5.0 / num_distinct_values
-      // Case 2 - Number of distinct values statistics not available
-      //   Case 2.1 - Equality comparison: 0.1
-      //   Case 2.2 - Otherwise: 0.5
+      //   Case 1.2 - Otherwise: 0.1
+      // Case 2 - Otherwise: 0.5
       const E::ComparisonExpressionPtr &comparison_expression =
           std::static_pointer_cast<const E::ComparisonExpression>(filter_predicate);
       E::AttributeReferencePtr attr;
@@ -226,25 +339,26 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
            E::SomeScalarLiteral::Matches(comparison_expression->right())) ||
           (E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(),
&attr) &&
            E::SomeScalarLiteral::Matches(comparison_expression->left()))) {
-        const auto it = num_distinct_values_map.find(attr->id());
-        if (it != num_distinct_values_map.end() && it->second > 0) {
-          double unit_selectivity = 1.0 / it->second;
-          return comparison_expression->isEqualityComparisonPredicate()
-                     ? unit_selectivity
-                     : std::min(0.5, unit_selectivity * 5.0);
+        for (const auto &child : physical_plan->children()) {
+          if (E::ContainsExprId(child->getOutputAttributes(), attr->id())) {
+            const std::size_t child_num_distinct_values = estimateNumDistinctValues(attr->id(),
child);
+            if (comparison_expression->isEqualityComparisonPredicate()) {
+              return 1.0 / child_num_distinct_values;
+            } else {
+              return 1.0 / std::max(std::min(child_num_distinct_values / 100.0, 10.0), 2.0);
+            }
+          }
         }
+        return 0.1;
       }
-
-      return comparison_expression->isEqualityComparisonPredicate() ? 0.1 : 0.5;
+      return 0.5;
     }
     case E::ExpressionType::kLogicalAnd: {
       const E::LogicalAndPtr &logical_and =
           std::static_pointer_cast<const E::LogicalAnd>(filter_predicate);
       double selectivity = 1.0;
       for (const auto &predicate : logical_and->operands()) {
-        selectivity =
-            std::min(selectivity,
-                     estimateSelectivityForPredicate(num_distinct_values_map, predicate));
+        selectivity = selectivity * estimateSelectivityForPredicate(predicate, physical_plan);
       }
       return selectivity;
     }
@@ -253,7 +367,7 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
           std::static_pointer_cast<const E::LogicalOr>(filter_predicate);
       double selectivity = 0;
       for (const auto &predicate : logical_or->operands()) {
-        selectivity += estimateSelectivityForPredicate(num_distinct_values_map, predicate);
+        selectivity += estimateSelectivityForPredicate(predicate, physical_plan);
       }
       return std::min(selectivity, 1.0);
     }
@@ -263,6 +377,23 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
   return 1.0;
 }
 
+std::size_t StarSchemaSimpleCostModel::getNumDistinctValues(
+    const E::ExprId attribute_id,
+    const P::TableReferencePtr &table_reference) {
+  const CatalogRelation &relation = *table_reference->relation();
+  const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list();
+  for (std::size_t i = 0; i < attributes.size(); ++i) {
+    if (attributes[i]->id() == attribute_id) {
+      std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i);
+      if (num_distinct_values > 0) {
+        return num_distinct_values;
+      }
+      break;
+    }
+  }
+  return estimateCardinalityForTableReference(table_reference);
+}
+
 }  // namespace cost
 }  // namespace optimizer
 }  // namespace quickstep

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
index 4314b92..6f6aa29 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
@@ -21,7 +21,6 @@
 #define QUERY_OPTIMIZER_COST_MODEL_STAR_SCHEMA_SIMPLE_COST_MODEL_HPP_
 
 #include <cstddef>
-#include <unordered_map>
 #include <vector>
 
 #include "query_optimizer/cost_model/CostModel.hpp"
@@ -32,6 +31,7 @@
 #include "query_optimizer/physical/HashJoin.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "query_optimizer/physical/Selection.hpp"
+#include "query_optimizer/physical/Sort.hpp"
 #include "query_optimizer/physical/TableGenerator.hpp"
 #include "query_optimizer/physical/TableReference.hpp"
 #include "query_optimizer/physical/TopLevelPlan.hpp"
@@ -67,6 +67,25 @@ class StarSchemaSimpleCostModel : public CostModel {
       const physical::PhysicalPtr &physical_plan) override;
 
   /**
+   * @brief Estimate the number of groups in an aggregation.
+   *
+   * @param aggregate The physical plan of the aggregation.
+   * @return The estimated number of groups.
+   */
+  std::size_t estimateNumGroupsForAggregate(
+      const physical::AggregatePtr &aggregate) override;
+
+  /**
+   * @brief Estimate the number of distinct values of an attribute in a relation.
+   * 
+   * @param attribute_id The expression id of the target attribute.
+   * @param physical_plan The physical plan of the attribute's relation.
+   * @return The estimated number of distinct values for the attribute.
+   */
+  std::size_t estimateNumDistinctValues(const expressions::ExprId attribute_id,
+                                        const physical::PhysicalPtr &physical_plan);
+
+  /**
    * @brief Estimate the "selectivity" of a physical plan under the assumption
    *        that it acts as a filtered dimension table in a hash join.
    *
@@ -75,40 +94,57 @@ class StarSchemaSimpleCostModel : public CostModel {
    */
   double estimateSelectivity(const physical::PhysicalPtr &physical_plan);
 
+  /**
+   * @brief Estimate the filter predicate's selectivity if it is present in
+   *        the input plan's root node.
+   *
+   * @param physical_plan The input physical plan.
+   * @return The estimated selectivity of the filter predicate if physical_plan
+   *         has such a filter predicate; 1.0 otherwise.
+   */
+  double estimateSelectivityForFilterPredicate(
+      const physical::PhysicalPtr &physical_plan);
+
  private:
-  std::size_t estimateCardinalityForTopLevelPlan(
-      const physical::TopLevelPlanPtr &physical_plan);
+  std::size_t estimateCardinalityForAggregate(
+      const physical::AggregatePtr &physical_plan);
 
-  std::size_t estimateCardinalityForTableReference(
-      const physical::TableReferencePtr &physical_plan);
+  std::size_t estimateCardinalityForHashJoin(
+      const physical::HashJoinPtr &physical_plan);
+
+  std::size_t estimateCardinalityForNestedLoopsJoin(
+      const physical::NestedLoopsJoinPtr &physical_plan);
 
   std::size_t estimateCardinalityForSelection(
       const physical::SelectionPtr &physical_plan);
 
+  std::size_t estimateCardinalityForSort(
+      const physical::SortPtr &physical_plan);
+
   std::size_t estimateCardinalityForTableGenerator(
       const physical::TableGeneratorPtr &physical_plan);
 
-  std::size_t estimateCardinalityForHashJoin(
-      const physical::HashJoinPtr &physical_plan);
-
-  std::size_t estimateCardinalityForNestedLoopsJoin(
-      const physical::NestedLoopsJoinPtr &physical_plan);
+  std::size_t estimateCardinalityForTableReference(
+      const physical::TableReferencePtr &physical_plan);
 
-  std::size_t estimateCardinalityForAggregate(
-      const physical::AggregatePtr &physical_plan);
+  std::size_t estimateCardinalityForTopLevelPlan(
+      const physical::TopLevelPlanPtr &physical_plan);
 
   std::size_t estimateCardinalityForWindowAggregate(
       const physical::WindowAggregatePtr &physical_plan);
 
-  double estimateSelectivityForSelection(
-      const physical::SelectionPtr &physical_plan);
-
   double estimateSelectivityForPredicate(
-      const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map,
-      const expressions::PredicatePtr &filter_predicate);
+      const expressions::PredicatePtr &filter_predicate,
+      const physical::PhysicalPtr &physical_plan);
 
   const std::vector<physical::PhysicalPtr> &shared_subplans_;
 
+  // Get the number of distinct values of an attribute in the table reference.
+  // If the stat is not avaiable, simply returns the table's cardinality.
+  std::size_t getNumDistinctValues(const expressions::ExprId attribute_id,
+                                   const physical::TableReferencePtr &table_reference);
+
+
   DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel);
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/expressions/ExpressionUtil.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/expressions/ExpressionUtil.hpp b/query_optimizer/expressions/ExpressionUtil.hpp
index e9a4067..422d5ab 100644
--- a/query_optimizer/expressions/ExpressionUtil.hpp
+++ b/query_optimizer/expressions/ExpressionUtil.hpp
@@ -95,6 +95,25 @@ bool ContainsExpression(
 }
 
 /**
+ * @brief Checks whether expr_id equals any expression's id in \p expressions.
+ *
+ * @param expressions A list of expressions to be checked with.
+ * @param expr_id The ExprId that is to be checked if it is in \p expressions.
+ * @return True if \p expr_id is contained by \p expressions.
+ */
+template <class NamedExpressionType>
+bool ContainsExprId(
+    const std::vector<std::shared_ptr<const NamedExpressionType>> &expressions,
+    const ExprId expr_id) {
+  for (const std::shared_ptr<const NamedExpressionType> &expression : expressions)
{
+    if (expression->id() == expr_id) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
  * @brief Checks whether \p left is a subset of \p right.
  *
  * @param left The left operand of the subset operator (i.e. the one that may be

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
index 4d6765c..c1a7bae 100644
--- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
+++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
@@ -103,7 +103,7 @@ class StarSchemaHashJoinOrderOptimization : public Rule<physical::Physical>
{
 
       if (lhs->estimated_selectivity < rhs->estimated_selectivity) {
         return !swapped;
-      } else if (lhs->estimated_cardinality < 1000u &&
+      } else if (lhs->estimated_cardinality < 100u &&
                  rhs->estimated_cardinality > 10000u &&
                  lhs->estimated_selectivity < rhs->estimated_selectivity * 1.5)
{
         return !swapped;


Mime
View raw message