spark-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From viirya <...@git.apache.org>
Subject [GitHub] spark pull request #17546: [SPARK-20233] [SQL] Apply star-join filter heuris...
Date Fri, 07 Apr 2017 01:52:42 GMT
Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/17546#discussion_r110303409
  
    --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
---
    @@ -327,3 +345,104 @@ object JoinReorderDP extends PredicateHelper with Logging {
     case class Cost(card: BigInt, size: BigInt) {
       def +(other: Cost): Cost = Cost(this.card + other.card, this.size + other.size)
     }
    +
    +/**
    + * Implements optional filters to reduce the search space for join enumeration.
    + *
    + * 1) Star-join filters: Plan star-joins together since they are assumed
    + *    to have an optimal execution based on their RI relationship.
    + * 2) Cartesian products: Defer their planning later in the graph to avoid
    + *    large intermediate results (expanding joins, in general).
    + * 3) Composite inners: Don't generate "bushy tree" plans to avoid materializing
    + *   intermediate results.
    + *
    + * Filters (2) and (3) are not implemented.
    + */
    +case class JoinReorderDPFilters(conf: SQLConf) extends PredicateHelper {
    +  /**
    +   * Builds join graph information to be used by the filtering strategies.
    +   * Currently, it builds the sets of star/non-star joins.
    +   * It can be extended with the sets of connected/unconnected joins, which
    +   * can be used to filter Cartesian products.
    +   */
    +  def buildJoinGraphInfo(
    +      items: Seq[LogicalPlan],
    +      conditions: Set[Expression],
    +      planIndex: Seq[(LogicalPlan, Int)]): Option[JoinGraphInfo] = {
    +
    +    // Compute the tables in a star-schema relationship.
    +    val starJoin = StarSchemaDetection(conf).findStarJoins(items, conditions.toSeq)
    +    val nonStarJoin = items.filterNot(starJoin.contains(_))
    +
    +    if (starJoin.nonEmpty && nonStarJoin.nonEmpty) {
    +      val (starInt, nonStarInt) = planIndex.collect {
    +        case (p, i) if starJoin.contains(p) =>
    +          (Some(i), None)
    +        case (p, i) if nonStarJoin.contains(p) =>
    +          (None, Some(i))
    +        case _ =>
    +          (None, None)
    +      }.unzip
    +      Some(JoinGraphInfo(starInt.flatten.toSet, nonStarInt.flatten.toSet))
    +    } else {
    +      // Nothing interesting to return.
    +      None
    +    }
    +  }
    +
    +  /**
    +   * Applies star-join filter.
    +   *
    +   * Given the outer/inner and the star/non-star sets,
    +   * the following plan combinations are allowed:
    +   * 1) (outer U inner) is a subset of star-join
    +   * 2) star-join is a subset of (outer U inner)
    +   * 3) (outer U inner) is a subset of non star-join
    +   *
    +   * It assumes the sets are disjoint.
    +   *
    +   * Example query graph:
    +   *
    +   * t1   d1 - t2 - t3
    +   *  \  /
    +   *   f1
    +   *   |
    +   *   d2
    +   *
    +   * star: {d1, f1, d2}
    +   * non-star: {t2, t1, t3}
    +   *
    +   * level 0: (f1 ), (d2 ), (t3 ), (d1 ), (t1 ), (t2 )
    +   * level 1: {t3 t2 }, {f1 d2 }, {f1 d1 }
    +   * level 2: {d2 f1 d1 }
    +   * level 3: {t1 d1 f1 d2 }, {t2 d1 f1 d2 }
    +   * level 4: {d1 t2 f1 t1 d2 }, {d1 t3 t2 f1 d2 }
    +   * level 5: {d1 t3 t2 f1 t1 d2 }
    +   */
    +  def starJoinFilter(
    +      outer: Set[Int],
    +      inner: Set[Int],
    +      filters: JoinGraphInfo) : Boolean = {
    +    val starJoins = filters.starJoins
    +    val nonStarJoins = filters.nonStarJoins
    +    val join = outer.union(inner)
    +
    +    // Disjoint sets
    +    outer.intersect(inner).isEmpty &&
    +      // Either star or non-star is empty
    +      (starJoins.isEmpty || nonStarJoins.isEmpty ||
    +        // Join is a subset of the star-join
    +        join.subsetOf(starJoins) ||
    +        // Star-join is a subset of join
    +        starJoins.subsetOf(join) ||
    --- End diff --
    
    If so, with this added filter, `CostBasedJoinReorder` can also let the star join plans
together, right?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


Mime
View raw message