spark-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From marmb...@apache.org
Subject git commit: [SPARK-2395][SQL] Optimize common LIKE patterns.
Date Tue, 08 Jul 2014 17:38:19 GMT
Repository: spark
Updated Branches:
  refs/heads/master 56e009d4f -> cc3e0a14d


[SPARK-2395][SQL] Optimize common LIKE patterns.

Author: Michael Armbrust <michael@databricks.com>

Closes #1325 from marmbrus/slowLike and squashes the following commits:

023c3eb [Michael Armbrust] add comment.
8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped.
d34d37e [Michael Armbrust] add periods.
3bbf35f [Michael Armbrust] Roll back changes to SparkBuild
53894b1 [Michael Armbrust] Fix grammar.
4094462 [Michael Armbrust] Fix grammar.
6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc3e0a14
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc3e0a14
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc3e0a14

Branch: refs/heads/master
Commit: cc3e0a14daf756ff5c2d4e7916438e175046e5bb
Parents: 56e009d
Author: Michael Armbrust <michael@databricks.com>
Authored: Tue Jul 8 10:36:18 2014 -0700
Committer: Michael Armbrust <michael@databricks.com>
Committed: Tue Jul 8 10:36:18 2014 -0700

----------------------------------------------------------------------
 .../catalyst/expressions/stringOperations.scala | 51 ++++++++++++++++++++
 .../sql/catalyst/optimizer/Optimizer.scala      | 23 +++++++++
 2 files changed, 74 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/cc3e0a14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c074b7b..347471c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
 
   override def toString() = s"Lower($child)"
 }
+
+/** A base class for functions that compare two strings, returning a boolean. */
+abstract class StringComparison extends Expression {
+  self: Product =>
+
+  type EvaluatedType = Any
+
+  def left: Expression
+  def right: Expression
+
+  override def references = children.flatMap(_.references).toSet
+  override def children = left :: right :: Nil
+
+  override def nullable: Boolean = true
+  override def dataType: DataType = BooleanType
+
+  def compare(l: String, r: String): Boolean
+
+  override def eval(input: Row): Any = {
+    val leftEval = left.eval(input).asInstanceOf[String]
+    if(leftEval == null) {
+      null
+    } else {
+      val rightEval = right.eval(input).asInstanceOf[String]
+      if (rightEval == null) null else compare(leftEval, rightEval)
+    }
+  }
+
+  override def toString() = s"$nodeName($left, $right)"
+}
+
+/**
+ * A function that returns true if the string `left` contains the string `right`.
+ */
+case class Contains(left: Expression, right: Expression) extends StringComparison {
+  override def compare(l: String, r: String) = l.contains(r)
+}
+
+/**
+ * A function that returns true if the string `left` starts with the string `right`.
+ */
+case class StartsWith(left: Expression, right: Expression) extends StringComparison {
+  def compare(l: String, r: String) = l.startsWith(r)
+}
+
+/**
+ * A function that returns true if the string `left` ends with the string `right`.
+ */
+case class EndsWith(left: Expression, right: Expression) extends StringComparison {
+  def compare(l: String, r: String) = l.endsWith(r)
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/cc3e0a14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 48ca31e..f0904f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
     Batch("ConstantFolding", FixedPoint(100),
       NullPropagation,
       ConstantFolding,
+      LikeSimplification,
       BooleanSimplification,
       SimplifyFilters,
       SimplifyCasts,
@@ -112,6 +113,28 @@ object ColumnPruning extends Rule[LogicalPlan] {
 }
 
 /**
+ * Simplifies LIKE expressions that do not need full regular expressions to evaluate the
condition.
+ * For example, when the expression is just checking to see if a string starts with a given
+ * pattern.
+ */
+object LikeSimplification extends Rule[LogicalPlan] {
+  // if guards below protect from escapes on trailing %.
+  // Cases like "something\%" are not optimized, but this does not affect correctness.
+  val startsWith = "([^_%]+)%".r
+  val endsWith = "%([^_%]+)".r
+  val contains = "%([^_%]+)%".r
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") =>
+      StartsWith(l, Literal(pattern))
+    case Like(l, Literal(endsWith(pattern), StringType)) =>
+      EndsWith(l, Literal(pattern))
+    case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") =>
+      Contains(l, Literal(pattern))
+  }
+}
+
+/**
  * Replaces [[Expression Expressions]] that can be statically evaluated with
  * equivalent [[Literal]] values. This rule is more specific with
  * Null value propagation from bottom to top of the expression tree.


Mime
View raw message