Return-Path: X-Original-To: apmail-spark-commits-archive@minotaur.apache.org Delivered-To: apmail-spark-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2BF2C119C0 for ; Tue, 8 Jul 2014 17:38:20 +0000 (UTC) Received: (qmail 95395 invoked by uid 500); 8 Jul 2014 17:38:20 -0000 Delivered-To: apmail-spark-commits-archive@spark.apache.org Received: (qmail 95365 invoked by uid 500); 8 Jul 2014 17:38:20 -0000 Mailing-List: contact commits-help@spark.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@spark.apache.org Delivered-To: mailing list commits@spark.apache.org Received: (qmail 95356 invoked by uid 99); 8 Jul 2014 17:38:20 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 08 Jul 2014 17:38:20 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id CFF26946A52; Tue, 8 Jul 2014 17:38:19 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: marmbrus@apache.org To: commits@spark.apache.org Message-Id: <1cad6daf5c6543ce99ae7d0bffa563a3@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: git commit: [SPARK-2395][SQL] Optimize common LIKE patterns. Date: Tue, 8 Jul 2014 17:38:19 +0000 (UTC) Repository: spark Updated Branches: refs/heads/master 56e009d4f -> cc3e0a14d [SPARK-2395][SQL] Optimize common LIKE patterns. Author: Michael Armbrust Closes #1325 from marmbrus/slowLike and squashes the following commits: 023c3eb [Michael Armbrust] add comment. 8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped. d34d37e [Michael Armbrust] add periods. 3bbf35f [Michael Armbrust] Roll back changes to SparkBuild 53894b1 [Michael Armbrust] Fix grammar. 4094462 [Michael Armbrust] Fix grammar. 6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc3e0a14 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc3e0a14 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc3e0a14 Branch: refs/heads/master Commit: cc3e0a14daf756ff5c2d4e7916438e175046e5bb Parents: 56e009d Author: Michael Armbrust Authored: Tue Jul 8 10:36:18 2014 -0700 Committer: Michael Armbrust Committed: Tue Jul 8 10:36:18 2014 -0700 ---------------------------------------------------------------------- .../catalyst/expressions/stringOperations.scala | 51 ++++++++++++++++++++ .../sql/catalyst/optimizer/Optimizer.scala | 23 +++++++++ 2 files changed, 74 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/cc3e0a14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index c074b7b..347471c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE override def toString() = s"Lower($child)" } + +/** A base class for functions that compare two strings, returning a boolean. */ +abstract class StringComparison extends Expression { + self: Product => + + type EvaluatedType = Any + + def left: Expression + def right: Expression + + override def references = children.flatMap(_.references).toSet + override def children = left :: right :: Nil + + override def nullable: Boolean = true + override def dataType: DataType = BooleanType + + def compare(l: String, r: String): Boolean + + override def eval(input: Row): Any = { + val leftEval = left.eval(input).asInstanceOf[String] + if(leftEval == null) { + null + } else { + val rightEval = right.eval(input).asInstanceOf[String] + if (rightEval == null) null else compare(leftEval, rightEval) + } + } + + override def toString() = s"$nodeName($left, $right)" +} + +/** + * A function that returns true if the string `left` contains the string `right`. + */ +case class Contains(left: Expression, right: Expression) extends StringComparison { + override def compare(l: String, r: String) = l.contains(r) +} + +/** + * A function that returns true if the string `left` starts with the string `right`. + */ +case class StartsWith(left: Expression, right: Expression) extends StringComparison { + def compare(l: String, r: String) = l.startsWith(r) +} + +/** + * A function that returns true if the string `left` ends with the string `right`. + */ +case class EndsWith(left: Expression, right: Expression) extends StringComparison { + def compare(l: String, r: String) = l.endsWith(r) +} http://git-wip-us.apache.org/repos/asf/spark/blob/cc3e0a14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 48ca31e..f0904f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] { Batch("ConstantFolding", FixedPoint(100), NullPropagation, ConstantFolding, + LikeSimplification, BooleanSimplification, SimplifyFilters, SimplifyCasts, @@ -112,6 +113,28 @@ object ColumnPruning extends Rule[LogicalPlan] { } /** + * Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition. + * For example, when the expression is just checking to see if a string starts with a given + * pattern. + */ +object LikeSimplification extends Rule[LogicalPlan] { + // if guards below protect from escapes on trailing %. + // Cases like "something\%" are not optimized, but this does not affect correctness. + val startsWith = "([^_%]+)%".r + val endsWith = "%([^_%]+)".r + val contains = "%([^_%]+)%".r + + def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") => + StartsWith(l, Literal(pattern)) + case Like(l, Literal(endsWith(pattern), StringType)) => + EndsWith(l, Literal(pattern)) + case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") => + Contains(l, Literal(pattern)) + } +} + +/** * Replaces [[Expression Expressions]] that can be statically evaluated with * equivalent [[Literal]] values. This rule is more specific with * Null value propagation from bottom to top of the expression tree.