Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id B3282160C38 for ; Wed, 3 Jan 2018 02:35:56 +0100 (CET) Received: (qmail 37861 invoked by uid 500); 3 Jan 2018 01:35:55 -0000 Mailing-List: contact commits-help@impala.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@impala.apache.org Delivered-To: mailing list commits@impala.apache.org Received: (qmail 37847 invoked by uid 99); 3 Jan 2018 01:35:55 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 03 Jan 2018 01:35:55 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B2306DFFDA; Wed, 3 Jan 2018 01:35:55 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: kwho@apache.org To: commits@impala.apache.org Date: Wed, 03 Jan 2018 01:35:56 -0000 Message-Id: <070b72e0dcdf463caff582390e218e8f@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [2/2] impala git commit: IMPALA-5929: Remove redundant explicit casts to string archived-at: Wed, 03 Jan 2018 01:35:57 -0000 IMPALA-5929: Remove redundant explicit casts to string This patch adds a query rewriter to remove redundant explicit casts to a string type (string, char, varchar) from binary predicates of the form "cast( to ) ". The cast is redundant if the predicate evaluation is the same even if the cast is removed and the constant is converted to the original type of the expression. For example: cast(int_col as string) = '123456' -> int_col = 123456 Performance: For the following query on a table having 6001215 records - select * from tpch.lineitem where cast(l_linenumber as string) = '0' +-----------------+-----------+--------+ | | Scan Time | +-----------------+-----------+--------+ | | Avg | St dev | | Without rewrite | 1s406ms | 44ms | | With rewrite | 1s099ms | 28ms | +-----------------+-----------+--------+ Testing: - Added unit tests to ExprRewriteRulesTest - Added functional test to expr.test - Current FE planner tests and BE expr-test run successfully with this change. Change-Id: I91b7c6452d0693115f9b9ed9ba09f3ffe0f36b2b Reviewed-on: http://gerrit.cloudera.org:8080/8660 Reviewed-by: Dimitris Tsirogiannis Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/545163bb Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/545163bb Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/545163bb Branch: refs/heads/master Commit: 545163bb0a5c86aa02652d0557871f5b694a6c82 Parents: f168a7d Author: Bikramjeet Vig Authored: Mon Nov 27 15:59:56 2017 -0800 Committer: Impala Public Jenkins Committed: Wed Jan 3 01:15:42 2018 +0000 ---------------------------------------------------------------------- .../org/apache/impala/analysis/Analyzer.java | 2 + .../rewrite/RemoveRedundantStringCast.java | 93 ++++++++++++++++++++ .../impala/analysis/ExprRewriteRulesTest.java | 91 +++++++++++++++++++ .../queries/QueryTest/exprs.test | 11 ++- 4 files changed, 196 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/main/java/org/apache/impala/analysis/Analyzer.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index b84f9ef..eab33cb 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -56,6 +56,7 @@ import org.apache.impala.common.Pair; import org.apache.impala.common.RuntimeEnv; import org.apache.impala.planner.PlanNode; import org.apache.impala.rewrite.BetweenToCompoundRule; +import org.apache.impala.rewrite.RemoveRedundantStringCast; import org.apache.impala.rewrite.EqualityDisjunctsToInRule; import org.apache.impala.rewrite.ExprRewriteRule; import org.apache.impala.rewrite.ExprRewriter; @@ -335,6 +336,7 @@ public class Analyzer { rules.add(EqualityDisjunctsToInRule.INSTANCE); rules.add(NormalizeCountStarRule.INSTANCE); rules.add(SimplifyDistinctFromRule.INSTANCE); + rules.add(RemoveRedundantStringCast.INSTANCE); } exprRewriter_ = new ExprRewriter(rules); } http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java b/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java new file mode 100644 index 0000000..40ab0fa --- /dev/null +++ b/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.impala.rewrite; + +import org.apache.impala.analysis.Analyzer; +import org.apache.impala.analysis.BinaryPredicate; +import org.apache.impala.analysis.CastExpr; +import org.apache.impala.analysis.Expr; +import org.apache.impala.analysis.LiteralExpr; +import org.apache.impala.analysis.TypeDef; +import org.apache.impala.catalog.Type; +import org.apache.impala.common.AnalysisException; + +/** + * Removes redundant explicit string casts (includes String, Char and Varchar + * types) from binary predicates of the form + * "cast( to ) " + * because casting and comparing as string is expensive. Redundancy is + * established by making sure that the equivalence relationship will be same + * even if the constant is cast to the non-const expression type, that is, check + * if the following is true: cast(cast( as typeOf()) + * as typeOf()) == + * + * It relies on NormalizeBinaryPredicatesRule and FoldConstantsRule so that + * is always on the right hand side and all constant Exprs have been + * evaluated and converted to LiteralExprs. + * + * Examples: + * Few cases that are successfully rewritten: + * 1. cast(int_col as string) = '123456' -> int_col = 123456 + * 2. cast(timestamp_col as string) = '2009-01-01 00:01:00' -> timestamp_col = + * TIMESTAMP '2009-01-01 00:01:00' + * + * Few cases that are not rewritten: + * 1. cast(int_col as string) = '0123456' -> no change as equivalence relationship + * is not the same if '0123456' is cast to int. + * 2. cast(tinyint_col as char(2)) = '100' -> no change as casting '100' to char(2) would + * give '10' and hence failing the redundancy test. + * 3. cast(smallint_col as string) = '1000000000' -> no change due to limitations of this + * rule as during the intermediate step of the the redundancy check, converting + * '1000000000' to smallint will overflow and give wrong result. + */ +public class RemoveRedundantStringCast implements ExprRewriteRule { + public static ExprRewriteRule INSTANCE = new RemoveRedundantStringCast(); + + @Override + public Expr apply(Expr expr, Analyzer analyzer) throws AnalysisException { + if(!(expr instanceof BinaryPredicate)) return expr; + BinaryPredicate.Operator op = ((BinaryPredicate) expr).getOp(); + boolean isPotentiallyRedundantCast = + (op == BinaryPredicate.Operator.EQ || op == BinaryPredicate.Operator.NE) && + (expr.getChild(0).ignoreImplicitCast() instanceof CastExpr) && + expr.getChild(0).ignoreImplicitCast().getType().isStringType() && + expr.getChild(1).getType().isStringType() && + expr.getChild(1).isLiteral(); + + if (!isPotentiallyRedundantCast) return expr; + // Ignore the implicit casts added during parsing. + Expr castExpr = expr.getChild(0).ignoreImplicitCast(); + Expr castExprChild = castExpr.getChild(0).ignoreImplicitCast(); + LiteralExpr literalExpr = (LiteralExpr) expr.getChild(1); + // Now check for redundancy. + Expr castForRedundancyCheck = new CastExpr(new TypeDef(castExpr.getType()), + new CastExpr(new TypeDef(castExprChild.getType()), literalExpr)); + castForRedundancyCheck.analyze(analyzer); + LiteralExpr resultOfReverseCast = LiteralExpr.create(castForRedundancyCheck, + analyzer.getQueryCtx()); + // Need to trim() while comparing char(n) types as conversion might add trailing + // spaces to the 'resultOfReverseCast'. + if (!resultOfReverseCast.isNullLiteral() && + resultOfReverseCast.getStringValue().trim() + .equals(literalExpr.getStringValue().trim())) { + return new BinaryPredicate(op, castExprChild, + castForRedundancyCheck.getChild(0).ignoreImplicitCast()); + } + return expr; + } +} http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java index 7b9d4b4..fa5bc05 100644 --- a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java @@ -23,6 +23,7 @@ import org.apache.impala.catalog.Catalog; import org.apache.impala.common.AnalysisException; import org.apache.impala.common.FrontendTestBase; import org.apache.impala.rewrite.BetweenToCompoundRule; +import org.apache.impala.rewrite.RemoveRedundantStringCast; import org.apache.impala.rewrite.SimplifyDistinctFromRule; import org.apache.impala.rewrite.EqualityDisjunctsToInRule; import org.apache.impala.rewrite.ExprRewriteRule; @@ -581,6 +582,96 @@ public class ExprRewriteRulesTest extends FrontendTestBase { RewritesOk("if(bool_col <=> NULL, 1, 2)", rules, null); } + @Test + public void TestRemoveRedundantStringCastRule() throws AnalysisException { + ExprRewriteRule removeRule = RemoveRedundantStringCast.INSTANCE; + ExprRewriteRule foldConstantRule = FoldConstantsRule.INSTANCE; + List comboRules = Lists.newArrayList(removeRule, foldConstantRule); + + // Can be simplified. + RewritesOk("cast(tinyint_col as string) = '100'", comboRules, "tinyint_col = 100"); + RewritesOk("cast(smallint_col as string) = '1000'", comboRules, + "smallint_col = 1000"); + RewritesOk("cast(int_col as string) = '123456'", comboRules, "int_col = 123456"); + RewritesOk("cast(bigint_col as string) = '9223372036854775807'", comboRules, + "bigint_col = 9223372036854775807"); + RewritesOk("cast(float_col as string) = '1000.5'", comboRules, "float_col = 1000.5"); + RewritesOk("cast(double_col as string) = '1000.5'", comboRules, + "double_col = 1000.5"); + RewritesOk("cast(timestamp_col as string) = '2009-01-01 00:01:00'", comboRules, + "timestamp_col = TIMESTAMP '2009-01-01 00:01:00'"); + RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '2.2222'", comboRules, + "c1 = 2.2222"); + + RewritesOk("cast(tinyint_col as string) = '-100'", comboRules, "tinyint_col = -100"); + RewritesOk("cast(smallint_col as string) = '-1000'", comboRules, + "smallint_col = -1000"); + RewritesOk("cast(int_col as string) = '-123456'", comboRules, "int_col = -123456"); + RewritesOk("cast(bigint_col as string) = '-9223372036854775807'", comboRules, + "bigint_col = -9223372036854775807"); + RewritesOk("cast(float_col as string) = '-1000.5'", comboRules, + "float_col = -1000.5"); + RewritesOk("cast(double_col as string) = '-1000.5'", comboRules, + "double_col = -1000.5"); + RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '-2.2222'", comboRules, + "c1 = -2.2222"); + + // Works for VARCHAR/CHAR. + RewritesOk("cast(tinyint_col as char(3)) = '100'", comboRules, "tinyint_col = 100"); + RewritesOk("cast(tinyint_col as varchar(3)) = '100'", comboRules, + "tinyint_col = 100"); + RewritesOk("functional.chars_tiny", "cast(cs as string) = 'abc'", + comboRules, "cs = 'abc '"); // column 'cs' is char(5), hence the trailing spaces. + RewritesOk("functional.chars_tiny", "cast(vc as string) = 'abc'", + comboRules, "vc = 'abc'"); + + // Works with complex expressions on both sides. + RewritesOk("cast(cast(int_col + 1 as double) as string) = '123456'", comboRules, + "CAST(int_col + 1 AS DOUBLE) = 123456"); + RewritesOk("cast(int_col + 1 as string) = concat('123', '456')", comboRules, + "int_col + 1 = 123456"); + RewritesOk("cast(int_col as string) = ltrim(concat(' 123', '456'))", comboRules, + "int_col = 123456"); + RewritesOk("cast(int_col as string) = strleft('123456789', 6)", comboRules, + "int_col = 123456"); + RewritesOk("cast(tinyint_col as char(3)) = cast(100 as char(3))", comboRules, + "tinyint_col = 100"); + RewritesOk("cast(tinyint_col as char(3)) = cast(100 as varchar(3))", comboRules, + "tinyint_col = 100"); + + // Verify nothing happens. + RewritesOk("cast(tinyint_col as string) = '0100'", comboRules, null); + RewritesOk("cast(tinyint_col as string) = '01000'", comboRules, null); + RewritesOk("cast(smallint_col as string) = '01000'", comboRules, null); + RewritesOk("cast(smallint_col as string) = '1000000000'", comboRules, null); + RewritesOk("cast(int_col as string) = '02147483647'", comboRules, null); + RewritesOk("cast(int_col as string) = '21474836470'", comboRules, null); + RewritesOk("cast(bigint_col as string) = '09223372036854775807'", comboRules, null); + RewritesOk("cast(bigint_col as string) = '92233720368547758070'", comboRules, null); + RewritesOk("cast(float_col as string) = '01000.5'", comboRules, null); + RewritesOk("cast(double_col as string) = '01000.5'", comboRules, null); + RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '02.2222'", comboRules, + null); + RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '2.22'", + comboRules, null); + RewritesOk("cast(timestamp_col as string) = '2009-15-01 00:01:00'", comboRules, null); + RewritesOk("cast(tinyint_col as char(4)) = '0100'", comboRules, null); + RewritesOk("cast(tinyint_col as varchar(4)) = '0100'", comboRules, null); + RewritesOk("cast(tinyint_col as char(2)) = '100'", comboRules, null); + RewritesOk("cast(tinyint_col as varchar(2)) = '100'", comboRules, null); + + // 'NULL' is treated like any other string, so no conversion should take place. + RewritesOk("cast(tinyint_col as string) = 'NULL'", comboRules, null); + RewritesOk("cast(smallint_col as string) = 'NULL'", comboRules, null); + RewritesOk("cast(int_col as string) = 'NULL'", comboRules, null); + RewritesOk("cast(bigint_col as string) = 'NULL'", comboRules, null); + RewritesOk("cast(float_col as string) = 'NULL'", comboRules, null); + RewritesOk("cast(double_col as string) = 'NULL'", comboRules, null); + RewritesOk("functional.decimal_tiny", "cast(c1 as string) = 'NULL'", + comboRules, null); + RewritesOk("cast(timestamp_col as string) = 'NULL'", comboRules, null); + } + /** * NULLIF gets converted to an IF, and has cases where * it can be further simplified via SimplifyDistinctFromRule. http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/testdata/workloads/functional-query/queries/QueryTest/exprs.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test index 0ed71ee..4a0b7a7 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test +++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test @@ -2909,4 +2909,13 @@ where id = case cast('nan' as double) when cast('nan' as double) then 0 else 1 e 1 ---- TYPES INT -==== \ No newline at end of file +==== +---- QUERY +# Test expr rewrite behavior of RemoveRedundantStringCast with timestamp column +select id, timestamp_col from functional.alltypestiny +where cast(timestamp_col as string) = '2009-02-01 00:00:00' +---- RESULTS +2,2009-02-01 00:00:00 +---- TYPES +int, timestamp +====