impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From k...@apache.org
Subject [2/2] impala git commit: IMPALA-5929: Remove redundant explicit casts to string
Date Wed, 03 Jan 2018 01:35:56 GMT
IMPALA-5929: Remove redundant explicit casts to string

This patch adds a query rewriter to remove redundant explicit casts to
a string type (string, char, varchar) from binary predicates of the form
"cast(<non-const expr> to <string type>) <eq/ne op> <string constant>".
The cast is redundant if the predicate evaluation is the same even if
the cast is removed and the constant is converted to the original type
of the expression. For example:
cast(int_col as string) = '123456' -> int_col = 123456

Performance:
For the following query on a table having 6001215 records -
select * from tpch.lineitem where cast(l_linenumber as string) = '0'

+-----------------+-----------+--------+
|                 |      Scan Time     |
+-----------------+-----------+--------+
|                 | Avg       | St dev |
| Without rewrite | 1s406ms   | 44ms   |
| With rewrite    | 1s099ms   | 28ms   |
+-----------------+-----------+--------+

Testing:
- Added unit tests to ExprRewriteRulesTest
- Added functional test to expr.test
- Current FE planner tests and BE expr-test run successfully with this
change.

Change-Id: I91b7c6452d0693115f9b9ed9ba09f3ffe0f36b2b
Reviewed-on: http://gerrit.cloudera.org:8080/8660
Reviewed-by: Dimitris Tsirogiannis <dtsirogiannis@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/545163bb
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/545163bb
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/545163bb

Branch: refs/heads/master
Commit: 545163bb0a5c86aa02652d0557871f5b694a6c82
Parents: f168a7d
Author: Bikramjeet Vig <bikramjeet.vig@cloudera.com>
Authored: Mon Nov 27 15:59:56 2017 -0800
Committer: Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org>
Committed: Wed Jan 3 01:15:42 2018 +0000

----------------------------------------------------------------------
 .../org/apache/impala/analysis/Analyzer.java    |  2 +
 .../rewrite/RemoveRedundantStringCast.java      | 93 ++++++++++++++++++++
 .../impala/analysis/ExprRewriteRulesTest.java   | 91 +++++++++++++++++++
 .../queries/QueryTest/exprs.test                | 11 ++-
 4 files changed, 196 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index b84f9ef..eab33cb 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -56,6 +56,7 @@ import org.apache.impala.common.Pair;
 import org.apache.impala.common.RuntimeEnv;
 import org.apache.impala.planner.PlanNode;
 import org.apache.impala.rewrite.BetweenToCompoundRule;
+import org.apache.impala.rewrite.RemoveRedundantStringCast;
 import org.apache.impala.rewrite.EqualityDisjunctsToInRule;
 import org.apache.impala.rewrite.ExprRewriteRule;
 import org.apache.impala.rewrite.ExprRewriter;
@@ -335,6 +336,7 @@ public class Analyzer {
         rules.add(EqualityDisjunctsToInRule.INSTANCE);
         rules.add(NormalizeCountStarRule.INSTANCE);
         rules.add(SimplifyDistinctFromRule.INSTANCE);
+        rules.add(RemoveRedundantStringCast.INSTANCE);
       }
       exprRewriter_ = new ExprRewriter(rules);
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java b/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java
new file mode 100644
index 0000000..40ab0fa
--- /dev/null
+++ b/fe/src/main/java/org/apache/impala/rewrite/RemoveRedundantStringCast.java
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.rewrite;
+
+import org.apache.impala.analysis.Analyzer;
+import org.apache.impala.analysis.BinaryPredicate;
+import org.apache.impala.analysis.CastExpr;
+import org.apache.impala.analysis.Expr;
+import org.apache.impala.analysis.LiteralExpr;
+import org.apache.impala.analysis.TypeDef;
+import org.apache.impala.catalog.Type;
+import org.apache.impala.common.AnalysisException;
+
+/**
+ * Removes redundant explicit string casts (includes String, Char and Varchar
+ * types) from binary predicates of the form
+ * "cast(<non-const expr> to <string type>) <eq/ne op> <string literal>"
+ * because casting and comparing as string is expensive. Redundancy is
+ * established by making sure that the equivalence relationship will be same
+ * even if the constant is cast to the non-const expression type, that is, check
+ * if the following is true: cast(cast(<string literal> as typeOf(<non-const expr>))
+ * as typeOf(<original cast expr>)) == <string literal>
+ *
+ * It relies on NormalizeBinaryPredicatesRule and FoldConstantsRule so that
+ * <string literal> is always on the right hand side and all constant Exprs have been
+ * evaluated and converted to LiteralExprs.
+ *
+ * Examples:
+ * Few cases that are successfully rewritten:
+ * 1. cast(int_col as string) = '123456' -> int_col = 123456
+ * 2. cast(timestamp_col as string) = '2009-01-01 00:01:00' -> timestamp_col =
+ *    TIMESTAMP '2009-01-01 00:01:00'
+ *
+ * Few cases that are not rewritten:
+ * 1. cast(int_col as string) = '0123456' -> no change as equivalence relationship
+ *    is not the same if '0123456' is cast to int.
+ * 2. cast(tinyint_col as char(2)) = '100' -> no change as casting '100' to char(2) would
+ *    give '10' and hence failing the redundancy test.
+ * 3. cast(smallint_col as string) = '1000000000' -> no change due to limitations of this
+ *    rule as during the intermediate step of the the redundancy check, converting
+ *    '1000000000' to smallint will overflow and give wrong result.
+ */
+public class RemoveRedundantStringCast implements ExprRewriteRule {
+  public static ExprRewriteRule INSTANCE = new RemoveRedundantStringCast();
+
+  @Override
+  public Expr apply(Expr expr, Analyzer analyzer) throws AnalysisException {
+    if(!(expr instanceof BinaryPredicate)) return expr;
+    BinaryPredicate.Operator op = ((BinaryPredicate) expr).getOp();
+    boolean isPotentiallyRedundantCast =
+        (op == BinaryPredicate.Operator.EQ || op == BinaryPredicate.Operator.NE) &&
+        (expr.getChild(0).ignoreImplicitCast() instanceof CastExpr) &&
+        expr.getChild(0).ignoreImplicitCast().getType().isStringType() &&
+        expr.getChild(1).getType().isStringType() &&
+        expr.getChild(1).isLiteral();
+
+    if (!isPotentiallyRedundantCast) return expr;
+    // Ignore the implicit casts added during parsing.
+    Expr castExpr = expr.getChild(0).ignoreImplicitCast();
+    Expr castExprChild = castExpr.getChild(0).ignoreImplicitCast();
+    LiteralExpr literalExpr = (LiteralExpr) expr.getChild(1);
+    // Now check for redundancy.
+    Expr castForRedundancyCheck = new CastExpr(new TypeDef(castExpr.getType()),
+        new CastExpr(new TypeDef(castExprChild.getType()), literalExpr));
+    castForRedundancyCheck.analyze(analyzer);
+    LiteralExpr resultOfReverseCast = LiteralExpr.create(castForRedundancyCheck,
+        analyzer.getQueryCtx());
+    // Need to trim() while comparing char(n) types as conversion might add trailing
+    // spaces to the 'resultOfReverseCast'.
+    if (!resultOfReverseCast.isNullLiteral() &&
+        resultOfReverseCast.getStringValue().trim()
+            .equals(literalExpr.getStringValue().trim())) {
+      return new BinaryPredicate(op, castExprChild,
+          castForRedundancyCheck.getChild(0).ignoreImplicitCast());
+    }
+    return expr;
+  }
+}

http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
index 7b9d4b4..fa5bc05 100644
--- a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
@@ -23,6 +23,7 @@ import org.apache.impala.catalog.Catalog;
 import org.apache.impala.common.AnalysisException;
 import org.apache.impala.common.FrontendTestBase;
 import org.apache.impala.rewrite.BetweenToCompoundRule;
+import org.apache.impala.rewrite.RemoveRedundantStringCast;
 import org.apache.impala.rewrite.SimplifyDistinctFromRule;
 import org.apache.impala.rewrite.EqualityDisjunctsToInRule;
 import org.apache.impala.rewrite.ExprRewriteRule;
@@ -581,6 +582,96 @@ public class ExprRewriteRulesTest extends FrontendTestBase {
     RewritesOk("if(bool_col <=> NULL, 1, 2)", rules, null);
   }
 
+  @Test
+  public void TestRemoveRedundantStringCastRule() throws AnalysisException {
+    ExprRewriteRule removeRule = RemoveRedundantStringCast.INSTANCE;
+    ExprRewriteRule foldConstantRule = FoldConstantsRule.INSTANCE;
+    List<ExprRewriteRule> comboRules = Lists.newArrayList(removeRule, foldConstantRule);
+
+    // Can be simplified.
+    RewritesOk("cast(tinyint_col as string) = '100'", comboRules, "tinyint_col = 100");
+    RewritesOk("cast(smallint_col as string) = '1000'", comboRules,
+        "smallint_col = 1000");
+    RewritesOk("cast(int_col as string) = '123456'", comboRules, "int_col = 123456");
+    RewritesOk("cast(bigint_col as string) = '9223372036854775807'", comboRules,
+        "bigint_col = 9223372036854775807");
+    RewritesOk("cast(float_col as string) = '1000.5'", comboRules, "float_col = 1000.5");
+    RewritesOk("cast(double_col as string) = '1000.5'", comboRules,
+        "double_col = 1000.5");
+    RewritesOk("cast(timestamp_col as string) = '2009-01-01 00:01:00'", comboRules,
+        "timestamp_col = TIMESTAMP '2009-01-01 00:01:00'");
+    RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '2.2222'", comboRules,
+        "c1 = 2.2222");
+
+    RewritesOk("cast(tinyint_col as string) = '-100'", comboRules, "tinyint_col = -100");
+    RewritesOk("cast(smallint_col as string) = '-1000'", comboRules,
+        "smallint_col = -1000");
+    RewritesOk("cast(int_col as string) = '-123456'", comboRules, "int_col = -123456");
+    RewritesOk("cast(bigint_col as string) = '-9223372036854775807'", comboRules,
+        "bigint_col = -9223372036854775807");
+    RewritesOk("cast(float_col as string) = '-1000.5'", comboRules,
+        "float_col = -1000.5");
+    RewritesOk("cast(double_col as string) = '-1000.5'", comboRules,
+        "double_col = -1000.5");
+    RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '-2.2222'", comboRules,
+        "c1 = -2.2222");
+
+    // Works for VARCHAR/CHAR.
+    RewritesOk("cast(tinyint_col as char(3)) = '100'", comboRules, "tinyint_col = 100");
+    RewritesOk("cast(tinyint_col as varchar(3)) = '100'", comboRules,
+        "tinyint_col = 100");
+    RewritesOk("functional.chars_tiny", "cast(cs as string) = 'abc'",
+        comboRules, "cs = 'abc  '"); // column 'cs' is char(5), hence the trailing spaces.
+    RewritesOk("functional.chars_tiny", "cast(vc as string) = 'abc'",
+        comboRules, "vc = 'abc'");
+
+    // Works with complex expressions on both sides.
+    RewritesOk("cast(cast(int_col + 1 as double) as string) = '123456'", comboRules,
+        "CAST(int_col + 1 AS DOUBLE) = 123456");
+    RewritesOk("cast(int_col + 1 as string) = concat('123', '456')", comboRules,
+        "int_col + 1 = 123456");
+    RewritesOk("cast(int_col as string) = ltrim(concat('     123', '456'))", comboRules,
+        "int_col = 123456");
+    RewritesOk("cast(int_col as string) = strleft('123456789', 6)", comboRules,
+        "int_col = 123456");
+    RewritesOk("cast(tinyint_col as char(3)) = cast(100 as char(3))", comboRules,
+        "tinyint_col = 100");
+    RewritesOk("cast(tinyint_col as char(3)) = cast(100 as varchar(3))", comboRules,
+        "tinyint_col = 100");
+
+    // Verify nothing happens.
+    RewritesOk("cast(tinyint_col as string) = '0100'", comboRules, null);
+    RewritesOk("cast(tinyint_col as string) = '01000'", comboRules, null);
+    RewritesOk("cast(smallint_col as string) = '01000'", comboRules, null);
+    RewritesOk("cast(smallint_col as string) = '1000000000'", comboRules, null);
+    RewritesOk("cast(int_col as string) = '02147483647'", comboRules, null);
+    RewritesOk("cast(int_col as string) = '21474836470'", comboRules, null);
+    RewritesOk("cast(bigint_col as string) = '09223372036854775807'", comboRules, null);
+    RewritesOk("cast(bigint_col as string) = '92233720368547758070'", comboRules, null);
+    RewritesOk("cast(float_col as string) = '01000.5'", comboRules, null);
+    RewritesOk("cast(double_col as string) = '01000.5'", comboRules, null);
+    RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '02.2222'", comboRules,
+        null);
+    RewritesOk("functional.decimal_tiny", "cast(c1 as string) = '2.22'",
+        comboRules, null);
+    RewritesOk("cast(timestamp_col as string) = '2009-15-01 00:01:00'", comboRules, null);
+    RewritesOk("cast(tinyint_col as char(4)) = '0100'", comboRules, null);
+    RewritesOk("cast(tinyint_col as varchar(4)) = '0100'", comboRules, null);
+    RewritesOk("cast(tinyint_col as char(2)) = '100'", comboRules, null);
+    RewritesOk("cast(tinyint_col as varchar(2)) = '100'", comboRules, null);
+
+    // 'NULL' is treated like any other string, so no conversion should take place.
+    RewritesOk("cast(tinyint_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("cast(smallint_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("cast(int_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("cast(bigint_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("cast(float_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("cast(double_col as string) = 'NULL'", comboRules, null);
+    RewritesOk("functional.decimal_tiny", "cast(c1 as string) = 'NULL'",
+        comboRules, null);
+    RewritesOk("cast(timestamp_col as string) = 'NULL'", comboRules, null);
+  }
+
   /**
    * NULLIF gets converted to an IF, and has cases where
    * it can be further simplified via SimplifyDistinctFromRule.

http://git-wip-us.apache.org/repos/asf/impala/blob/545163bb/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index 0ed71ee..4a0b7a7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -2909,4 +2909,13 @@ where id = case cast('nan' as double) when cast('nan' as double) then
0 else 1 e
 1
 ---- TYPES
 INT
-====
\ No newline at end of file
+====
+---- QUERY
+# Test expr rewrite behavior of RemoveRedundantStringCast with timestamp column
+select id, timestamp_col from functional.alltypestiny
+where cast(timestamp_col as string) = '2009-02-01 00:00:00'
+---- RESULTS
+2,2009-02-01 00:00:00
+---- TYPES
+int, timestamp
+====


Mime
View raw message