hive-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From harisan...@apache.org
Subject hive git commit: HIVE-11141 : Improve RuleRegExp when the Expression node stack gets huge (Hari Subramaniyan, reviewed by Laljo John Pullokkaran, Jesus Camacho Rodriguez)
Date Tue, 21 Jul 2015 00:17:12 GMT
Repository: hive
Updated Branches:
  refs/heads/master 941610f2a -> 0ad4f717a


HIVE-11141 : Improve RuleRegExp when the Expression node stack gets huge (Hari Subramaniyan,
reviewed by Laljo John Pullokkaran, Jesus Camacho Rodriguez)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0ad4f717
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0ad4f717
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0ad4f717

Branch: refs/heads/master
Commit: 0ad4f717a7c06bd7bbd90d4b3e861ba1e25d14b7
Parents: 941610f
Author: Hari Subramaniyan <harisankar@apache.org>
Authored: Mon Jul 20 17:17:03 2015 -0700
Committer: Hari Subramaniyan <harisankar@apache.org>
Committed: Mon Jul 20 17:17:03 2015 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hive/ql/lib/RuleRegExp.java   | 191 ++++++++++++++++++-
 .../hadoop/hive/ql/lib/TestRuleRegExp.java      | 118 ++++++++++++
 2 files changed, 300 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
index ddc96c2..c88ed68 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
@@ -18,6 +18,9 @@
 
 package org.apache.hadoop.hive.ql.lib;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
 import java.util.Stack;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -31,7 +34,54 @@ import org.apache.hadoop.hive.ql.parse.SemanticException;
 public class RuleRegExp implements Rule {
 
   private final String ruleName;
-  private final Pattern pattern;
+  private final Pattern patternWithWildCardChar;
+  private final String patternWithoutWildCardChar;
+  private String[] patternORWildChar;
+  private static final Set<Character> wildCards = new HashSet<Character>(Arrays.asList(
+    '[', '^', '$', '*', ']', '+', '|', '(', '\\', '.', '?', ')', '&'));
+
+  /**
+   * The function iterates through the list of wild card characters and sees if
+   * this regular expression contains a wild card character.
+   *
+   * @param pattern
+   *          pattern expressed as a regular Expression
+   */
+  private static boolean patternHasWildCardChar(String pattern) {
+    if (pattern == null) {
+      return false;
+    }
+    for (char pc : pattern.toCharArray()) {
+      if (wildCards.contains(pc)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * The function iterates through the list of wild card characters and sees if
+   * this regular expression contains  only the given char as wild card character.
+   *
+   * @param pattern
+   *          pattern expressed as a regular Expression
+   * @param wcc
+   *          wild card character
+   */
+  private static boolean patternHasOnlyWildCardChar(String pattern, char wcc) {
+    if (pattern == null) {
+      return false;
+    }
+    boolean ret = true;
+    boolean hasWildCard = false;
+    for (char pc : pattern.toCharArray()) {
+      if (wildCards.contains(pc)) {
+        hasWildCard = true;
+        ret = ret && (pc == wcc);
+      }
+    }
+    return ret && hasWildCard;
+  }
 
   /**
    * The rule specified by the regular expression. Note that, the regular
@@ -46,33 +96,156 @@ public class RuleRegExp implements Rule {
    **/
   public RuleRegExp(String ruleName, String regExp) {
     this.ruleName = ruleName;
-    pattern = Pattern.compile(regExp);
+
+    if (patternHasWildCardChar(regExp)) {
+      if (patternHasOnlyWildCardChar(regExp, '|')) {
+          this.patternWithWildCardChar = null;
+          this.patternWithoutWildCardChar = null;
+          this.patternORWildChar = regExp.split("\\|");
+      } else {
+        this.patternWithWildCardChar = Pattern.compile(regExp);
+        this.patternWithoutWildCardChar = null;
+        this.patternORWildChar = null;
+      }
+    } else {
+      this.patternWithWildCardChar = null;
+      this.patternWithoutWildCardChar = regExp;
+      this.patternORWildChar = null;
+    }
   }
 
   /**
-   * This function returns the cost of the rule for the specified stack. Lower
-   * the cost, the better the rule is matched
-   * 
+   * This function returns the cost of the rule for the specified stack when the pattern
+   * matched for has no wildcard character in it. The function expects patternWithoutWildCardChar
+   * to be not null.
    * @param stack
    *          Node stack encountered so far
    * @return cost of the function
    * @throws SemanticException
    */
-  @Override
-  public int cost(Stack<Node> stack) throws SemanticException {
+  private int costPatternWithoutWildCardChar(Stack<Node> stack) throws SemanticException
{
     int numElems = (stack != null ? stack.size() : 0);
+    String name = new String("");
+    int patLen = patternWithoutWildCardChar.length();
+
+    for (int pos = numElems - 1; pos >= 0; pos--) {
+        name = stack.get(pos).getName() + "%" + name;
+      if (name.length() >= patLen) {
+        if (patternWithoutWildCardChar.equals(name)) {
+          return patLen;
+        } else {
+          return -1;
+        }
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * This function returns the cost of the rule for the specified stack when the pattern
+   * matched for has only OR wildcard character in it. The function expects patternORWildChar
+   * to be not null.
+   * @param stack
+   *          Node stack encountered so far
+   * @return cost of the function
+   * @throws SemanticException
+   */
+  private int costPatternWithORWildCardChar(Stack<Node> stack) throws SemanticException
{
+    int numElems = (stack != null ? stack.size() : 0);
+    for (String pattern : patternORWildChar) {
+      String name = new String("");
+      int patLen = pattern.length();
+
+      for (int pos = numElems - 1; pos >= 0; pos--) {
+        name = stack.get(pos).getName() + "%" + name;
+        if (name.length() >= patLen) {
+          if (pattern.equals(name)) {
+            return patLen;
+          } else {
+            break;
+          }
+        }
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * This function returns the cost of the rule for the specified stack when the pattern
+   * matched for has wildcard character in it. The function expects patternWithWildCardChar
+   * to be not null.
+   *
+   * @param stack
+   *          Node stack encountered so far
+   * @return cost of the function
+   * @throws SemanticException
+   */
+  private int costPatternWithWildCardChar(Stack<Node> stack) throws SemanticException
{
+	int numElems = (stack != null ? stack.size() : 0);
     String name = "";
+    Matcher m = patternWithWildCardChar.matcher("");
     for (int pos = numElems - 1; pos >= 0; pos--) {
       name = stack.get(pos).getName() + "%" + name;
-      Matcher m = pattern.matcher(name);
+      m.reset(name);
       if (m.matches()) {
-        return m.group().length();
+        return name.length();
       }
     }
     return -1;
   }
 
   /**
+   * Returns true if the rule pattern is valid and has wild character in it.
+   */
+  boolean rulePatternIsValidWithWildCardChar() {
+    return patternWithoutWildCardChar == null && patternWithWildCardChar != null
&& this.patternORWildChar == null;
+  }
+
+  /**
+   * Returns true if the rule pattern is valid and has wild character in it.
+   */
+  boolean rulePatternIsValidWithoutWildCardChar() {
+    return patternWithWildCardChar == null && patternWithoutWildCardChar != null
&& this.patternORWildChar == null;
+  }
+
+  /**
+   * Returns true if the rule pattern is valid and has wild character in it.
+   */
+  boolean rulePatternIsValidWithORWildCardChar() {
+    return patternWithoutWildCardChar == null && patternWithWildCardChar == null
&& this.patternORWildChar != null;
+  }
+
+  /**
+   * This function returns the cost of the rule for the specified stack. Lower
+   * the cost, the better the rule is matched
+   *
+   * @param stack
+   *          Node stack encountered so far
+   * @return cost of the function
+   * @throws SemanticException
+   */
+  @Override
+  public int cost(Stack<Node> stack) throws SemanticException {
+    if (rulePatternIsValidWithoutWildCardChar()) {
+      return costPatternWithoutWildCardChar(stack);
+    }
+    if (rulePatternIsValidWithWildCardChar()) {
+      return costPatternWithWildCardChar(stack);
+    }
+    if (rulePatternIsValidWithORWildCardChar()) {
+      return costPatternWithORWildCardChar(stack);
+    }
+    // If we reached here, either :
+    // 1. patternWithWildCardChar and patternWithoutWildCardChar are both nulls.
+    // 2. patternWithWildCardChar and patternWithoutWildCardChar are both not nulls.
+    // This is an internal error and we should not let this happen, so throw an exception.
+    throw new SemanticException (
+      "Rule pattern is invalid for " + getName() + " : patternWithWildCardChar = " +
+      patternWithWildCardChar + " patternWithoutWildCardChar = " +
+      patternWithoutWildCardChar);
+  }
+
+  /**
    * @return the name of the Node
    **/
   @Override

http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
new file mode 100644
index 0000000..f06d0df
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.lib;
+
+import static org.junit.Assert.*;
+
+import java.util.List;
+import java.util.Stack;
+
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.junit.Test;
+
+public class TestRuleRegExp {
+
+  public class TestNode implements Node {
+    private String name;
+
+    TestNode (String name) {
+      this.name = name;
+    }
+
+    @Override
+    public List<? extends Node> getChildren() {
+      return null;
+    }
+
+    @Override
+    public String getName() {
+      return name;
+    }
+  }
+
+  @Test
+  public void testPatternWithoutWildCardChar() {
+    String patternStr =
+      ReduceSinkOperator.getOperatorName() + "%" +
+      SelectOperator.getOperatorName() + "%" +
+      FileSinkOperator.getOperatorName() + "%";
+    RuleRegExp rule1 = new RuleRegExp("R1", patternStr);
+    assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), true);
+    assertEquals(rule1.rulePatternIsValidWithWildCardChar(), false);
+    // positive test
+    Stack<Node> ns1 = new Stack<Node>();
+    ns1.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+    ns1.push(new TestNode(SelectOperator.getOperatorName()));
+    ns1.push(new TestNode(FileSinkOperator.getOperatorName()));
+    try {
+      assertEquals(rule1.cost(ns1), patternStr.length());
+    } catch (SemanticException e) {
+      fail(e.getMessage());
+	}
+    // negative test
+    Stack<Node> ns2 = new Stack<Node>();
+    ns2.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+    ns1.push(new TestNode(TableScanOperator.getOperatorName()));
+    ns1.push(new TestNode(FileSinkOperator.getOperatorName()));
+    try {
+      assertEquals(rule1.cost(ns2), -1);
+    } catch (SemanticException e) {
+      fail(e.getMessage());
+    }
+  }
+
+  @Test
+  public void testPatternWithWildCardChar() {
+    RuleRegExp rule1 =  new RuleRegExp("R1",
+      "(" + TableScanOperator.getOperatorName() + "%"
+      + FilterOperator.getOperatorName() + "%)|("
+      + TableScanOperator.getOperatorName() + "%"
+      + FileSinkOperator.getOperatorName() + "%)");
+    assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), false);
+    assertEquals(rule1.rulePatternIsValidWithWildCardChar(), true);
+    // positive test
+    Stack<Node> ns1 = new Stack<Node>();
+    ns1.push(new TestNode(TableScanOperator.getOperatorName()));
+    ns1.push(new TestNode(FilterOperator.getOperatorName()));
+    Stack<Node> ns2 = new Stack<Node>();
+    ns2.push(new TestNode(TableScanOperator.getOperatorName()));
+    ns2.push(new TestNode(FileSinkOperator.getOperatorName()));
+    try {
+      assertNotEquals(rule1.cost(ns1), -1);
+      assertNotEquals(rule1.cost(ns2), -1);
+    } catch (SemanticException e) {
+      fail(e.getMessage());
+	}
+    // negative test
+    Stack<Node> ns3 = new Stack<Node>();
+    ns3.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+    ns3.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+    ns3.push(new TestNode(FileSinkOperator.getOperatorName()));
+    try {
+      assertEquals(rule1.cost(ns3), -1);
+    } catch (SemanticException e) {
+      fail(e.getMessage());
+    }
+  }
+
+}


Mime
View raw message