phoenix-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gr...@apache.org
Subject git commit: PHOENIX-1254 Add REGEXP_SPLIT function
Date Wed, 17 Sep 2014 07:07:43 GMT
Repository: phoenix
Updated Branches:
  refs/heads/master 173c7d72d -> 89c2f2741


PHOENIX-1254 Add REGEXP_SPLIT function

Add REGEXP_SPLIT function to create VARCHAR_ARRAYs from a VARCHAR
by splitting on a regular expression pattern.


Project: http://git-wip-us.apache.org/repos/asf/phoenix/repo
Commit: http://git-wip-us.apache.org/repos/asf/phoenix/commit/89c2f274
Tree: http://git-wip-us.apache.org/repos/asf/phoenix/tree/89c2f274
Diff: http://git-wip-us.apache.org/repos/asf/phoenix/diff/89c2f274

Branch: refs/heads/master
Commit: 89c2f27411af1ad6c230201e93ab262b7a92fca1
Parents: 173c7d7
Author: Gabriel Reid <gabrielr@ngdata.com>
Authored: Tue Sep 16 09:30:19 2014 +0200
Committer: Gabriel Reid <gabrielr@ngdata.com>
Committed: Wed Sep 17 08:47:21 2014 +0200

----------------------------------------------------------------------
 .../phoenix/end2end/RegexpSplitFunctionIT.java  | 210 +++++++++++++++++++
 .../phoenix/expression/ExpressionType.java      |   6 +-
 .../function/RegexpSplitFunction.java           | 127 +++++++++++
 3 files changed, 341 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/phoenix/blob/89c2f274/phoenix-core/src/it/java/org/apache/phoenix/end2end/RegexpSplitFunctionIT.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/it/java/org/apache/phoenix/end2end/RegexpSplitFunctionIT.java
b/phoenix-core/src/it/java/org/apache/phoenix/end2end/RegexpSplitFunctionIT.java
new file mode 100644
index 0000000..73f1828
--- /dev/null
+++ b/phoenix-core/src/it/java/org/apache/phoenix/end2end/RegexpSplitFunctionIT.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.end2end;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.sql.Array;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Types;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@Category(HBaseManagedTimeTest.class)
+public class RegexpSplitFunctionIT extends BaseHBaseManagedTimeIT {
+
+    private void initTable(Connection conn, String val) throws SQLException {
+        initTable(conn, val, ",");
+    }
+
+    private void initTable(Connection conn, String val, String separator) throws SQLException
{
+        String ddl = "CREATE TABLE SPLIT_TEST (" +
+                "ID INTEGER NOT NULL PRIMARY KEY," +
+                "VAL VARCHAR," +
+                "SEP VARCHAR," +
+                "ARR VARCHAR ARRAY)";
+        conn.createStatement().execute(ddl);
+        String dml = "UPSERT INTO SPLIT_TEST (ID, SEP, VAL) VALUES (?, ?, ?)";
+        PreparedStatement stmt = conn.prepareStatement(dml);
+        stmt.setInt(1, 1);
+        if (separator == null) {
+            stmt.setNull(2, Types.VARCHAR);
+        } else {
+            stmt.setString(2, separator);
+        }
+        if (val == null) {
+            stmt.setNull(3, Types.VARCHAR);
+        } else {
+            stmt.setString(3, val);
+        }
+        stmt.execute();
+        conn.commit();
+    }
+
+    @Test
+    public void testSplit_ArrayReference() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, ',')[1] FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        assertEquals("ONE", rs.getString(1));
+        assertFalse(rs.next());
+    }
+
+    @Test
+    public void testSplit_InFilter() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT ID FROM SPLIT_TEST WHERE (REGEXP_SPLIT(VAL, ','))[1] = 'ONE'");
+        assertTrue(rs.next());
+        assertEquals(1, rs.getInt(1));
+        assertFalse(rs.next());
+    }
+
+    @Test
+    public void testSplit_Upsert() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE");
+
+        conn.createStatement().executeUpdate("UPSERT INTO SPLIT_TEST (ID, ARR) SELECT ID,
" +
+                "REGEXP_SPLIT(VAL, ',') FROM SPLIT_TEST");
+        conn.commit();
+
+        ResultSet rs = conn.createStatement().executeQuery("SELECT ARR FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[]{ "ONE", "TWO", "THREE" }, values);
+    }
+
+    @Test
+    public void testSplit_AlternateSeparator() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE:TWO:THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, ':') FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[] { "ONE", "TWO", "THREE" }, values);
+    }
+
+    @Test
+    public void testSplit_DynamicPattern() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, SEP) FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[] { "ONE", "TWO", "THREE" }, values);
+    }
+
+    @Test
+    public void testSplit_NoSplitString() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "CANNOT BE SPLIT");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, ',') FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[] { "CANNOT BE SPLIT" }, values);
+    }
+
+    @Test
+    public void testSplit_PatternBasedSplit() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE!:TWO:::!THREE::!:FOUR");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, '[:!]+') FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[] { "ONE", "TWO", "THREE", "FOUR" }, values);
+    }
+
+    @Test
+    public void testSplit_PatternEscape() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE|TWO|THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, '\\\\|') FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        Array array = rs.getArray(1);
+        String[] values = (String[]) array.getArray();
+        assertArrayEquals(new String[] { "ONE", "TWO", "THREE" }, values);
+    }
+
+    @Test
+    public void testSplit_NullString() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, null);
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, ',') FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        assertNull(rs.getString(1));
+        assertFalse(rs.next());
+    }
+
+    @Test
+    public void testSplit_NullSeparator() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE");
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, NULL) FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        assertNull(rs.getString(1));
+        assertFalse(rs.next());
+    }
+
+    @Test
+    public void testSplit_NullDynamicSeparator() throws SQLException {
+        Connection conn = DriverManager.getConnection(getUrl());
+        initTable(conn, "ONE,TWO,THREE", null);
+
+        ResultSet rs = conn.createStatement().executeQuery(
+                "SELECT REGEXP_SPLIT(VAL, SEP) FROM SPLIT_TEST");
+        assertTrue(rs.next());
+        assertNull(rs.getString(1));
+        assertFalse(rs.next());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/89c2f274/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
index de037fb..1566d64 100644
--- a/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
+++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
@@ -55,6 +55,7 @@ import org.apache.phoenix.expression.function.PercentileContAggregateFunction;
 import org.apache.phoenix.expression.function.PercentileDiscAggregateFunction;
 import org.apache.phoenix.expression.function.RTrimFunction;
 import org.apache.phoenix.expression.function.RegexpReplaceFunction;
+import org.apache.phoenix.expression.function.RegexpSplitFunction;
 import org.apache.phoenix.expression.function.RegexpSubstrFunction;
 import org.apache.phoenix.expression.function.ReverseFunction;
 import org.apache.phoenix.expression.function.RoundDateExpression;
@@ -84,7 +85,7 @@ import com.google.common.collect.Maps;
  * Enumeration of all Expression types that may be evaluated on the server-side.
  * Used during serialization and deserialization to pass Expression between client
  * and server.
- *  
+ *
  *
  *
  * @since 0.1
@@ -181,7 +182,8 @@ public enum ExpressionType {
     ArrayAllComparisonExpression(ArrayAllComparisonExpression.class),
     InlineArrayElemRefExpression(InlineArrayElemRefExpression.class),
     SQLIndexTypeFunction(SQLIndexTypeFunction.class),
-    ModulusExpression(ModulusExpression.class);
+    ModulusExpression(ModulusExpression.class),
+    RegexpSplitFunctiond(RegexpSplitFunction.class);
     ExpressionType(Class<? extends Expression> clazz) {
         this.clazz = clazz;
     }

http://git-wip-us.apache.org/repos/asf/phoenix/blob/89c2f274/phoenix-core/src/main/java/org/apache/phoenix/expression/function/RegexpSplitFunction.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/RegexpSplitFunction.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/RegexpSplitFunction.java
new file mode 100644
index 0000000..1a9e961
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/RegexpSplitFunction.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.expression.function;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.phoenix.expression.Expression;
+import org.apache.phoenix.expression.LiteralExpression;
+import org.apache.phoenix.parse.FunctionParseNode;
+import org.apache.phoenix.schema.PDataType;
+import org.apache.phoenix.schema.PhoenixArray;
+import org.apache.phoenix.schema.tuple.Tuple;
+import org.apache.phoenix.util.ByteUtil;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Function to split a string value into a {@code VARCHAR_ARRAY}.
+ * <p>
+ * Usage:
+ * {@code REGEXP_SPLIT(&lt;source_str&gt;, &lt;split_pattern&gt;)}
+ * <p>
+ * {@code source_str} is the string in which we want to split. {@code split_pattern} is a
+ * Java compatible regular expression string to split the source string.
+ *
+ * The function returns a {@link org.apache.phoenix.schema.PDataType#VARCHAR_ARRAY}
+ */
+ @FunctionParseNode.BuiltInFunction(name=RegexpSplitFunction.NAME, args= {
+        @FunctionParseNode.Argument(allowedTypes={PDataType.VARCHAR}),
+        @FunctionParseNode.Argument(allowedTypes={PDataType.VARCHAR})})
+public class RegexpSplitFunction extends ScalarFunction {
+
+    public static final String NAME = "REGEXP_SPLIT";
+
+    private ImmutableBytesWritable patternPtr = new ImmutableBytesWritable();
+    private Splitter initializedSplitter = null;
+
+    public RegexpSplitFunction() {}
+
+    public RegexpSplitFunction(List<Expression> children) {
+        super(children);
+        init();
+    }
+
+    private void init() {
+        Expression patternExpression = children.get(1);
+        if (patternExpression instanceof LiteralExpression) {
+            Object patternValue = ((LiteralExpression) patternExpression).getValue();
+            if (patternValue != null) {
+                initializedSplitter = Splitter.onPattern(patternValue.toString());
+            }
+        }
+    }
+
+    @Override
+    public void readFields(DataInput input) throws IOException {
+        super.readFields(input);
+        init();
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) {
+        if (!children.get(0).evaluate(tuple, ptr)) {
+            return false;
+        }
+
+        Expression sourceStrExpression = children.get(0);
+        String sourceStr = (String)PDataType.VARCHAR.toObject(ptr, sourceStrExpression.getSortOrder());
+        if (sourceStr == null) { // sourceStr evaluated to null
+            ptr.set(ByteUtil.EMPTY_BYTE_ARRAY);
+            return true;
+        }
+
+        return split(tuple, ptr, sourceStr);
+    }
+
+    private boolean split(Tuple tuple, ImmutableBytesWritable ptr, String sourceStr) {
+        Splitter splitter = initializedSplitter;
+        if (splitter == null) {
+            Expression patternExpression = children.get(1);
+            if (!patternExpression.evaluate(tuple, ptr)) {
+                return false;
+            }
+            if (ptr.getLength() == 0) {
+                return true; // ptr is already set to null
+            }
+
+            String patternStr = (String) PDataType.VARCHAR.toObject(
+                    ptr, patternExpression.getSortOrder());
+            splitter = Splitter.onPattern(patternStr);
+        }
+
+        List<String> splitStrings = Lists.newArrayList(splitter.split(sourceStr));
+        PhoenixArray splitArray = new PhoenixArray(PDataType.VARCHAR, splitStrings.toArray());
+        ptr.set(PDataType.VARCHAR_ARRAY.toBytes(splitArray));
+        return true;
+    }
+
+
+    @Override
+    public PDataType getDataType() {
+        return PDataType.VARCHAR_ARRAY;
+    }
+}


Mime
View raw message