phoenix-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jamestay...@apache.org
Subject [28/40] phoenix git commit: PHOENIX-4237 Allow sorting on (Java) collation keys for non-English locales (Shehzaad Nakhoda)
Date Wed, 15 Nov 2017 18:34:56 GMT
PHOENIX-4237 Allow sorting on (Java) collation keys for non-English locales (Shehzaad Nakhoda)


Project: http://git-wip-us.apache.org/repos/asf/phoenix/repo
Commit: http://git-wip-us.apache.org/repos/asf/phoenix/commit/c77f3ba2
Tree: http://git-wip-us.apache.org/repos/asf/phoenix/tree/c77f3ba2
Diff: http://git-wip-us.apache.org/repos/asf/phoenix/diff/c77f3ba2

Branch: refs/heads/4.x-HBase-1.2
Commit: c77f3ba2bcbb10ea4a4b3b65e6bd57c7a9cba750
Parents: 7062a48
Author: James Taylor <jtaylor@salesforce.com>
Authored: Fri Nov 3 09:17:29 2017 -0700
Committer: James Taylor <jtaylor@salesforce.com>
Committed: Wed Nov 15 10:02:14 2017 -0800

----------------------------------------------------------------------
 LICENSE                                         |  43 ++--
 phoenix-core/pom.xml                            |   5 +
 .../phoenix/end2end/CollationKeyFunctionIT.java | 181 ++++++++++++++
 .../phoenix/expression/ExpressionType.java      |   4 +-
 .../function/CollationKeyFunction.java          | 199 +++++++++++++++
 .../apache/phoenix/jdbc/PhoenixConnection.java  |   3 +
 .../apache/phoenix/util/VarBinaryFormatter.java |  52 ++++
 .../function/CollationKeyFunctionTest.java      | 243 +++++++++++++++++++
 phoenix-server/pom.xml                          |   1 +
 9 files changed, 713 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index 08e5e10..7bd8ad1 100644
--- a/LICENSE
+++ b/LICENSE
@@ -236,23 +236,32 @@ Font Awesome fonts (http://fontawesome.io/)
 
 3-Clause BSD License:
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ---
 

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/pom.xml
----------------------------------------------------------------------
diff --git a/phoenix-core/pom.xml b/phoenix-core/pom.xml
index 6f3adb4..f82cddc 100644
--- a/phoenix-core/pom.xml
+++ b/phoenix-core/pom.xml
@@ -471,5 +471,10 @@
       <artifactId>stream</artifactId>
       <version>${stream.version}</version>
     </dependency>
+     <dependency>
+      <groupId>com.salesforce.i18n</groupId>
+      <artifactId>i18n-util</artifactId>
+      <version>1.0.1</version>
+    </dependency>
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/it/java/org/apache/phoenix/end2end/CollationKeyFunctionIT.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/it/java/org/apache/phoenix/end2end/CollationKeyFunctionIT.java
b/phoenix-core/src/it/java/org/apache/phoenix/end2end/CollationKeyFunctionIT.java
new file mode 100644
index 0000000..efbab64
--- /dev/null
+++ b/phoenix-core/src/it/java/org/apache/phoenix/end2end/CollationKeyFunctionIT.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.end2end;
+
+import static org.apache.phoenix.util.TestUtil.closeStmtAndConn;
+import static org.junit.Assert.assertEquals;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.text.Collator;
+
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * End2End test that tests the COLLATION_KEY in an ORDER BY clause
+ * 
+ */
+public class CollationKeyFunctionIT extends ParallelStatsDisabledIT {
+
+	private String tableName;
+	private String[] dataArray = new String[] {
+			// (0-6) chinese characters
+			"\u963f", "\u55c4", "\u963e", "\u554a", "\u4ec8", "\u3d9a", "\u9f51",
+			// (7-13) western characters, some with accent
+			"a", "b", "ä", "A", "a", "ä", "A" };
+
+	@Before
+	public void initAndPopulateTable() throws Exception {
+		Connection conn = null;
+		PreparedStatement stmt = null;
+		tableName = generateUniqueName();
+		try {
+			conn = DriverManager.getConnection(getUrl());
+			String ddl = "CREATE TABLE " + tableName + " (id INTEGER PRIMARY KEY, data VARCHAR)";
+			conn.createStatement().execute(ddl);
+
+			// insert dataArray into the table, with the index into the array as
+			// the id
+			for (int i = 0; i < dataArray.length; i++) {
+				PreparedStatement ps = conn.prepareStatement("upsert into " + tableName + " values(?,
?)");
+				ps.setInt(1, i);
+				ps.setString(2, dataArray[i]);
+				ps.executeUpdate();
+			}
+			conn.commit();
+		} finally {
+			closeStmtAndConn(stmt, conn);
+		}
+	}
+
+	@Test
+	public void testZhSort() throws Exception {
+		queryWithCollKeyDefaultArgsWithExpectedOrder("zh", 0, 6, new Integer[] { 3, 0, 1, 6, 5,
4, 2 });
+	}
+
+	@Test
+	public void testZhTwSort() throws Exception {
+		queryWithCollKeyDefaultArgsWithExpectedOrder("zh_TW", 0, 6, new Integer[] { 0, 3, 4, 1,
5, 2, 6 });
+	}
+
+	@Test
+	public void testZhTwStrokeSort() throws Exception {
+		queryWithCollKeyDefaultArgsWithExpectedOrder("zh_TW_STROKE", 0, 6, new Integer[] { 4, 2,
0, 3, 1, 6, 5 });
+	}
+
+	@Test
+	public void testZhStrokeSort() throws Exception {
+		queryWithCollKeyDefaultArgsWithExpectedOrder("zh__STROKE", 0, 6, new Integer[] { 0, 1,
3, 4, 6, 2, 5 });
+	}
+
+	@Test
+	public void testZhPinyinSort() throws Exception {
+		queryWithCollKeyDefaultArgsWithExpectedOrder("zh__PINYIN", 0, 6, new Integer[] { 0, 1,
3, 4, 6, 2, 5 });
+	}
+
+	@Test
+	public void testUpperCaseSort() throws Exception {
+		queryWithCollKeyUpperCaseWithExpectedOrder("en", 7, 13, new Integer[] { 7, 10, 11, 13,
9, 12, 8 });
+	}
+
+	@Test
+	public void testPrimaryStrengthSort() throws Exception {
+		queryWithCollKeyWithStrengthWithExpectedOrder("en", Collator.PRIMARY, false, 7, 13,
+				new Integer[] { 7, 9, 10, 11, 12, 13, 8 });
+	}
+	
+	@Test
+	public void testSecondaryStrengthSort() throws Exception {
+		queryWithCollKeyWithStrengthWithExpectedOrder("en", Collator.SECONDARY, false, 7, 13,
+				new Integer[] { 7, 10, 11, 13, 9, 12, 8 });
+	}
+
+	@Test
+	public void testTertiaryStrengthSort() throws Exception {
+		queryWithCollKeyWithStrengthWithExpectedOrder("en", Collator.TERTIARY, false, 7, 13,
+				new Integer[] { 7, 11, 10, 13, 9, 12, 8 });
+	}
+
+	@Test
+	public void testTertiaryStrengthSortDesc() throws Exception {
+		queryWithCollKeyWithStrengthWithExpectedOrder("en", Collator.TERTIARY, true, 7, 13,
+				new Integer[] { 8, 12, 9, 13, 10, 11, 7 });
+	}
+
+	
+	/**
+	 * Issue a query ordered by the collation key (with COLLATION_KEY called
+	 * with default args) of the data column according to the provided
+	 * localeString, and compare the ID and data columns to the expected order.
+	 * 
+	 * @param expectedIndexOrder
+	 *            an array of indexes into the dataArray in the order we expect.
+	 *            This is the same as the ID column
+	 * @throws SQLException
+	 */
+	private void queryWithCollKeyDefaultArgsWithExpectedOrder(String localeString, Integer beginIndex,
Integer endIndex,
+			Integer[] expectedIndexOrder) throws Exception {
+		String query = String.format(
+				"SELECT id, data FROM %s WHERE ID BETWEEN %d AND %d ORDER BY COLLATION_KEY(data, '%s')",
tableName,
+				beginIndex, endIndex, localeString);
+		queryWithExpectedOrder(query, expectedIndexOrder);
+	}
+
+	/**
+	 * Same as above, except the upperCase collator argument is set to true
+	 */
+	private void queryWithCollKeyUpperCaseWithExpectedOrder(String localeString, Integer beginIndex,
Integer endIndex,
+			Integer[] expectedIndexOrder) throws Exception {
+		String query = String.format(
+				"SELECT id, data FROM %s WHERE ID BETWEEN %d AND %d ORDER BY COLLATION_KEY(data, '%s',
true), id",
+				tableName, beginIndex, endIndex, localeString);
+		queryWithExpectedOrder(query, expectedIndexOrder);
+	}
+
+	/**
+	 * Same as above, except the collator strength is set
+	 */
+	private void queryWithCollKeyWithStrengthWithExpectedOrder(String localeString, Integer
strength, boolean isDescending,
+			Integer beginIndex, Integer endIndex, Integer[] expectedIndexOrder) throws Exception {
+		String sortOrder = isDescending ? "DESC" : "";
+		
+		String query = String.format(
+				"SELECT id, data FROM %s WHERE ID BETWEEN %d AND %d ORDER BY COLLATION_KEY(data, '%s',
false, %d) %s, id %s",
+				tableName, beginIndex, endIndex, localeString, strength, sortOrder, sortOrder);
+		queryWithExpectedOrder(query, expectedIndexOrder);
+	}
+
+	private void queryWithExpectedOrder(String query, Integer[] expectedIndexOrder) throws Exception
{
+		Connection conn = DriverManager.getConnection(getUrl());
+		PreparedStatement ps = conn.prepareStatement(query);
+		ResultSet rs = ps.executeQuery();
+		int i = 0;
+		while (rs.next()) {
+			int expectedId = expectedIndexOrder[i];
+			assertEquals("For row " + i + ": The ID did not match the expected index", expectedId,
rs.getInt(1));
+			assertEquals("For row " + i + ": The data did not match the expected entry from the data
array",
+					dataArray[expectedId], rs.getString(2));
+			i++;
+		}
+		assertEquals("The result set returned a different number of rows from the data array",
expectedIndexOrder.length, i);
+	}
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
index 4f26e87..9a53eb1 100644
--- a/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
+++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/ExpressionType.java
@@ -42,6 +42,7 @@ import org.apache.phoenix.expression.function.CeilTimestampExpression;
 import org.apache.phoenix.expression.function.CeilWeekExpression;
 import org.apache.phoenix.expression.function.CeilYearExpression;
 import org.apache.phoenix.expression.function.CoalesceFunction;
+import org.apache.phoenix.expression.function.CollationKeyFunction;
 import org.apache.phoenix.expression.function.ConvertTimezoneFunction;
 import org.apache.phoenix.expression.function.CountAggregateFunction;
 import org.apache.phoenix.expression.function.DayOfMonthFunction;
@@ -294,7 +295,8 @@ public enum ExpressionType {
     ArrayColumnExpression(SingleCellColumnExpression.class),
     FirstValuesFunction(FirstValuesFunction.class),
     LastValuesFunction(LastValuesFunction.class),
-    DistinctCountHyperLogLogAggregateFunction(DistinctCountHyperLogLogAggregateFunction.class);
+    DistinctCountHyperLogLogAggregateFunction(DistinctCountHyperLogLogAggregateFunction.class),
+    CollationKeyFunction(CollationKeyFunction.class);
 
     ExpressionType(Class<? extends Expression> clazz) {
         this.clazz = clazz;

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
new file mode 100644
index 0000000..827f70a
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.expression.function;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.sql.SQLException;
+import java.text.Collator;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.lang.BooleanUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.phoenix.expression.Expression;
+import org.apache.phoenix.expression.LiteralExpression;
+import org.apache.phoenix.parse.FunctionParseNode;
+import org.apache.phoenix.schema.tuple.Tuple;
+import org.apache.phoenix.schema.types.PBoolean;
+import org.apache.phoenix.schema.types.PDataType;
+import org.apache.phoenix.schema.types.PInteger;
+import org.apache.phoenix.schema.types.PVarbinary;
+import org.apache.phoenix.schema.types.PVarchar;
+import org.apache.phoenix.util.VarBinaryFormatter;
+
+import com.force.db.i18n.LinguisticSort;
+import com.force.i18n.LocaleUtils;
+
+/**
+ * A Phoenix Function that calculates a collation key for an input string based
+ * on a caller-provided locale and collator strength and decomposition settings.
+ * 
+ * The locale should be specified as xx_yy_variant where xx is the ISO 639-1
+ * 2-letter language code, yy is the the ISO 3166 2-letter country code. Both
+ * countryCode and variant are optional. For example, zh_TW_STROKE, zh_TW and zh
+ * are all valid locale representations. Note the language code, country code
+ * and variant are used as arguments to the constructor of java.util.Locale.
+ *
+ * This function uses the open-source i18n-util package to obtain the collators
+ * it needs from the provided locale.
+ *
+ * The LinguisticSort implementation in i18n-util encapsulates sort-related
+ * functionality for a substantive list of locales. For each locale, it provides
+ * a collator and an Oracle-specific database function that can be used to sort
+ * strings according to the natural language rules of that locale.
+ *
+ * This function uses the collator returned by LinguisticSort.getCollator to
+ * produce a collation key for its input string. A user can expect that the
+ * sorting semantics of this function for a given locale is equivalent to the
+ * sorting behaviour of an Oracle query that is constructed using the Oracle
+ * functions returned by LinguisticSort for that locale.
+ *
+ * The optional third argument to the function is a boolean that specifies
+ * whether to use the upper-case collator (case-insensitive) returned by
+ * LinguisticSort.getUpperCaseCollator.
+ *
+ * The optional fourth and fifth arguments are used to set respectively the
+ * strength and composition of the collator returned by LinguisticSort using the
+ * setStrength and setDecomposition methods of java.text.Collator.
+ * 
+ */
+@FunctionParseNode.BuiltInFunction(name = CollationKeyFunction.NAME, args = {
+		// input string
+		@FunctionParseNode.Argument(allowedTypes = { PVarchar.class }),
+		// ISO Code for Locale
+		@FunctionParseNode.Argument(allowedTypes = { PVarchar.class }, isConstant = true),
+		// whether to use special upper case collator
+		@FunctionParseNode.Argument(allowedTypes = { PBoolean.class }, defaultValue = "false",
isConstant = true),
+		// collator strength
+		@FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant
= true),
+		// collator decomposition
+		@FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant
= true) })
+public class CollationKeyFunction extends ScalarFunction {
+
+	private static final Log LOG = LogFactory.getLog(CollationKeyFunction.class);
+
+	public static final String NAME = "COLLATION_KEY";
+
+	private Collator collator;
+
+	public CollationKeyFunction() {
+	}
+
+	public CollationKeyFunction(List<Expression> children) throws SQLException {
+		super(children);
+		initialize();
+	}
+
+	@Override
+	public void readFields(DataInput input) throws IOException {
+		super.readFields(input);
+		initialize();
+	}
+
+	@Override
+	public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) {
+		Expression expression = getChildren().get(0);
+		if (!expression.evaluate(tuple, ptr)) {
+			return false;
+		}
+		String inputString = (String) PVarchar.INSTANCE.toObject(ptr, expression.getSortOrder());
+		if (LOG.isTraceEnabled()) {
+			LOG.trace("CollationKey inputString: " + inputString);
+		}
+		byte[] collationKeyByteArray = collator.getCollationKey(inputString).toByteArray();
+
+		if (LOG.isTraceEnabled()) {
+			LOG.trace("CollationKey bytes: " + VarBinaryFormatter.INSTANCE.format(collationKeyByteArray));
+		}
+
+		ptr.set(collationKeyByteArray);
+		return true;
+	}
+
+	private void initialize() {
+		String localeISOCode = getLiteralValue(1, String.class);
+		Boolean useSpecialUpperCaseCollator = getLiteralValue(2, Boolean.class);
+		Integer collatorStrength = getLiteralValue(3, Integer.class);
+		Integer collatorDecomposition = getLiteralValue(4, Integer.class);
+
+		if (LOG.isTraceEnabled()) {
+			StringBuilder logInputsMessage = new StringBuilder();
+			logInputsMessage.append("Input (literal) arguments:").append("localeISOCode: " + localeISOCode)
+					.append(", useSpecialUpperCaseCollator: " + useSpecialUpperCaseCollator)
+					.append(", collatorStrength: " + collatorStrength)
+					.append(", collatorDecomposition: " + collatorDecomposition);
+			LOG.trace(logInputsMessage);
+		}
+
+		Locale locale = LocaleUtils.get().getLocaleByIsoCode(localeISOCode);
+
+		if (LOG.isTraceEnabled()) {
+			LOG.trace(String.format("Locale: " + locale.toLanguageTag()));
+		}
+
+		LinguisticSort linguisticSort = LinguisticSort.get(locale);
+
+		collator = BooleanUtils.isTrue(useSpecialUpperCaseCollator) ? linguisticSort.getUpperCaseCollator(false)
+				: linguisticSort.getCollator();
+
+		if (collatorStrength != null) {
+			collator.setStrength(collatorStrength);
+		}
+
+		if (collatorDecomposition != null) {
+			collator.setDecomposition(collatorDecomposition);
+		}
+
+		if (LOG.isTraceEnabled()) {
+			LOG.trace(String.format("Collator: [strength: %d, decomposition: %d], Special-Upper-Case:
%s",
+					collator.getStrength(), collator.getDecomposition(),
+					BooleanUtils.isTrue(useSpecialUpperCaseCollator)));
+		}
+	}
+
+	@Override
+	public PDataType getDataType() {
+		return PVarbinary.INSTANCE;
+	}
+
+	@Override
+	public String getName() {
+		return NAME;
+	}
+
+	@Override
+	public boolean isThreadSafe() {
+		// ICU4J Collators are not thread-safe unless they are frozen.
+		// TODO: Look into calling freeze() on them to be able return true here.
+		return false;
+	}
+
+	private <T> T getLiteralValue(int childIndex, Class<T> type) {
+		Expression expression = getChildren().get(childIndex);
+		if (LOG.isDebugEnabled()) {
+			LOG.debug("child: " + childIndex + ", expression: " + expression);
+		}
+		// It's safe to assume expression is a LiteralExpression since
+		// only arguments marked as isConstant = true should be handled through
+		// this method.
+		return type.cast(((LiteralExpression) expression).getValue());
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/main/java/org/apache/phoenix/jdbc/PhoenixConnection.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/jdbc/PhoenixConnection.java b/phoenix-core/src/main/java/org/apache/phoenix/jdbc/PhoenixConnection.java
index 730f754..4555190 100644
--- a/phoenix-core/src/main/java/org/apache/phoenix/jdbc/PhoenixConnection.java
+++ b/phoenix-core/src/main/java/org/apache/phoenix/jdbc/PhoenixConnection.java
@@ -105,6 +105,7 @@ import org.apache.phoenix.schema.types.PTimestamp;
 import org.apache.phoenix.schema.types.PUnsignedDate;
 import org.apache.phoenix.schema.types.PUnsignedTime;
 import org.apache.phoenix.schema.types.PUnsignedTimestamp;
+import org.apache.phoenix.schema.types.PVarbinary;
 import org.apache.phoenix.trace.util.Tracing;
 import org.apache.phoenix.transaction.PhoenixTransactionContext;
 import org.apache.phoenix.util.DateUtil;
@@ -116,6 +117,7 @@ import org.apache.phoenix.util.ReadOnlyProps;
 import org.apache.phoenix.util.SQLCloseable;
 import org.apache.phoenix.util.SQLCloseables;
 import org.apache.phoenix.util.SchemaUtil;
+import org.apache.phoenix.util.VarBinaryFormatter;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
@@ -336,6 +338,7 @@ public class PhoenixConnection implements Connection, MetaDataMutated,
SQLClosea
         formatters.put(PUnsignedTimestamp.INSTANCE, timestampFormat);
         formatters.put(PDecimal.INSTANCE,
                 FunctionArgumentType.NUMERIC.getFormatter(numberPattern));
+        formatters.put(PVarbinary.INSTANCE, VarBinaryFormatter.INSTANCE);
         // We do not limit the metaData on a connection less than the global
         // one,
         // as there's not much that will be cached here.

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/main/java/org/apache/phoenix/util/VarBinaryFormatter.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/VarBinaryFormatter.java b/phoenix-core/src/main/java/org/apache/phoenix/util/VarBinaryFormatter.java
new file mode 100644
index 0000000..7f0d030
--- /dev/null
+++ b/phoenix-core/src/main/java/org/apache/phoenix/util/VarBinaryFormatter.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.util;
+
+import java.text.FieldPosition;
+import java.text.Format;
+import java.text.ParsePosition;
+
+import org.apache.commons.codec.binary.Hex;
+
+/**
+ * A formatter that formats a byte array to a hexadecimal string
+ * (with each byte converted to a 2-digit hex sequence)
+ *
+ * @author snakhoda-sfdc
+ */
+public class VarBinaryFormatter extends Format {
+
+	private static final long serialVersionUID = -7940880118392024750L;
+
+	public static final VarBinaryFormatter INSTANCE = new VarBinaryFormatter();
+
+	@Override
+	public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) {
+		if (!(obj instanceof byte[])) {
+			throw new IllegalArgumentException("VarBinaryFormatter can only format byte arrays");
+		}
+		String hexString = Hex.encodeHexString((byte[]) obj);
+		toAppendTo.append(hexString);
+		return toAppendTo;
+	}
+
+	@Override
+	public Object parseObject(String source, ParsePosition pos) {
+		return new UnsupportedOperationException();
+	}
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-core/src/test/java/org/apache/phoenix/expression/function/CollationKeyFunctionTest.java
----------------------------------------------------------------------
diff --git a/phoenix-core/src/test/java/org/apache/phoenix/expression/function/CollationKeyFunctionTest.java
b/phoenix-core/src/test/java/org/apache/phoenix/expression/function/CollationKeyFunctionTest.java
new file mode 100644
index 0000000..f57a937
--- /dev/null
+++ b/phoenix-core/src/test/java/org/apache/phoenix/expression/function/CollationKeyFunctionTest.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.phoenix.expression.function;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.fail;
+
+import java.text.Collator;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.lang.builder.ToStringBuilder;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.phoenix.expression.Expression;
+import org.apache.phoenix.expression.LiteralExpression;
+import org.apache.phoenix.schema.SortOrder;
+import org.apache.phoenix.schema.types.PBoolean;
+import org.apache.phoenix.schema.types.PInteger;
+import org.apache.phoenix.schema.types.PVarchar;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.UnsignedBytes;
+
+/**
+ * "Unit" tests for CollationKeyFunction
+ * 
+ */
+public class CollationKeyFunctionTest {
+
+	private static String[] chineseChars = new String[] { "\u963f", "\u55c4", "\u963e", "\u554a",
"\u4ec8", "\u3d9a",
+			"\u9f51" };
+
+	private static Comparator<byte[]> collationKeyComparator = UnsignedBytes.lexicographicalComparator();
+
+	private static Comparator<ByteArrayAndInteger> collationKeyAndIndexComparator = new
Comparator<ByteArrayAndInteger>() {
+		@Override
+		public int compare(ByteArrayAndInteger o1, ByteArrayAndInteger o2) {
+			int compareResult = collationKeyComparator.compare(o1.byteArray, o2.byteArray);
+			if (compareResult == 0) {
+				compareResult = o1.integer.compareTo(o2.integer);
+			}
+			return compareResult;
+		}
+	};
+
+	private static class ByteArrayAndInteger {
+
+		private ByteArrayAndInteger(byte[] byteArray, Integer integer) {
+			super();
+			this.byteArray = byteArray;
+			this.integer = integer;
+		}
+
+		byte[] byteArray;
+		Integer integer;
+
+		public String toString() {
+			return ToStringBuilder.reflectionToString(this);
+		}
+
+		public static ByteArrayAndInteger findFirstIntegerMatch(List<ByteArrayAndInteger>
list,
+				Integer matchingInteger) {
+			for (ByteArrayAndInteger entry : list) {
+				if (entry.integer.equals(matchingInteger)) {
+					return entry;
+				}
+			}
+			return null;
+		}
+	}
+
+	@Test
+	public void testZhSort() throws Exception {
+		testSortOrderNoEquals(chineseChars, "zh", Boolean.FALSE, null, null, new Integer[] { 3,
0, 1, 6, 5, 4, 2 });
+	}
+
+	@Test
+	public void testZhTwSort() throws Exception {
+		testSortOrderNoEquals(chineseChars, "zh_TW", Boolean.FALSE, null, null, new Integer[] {
0, 3, 4, 1, 5, 2, 6 });
+	}
+
+	@Test
+	public void testZhTwStrokeSort() throws Exception {
+		testSortOrderNoEquals(chineseChars, "zh_TW_STROKE", Boolean.FALSE, null, null,
+				new Integer[] { 4, 2, 0, 3, 1, 6, 5 });
+	}
+
+	@Test
+	public void testZhStrokeSort() throws Exception {
+		testSortOrderNoEquals(chineseChars, "zh__STROKE", Boolean.FALSE, null, null,
+				new Integer[] { 0, 1, 3, 4, 6, 2, 5 });
+	}
+
+	@Test
+	public void testZhPinyinSort() throws Exception {
+		testSortOrderNoEquals(chineseChars, "zh__PINYIN", Boolean.FALSE, null, null,
+				new Integer[] { 0, 1, 3, 4, 6, 2, 5 });
+	}
+
+	@Test
+	public void testUpperCaseCollationKeyBytes() throws Exception {
+		testCollationKeysEqual(new String[] { "abcdef", "ABCDEF", "aBcDeF" }, "en", Boolean.TRUE,
null, null);
+	}
+
+	@Test
+	public void testEqualCollationKeysForPrimaryStrength() throws Exception {
+		// "a", "A", "ä" are considered equivalent
+		testCollationKeysEqual(new String[] { "a", "A", "ä" }, "en", Boolean.FALSE, Collator.PRIMARY,
null);
+		testSortOrderNoEquals(new String[] { "b", "a" }, "en", Boolean.FALSE, Collator.PRIMARY,
null,
+				new Integer[] { 1, 0 });
+
+	}
+
+	@Test
+	public void testCollationKeyBytesForSecondaryStrength() throws Exception {
+		// "a" and "A" are considered equivalent but not "ä"
+		testCollationKeysEqual(new String[] { "a", "A" }, "en", Boolean.FALSE, Collator.SECONDARY,
null);
+		testSortOrderNoEquals(new String[] { "b", "a", "ä" }, "en", Boolean.FALSE, Collator.SECONDARY,
null,
+				new Integer[] { 1, 2, 0 });
+	}
+
+	@Test
+	public void testCollationKeyBytesForTertiaryStrength() throws Exception {
+		// none of these are considered equivalent
+		testSortOrderNoEquals(new String[] { "b", "a", "ä", "A" }, "en", Boolean.FALSE, Collator.TERTIARY,
null,
+				new Integer[] { 1, 3, 2, 0 });
+	}
+
+	/**
+	 * Just test that changing the decomposition mode works for basic sorting.
+	 * TODO: Actually test for the accented characters and languages where this
+	 * actually matters.
+	 */
+	@Test
+	public void testCollationKeyBytesForFullDecomposition() throws Exception {
+		testCollationKeysEqual(new String[] { "a", "A" }, "en", Boolean.FALSE, null, Collator.FULL_DECOMPOSITION);
+	}
+
+	/** HELPER METHODS **/
+	private void testSortOrderNoEquals(String[] inputStrings, String locale, Boolean uppercaseCollator,
+			Integer strength, Integer decomposition, Integer[] expectedOrder) throws Exception {
+		List<ByteArrayAndInteger> sortedCollationKeysAndIndexes = calculateCollationKeys(inputStrings,
locale,
+				uppercaseCollator, strength, decomposition);
+		Collections.sort(sortedCollationKeysAndIndexes, collationKeyAndIndexComparator);
+		testCollationKeysNotEqual(inputStrings, sortedCollationKeysAndIndexes);
+
+		Integer[] sortedIndexes = new Integer[sortedCollationKeysAndIndexes.size()];
+		for (int i = 0; i < sortedIndexes.length; i++) {
+			sortedIndexes[i] = sortedCollationKeysAndIndexes.get(i).integer;
+		}
+		assertArrayEquals(expectedOrder, sortedIndexes);
+	}
+
+	private List<ByteArrayAndInteger> calculateCollationKeys(String[] inputStrings, String
locale,
+			Boolean upperCaseCollator, Integer strength, Integer decomposition) throws Exception {
+		List<ByteArrayAndInteger> collationKeysAndIndexes = Lists.newArrayList();
+		for (int i = 0; i < inputStrings.length; i++) {
+			byte[] thisCollationKeyBytes = callFunction(inputStrings[i], locale, upperCaseCollator,
strength,
+					decomposition, SortOrder.ASC);
+			collationKeysAndIndexes.add(new ByteArrayAndInteger(thisCollationKeyBytes, i));
+		}
+		return collationKeysAndIndexes;
+	}
+
+	private void testCollationKeysEqual(String[] inputStrings, String locale, Boolean upperCaseCollator,
+			Integer strength, Integer decomposition) throws Exception {
+		List<ByteArrayAndInteger> collationKeysAndIndexes = calculateCollationKeys(inputStrings,
locale,
+				upperCaseCollator, strength, decomposition);
+
+		for (int i = 0, j = 1; i < inputStrings.length && j < inputStrings.length;
i++, j++) {
+			byte[] iByteArray = ByteArrayAndInteger.findFirstIntegerMatch(collationKeysAndIndexes,
i).byteArray;
+			byte[] jByteArray = ByteArrayAndInteger.findFirstIntegerMatch(collationKeysAndIndexes,
j).byteArray;
+			boolean isPairEqual = collationKeyComparator.compare(iByteArray, jByteArray) == 0;
+			if (!isPairEqual) {
+				fail(String.format("Collation keys for inputStrings [%s] and [%s] ([%s], [%s]) were not
equal",
+						inputStrings[i], inputStrings[j], Hex.encodeHexString(iByteArray),
+						Hex.encodeHexString(jByteArray)));
+			}
+		}
+	}
+
+	private void testCollationKeysNotEqual(String[] inputStrings, List<ByteArrayAndInteger>
collationKeysAndIndexes)
+			throws Exception {
+		for (int i = 0; i < inputStrings.length; i++) {
+			for (int j = i + 1; j < inputStrings.length; j++) {
+				byte[] iByteArray = ByteArrayAndInteger.findFirstIntegerMatch(collationKeysAndIndexes,
i).byteArray;
+				byte[] jByteArray = ByteArrayAndInteger.findFirstIntegerMatch(collationKeysAndIndexes,
j).byteArray;
+				boolean isPairEqual = collationKeyComparator.compare(iByteArray, jByteArray) == 0;
+				if (isPairEqual) {
+					fail(String.format("Collation keys for inputStrings [%s] and [%s] ([%s], [%s]) were
equal",
+							inputStrings[i], inputStrings[j], Hex.encodeHexString(iByteArray),
+							Hex.encodeHexString(jByteArray)));
+				}
+			}
+		}
+	}
+
+	private static byte[] callFunction(String inputStr, String localeIsoCode, Boolean upperCaseCollator,
+			Integer strength, Integer decomposition, SortOrder sortOrder) throws Exception {
+		LiteralExpression inputStrLiteral, localeIsoCodeLiteral, upperCaseBooleanLiteral, strengthLiteral,
+				decompositionLiteral;
+		inputStrLiteral = LiteralExpression.newConstant(inputStr, PVarchar.INSTANCE, sortOrder);
+		localeIsoCodeLiteral = LiteralExpression.newConstant(localeIsoCode, PVarchar.INSTANCE,
sortOrder);
+		upperCaseBooleanLiteral = LiteralExpression.newConstant(upperCaseCollator, PBoolean.INSTANCE,
sortOrder);
+		strengthLiteral = LiteralExpression.newConstant(strength, PInteger.INSTANCE, sortOrder);
+		decompositionLiteral = LiteralExpression.newConstant(decomposition, PInteger.INSTANCE,
sortOrder);
+		return callFunction(inputStrLiteral, localeIsoCodeLiteral, upperCaseBooleanLiteral, strengthLiteral,
+				decompositionLiteral);
+
+	}
+
+	private static byte[] callFunction(LiteralExpression inputStrLiteral, LiteralExpression
localeIsoCodeLiteral,
+			LiteralExpression upperCaseBooleanLiteral, LiteralExpression strengthLiteral,
+			LiteralExpression decompositionLiteral) throws Exception {
+		List<Expression> expressions = Lists.newArrayList((Expression) inputStrLiteral,
+				(Expression) localeIsoCodeLiteral, (Expression) upperCaseBooleanLiteral, (Expression)
strengthLiteral,
+				(Expression) decompositionLiteral);
+		Expression collationKeyFunction = new CollationKeyFunction(expressions);
+		ImmutableBytesWritable ptr = new ImmutableBytesWritable();
+		boolean ret = collationKeyFunction.evaluate(null, ptr);
+		byte[] result = ret
+				? (byte[]) collationKeyFunction.getDataType().toObject(ptr, collationKeyFunction.getSortOrder())
: null;
+		return result;
+	}
+}

http://git-wip-us.apache.org/repos/asf/phoenix/blob/c77f3ba2/phoenix-server/pom.xml
----------------------------------------------------------------------
diff --git a/phoenix-server/pom.xml b/phoenix-server/pom.xml
index 3576425..67832ad 100644
--- a/phoenix-server/pom.xml
+++ b/phoenix-server/pom.xml
@@ -134,6 +134,7 @@
                   <include>io.dropwizard.metrics:metrics-core</include>
                   <include>org.apache.thrift:libthrift</include>
                   <include>com.clearspring.analytics:stream</include>
+                  <include>com.salesforce.i18n:i18n-util</include>
                 </includes>
                   <excludes>
                     <exclude>org.apache.phoenix:phoenix-server</exclude>


Mime
View raw message