tajo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hyun...@apache.org
Subject [48/50] [abbrv] git commit: TAJO-902: Unicode delimiter does not work correctly. (jinho) Closes #53
Date Wed, 09 Jul 2014 04:11:05 GMT
TAJO-902: Unicode delimiter does not work correctly. (jinho)
Closes #53


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/10caff07
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/10caff07
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/10caff07

Branch: refs/heads/window_function
Commit: 10caff074e2f00887134f94b0dc918b3cef0e824
Parents: d84d5ca
Author: jinossy <jinossy@gmail.com>
Authored: Mon Jul 7 14:14:16 2014 +0900
Committer: jinossy <jinossy@gmail.com>
Committed: Mon Jul 7 14:14:16 2014 +0900

----------------------------------------------------------------------
 CHANGES                                         |  2 +
 .../org/apache/tajo/cli/DescTableCommand.java   | 18 +++-
 .../java/org/apache/tajo/util/StringUtils.java  | 15 ++++
 .../org/apache/tajo/util/TestStringUtil.java    | 92 ++++++++++++++++++++
 .../apache/tajo/engine/parser/SQLAnalyzer.java  | 15 +---
 5 files changed, 128 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 74958dd..661b488 100644
--- a/CHANGES
+++ b/CHANGES
@@ -74,6 +74,8 @@ Release 0.9.0 - unreleased
 
   BUG FIXES
 
+    TAJO-902: Unicode delimiter does not work correctly. (jinho)
+
     TAJO-905: When to_date() parses some date without day, the result will be 
     wrong. (hyunsik)
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java
----------------------------------------------------------------------
diff --git a/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java b/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java
index 6bda7c9..d8023f2 100644
--- a/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java
+++ b/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java
@@ -18,6 +18,8 @@
 
 package org.apache.tajo.cli;
 
+import org.apache.commons.lang.CharUtils;
+import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.tajo.catalog.Column;
 import org.apache.tajo.catalog.TableDesc;
 import org.apache.tajo.catalog.partition.PartitionMethodDesc;
@@ -84,8 +86,22 @@ public class DescTableCommand extends TajoShellCommand {
     }
     sb.append("Options: \n");
     for(Map.Entry<String, String> entry : desc.getMeta().toMap().entrySet()){
+
+      /*
+      *  Checks whether the character is ASCII 7 bit printable.
+      *  For example, a printable unicode '\u007c' become the character ‘|’.
+      *
+      *  Control-chars : ctrl-a(\u0001), tab(\u0009) ..
+      *  Printable-chars : '|'(\u007c), ','(\u002c) ..
+      * */
+
+      String value = entry.getValue();
+      String unescaped = StringEscapeUtils.unescapeJava(value);
+      if (unescaped.length() == 1 && CharUtils.isAsciiPrintable(unescaped.charAt(0)))
{
+        value = unescaped;
+      }
       sb.append("\t").append("'").append(entry.getKey()).append("'").append("=")
-          .append("'").append(entry.getValue()).append("'").append("\n");
+          .append("'").append(value).append("'").append("\n");
     }
     sb.append("\n");
     sb.append("schema: \n");

http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
index ed9014d..41ea153 100644
--- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
+++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
@@ -18,6 +18,8 @@
 
 package org.apache.tajo.util;
 
+import org.apache.commons.lang.CharUtils;
+import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.SystemUtils;
 import org.apache.hadoop.util.ShutdownHookManager;
 import org.apache.hadoop.util.SignalLogger;
@@ -165,4 +167,17 @@ public class StringUtils {
           }
         }, SHUTDOWN_HOOK_PRIORITY);
   }
+
+  public static String unicodeEscapedDelimiter(String value) {
+    try {
+      String delimiter = StringEscapeUtils.unescapeJava(value);
+      return unicodeEscapedDelimiter(delimiter.charAt(0));
+    } catch (Throwable e) {
+    }
+    return value;
+  }
+
+  public static String unicodeEscapedDelimiter(char c) {
+    return CharUtils.unicodeEscaped(c);
+  }
 }

http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
new file mode 100644
index 0000000..5c13f8f
--- /dev/null
+++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.util;
+
+import org.apache.commons.lang.CharUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+public class TestStringUtil {
+
+  @Test
+  public void testUnicodeEscapedDelimiter() {
+    for (int i = 0; i < 128; i++) {
+      char c = (char) i;
+      String delimiter = CharUtils.unicodeEscaped(c);
+      String escapedDelimiter = StringUtils.unicodeEscapedDelimiter(delimiter);
+      assertEquals(delimiter, escapedDelimiter);
+      assertEquals(1, StringEscapeUtils.unescapeJava(escapedDelimiter).length());
+      assertEquals(c, StringEscapeUtils.unescapeJava(escapedDelimiter).charAt(0));
+    }
+  }
+
+  @Test
+  public void testUnescapedDelimiter() {
+    for (int i = 0; i < 128; i++) {
+      char c = (char) i;
+      String delimiter = String.valueOf(c);
+      String escapedDelimiter = StringUtils.unicodeEscapedDelimiter(delimiter);
+      assertEquals(CharUtils.unicodeEscaped(c), escapedDelimiter);
+      assertEquals(1, StringEscapeUtils.unescapeJava(escapedDelimiter).length());
+      assertEquals(c, StringEscapeUtils.unescapeJava(escapedDelimiter).charAt(0));
+    }
+  }
+
+  @Test
+  public void testVariousDelimiter() {
+    /*
+    * Character         ASCII    Unicode
+    *
+    * Horizontal tab    9        <U0009>
+    * Space Bar         32       <U0020>
+    * 1                 49       <U0031>
+    * |                 124      <U007c>
+    *
+    * */
+
+
+    String escapedDelimiter = "\\u0031";
+
+    assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("1"));
+    assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("\\1"));
+    assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("\\u0031"));
+    assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter((char)49));
+    assertNotEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter('\001'));
+
+    String delimiter = "|";
+    assertEquals("\\u007c", StringUtils.unicodeEscapedDelimiter(delimiter));
+    assertEquals(delimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(delimiter)));
+
+
+    String commaDelimiter = ",";
+    assertEquals("\\u002c", StringUtils.unicodeEscapedDelimiter(commaDelimiter));
+    assertEquals(commaDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(commaDelimiter)));
+
+    String tabDelimiter = "\t";
+    assertEquals("\\u0009", StringUtils.unicodeEscapedDelimiter(tabDelimiter));
+    assertEquals(tabDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(tabDelimiter)));
+
+    String spaceDelimiter = " ";
+    assertEquals("\\u0020", StringUtils.unicodeEscapedDelimiter(spaceDelimiter));
+    assertEquals(spaceDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(spaceDelimiter)));
+  }
+}

http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java b/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java
index a638735..01568af 100644
--- a/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java
+++ b/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java
@@ -24,15 +24,14 @@ import org.antlr.v4.runtime.ANTLRInputStream;
 import org.antlr.v4.runtime.CommonTokenStream;
 import org.antlr.v4.runtime.misc.NotNull;
 import org.antlr.v4.runtime.tree.TerminalNode;
-import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.tajo.algebra.*;
 import org.apache.tajo.algebra.Aggregation.GroupType;
 import org.apache.tajo.algebra.LiteralValue.LiteralType;
 import org.apache.tajo.catalog.CatalogUtil;
 import org.apache.tajo.engine.parser.SQLParser.*;
 import org.apache.tajo.storage.StorageConstants;
+import org.apache.tajo.util.StringUtils;
 
-import java.nio.charset.Charset;
 import java.util.*;
 
 import static org.apache.tajo.algebra.Aggregation.GroupElement;
@@ -1341,7 +1340,7 @@ public class SQLAnalyzer extends SQLParserBaseVisitor<Expr> {
     Map<String, String> params = new HashMap<String, String>();
     for (Map.Entry<String, String> entry : map.entrySet()) {
       if (entry.getKey().equals(StorageConstants.CSVFILE_DELIMITER)) {
-        params.put(entry.getKey(), escapeDelimiter(entry.getValue()));
+        params.put(entry.getKey(), StringUtils.unicodeEscapedDelimiter(entry.getValue()));
       } else {
         params.put(entry.getKey(), entry.getValue());
       }
@@ -1349,16 +1348,6 @@ public class SQLAnalyzer extends SQLParserBaseVisitor<Expr> {
     return params;
   }
 
-  public static String escapeDelimiter(String value) {
-    try {
-      String delimiter = StringEscapeUtils.unescapeJava(value);
-      delimiter = new String(new byte[]{Byte.valueOf(delimiter).byteValue()}, Charset.defaultCharset());
-      return StringEscapeUtils.escapeJava(delimiter);
-    } catch (NumberFormatException e) {
-    }
-    return value;
-  }
-
   private static String stripQuote(String str) {
     return str.substring(1, str.length() - 1);
   }


Mime
View raw message