Return-Path: X-Original-To: apmail-tajo-commits-archive@minotaur.apache.org Delivered-To: apmail-tajo-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id CE4F5178DC for ; Thu, 30 Apr 2015 10:18:40 +0000 (UTC) Received: (qmail 36594 invoked by uid 500); 30 Apr 2015 10:18:40 -0000 Delivered-To: apmail-tajo-commits-archive@tajo.apache.org Received: (qmail 36561 invoked by uid 500); 30 Apr 2015 10:18:40 -0000 Mailing-List: contact commits-help@tajo.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@tajo.apache.org Delivered-To: mailing list commits@tajo.apache.org Received: (qmail 36548 invoked by uid 99); 30 Apr 2015 10:18:40 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 30 Apr 2015 10:18:40 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A20A5E0061; Thu, 30 Apr 2015 10:18:38 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: jhkim@apache.org To: commits@tajo.apache.org Date: Thu, 30 Apr 2015 10:18:38 -0000 Message-Id: <502a5f1b3c604488abe85a34311051c5@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [1/2] tajo git commit: TAJO-1374: Support multi-bytes delimiter for CSV file. Repository: tajo Updated Branches: refs/heads/branch-0.10.1 7ccd8341a -> b0ab7eafd TAJO-1374: Support multi-bytes delimiter for CSV file. Signed-off-by: Jinho Kim Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/4db8ca05 Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/4db8ca05 Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/4db8ca05 Branch: refs/heads/branch-0.10.1 Commit: 4db8ca055897ba712cb2bdc40594d63905a11e64 Parents: 7ccd834 Author: navis.ryu Authored: Thu Apr 30 19:15:00 2015 +0900 Committer: Jinho Kim Committed: Thu Apr 30 19:15:00 2015 +0900 ---------------------------------------------------------------------- CHANGES | 5 +- .../java/org/apache/tajo/util/BytesUtils.java | 159 +++++++++++-------- .../java/org/apache/tajo/util/StringUtils.java | 6 +- .../org/apache/tajo/util/TestStringUtil.java | 4 +- .../apache/tajo/engine/eval/ExprTestBase.java | 5 +- .../tajo/engine/query/TestSelectQuery.java | 24 +++ .../multibytes_delimiter1/table1.tbl | 5 + .../multibytes_delimiter2/table2.tbl | 5 + .../multibytes_delimiter_table1_ddl.sql | 3 + .../multibytes_delimiter_table2_ddl.sql | 3 + .../testMultiBytesDelimiter1.sql | 1 + .../testMultiBytesDelimiter2.sql | 1 + .../testMultiBytesDelimiter1.result | 7 + .../testMultiBytesDelimiter2.result | 7 + .../org/apache/tajo/storage/TestLazyTuple.java | 4 +- .../tajo/storage/hbase/ColumnMapping.java | 6 +- .../apache/tajo/storage/hbase/HBaseScanner.java | 3 +- .../tajo/storage/hbase/HBaseStorageManager.java | 3 +- .../java/org/apache/tajo/storage/CSVFile.java | 14 +- .../sequencefile/SequenceFileScanner.java | 3 +- 20 files changed, 181 insertions(+), 87 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index 9fdcb09..e19eaac 100644 --- a/CHANGES +++ b/CHANGES @@ -8,7 +8,10 @@ Release 0.10.1 - unreleased (jihun) IMPROVEMENT - + + TAJO-1374: Support multi-bytes delimiter for CSV file. + (Contributed by navis, Committed by jinho) + TAJO-1400: Add TajoStatement::setMaxRows method support. (Contributed by YeonSu Han, Committed by jihoon) http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java index 91165ac..725301c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java @@ -22,6 +22,7 @@ import org.apache.hadoop.io.WritableUtils; import java.io.ByteArrayOutputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -86,22 +87,23 @@ public class BytesUtils { return buffer; } - public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target) { - return splitWorker(str, 0, -1, separatorChar, true, target); + public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) { + return splitWorker(str, 0, -1, separatorChar, target, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, char separatorChar, int[] target) { - return splitWorker(str, offset, length, separatorChar, true, target); + public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { + return splitWorker(str, offset, length, separator, target, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar) { - return splitWorker(str, 0, -1, separatorChar, true, null); + public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) { + return splitWorker(str, 0, -1, separatorChar, null, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, int length, char separatorChar) { - return splitWorker(str, 0, length, separatorChar, true, null); + private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, + int[] target, int numColumns) { + return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns); } - + /** * Performs the logic for the split and * splitPreserveAllTokens methods that do not return a @@ -109,75 +111,96 @@ public class BytesUtils { * * @param str the String to parse, may be null * @param length amount of bytes to str - * @param separatorChar the ascii separate character - * @param preserveAllTokens if true, adjacent separators are - * treated as empty token separators; if false, adjacent - * separators are treated as one separator. + * @param separator the ascii separate characters * @param target the projection target + * @param numColumns number of columns to be retrieved * @return an array of parsed Strings, null if null String input */ - private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, - boolean preserveAllTokens, int[] target) { - // Performance tuned for 2.0 (JDK1.4) - + private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { if (str == null) { return null; } - int len = length; - if (len == 0) { - return new byte[1][0]; - }else if(len < 0){ - len = str.length - offset; - } - - List list = new ArrayList(); - int i = 0, start = 0; - boolean match = false; - boolean lastMatch = false; - int currentTarget = 0; - int currentIndex = 0; - while (i < len) { - if (str[i + offset] == separatorChar) { - if (match || preserveAllTokens) { - if (target == null) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - } else if (target.length > currentTarget && currentIndex == target[currentTarget]) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - currentTarget++; - } else { - list.add(null); - } - currentIndex++; - match = false; - lastMatch = true; - } - start = ++i; - continue; + if (length == 0) { + return new byte[numColumns][0]; + } + if (length < 0) { + length = str.length - offset; + } + int indexMax = 0; + if (target != null) { + for (int index : target) { + indexMax = Math.max(indexMax, index + 1); } - lastMatch = false; - match = true; - i++; - } - if (match || (preserveAllTokens && lastMatch)) { - if (target == null) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - } else if (target.length > currentTarget && currentIndex == target[currentTarget]) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); //str.substring(start, i)); - currentTarget++; + } else { + indexMax = numColumns; + } + + int[][] indices = split(str, offset, length, separator, new int[indexMax][]); + byte[][] result = new byte[numColumns][]; + + // not-picked -> null, picked but not-exists -> byte[0] + if (target != null) { + for (int i : target) { + int[] index = indices[i]; + result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); + } + } else { + for (int i = 0; i < result.length; i++) { + int[] index = indices[i]; + result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); + } + } + return result; + } + + public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) { + if (indices.length == 0) { + return indices; // trivial + } + final int limit = offset + length; + + int start = offset; + int colIndex = 0; + for (int index = offset; index < limit;) { + if (onDelimiter(str, index, limit, separator)) { + indices[colIndex++] = new int[] {start, index}; + if (colIndex >= indices.length) { + return indices; + } + index += separator.length; + start = index; } else { - list.add(null); + index++; } - currentIndex++; } - return (byte[][]) list.toArray(new byte[list.size()][]); + if (colIndex < indices.length) { + indices[colIndex] = new int[]{start, limit}; + } + return indices; + } + + private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) { + for (int i = 0; i < delimiter.length; i++) { + if (offset + i >= limit || input[offset + i] != delimiter[i]) { + return false; + } + } + return true; + } + + public static byte[][] splitTrivial(byte[] value, byte delimiter) { + List split = new ArrayList(); + int prev = 0; + for (int i = 0; i < value.length; i++) { + if (value[i] == delimiter) { + split.add(Arrays.copyOfRange(value, prev, i)); + prev = i + 1; + } + } + if (prev <= value.length) { + split.add(Arrays.copyOfRange(value, prev, value.length)); + } + return split.toArray(new byte[split.size()][]); } /** http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index 38c0fd8..d035e4a 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -186,7 +186,11 @@ public class StringUtils { public static String unicodeEscapedDelimiter(String value) { try { String delimiter = StringEscapeUtils.unescapeJava(value); - return unicodeEscapedDelimiter(delimiter.charAt(0)); + StringBuilder builder = new StringBuilder(); + for (char achar : delimiter.toCharArray()) { + builder.append(unicodeEscapedDelimiter(achar)); + } + return builder.toString(); } catch (Throwable e) { } return value; http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java index 5272586..c4329a1 100644 --- a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java +++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java @@ -103,7 +103,7 @@ public class TestStringUtil { char separatorChar = '|'; String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar); - byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar); + byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, 3); assertEquals(textArray.length, bytesArray.length); for (int i = 0; i < textArray.length; i++){ @@ -118,7 +118,7 @@ public class TestStringUtil { char separatorChar = '|'; String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar); - byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target); + byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target, 3); assertEquals(textArray.length, bytesArray.length); http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java index 4e4b710..876e3e4 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java @@ -238,8 +238,9 @@ public class ExprTestBase { targetIdx[i] = i; } - lazyTuple = - new LazyTuple(inputSchema, BytesUtils.splitPreserveAllTokens(csvTuple.getBytes(), delimiter, targetIdx),0); + byte[][] tokens = BytesUtils.splitPreserveAllTokens( + csvTuple.getBytes(), delimiter, targetIdx, inputSchema.size()); + lazyTuple = new LazyTuple(inputSchema, tokens,0); vtuple = new VTuple(inputSchema.size()); for (int i = 0; i < inputSchema.size(); i++) { http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index 9ba8a56..dd93dd1 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -635,4 +635,28 @@ public class TestSelectQuery extends QueryTestCaseBase { testingCluster.getConfiguration().setSystemTimezone(TimeZone.getTimeZone("GMT")); } } + + @Test + public void testMultiBytesDelimiter1() throws Exception { + executeDDL("multibytes_delimiter_table1_ddl.sql", "multibytes_delimiter1"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table1"); + } + } + + @Test + public void testMultiBytesDelimiter2() throws Exception { + executeDDL("multibytes_delimiter_table2_ddl.sql", "multibytes_delimiter2"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table2"); + } + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl new file mode 100644 index 0000000..5acccf6 --- /dev/null +++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl @@ -0,0 +1,5 @@ +1||ooo||1.1||a +2||ppp||2.3|| +3||qqq|||| +4||||4.5|| +||xxx||5.6||e \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl new file mode 100644 index 0000000..b26cdfd --- /dev/null +++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl @@ -0,0 +1,5 @@ +1ㅎoooㅎ1.1ㅎa +2ㅎpppㅎ2.3ㅎ +3ㅎqqqㅎㅎ +4ㅎㅎ4.5ㅎ +ㅎxxxㅎ5.6ㅎe \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql new file mode 100644 index 0000000..2b4a2ce --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql @@ -0,0 +1,3 @@ +create external table table1 (id int, name text, score float, type text) using csv +with ('csvfile.delimiter'='||', 'csvfile.null'='NULL') location ${table.path}; + http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql new file mode 100644 index 0000000..d918ac6 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql @@ -0,0 +1,3 @@ +create external table table2 (id int, name text, score float, type text) using csv +with ('csvfile.delimiter'='ㅎ', 'csvfile.null'='NULL') location ${table.path}; + http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql new file mode 100644 index 0000000..bd6b02d --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql @@ -0,0 +1 @@ +select * from table1; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql new file mode 100644 index 0000000..66a69ec --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql @@ -0,0 +1 @@ +select * from table2; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result new file mode 100644 index 0000000..d8d43b1 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result ---------------------------------------------------------------------- diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result new file mode 100644 index 0000000..d8d43b1 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java index c6149f7..fccaf2a 100644 --- a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java +++ b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java @@ -69,7 +69,7 @@ public class TestLazyTuple { sb.append(DatumFactory.createInet4("192.168.0.1")).append('|'); sb.append(new String(nullbytes)).append('|'); sb.append(NullDatum.get()); - textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|'); + textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|', 13); serde = new TextSerializerDeserializer(); } @@ -220,7 +220,7 @@ public class TestLazyTuple { @Test public void testInvalidNumber() { - byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|'); + byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|', 5); Schema schema = new Schema(); schema.addColumn("col1", TajoDataTypes.Type.INT2); schema.addColumn("col2", TajoDataTypes.Type.INT4); http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java index 7ddf09a..c3094fd 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java @@ -85,7 +85,7 @@ public class ColumnMapping { for (String eachToken: columnMappingTokens) { mappingColumns[index] = new byte[2][]; - byte[][] mappingTokens = BytesUtils.splitPreserveAllTokens(eachToken.trim().getBytes(), ':'); + byte[][] mappingTokens = BytesUtils.splitTrivial(eachToken.trim().getBytes(), (byte)':'); if (mappingTokens.length == 3) { if (mappingTokens[0].length == 0) { @@ -230,6 +230,10 @@ public class ColumnMapping { return numRowKeys; } + public int getNumColumns() { + return schema.size(); + } + public boolean[] getIsColumnValues() { return isColumnValues; } http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java index 5cae077..ab56252 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java @@ -218,7 +218,8 @@ public class HBaseScanner implements Scanner { if (!isBinaryColumns[fieldId] && rowKeyFieldIndexes[fieldId] >= 0) { int rowKeyFieldIndex = rowKeyFieldIndexes[fieldId]; - byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(value, rowKeyDelimiter); + byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens( + value, rowKeyDelimiter, columnMapping.getNumColumns()); if (rowKeyFields.length < rowKeyFieldIndex) { return NullDatum.get(); http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java index 2a635d8..a9e5bde 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java @@ -1015,7 +1015,8 @@ public class HBaseStorageManager extends StorageManager { Tuple endTuple = new VTuple(sortSpecs.length); byte[][] rowKeyFields; if (sortSpecs.length > 1) { - byte[][] splitValues = BytesUtils.splitPreserveAllTokens(eachEndKey, columnMapping.getRowKeyDelimiter()); + byte[][] splitValues = BytesUtils.splitPreserveAllTokens( + eachEndKey, columnMapping.getRowKeyDelimiter(), columnMapping.getNumColumns()); if (splitValues.length == sortSpecs.length) { rowKeyFields = splitValues; } else { http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java index dd5366c..bb628b1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java @@ -61,7 +61,7 @@ public class CSVFile { private FSDataOutputStream fos; private DataOutputStream outputStream; private CompressionOutputStream deflateFilter; - private char delimiter; + private byte[] delimiter; private TableStatistics stats = null; private Compressor compressor; private CompressionCodecFactory codecFactory; @@ -83,7 +83,7 @@ public class CSVFile { this.meta = meta; this.schema = schema; this.delimiter = StringEscapeUtils.unescapeJava( - this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0); + this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes(); this.columnNum = schema.size(); @@ -169,8 +169,8 @@ public class CSVFile { rowBytes += serde.serialize(schema.getColumn(i), datum, os, nullChars); if(columnNum - 1 > i){ - os.write((byte) delimiter); - rowBytes += 1; + os.write(delimiter); + rowBytes += delimiter.length; } if (isShuffle) { // it is to calculate min/max values, and it is only used for the intermediate file. @@ -265,7 +265,7 @@ public class CSVFile { //Delimiter this.delimiter = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_DELIMITER, - meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).charAt(0); + meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).getBytes(); String nullCharacters = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_NULL, @@ -279,7 +279,7 @@ public class CSVFile { } private final static int DEFAULT_PAGE_SIZE = 256 * 1024; - private char delimiter; + private byte[] delimiter; private FileSystem fs; private FSDataInputStream fis; private InputStream is; //decompressd stream @@ -476,7 +476,7 @@ public class CSVFile { } byte[][] cells = BytesUtils.splitPreserveAllTokens(buffer.getData(), startOffsets.get(currentIdx), - rowLengthList.get(currentIdx), delimiter, targetColumnIndexes); + rowLengthList.get(currentIdx), delimiter, targetColumnIndexes, schema.size()); currentIdx++; return new LazyTuple(schema, cells, offset, nullChars, serde); } catch (Throwable t) { http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java index 74563ff..92a041c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java @@ -171,7 +171,8 @@ public class SequenceFileScanner extends FileScanner { } else { Text text = new Text(); reader.getCurrentValue(text); - cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap); + cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), + delimiter, projectionMap, schema.getColumns().size()); totalBytes += (long)text.getBytes().length; tuple = new LazyTuple(schema, cells, 0, nullChars, serde); }