tajo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jh...@apache.org
Subject [1/2] tajo git commit: TAJO-1374: Support multi-bytes delimiter for CSV file.
Date Thu, 30 Apr 2015 10:18:38 GMT
Repository: tajo
Updated Branches:
  refs/heads/branch-0.10.1 7ccd8341a -> b0ab7eafd


TAJO-1374: Support multi-bytes delimiter for CSV file.

Signed-off-by: Jinho Kim <jhkim@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/4db8ca05
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/4db8ca05
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/4db8ca05

Branch: refs/heads/branch-0.10.1
Commit: 4db8ca055897ba712cb2bdc40594d63905a11e64
Parents: 7ccd834
Author: navis.ryu <navis@apache.org>
Authored: Thu Apr 30 19:15:00 2015 +0900
Committer: Jinho Kim <jhkim@apache.org>
Committed: Thu Apr 30 19:15:00 2015 +0900

----------------------------------------------------------------------
 CHANGES                                         |   5 +-
 .../java/org/apache/tajo/util/BytesUtils.java   | 159 +++++++++++--------
 .../java/org/apache/tajo/util/StringUtils.java  |   6 +-
 .../org/apache/tajo/util/TestStringUtil.java    |   4 +-
 .../apache/tajo/engine/eval/ExprTestBase.java   |   5 +-
 .../tajo/engine/query/TestSelectQuery.java      |  24 +++
 .../multibytes_delimiter1/table1.tbl            |   5 +
 .../multibytes_delimiter2/table2.tbl            |   5 +
 .../multibytes_delimiter_table1_ddl.sql         |   3 +
 .../multibytes_delimiter_table2_ddl.sql         |   3 +
 .../testMultiBytesDelimiter1.sql                |   1 +
 .../testMultiBytesDelimiter2.sql                |   1 +
 .../testMultiBytesDelimiter1.result             |   7 +
 .../testMultiBytesDelimiter2.result             |   7 +
 .../org/apache/tajo/storage/TestLazyTuple.java  |   4 +-
 .../tajo/storage/hbase/ColumnMapping.java       |   6 +-
 .../apache/tajo/storage/hbase/HBaseScanner.java |   3 +-
 .../tajo/storage/hbase/HBaseStorageManager.java |   3 +-
 .../java/org/apache/tajo/storage/CSVFile.java   |  14 +-
 .../sequencefile/SequenceFileScanner.java       |   3 +-
 20 files changed, 181 insertions(+), 87 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 9fdcb09..e19eaac 100644
--- a/CHANGES
+++ b/CHANGES
@@ -8,7 +8,10 @@ Release 0.10.1 - unreleased
     (jihun)
 
   IMPROVEMENT
-
+  
+    TAJO-1374: Support multi-bytes delimiter for CSV file.
+    (Contributed by navis, Committed by jinho)
+        
     TAJO-1400: Add TajoStatement::setMaxRows method support.
     (Contributed by YeonSu Han, Committed by jihoon)
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
index 91165ac..725301c 100644
--- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
+++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.io.WritableUtils;
 
 import java.io.ByteArrayOutputStream;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 /**
@@ -86,22 +87,23 @@ public class BytesUtils {
     return buffer;
   }
 
-  public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target)
{
-    return splitWorker(str, 0, -1, separatorChar, true, target);
+  public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target,
int numColumns) {
+    return splitWorker(str, 0, -1, separatorChar, target, numColumns);
   }
 
-  public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, char
separatorChar, int[] target) {
-    return splitWorker(str, offset, length, separatorChar, true, target);
+  public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[]
separator, int[] target, int numColumns) {
+    return splitWorker(str, offset, length, separator, target, numColumns);
   }
 
-  public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar) {
-    return splitWorker(str, 0, -1, separatorChar, true, null);
+  public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns)
{
+    return splitWorker(str, 0, -1, separatorChar, null, numColumns);
   }
 
-  public static byte[][] splitPreserveAllTokens(byte[] str, int length, char separatorChar)
{
-    return splitWorker(str, 0, length, separatorChar, true, null);
+  private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar,
+                                      int[] target, int numColumns) {
+    return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns);
   }
-
+  
   /**
    * Performs the logic for the <code>split</code> and
    * <code>splitPreserveAllTokens</code> methods that do not return a
@@ -109,75 +111,96 @@ public class BytesUtils {
    *
    * @param str  the String to parse, may be <code>null</code>
    * @param length amount of bytes to str
-   * @param separatorChar the ascii separate character
-   * @param preserveAllTokens if <code>true</code>, adjacent separators are
-   * treated as empty token separators; if <code>false</code>, adjacent
-   * separators are treated as one separator.
+   * @param separator the ascii separate characters
    * @param target the projection target
+   * @param numColumns number of columns to be retrieved              
    * @return an array of parsed Strings, <code>null</code> if null String input
    */
-  private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar,
-                                      boolean preserveAllTokens, int[] target) {
-    // Performance tuned for 2.0 (JDK1.4)
-
+  private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator,
int[] target, int numColumns) {
     if (str == null) {
       return null;
     }
-    int len = length;
-    if (len == 0) {
-      return new byte[1][0];
-    }else if(len < 0){
-      len = str.length - offset;
-    }
-
-    List list = new ArrayList();
-    int i = 0, start = 0;
-    boolean match = false;
-    boolean lastMatch = false;
-    int currentTarget = 0;
-    int currentIndex = 0;
-    while (i < len) {
-      if (str[i + offset] == separatorChar) {
-        if (match || preserveAllTokens) {
-          if (target == null) {
-            byte[] bytes = new byte[i - start];
-            System.arraycopy(str, start + offset, bytes, 0, bytes.length);
-            list.add(bytes);
-          } else if (target.length > currentTarget && currentIndex == target[currentTarget])
{
-            byte[] bytes = new byte[i - start];
-            System.arraycopy(str, start + offset, bytes, 0, bytes.length);
-            list.add(bytes);
-            currentTarget++;
-          } else {
-            list.add(null);
-          }
-          currentIndex++;
-          match = false;
-          lastMatch = true;
-        }
-        start = ++i;
-        continue;
+    if (length == 0) {
+      return new byte[numColumns][0];
+    }
+    if (length < 0) {
+      length = str.length - offset;
+    }
+    int indexMax = 0;
+    if (target != null) {
+      for (int index : target) {
+        indexMax = Math.max(indexMax, index + 1);
       }
-      lastMatch = false;
-      match = true;
-      i++;
-    }
-    if (match || (preserveAllTokens && lastMatch)) {
-      if (target == null) {
-        byte[] bytes = new byte[i - start];
-        System.arraycopy(str, start + offset, bytes, 0, bytes.length);
-        list.add(bytes);
-      } else if (target.length > currentTarget && currentIndex == target[currentTarget])
{
-        byte[] bytes = new byte[i - start];
-        System.arraycopy(str, start + offset, bytes, 0, bytes.length);
-        list.add(bytes); //str.substring(start, i));
-        currentTarget++;
+    } else {
+      indexMax = numColumns;
+    }
+
+    int[][] indices = split(str, offset, length, separator, new int[indexMax][]);
+    byte[][] result = new byte[numColumns][];
+
+    // not-picked -> null, picked but not-exists -> byte[0]
+    if (target != null) {
+      for (int i : target) {
+        int[] index = indices[i];
+        result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
+      }
+    } else {
+      for (int i = 0; i < result.length; i++) {
+        int[] index = indices[i];
+        result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
+      }
+    }
+    return result;
+  }
+
+  public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][]
indices) {
+    if (indices.length == 0) {
+      return indices;   // trivial
+    }
+    final int limit = offset + length;
+
+    int start = offset;
+    int colIndex = 0;
+    for (int index = offset; index < limit;) {
+      if (onDelimiter(str, index, limit, separator)) {
+        indices[colIndex++] = new int[] {start, index};
+        if (colIndex >= indices.length) {
+          return indices;
+        }
+        index += separator.length;
+        start = index;
       } else {
-        list.add(null);
+        index++;
       }
-      currentIndex++;
     }
-    return (byte[][]) list.toArray(new byte[list.size()][]);
+    if (colIndex < indices.length) {
+      indices[colIndex] = new int[]{start, limit};
+    }
+    return indices;
+  }
+  
+  private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter)
{
+    for (int i = 0; i < delimiter.length; i++) {
+      if (offset + i >= limit || input[offset + i] != delimiter[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  public static byte[][] splitTrivial(byte[] value, byte delimiter) {
+    List<byte[]> split = new ArrayList<byte[]>();
+    int prev = 0;
+    for (int i = 0; i < value.length; i++) {
+      if (value[i] == delimiter) {
+        split.add(Arrays.copyOfRange(value, prev, i));
+        prev = i + 1;
+      }
+    }
+    if (prev <= value.length) {
+      split.add(Arrays.copyOfRange(value, prev, value.length));
+    }
+    return split.toArray(new byte[split.size()][]);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
index 38c0fd8..d035e4a 100644
--- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
+++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
@@ -186,7 +186,11 @@ public class StringUtils {
   public static String unicodeEscapedDelimiter(String value) {
     try {
       String delimiter = StringEscapeUtils.unescapeJava(value);
-      return unicodeEscapedDelimiter(delimiter.charAt(0));
+      StringBuilder builder = new StringBuilder();
+      for (char achar : delimiter.toCharArray()) {
+        builder.append(unicodeEscapedDelimiter(achar));
+      }
+      return builder.toString();
     } catch (Throwable e) {
     }
     return value;

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
index 5272586..c4329a1 100644
--- a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
+++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
@@ -103,7 +103,7 @@ public class TestStringUtil {
     char separatorChar = '|';
 
     String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text,
separatorChar);
-    byte[][] bytesArray =  BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar);
+    byte[][] bytesArray =  BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar,
3);
 
     assertEquals(textArray.length, bytesArray.length);
     for (int i = 0; i < textArray.length; i++){
@@ -118,7 +118,7 @@ public class TestStringUtil {
     char separatorChar = '|';
 
     String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text,
separatorChar);
-    byte[][] bytesArray =  BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar,
target);
+    byte[][] bytesArray =  BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar,
target, 3);
 
     assertEquals(textArray.length, bytesArray.length);
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
index 4e4b710..876e3e4 100644
--- a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
+++ b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
@@ -238,8 +238,9 @@ public class ExprTestBase {
         targetIdx[i] = i;
       }
 
-      lazyTuple =
-          new LazyTuple(inputSchema, BytesUtils.splitPreserveAllTokens(csvTuple.getBytes(),
delimiter, targetIdx),0);
+      byte[][] tokens = BytesUtils.splitPreserveAllTokens(
+          csvTuple.getBytes(), delimiter, targetIdx, inputSchema.size());
+      lazyTuple = new LazyTuple(inputSchema, tokens,0);
       vtuple = new VTuple(inputSchema.size());
       for (int i = 0; i < inputSchema.size(); i++) {
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
index 9ba8a56..dd93dd1 100644
--- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
+++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
@@ -635,4 +635,28 @@ public class TestSelectQuery extends QueryTestCaseBase {
       testingCluster.getConfiguration().setSystemTimezone(TimeZone.getTimeZone("GMT"));
     }
   }
+  
+  @Test
+  public void testMultiBytesDelimiter1() throws Exception {
+    executeDDL("multibytes_delimiter_table1_ddl.sql", "multibytes_delimiter1");
+    try {
+      ResultSet res = executeQuery();
+      assertResultSet(res);
+      cleanupQuery(res);
+    } finally {
+      executeString("DROP TABLE table1");
+    }
+  }
+  
+  @Test
+  public void testMultiBytesDelimiter2() throws Exception {
+    executeDDL("multibytes_delimiter_table2_ddl.sql", "multibytes_delimiter2");
+    try {
+      ResultSet res = executeQuery();
+      assertResultSet(res);
+      cleanupQuery(res);
+    } finally {
+      executeString("DROP TABLE table2");
+    }
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
new file mode 100644
index 0000000..5acccf6
--- /dev/null
+++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
@@ -0,0 +1,5 @@
+1||ooo||1.1||a
+2||ppp||2.3||
+3||qqq||||
+4||||4.5||
+||xxx||5.6||e
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
new file mode 100644
index 0000000..b26cdfd
--- /dev/null
+++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
@@ -0,0 +1,5 @@
+1ㅎoooㅎ1.1ㅎa
+2ㅎpppㅎ2.3ㅎ
+3ㅎqqqㅎㅎ
+4ㅎㅎ4.5ㅎ
+ㅎxxxㅎ5.6ㅎe
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
new file mode 100644
index 0000000..2b4a2ce
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
@@ -0,0 +1,3 @@
+create external table table1 (id int, name text, score float, type text) using csv
+with ('csvfile.delimiter'='||', 'csvfile.null'='NULL') location ${table.path};
+

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
new file mode 100644
index 0000000..d918ac6
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
@@ -0,0 +1,3 @@
+create external table table2 (id int, name text, score float, type text) using csv
+with ('csvfile.delimiter'='ㅎ', 'csvfile.null'='NULL') location ${table.path};
+

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
new file mode 100644
index 0000000..bd6b02d
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
@@ -0,0 +1 @@
+select * from table1;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
new file mode 100644
index 0000000..66a69ec
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
@@ -0,0 +1 @@
+select * from table2;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
new file mode 100644
index 0000000..d8d43b1
--- /dev/null
+++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
@@ -0,0 +1,7 @@
+id,name,score,type
+-------------------------------
+1,ooo,1.1,a
+2,ppp,2.3,
+3,qqq,null,
+4,,4.5,
+null,xxx,5.6,e
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
new file mode 100644
index 0000000..d8d43b1
--- /dev/null
+++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
@@ -0,0 +1,7 @@
+id,name,score,type
+-------------------------------
+1,ooo,1.1,a
+2,ppp,2.3,
+3,qqq,null,
+4,,4.5,
+null,xxx,5.6,e
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
index c6149f7..fccaf2a 100644
--- a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
+++ b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
@@ -69,7 +69,7 @@ public class TestLazyTuple {
     sb.append(DatumFactory.createInet4("192.168.0.1")).append('|');
     sb.append(new String(nullbytes)).append('|');
     sb.append(NullDatum.get());
-    textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|');
+    textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|', 13);
     serde = new TextSerializerDeserializer();
   }
 
@@ -220,7 +220,7 @@ public class TestLazyTuple {
 
   @Test
   public void testInvalidNumber() {
-    byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|');
+    byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|', 5);
     Schema schema = new Schema();
     schema.addColumn("col1", TajoDataTypes.Type.INT2);
     schema.addColumn("col2", TajoDataTypes.Type.INT4);

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
index 7ddf09a..c3094fd 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
@@ -85,7 +85,7 @@ public class ColumnMapping {
     for (String eachToken: columnMappingTokens) {
       mappingColumns[index] = new byte[2][];
 
-      byte[][] mappingTokens = BytesUtils.splitPreserveAllTokens(eachToken.trim().getBytes(),
':');
+      byte[][] mappingTokens = BytesUtils.splitTrivial(eachToken.trim().getBytes(), (byte)':');
 
       if (mappingTokens.length == 3) {
         if (mappingTokens[0].length == 0) {
@@ -230,6 +230,10 @@ public class ColumnMapping {
     return numRowKeys;
   }
 
+  public int getNumColumns() {
+    return schema.size();
+  }
+  
   public boolean[] getIsColumnValues() {
     return isColumnValues;
   }

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
index 5cae077..ab56252 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
@@ -218,7 +218,8 @@ public class HBaseScanner implements Scanner {
       if (!isBinaryColumns[fieldId] && rowKeyFieldIndexes[fieldId] >= 0) {
         int rowKeyFieldIndex = rowKeyFieldIndexes[fieldId];
 
-        byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(value, rowKeyDelimiter);
+        byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(
+            value, rowKeyDelimiter, columnMapping.getNumColumns());
 
         if (rowKeyFields.length < rowKeyFieldIndex) {
           return NullDatum.get();

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
index 2a635d8..a9e5bde 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
@@ -1015,7 +1015,8 @@ public class HBaseStorageManager extends StorageManager {
           Tuple endTuple = new VTuple(sortSpecs.length);
           byte[][] rowKeyFields;
           if (sortSpecs.length > 1) {
-            byte[][] splitValues = BytesUtils.splitPreserveAllTokens(eachEndKey, columnMapping.getRowKeyDelimiter());
+            byte[][] splitValues = BytesUtils.splitPreserveAllTokens(
+                eachEndKey, columnMapping.getRowKeyDelimiter(), columnMapping.getNumColumns());
             if (splitValues.length == sortSpecs.length) {
               rowKeyFields = splitValues;
             } else {

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
index dd5366c..bb628b1 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
@@ -61,7 +61,7 @@ public class CSVFile {
     private FSDataOutputStream fos;
     private DataOutputStream outputStream;
     private CompressionOutputStream deflateFilter;
-    private char delimiter;
+    private byte[] delimiter;
     private TableStatistics stats = null;
     private Compressor compressor;
     private CompressionCodecFactory codecFactory;
@@ -83,7 +83,7 @@ public class CSVFile {
       this.meta = meta;
       this.schema = schema;
       this.delimiter = StringEscapeUtils.unescapeJava(
-          this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0);
+          this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes();
 
       this.columnNum = schema.size();
 
@@ -169,8 +169,8 @@ public class CSVFile {
         rowBytes += serde.serialize(schema.getColumn(i), datum, os, nullChars);
 
         if(columnNum - 1 > i){
-          os.write((byte) delimiter);
-          rowBytes += 1;
+          os.write(delimiter);
+          rowBytes += delimiter.length;
         }
         if (isShuffle) {
           // it is to calculate min/max values, and it is only used for the intermediate
file.
@@ -265,7 +265,7 @@ public class CSVFile {
       //Delimiter
       this.delimiter = StringEscapeUtils.unescapeJava(
           meta.getOption(StorageConstants.TEXT_DELIMITER,
-          meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).charAt(0);
+          meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).getBytes();
 
       String nullCharacters = StringEscapeUtils.unescapeJava(
           meta.getOption(StorageConstants.TEXT_NULL,
@@ -279,7 +279,7 @@ public class CSVFile {
     }
 
     private final static int DEFAULT_PAGE_SIZE = 256 * 1024;
-    private char delimiter;
+    private byte[] delimiter;
     private FileSystem fs;
     private FSDataInputStream fis;
     private InputStream is; //decompressd stream
@@ -476,7 +476,7 @@ public class CSVFile {
         }
 
         byte[][] cells = BytesUtils.splitPreserveAllTokens(buffer.getData(), startOffsets.get(currentIdx),
-            rowLengthList.get(currentIdx), delimiter, targetColumnIndexes);
+            rowLengthList.get(currentIdx), delimiter, targetColumnIndexes, schema.size());
         currentIdx++;
         return new LazyTuple(schema, cells, offset, nullChars, serde);
       } catch (Throwable t) {

http://git-wip-us.apache.org/repos/asf/tajo/blob/4db8ca05/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
index 74563ff..92a041c 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
@@ -171,7 +171,8 @@ public class SequenceFileScanner extends FileScanner {
       } else {
         Text text = new Text();
         reader.getCurrentValue(text);
-        cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap);
+        cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), 
+            delimiter, projectionMap, schema.getColumns().size());
         totalBytes += (long)text.getBytes().length;
         tuple = new LazyTuple(schema, cells, 0, nullChars, serde);
       }


Mime
View raw message