tajo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jh...@apache.org
Subject tajo git commit: TAJO-1242: Json scanner can not read some case of trucated text. (jinho)
Date Fri, 12 Dec 2014 08:01:16 GMT
Repository: tajo
Updated Branches:
  refs/heads/master 5d9a130b7 -> c665ae1f6


TAJO-1242: Json scanner can not read some case of trucated text. (jinho)

Closes #296


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/c665ae1f
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/c665ae1f
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/c665ae1f

Branch: refs/heads/master
Commit: c665ae1f6fc1e35e6a743e7e4e377c7885686b32
Parents: 5d9a130
Author: jhkim <jhkim@apache.org>
Authored: Fri Dec 12 17:00:40 2014 +0900
Committer: jhkim <jhkim@apache.org>
Committed: Fri Dec 12 17:00:40 2014 +0900

----------------------------------------------------------------------
 CHANGES                                         |  4 +-
 .../testErrorTolerance3.json                    |  1 +
 .../tajo/storage/json/JsonLineDeserializer.java | 39 ++++++++++----------
 .../tajo/storage/TestDelimitedTextFile.java     | 17 +++++++++
 4 files changed, 41 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index d758459..e41ea56 100644
--- a/CHANGES
+++ b/CHANGES
@@ -109,7 +109,9 @@ Release 0.9.1 - unreleased
 
   BUG FIXES
 
-    TAJO-1239 ORDER BY with null column desc miss some data. 
+    TAJO-1242: Json scanner can not read some case of trucated text. (jinho) 
+
+    TAJO-1239: ORDER BY with null column desc miss some data. 
     (Hyoungjun Kim via hyunsik)
 
     TAJO-1244: tajo.worker.tmpdir.locations should use a validator for a list 

http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json
----------------------------------------------------------------------
diff --git a/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json
b/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json
new file mode 100644
index 0000000..a7fe424
--- /dev/null
+++ b/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json
@@ -0,0 +1 @@
+{"id":[{"text":"json test
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
index dfe36f6..a7e02a4 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
@@ -32,7 +32,6 @@ import org.apache.tajo.common.exception.NotImplementedException;
 import org.apache.tajo.datum.DatumFactory;
 import org.apache.tajo.datum.NullDatum;
 import org.apache.tajo.datum.TextDatum;
-import org.apache.tajo.datum.protobuf.ProtobufJsonFormat;
 import org.apache.tajo.storage.Tuple;
 import org.apache.tajo.storage.text.TextLineDeserializer;
 import org.apache.tajo.storage.text.TextLineParsingError;
@@ -42,8 +41,8 @@ import java.util.Iterator;
 
 public class JsonLineDeserializer extends TextLineDeserializer {
   private JSONParser parser;
-  private Type [] types;
-  private String [] columnNames;
+  private Type[] types;
+  private String[] columnNames;
 
   public JsonLineDeserializer(Schema schema, TableMeta meta, int[] targetColumnIndexes) {
     super(schema, meta, targetColumnIndexes);
@@ -54,27 +53,34 @@ public class JsonLineDeserializer extends TextLineDeserializer {
     types = SchemaUtil.toTypes(schema);
     columnNames = SchemaUtil.toSimpleNames(schema);
 
-    parser = new JSONParser(JSONParser.MODE_JSON_SIMPLE);
+    parser = new JSONParser(JSONParser.MODE_JSON_SIMPLE | JSONParser.IGNORE_CONTROL_CHAR);
   }
 
   @Override
   public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError
{
-    byte [] line = new byte[buf.readableBytes()];
+    byte[] line = new byte[buf.readableBytes()];
     buf.readBytes(line);
 
+    JSONObject object;
     try {
-      JSONObject object = (JSONObject) parser.parse(line);
+      object = (JSONObject) parser.parse(line);
+    } catch (ParseException pe) {
+      throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe);
+    } catch (ArrayIndexOutOfBoundsException ae) {
+      // truncated value
+      throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae);
+    }
 
-      for (int i = 0; i < targetColumnIndexes.length; i++) {
-        int actualIdx = targetColumnIndexes[i];
-        String fieldName = columnNames[actualIdx];
+    for (int i = 0; i < targetColumnIndexes.length; i++) {
+      int actualIdx = targetColumnIndexes[i];
+      String fieldName = columnNames[actualIdx];
 
-        if (!object.containsKey(fieldName)) {
-          output.put(actualIdx, NullDatum.get());
-          continue;
-        }
+      if (!object.containsKey(fieldName)) {
+        output.put(actualIdx, NullDatum.get());
+        continue;
+      }
 
-        switch (types[actualIdx]) {
+      switch (types[actualIdx]) {
         case BOOLEAN:
           String boolStr = object.getAsString(fieldName);
           if (boolStr != null) {
@@ -210,12 +216,7 @@ public class JsonLineDeserializer extends TextLineDeserializer {
 
         default:
           throw new NotImplementedException(types[actualIdx].name() + " is not supported.");
-        }
       }
-    } catch (ParseException pe) {
-      throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe);
-    } catch (Throwable e) {
-      throw new IOException(e);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
index 8749925..7e4b7aa 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
@@ -160,4 +160,21 @@ public class TestDelimitedTextFile {
     }
     fail();
   }
+
+  @Test
+  public void testIgnoreTruncatedValueErrorTolerance() throws IOException {
+    TajoConf conf = new TajoConf();
+    TableMeta meta = CatalogUtil.newTableMeta(CatalogProtos.StoreType.JSON);
+    meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "1");
+    FileFragment fragment = getFileFragment("testErrorTolerance3.json");
+    Scanner scanner = StorageManager.getStorageManager(conf).getScanner(meta, schema, fragment);
+    scanner.init();
+
+    try {
+      Tuple tuple = scanner.next();
+      assertNull(tuple);
+    } finally {
+      scanner.close();
+    }
+  }
 }


Mime
View raw message