drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amansi...@apache.org
Subject drill git commit: DRILL-4479: For empty fields under all_text_mode enabled (a) use varchar for the default columns and (b) ensure we create fields corresponding to all columns.
Date Mon, 14 Mar 2016 20:46:38 GMT
Repository: drill
Updated Branches:
  refs/heads/master 46e3de790 -> f7197596d


DRILL-4479: For empty fields under all_text_mode enabled (a) use varchar for the default columns
and (b) ensure we create fields corresponding to all columns.

close apache/drill#420


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/f7197596
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/f7197596
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/f7197596

Branch: refs/heads/master
Commit: f7197596d61bf2f3652df8318113636ef1eb5c18
Parents: 46e3de7
Author: Aman Sinha <asinha@maprtech.com>
Authored: Tue Mar 8 09:27:32 2016 -0800
Committer: Aman Sinha <asinha@maprtech.com>
Committed: Mon Mar 14 13:41:11 2016 -0700

----------------------------------------------------------------------
 .../exec/store/easy/json/JSONRecordReader.java  |  4 +-
 .../exec/vector/complex/fn/JsonReader.java      | 55 ++++++++++++++++----
 .../exec/store/json/TestJsonRecordReader.java   |  3 +-
 .../vector/complex/writer/TestJsonReader.java   | 55 ++++++++++++++++++++
 4 files changed, 105 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/f7197596/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
index 8d82f78..e943401 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
@@ -48,6 +48,8 @@ import com.google.common.collect.ImmutableList;
 public class JSONRecordReader extends AbstractRecordReader {
   private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(JSONRecordReader.class);
 
+  public static final long DEFAULT_ROWS_PER_BATCH = BaseValueVector.INITIAL_VALUE_ALLOCATION;
+
   private VectorContainerWriter writer;
 
   // Data we're consuming
@@ -192,7 +194,7 @@ public class JSONRecordReader extends AbstractRecordReader {
     ReadState write = null;
 //    Stopwatch p = new Stopwatch().start();
     try{
-      outside: while(recordCount < BaseValueVector.INITIAL_VALUE_ALLOCATION) {
+      outside: while(recordCount < DEFAULT_ROWS_PER_BATCH) {
         writer.setPosition(recordCount);
         write = jsonReader.write(writer);
 

http://git-wip-us.apache.org/repos/asf/drill/blob/f7197596/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/fn/JsonReader.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/fn/JsonReader.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/fn/JsonReader.java
index 1609541..64ee449 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/fn/JsonReader.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/vector/complex/fn/JsonReader.java
@@ -21,6 +21,7 @@ import io.netty.buffer.DrillBuf;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.BitSet;
 import java.util.List;
 
 import org.apache.drill.common.exceptions.UserException;
@@ -40,6 +41,7 @@ import com.fasterxml.jackson.core.JsonToken;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.google.common.base.Charsets;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 
 public class JsonReader extends BaseJsonProcessor {
   private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(JsonReader.class);
@@ -89,16 +91,51 @@ public class JsonReader extends BaseJsonProcessor {
 
   @Override
   public void ensureAtLeastOneField(ComplexWriter writer) {
-    // if we had no columns, create one empty one so we can return some data for count purposes.
-    SchemaPath sp = columns.get(0);
-    PathSegment fieldPath = sp.getRootSegment();
-    BaseWriter.MapWriter fieldWriter = writer.rootAsMap();
-    while (fieldPath.getChild() != null && ! fieldPath.getChild().isArray()) {
-      fieldWriter = fieldWriter.map(fieldPath.getNameSegment().getPath());
-      fieldPath = fieldPath.getChild();
+    List<BaseWriter.MapWriter> writerList = Lists.newArrayList();
+    List<PathSegment> fieldPathList = Lists.newArrayList();
+    BitSet emptyStatus = new BitSet(columns.size());
+
+    // first pass: collect which fields are empty
+    for (int i = 0; i < columns.size(); i++) {
+      SchemaPath sp = columns.get(i);
+      PathSegment fieldPath = sp.getRootSegment();
+      BaseWriter.MapWriter fieldWriter = writer.rootAsMap();
+      while (fieldPath.getChild() != null && ! fieldPath.getChild().isArray()) {
+        fieldWriter = fieldWriter.map(fieldPath.getNameSegment().getPath());
+        fieldPath = fieldPath.getChild();
+      }
+      writerList.add(fieldWriter);
+      fieldPathList.add(fieldPath);
+      if (fieldWriter.isEmptyMap()) {
+        emptyStatus.set(i, true);
+      }
+      if (i == 0 && !allTextMode) {
+        // when allTextMode is false, there is not much benefit to producing all the empty
+        // fields; just produce 1 field.  The reason is that the type of the fields is
+        // unknown, so if we produce multiple Integer fields by default, a subsequent batch
+        // that contains non-integer fields will error out in any case.  Whereas, with
+        // allTextMode true, we are sure that all fields are going to be treated as varchar,
+        // so it makes sense to produce all the fields, and in fact is necessary in order
to
+        // avoid schema change exceptions by downstream operators.
+        break;
+      }
+
     }
-    if (fieldWriter.isEmptyMap()) {
-      fieldWriter.integer(fieldPath.getNameSegment().getPath());
+
+    // second pass: create default typed vectors corresponding to empty fields
+    // Note: this is not easily do-able in 1 pass because the same fieldWriter may be
+    // shared by multiple fields whereas we want to keep track of all fields independently,
+    // so we rely on the emptyStatus.
+    for (int j = 0; j < fieldPathList.size(); j++) {
+      BaseWriter.MapWriter fieldWriter = writerList.get(j);
+      PathSegment fieldPath = fieldPathList.get(j);
+      if (emptyStatus.get(j)) {
+        if (allTextMode) {
+          fieldWriter.varChar(fieldPath.getNameSegment().getPath());
+        } else {
+          fieldWriter.integer(fieldPath.getNameSegment().getPath());
+        }
+      }
     }
   }
 

http://git-wip-us.apache.org/repos/asf/drill/blob/f7197596/exec/java-exec/src/test/java/org/apache/drill/exec/store/json/TestJsonRecordReader.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/json/TestJsonRecordReader.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/store/json/TestJsonRecordReader.java
index efe877d..02b98fc 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/json/TestJsonRecordReader.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/json/TestJsonRecordReader.java
@@ -18,13 +18,12 @@
 package org.apache.drill.exec.store.json;
 
 import org.apache.drill.BaseTestQuery;
-import org.apache.drill.TestBuilder;
 import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.exec.proto.UserBitShared;
 import org.junit.Test;
 import org.junit.Assert;
-import static org.junit.Assert.assertEquals;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 public class TestJsonRecordReader extends BaseTestQuery {

http://git-wip-us.apache.org/repos/asf/drill/blob/f7197596/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java
index c088e25..1168e37 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/vector/complex/writer/TestJsonReader.java
@@ -33,6 +33,7 @@ import java.util.List;
 import java.util.zip.GZIPOutputStream;
 
 import com.google.common.base.Joiner;
+
 import org.apache.drill.BaseTestQuery;
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.util.FileUtils;
@@ -41,6 +42,7 @@ import org.apache.drill.exec.proto.UserBitShared;
 import org.apache.drill.exec.record.RecordBatchLoader;
 import org.apache.drill.exec.record.VectorWrapper;
 import org.apache.drill.exec.rpc.user.QueryDataBatch;
+import org.apache.drill.exec.store.easy.json.JSONRecordReader;
 import org.apache.drill.exec.vector.IntVector;
 import org.apache.drill.exec.vector.RepeatedBigIntVector;
 import org.junit.Ignore;
@@ -49,6 +51,7 @@ import org.junit.Test;
 
 import com.google.common.base.Charsets;
 import com.google.common.io.Files;
+
 import org.junit.rules.TemporaryFolder;
 
 public class TestJsonReader extends BaseTestQuery {
@@ -597,4 +600,56 @@ public class TestJsonReader extends BaseTestQuery {
     os.close();
     testNoResult("select t.col2.col3 from dfs_test.tmp.drill_4032 t");
   }
+
+  @Test
+  public void drill_4479() throws Exception {
+    try {
+      String dfs_temp = getDfsTestTmpSchemaLocation();
+      File table_dir = new File(dfs_temp, "drill_4479");
+      table_dir.mkdir();
+      BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(new File(table_dir,
"mostlynulls.json")));
+      // Create an entire batch of null values for 3 columns
+      for (int i = 0 ; i < JSONRecordReader.DEFAULT_ROWS_PER_BATCH; i++) {
+        os.write("{\"a\": null, \"b\": null, \"c\": null}".getBytes());
+      }
+      // Add a row with {bigint,  float, string} values
+      os.write("{\"a\": 123456789123, \"b\": 99.999, \"c\": \"Hello World\"}".getBytes());
+      os.flush();
+      os.close();
+
+      String query1 = "select c, count(*) as cnt from dfs_test.tmp.drill_4479 t group by
c";
+      String query2 = "select a, b, c, count(*) as cnt from dfs_test.tmp.drill_4479 t group
by a, b, c";
+      String query3 = "select max(a) as x, max(b) as y, max(c) as z from dfs_test.tmp.drill_4479
t";
+
+      testBuilder()
+        .sqlQuery(query1)
+        .ordered()
+        .optionSettingQueriesForTestQuery("alter session set `store.json.all_text_mode` =
true")
+        .baselineColumns("c", "cnt")
+        .baselineValues(null, 4096L)
+        .baselineValues("Hello World", 1L)
+        .go();
+
+      testBuilder()
+        .sqlQuery(query2)
+        .ordered()
+        .optionSettingQueriesForTestQuery("alter session set `store.json.all_text_mode` =
true")
+        .baselineColumns("a", "b", "c", "cnt")
+        .baselineValues(null, null, null, 4096L)
+        .baselineValues("123456789123", "99.999", "Hello World", 1L)
+        .go();
+
+      testBuilder()
+        .sqlQuery(query3)
+        .ordered()
+        .optionSettingQueriesForTestQuery("alter session set `store.json.all_text_mode` =
true")
+        .baselineColumns("x", "y", "z")
+        .baselineValues("123456789123", "99.999", "Hello World")
+        .go();
+
+    } finally {
+      testNoResult("alter session set `store.json.all_text_mode` = false");
+    }
+  }
+
 }


Mime
View raw message