drill-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (DRILL-4694) CTAS in JSON format produces extraneous NULL fields
Date Sat, 04 Jun 2016 01:55:59 GMT

    [ https://issues.apache.org/jira/browse/DRILL-4694?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15315251#comment-15315251
] 

ASF GitHub Bot commented on DRILL-4694:
---------------------------------------

Github user amansinha100 commented on a diff in the pull request:

    https://github.com/apache/drill/pull/514#discussion_r65795257
  
    --- Diff: exec/java-exec/src/test/java/org/apache/drill/TestCTASJson.java ---
    @@ -0,0 +1,129 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.drill;
    +
    +
    +import org.apache.drill.common.util.TestTools;
    +import org.apache.drill.exec.ExecConstants;
    +import org.junit.Test;
    +
    +import static org.junit.Assert.assertEquals;
    +
    +public class TestCTASJson extends PlanTestBase {
    +  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestCTASJson.class);
    +
    +  static final String WORKING_PATH = TestTools.getWorkingPath();
    +  static final String TEST_RES_PATH = WORKING_PATH + "/src/test/resources";
    +
    +  @Test
    +  /**
    +   * Test a source json file that contains records that are maps with fields of all types.
    +   * Some records have missing fields. CTAS should skip the missing fields
    +   */ public void testctas_alltypes_map() throws Exception {
    +    String testName = "ctas_alltypes_map";
    +    test("use dfs_test.tmp");
    +    test("alter session set store.format = 'json' ");
    +    test("alter session set store.json.writer.skip_null_fields = true"); // DEFAULT
    +    test("create table " + testName + "_json as select * from cp.`json/" + testName +
".json`");
    +
    +    final String query = "select * from `" + testName + "_json` t1 ";
    +
    +    testBuilder()
    +        .sqlQuery(query)
    +        .ordered()
    +        .jsonBaselineFile("json/" + testName + ".json")
    +        .build()
    +        .run();
    +
    +    test("drop table " + testName + "_json" );
    +  }
    +
    +  @Test
    +  /**
    +   * Test a source json file that contains records that are maps with fields of all types.
    +   * Some records have missing fields. CTAS should NOT skip the missing fields
    +   */
    +  public void testctas_alltypes_map_noskip() throws Exception {
    +    String testName = "ctas_alltypes_map";
    +    test("use dfs_test.tmp");
    +    test("alter session set store.format = 'json' ");
    +    test("alter session set store.json.writer.skip_null_fields = false"); // CHANGE from
default
    +    test("create table " + testName + "_json as select * from cp.`json/" + testName +
".json`");
    +
    +    final String query = "select * from `" + testName + "_json` t1 ";
    +
    +    testBuilder()
    +        .sqlQuery(query)
    +        .ordered()
    +        .jsonBaselineFile("json/" + testName + "_out.json")
    +        .build()
    +        .run();
    +
    +    test("drop table " + testName + "_json" );
    +  }
    +
    +  @Test
    +  /**
    +   * Test a source json file that contains records that are maps with fields of all types.
    +   * Some records have missing fields. CTAS should skip the missing fields
    +   */ public void testctas_alltypes_repeatedmap() throws Exception {
    +    String testName = "ctas_alltypes_repeated_map";
    +    test("use dfs_test.tmp");
    +    test("alter session set store.format = 'json' ");
    +    test("alter session set store.json.writer.skip_null_fields = true"); // DEFAULT
    +    test("create table " + testName + "_json as select * from cp.`json/" + testName +
".json`");
    +
    +    final String query = "select * from `" + testName + "_json` t1 ";
    +
    +    testBuilder()
    +        .sqlQuery(query)
    +        .ordered()
    +        .jsonBaselineFile("json/" + testName + ".json")
    +        .build()
    +        .run();
    +
    +    test("drop table " + testName + "_json" );
    +
    +  }
    +
    +  @Test
    +  /**
    +   * Test a source json file that contains records that are maps with fields of all types.
    +   * Some records have missing fields. CTAS should NOT skip the missing fields
    +   */
    +  public void testctas_alltypes_repeated_map_noskip() throws Exception {
    +    String testName = "ctas_alltypes_repeated_map";
    +    test("use dfs_test.tmp");
    +    test("alter session set store.format = 'json' ");
    +    test("alter session set store.json.writer.skip_null_fields = false"); // CHANGE from
default
    --- End diff --
    
    The testBuilder provides the optionSettingQueriesForTestQuery() method to set the options
per query.  Would be good to use it and have a finally block {} that resets the option. 


> CTAS in JSON format produces extraneous NULL fields
> ---------------------------------------------------
>
>                 Key: DRILL-4694
>                 URL: https://issues.apache.org/jira/browse/DRILL-4694
>             Project: Apache Drill
>          Issue Type: Bug
>          Components: Storage - JSON
>    Affects Versions: 1.6.0
>            Reporter: Aman Sinha
>            Assignee: Aman Sinha
>
> Consider the following JSON file: 
> {noformat}
> // file t2.json
> {
> "X" : {
>   "key1" : "value1",
>   "key2" : "value2"
>   } 
> }
> {
> "X" : {
>   "key3" : "value3",
>   "key4" : "value4"
>   }
> }
> {
> "X" : {
>   "key5" : "value5",
>   "key6" : "value6"
>   }
> }
> {noformat}
> Now create a table in Json format using CTAS: 
> {noformat}
> 0: jdbc:drill:zk=local> alter session set `store.format` = 'json';
> 0: jdbc:drill:zk=local> create table dfs.tmp.jt12 as select t.`X` from `t2.json` t;
> +-----------+----------------------------+
> | Fragment  | Number of records written  |
> +-----------+----------------------------+
> | 0_0       | 3                          |
> +-----------+----------------------------+
> {noformat}
> The output file has rows with union schema of all the fields in all the records.  This
creates extraneous Null fields in the output: 
> {noformat}
> $ cat jt12/0_0_0.json 
> {
>   "X" : {
>     "key1" : "value1",
>     "key2" : "value2",
>     "key3" : null,
>     "key4" : null,
>     "key5" : null,
>     "key6" : null
>   }
> } {
>   "X" : {
>     "key1" : null,
>     "key2" : null,
>     "key3" : "value3",
>     "key4" : "value4",
>     "key5" : null,
>     "key6" : null
>   }
> } {
>   "X" : {
>     "key1" : null,
>     "key2" : null,
>     "key3" : null,
>     "key4" : null,
>     "key5" : "value5",
>     "key6" : "value6"
>   }
> }
> {noformat}
> Note that if I change the output format to CSV or Parquet, there are no Null fields created
in the output file.   The expectation for a CTAS in json format is that the output should
match that of the input json data.  



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message