Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 2D3D5200C8A for ; Sat, 20 May 2017 16:26:03 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 2BE9C160BAD; Sat, 20 May 2017 14:26:03 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id DB197160BD7 for ; Sat, 20 May 2017 16:26:01 +0200 (CEST) Received: (qmail 85803 invoked by uid 500); 20 May 2017 14:26:01 -0000 Mailing-List: contact commits-help@drill.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: commits@drill.apache.org Delivered-To: mailing list commits@drill.apache.org Received: (qmail 85593 invoked by uid 99); 20 May 2017 14:26:00 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 20 May 2017 14:26:00 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 8751AE1103; Sat, 20 May 2017 14:26:00 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: amansinha@apache.org To: commits@drill.apache.org Date: Sat, 20 May 2017 14:26:04 -0000 Message-Id: <5b3bf9656d1a4c7cb632678ebfe4678e@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [5/6] drill git commit: DRILL-5498: Better handling of CSV column headers archived-at: Sat, 20 May 2017 14:26:03 -0000 DRILL-5498: Better handling of CSV column headers See DRILL-5498 for details. Replaced the repeated varchar reader for reading columns with a purpose built column parser. Implemented rules to recover from invalid column headers. Added missing test method Changes re code review comments Back out testing-only change close apache/drill#830 Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/f21edb05 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/f21edb05 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/f21edb05 Branch: refs/heads/master Commit: f21edb057efd83989d7462910541e528a8779b79 Parents: 7f98400 Author: Paul Rogers Authored: Wed May 10 16:17:24 2017 -0700 Committer: Aman Sinha Committed: Fri May 19 10:08:31 2017 -0700 ---------------------------------------------------------------------- .../compliant/CompliantTextRecordReader.java | 9 +- .../easy/text/compliant/HeaderBuilder.java | 274 +++++++++++++++++++ .../exec/store/easy/text/compliant/TestCsv.java | 150 ++++++++++ .../easy/text/compliant/TestHeaderBuilder.java | 223 +++++++++++++++ .../org/apache/drill/test/ClusterFixture.java | 13 +- 5 files changed, 660 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/f21edb05/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/CompliantTextRecordReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/CompliantTextRecordReader.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/CompliantTextRecordReader.java index 93c4ff8..e253730 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/CompliantTextRecordReader.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/CompliantTextRecordReader.java @@ -179,11 +179,7 @@ public class CompliantTextRecordReader extends AbstractRecordReader { // don't skip header in case skipFirstLine is set true settings.setSkipFirstLine(false); - // setup Output using OutputMutator - // we should use a separate output mutator to avoid reshaping query output with header data - HeaderOutputMutator hOutputMutator = new HeaderOutputMutator(); - TextOutput hOutput = new RepeatedVarCharOutput(hOutputMutator, getColumns(), true); - this.allocate(hOutputMutator.fieldVectorMap); + HeaderBuilder hOutput = new HeaderBuilder(); // setup Input using InputStream // we should read file header irrespective of split given given to this reader @@ -198,11 +194,10 @@ public class CompliantTextRecordReader extends AbstractRecordReader { reader.parseNext(); // grab the field names from output - String [] fieldNames = ((RepeatedVarCharOutput)hOutput).getTextOutput(); + String [] fieldNames = hOutput.getHeaders(); // cleanup and set to skip the first line next time we read input reader.close(); - hOutputMutator.close(); settings.setSkipFirstLine(true); return fieldNames; http://git-wip-us.apache.org/repos/asf/drill/blob/f21edb05/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/HeaderBuilder.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/HeaderBuilder.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/HeaderBuilder.java new file mode 100644 index 0000000..8910c26 --- /dev/null +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/text/compliant/HeaderBuilder.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.easy.text.compliant; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Charsets; + +/** + * Text output that implements a header reader/parser. + * The caller parses out the characters of each header; + * this class assembles UTF-8 bytes into Unicode characters, + * fixes invalid characters (those not legal for SQL symbols), + * and maps duplicate names to unique names. + *

+ * That is, this class is as permissive as possible with file + * headers to avoid spurious query failures for trivial reasons. + */ + +// Note: this class uses Java heap strings and the usual Java +// convenience classes. Since we do heavy Unicode string operations, +// and read a single row, there is no good reason to try to use +// value vectors and direct memory for this task. + +public class HeaderBuilder extends TextOutput { + + /** + * Maximum Drill symbol length, as enforced for headers. + * @see + * identifier documentation + */ + // TODO: Replace with the proper constant, if available + public static final int MAX_HEADER_LEN = 1024; + + /** + * Prefix used to replace non-alphabetic characters at the start of + * a column name. For example, $foo becomes col_foo. Used + * because SQL does not allow _foo. + */ + + public static final String COLUMN_PREFIX = "col_"; + + /** + * Prefix used to create numbered columns for missing + * headers. Typical names: column_1, column_2, ... + */ + + public static final String ANONYMOUS_COLUMN_PREFIX = "column_"; + + /** + * Exception that reports header errors. Is an unchecked exception + * to avoid cluttering the normal field reader interface. + */ + public static class HeaderError extends RuntimeException { + + private static final long serialVersionUID = 1L; + + public HeaderError(String msg) { + super(msg); + } + + public HeaderError(int colIndex, String msg) { + super("Column " + (colIndex + 1) + ": " + msg); + } + } + + public final List headers = new ArrayList<>(); + public final ByteBuffer currentField = ByteBuffer.allocate(MAX_HEADER_LEN); + + @Override + public void startField(int index) { + currentField.clear(); + } + + @Override + public boolean endField() { + String header = new String(currentField.array(), 0, currentField.position(), Charsets.UTF_8); + header = validateSymbol(header); + headers.add(header); + return true; + } + + @Override + public boolean endEmptyField() { + + // Empty header will be rewritten to "column_". + + return endField(); + } + + /** + * Validate the header name according to the SQL lexical rules. + * @see + * identifier documentation + * @param header the header name to validate + */ + + // TODO: Replace with existing code, if any. + private String validateSymbol(String header) { + header = header.trim(); + + // To avoid unnecessary query failures, just make up a column name + // if the name is missing or all blanks. + + if (header.isEmpty()) { + return ANONYMOUS_COLUMN_PREFIX + (headers.size() + 1); + } + if (! Character.isAlphabetic(header.charAt(0))) { + return rewriteHeader(header); + } + for (int i = 1; i < header.length(); i++) { + char ch = header.charAt(i); + if (! Character.isAlphabetic(ch) && + ! Character.isDigit(ch) && ch != '_') { + return rewriteHeader(header); + } + } + return header; + } + + /** + * Given an invalid header, rewrite it to replace illegal characters + * with valid ones. The header won't be what the user specified, + * but it will be a valid SQL identifier. This solution avoids failing + * queries due to corrupted or invalid header data. + *

+ * Names with invalid first characters are mapped to "col_". Example: + * $foo maps to col_foo. If the only character is non-alphabetic, treat + * the column as anonymous and create a generic name: column_4, etc. + *

+ * This mapping could create a column that exceeds the maximum length + * of 1024. Since that is not really a hard limit, we just live with the + * extra few characters. + * + * @param header the original header + * @return the rewritten header, valid for SQL + */ + + private String rewriteHeader(String header) { + final StringBuilder buf = new StringBuilder(); + + // If starts with non-alphabetic, can't map the character to + // underscore, so just tack on a prefix. + + char ch = header.charAt(0); + if (Character.isAlphabetic(ch)) { + buf.append(ch); + } else if (Character.isDigit(ch)) { + buf.append(COLUMN_PREFIX); + buf.append(ch); + + // For the strange case of only one character, format + // the same as an empty header. + + } else if (header.length() == 1) { + return ANONYMOUS_COLUMN_PREFIX + (headers.size() + 1); + } else { + buf.append(COLUMN_PREFIX); + } + + // Convert all remaining invalid characters to underscores + + for (int i = 1; i < header.length(); i++) { + ch = header.charAt(i); + if (Character.isAlphabetic(ch) || + Character.isDigit(ch) || ch == '_') { + buf.append(ch); + } else { + buf.append("_"); + } + } + return buf.toString(); + } + + @Override + public void append(byte data) { + + // Ensure the data fits. Note that, if the name is Unicode, the actual + // number of characters might be less than the limit even though the + // byte count exceeds the limit. Fixing this, in general, would require + // a buffer four times larger, so we leave that as a later improvement + // if ever needed. + + try { + currentField.put(data); + } catch (BufferOverflowException e) { + throw new HeaderError(headers.size(), "Column exceeds maximum length of " + MAX_HEADER_LEN); + } + } + + @Override + public void finishRecord() { + if (headers.isEmpty()) { + throw new HeaderError("The file must define at least one header."); + } + + // Force headers to be unique. + + final Set idents = new HashSet(); + for (int i = 0; i < headers.size(); i++) { + String header = headers.get(i); + String key = header.toLowerCase(); + + // Is the header a duplicate? + + if (idents.contains(key)) { + + // Make header unique by appending a suffix. + // This loop must end because we have a finite + // number of headers. + // The original column is assumed to be "1", so + // the first duplicate is "2", and so on. + // Note that this will map columns of the form: + // "col,col,col_2,col_2_2" to + // "col", "col_2", "col_2_2", "col_2_2_2". + // No mapping scheme is perfect... + + for (int l = 2; ; l++) { + final String rewritten = header + "_" + l; + key = rewritten.toLowerCase(); + if (! idents.contains(key)) { + headers.set(i, rewritten); + break; + } + } + } + idents.add(key); + } + } + + @Override + public long getRecordCount() { return 1; } + + @Override + public void startBatch() { } + + @Override + public void finishBatch() { } + + @Override + public boolean rowHasData() { + return ! headers.isEmpty(); + } + + public String[] getHeaders() { + + // Just return the headers: any needed checks were done in + // finishRecord() + + final String array[] = new String[headers.size()]; + return headers.toArray(array); + } + +} http://git-wip-us.apache.org/repos/asf/drill/blob/f21edb05/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestCsv.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestCsv.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestCsv.java new file mode 100644 index 0000000..7d38cf9 --- /dev/null +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestCsv.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License.‰ + */ +package org.apache.drill.exec.store.easy.text.compliant; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.BatchSchema; +import org.apache.drill.exec.store.easy.text.TextFormatPlugin.TextFormatConfig; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.rowSet.RowSet; +import org.apache.drill.test.rowSet.RowSetBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.apache.drill.test.rowSet.SchemaBuilder; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * SQL-level tests for CSV headers. See + * {@link TestHeaderBuilder} for detailed unit tests. + * This test does not attempt to duplicate all the cases + * from the unit tests; instead it just does a sanity check. + */ + +public class TestCsv extends ClusterTest { + + private static File testDir; + + @BeforeClass + public static void setup() throws Exception { + startCluster(ClusterFixture.builder() + .maxParallelization(1) + ); + + // Set up CSV storage plugin using headers. + + testDir = cluster.makeTempDir("csv"); + TextFormatConfig csvFormat = new TextFormatConfig(); + csvFormat.fieldDelimiter = ','; + csvFormat.skipFirstLine = false; + csvFormat.extractHeader = true; + cluster.defineWorkspace("dfs", "data", testDir.getAbsolutePath(), "csv", csvFormat); + } + + String emptyHeaders[] = { + "", + "10,foo,bar" + }; + + @Test + public void testEmptyCsvHeaders() throws IOException { + String fileName = "case1.csv"; + buildFile(fileName, emptyHeaders); + try { + client.queryBuilder().sql(makeStatement(fileName)).run(); + fail(); + } catch (Exception e) { + assertTrue(e.getMessage().contains("must define at least one header")); + } + } + + String validHeaders[] = { + "a,b,c", + "10,foo,bar" + }; + + @Test + public void testValidCsvHeaders() throws IOException { + String fileName = "case2.csv"; + buildFile(fileName, validHeaders); + RowSet actual = client.queryBuilder().sql(makeStatement(fileName)).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .add("a", MinorType.VARCHAR) + .add("b", MinorType.VARCHAR) + .add("c", MinorType.VARCHAR) + .build(); + assertEquals(expectedSchema, actual.batchSchema()); + + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .add("10", "foo", "bar") + .build(); + new RowSetComparison(expected) + .verifyAndClear(actual); + } + + String invalidHeaders[] = { + "$,,9b,c,c,c_2", + "10,foo,bar,fourth,fifth,sixth" + }; + + @Test + public void testInvalidCsvHeaders() throws IOException { + String fileName = "case3.csv"; + buildFile(fileName, invalidHeaders); + RowSet actual = client.queryBuilder().sql(makeStatement(fileName)).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .add("column_1", MinorType.VARCHAR) + .add("column_2", MinorType.VARCHAR) + .add("col_9b", MinorType.VARCHAR) + .add("c", MinorType.VARCHAR) + .add("c_2", MinorType.VARCHAR) + .add("c_2_2", MinorType.VARCHAR) + .build(); + assertEquals(expectedSchema, actual.batchSchema()); + + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .add("10", "foo", "bar", "fourth", "fifth", "sixth") + .build(); + new RowSetComparison(expected) + .verifyAndClear(actual); + } + + private String makeStatement(String fileName) { + return "SELECT * FROM `dfs.data`.`" + fileName + "`"; + } + + private void buildFile(String fileName, String[] data) throws IOException { + try(PrintWriter out = new PrintWriter(new FileWriter(new File(testDir, fileName)))) { + for (String line : data) { + out.println(line); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/drill/blob/f21edb05/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestHeaderBuilder.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestHeaderBuilder.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestHeaderBuilder.java new file mode 100644 index 0000000..47bb903 --- /dev/null +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/easy/text/compliant/TestHeaderBuilder.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.easy.text.compliant; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.drill.exec.store.easy.text.compliant.HeaderBuilder.HeaderError; +import org.apache.drill.test.DrillTest; +import org.junit.Test; + +import com.google.common.base.Charsets; + +public class TestHeaderBuilder extends DrillTest { + + @Test + public void testEmptyHeader() { + HeaderBuilder hb = new HeaderBuilder(); + hb.startBatch(); + try { + hb.finishRecord(); + } catch (HeaderError e) { + assertTrue(e.getMessage().contains("must define at least one header")); + } + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,""); + try { + hb.finishRecord(); + } catch (HeaderError e) { + assertTrue(e.getMessage().contains("must define at least one header")); + } + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb," "); + validateHeader(hb, new String[] {"column_1"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,","); + validateHeader(hb, new String[] {"column_1", "column_2"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb," , "); + validateHeader(hb, new String[] {"column_1", "column_2"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a, "); + validateHeader(hb, new String[] {"a", "column_2"}); + } + + @Test + public void testWhiteSpace() { + HeaderBuilder hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a"); + validateHeader(hb, new String[] {"a"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb," a "); + validateHeader(hb, new String[] {"a"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb," a "); + validateHeader(hb, new String[] {"a"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a,b,c"); + validateHeader(hb, new String[] {"a","b","c"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb," a , b , c "); + validateHeader(hb, new String[] {"a","b","c"}); + } + + @Test + public void testSyntax() { + HeaderBuilder hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a_123"); + validateHeader(hb, new String[] {"a_123"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a_123_"); + validateHeader(hb, new String[] {"a_123_"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"az09_"); + validateHeader(hb, new String[] {"az09_"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"+"); + validateHeader(hb, new String[] {"column_1"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"+,-"); + validateHeader(hb, new String[] {"column_1", "column_2"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"+9a"); + validateHeader(hb, new String[] {"col_9a"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"9a"); + validateHeader(hb, new String[] {"col_9a"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a+b"); + validateHeader(hb, new String[] {"a_b"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"a_b"); + validateHeader(hb, new String[] {"a_b"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"EXPR$0"); + validateHeader(hb, new String[] {"EXPR_0"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"(_-^-_)"); + validateHeader(hb, new String[] {"col_______"}); + } + + @Test + public void testUnicode() { + HeaderBuilder hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"Αθήνα"); + validateHeader(hb, new String[] {"Αθήνα"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"Москва"); + validateHeader(hb, new String[] {"Москва"}); + + hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,"Paris,Αθήνα,Москва"); + validateHeader(hb, new String[] {"Paris","Αθήνα","Москва"}); + } + + @Test + public void testDuplicateNames() { + testParser("a,a", new String[] {"a","a_2"}); + testParser("a,A", new String[] {"a","A_2"}); + // It ain't pretty, but it is unique... + testParser("a,A,A_2", new String[] {"a","A_2", "A_2_2"}); + // Verify with non-ASCII characters + testParser("Αθήνα,ΑθήνΑ", new String[] {"Αθήνα","ΑθήνΑ_2"}); + } + + private void testParser(String input, String[] expected) { + HeaderBuilder hb = new HeaderBuilder(); + hb.startBatch(); + parse(hb,input); + hb.finishRecord(); + validateHeader(hb, expected); + } + + private void parse(HeaderBuilder hb, String input) { + if (input == null) { + return; + } + byte bytes[] = input.getBytes(Charsets.UTF_8); + if (bytes.length == 0) { + return; + } + int fieldIndex = -1; + hb.startField(++fieldIndex); + for (int i = 0; i < bytes.length; i++) { + byte b = bytes[i]; + if (b == ',') { + hb.endField(); + hb.startField(++fieldIndex); + } else { + hb.append(b); + } + } + hb.endField(); + } + + private void validateHeader(HeaderBuilder hb, String[] expected) { + String actual[] = hb.getHeaders(); + assertEquals(expected.length, actual.length); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], actual[i]); + } + } + +} http://git-wip-us.apache.org/repos/asf/drill/blob/f21edb05/exec/java-exec/src/test/java/org/apache/drill/test/ClusterFixture.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterFixture.java b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterFixture.java index 0ce337d..513fe3a 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterFixture.java +++ b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterFixture.java @@ -36,6 +36,7 @@ import org.apache.drill.DrillTestWrapper.TestServices; import org.apache.drill.QueryTestUtil; import org.apache.drill.TestBuilder; import org.apache.drill.common.exceptions.ExecutionSetupException; +import org.apache.drill.common.logical.FormatPluginConfig; import org.apache.drill.exec.ExecConstants; import org.apache.drill.exec.ZookeeperHelper; import org.apache.drill.exec.client.DrillClient; @@ -526,9 +527,14 @@ public class ClusterFixture extends BaseFixture implements AutoCloseable { public void defineWorkspace(String pluginName, String schemaName, String path, String defaultFormat) { + defineWorkspace(pluginName, schemaName, path, defaultFormat, null); + } + + public void defineWorkspace(String pluginName, String schemaName, String path, + String defaultFormat, FormatPluginConfig format) { for (Drillbit bit : drillbits()) { try { - defineWorkspace(bit, pluginName, schemaName, path, defaultFormat); + defineWorkspace(bit, pluginName, schemaName, path, defaultFormat, format); } catch (ExecutionSetupException e) { // This functionality is supposed to work in tests. Change // exception to unchecked to make test code simpler. @@ -539,7 +545,7 @@ public class ClusterFixture extends BaseFixture implements AutoCloseable { } public static void defineWorkspace(Drillbit drillbit, String pluginName, - String schemaName, String path, String defaultFormat) + String schemaName, String path, String defaultFormat, FormatPluginConfig format) throws ExecutionSetupException { @SuppressWarnings("resource") final StoragePluginRegistry pluginRegistry = drillbit.getContext().getStorage(); @@ -550,6 +556,9 @@ public class ClusterFixture extends BaseFixture implements AutoCloseable { pluginConfig.workspaces.remove(schemaName); pluginConfig.workspaces.put(schemaName, newTmpWSConfig); + if (format != null) { + pluginConfig.formats.put(defaultFormat, format); + } pluginRegistry.createOrUpdate(pluginName, pluginConfig, true); }