asterixdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ima...@apache.org
Subject [2/2] incubator-asterixdb-hyracks git commit: Issue 867: Handle delimited files using CR-only line separators
Date Fri, 01 May 2015 02:38:05 GMT
Issue 867: Handle delimited files using CR-only line separators

Also simplify record- and field-counting logic.

Change-Id: Ie28abda93fc9e5996008fac8b60aaf906df49cb7
Reviewed-on: https://asterix-gerrit.ics.uci.edu/246
Reviewed-by: Ian Maxon <imaxon@uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Preston Carman <ecarm002@ucr.edu>


Project: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/commit/ec8d7a2f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/tree/ec8d7a2f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/diff/ec8d7a2f

Branch: refs/heads/master
Commit: ec8d7a2f30ae057674f6c3c92837cb9756955247
Parents: 12bab0d
Author: Chris Hillery <chillery@lambda.nu>
Authored: Thu Apr 30 17:03:35 2015 -0700
Committer: Chris Hillery <ceej@lambda.nu>
Committed: Thu Apr 30 19:27:14 2015 -0700

----------------------------------------------------------------------
 .../file/DelimitedDataTupleParserFactory.java   |  6 +----
 .../file/FieldCursorForDelimitedDataParser.java | 24 +++++++++++---------
 2 files changed, 14 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/ec8d7a2f/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
index 6fd38d2..5be1eab 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
@@ -37,8 +37,6 @@ public class DelimitedDataTupleParserFactory implements ITupleParserFactory
{
     private char fieldDelimiter;
     private char quote;
 
-    private int fieldCount;
-
     public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char
fieldDelimiter) {
         this(fieldParserFactories, fieldDelimiter, '\"');
     }
@@ -47,7 +45,6 @@ public class DelimitedDataTupleParserFactory implements ITupleParserFactory
{
         this.valueParserFactories = fieldParserFactories;
         this.fieldDelimiter = fieldDelimiter;
         this.quote = quote;
-        this.fieldCount = 0;
     }
 
     @Override
@@ -71,7 +68,7 @@ public class DelimitedDataTupleParserFactory implements ITupleParserFactory
{
                     while (cursor.nextRecord()) {
                         tb.reset();
                         for (int i = 0; i < valueParsers.length; ++i) {
-                            if (!cursor.nextField(fieldCount)) {
+                            if (!cursor.nextField()) {
                                 break;
                             }
                             // Eliminate double quotes in the field that we are going to
parse
@@ -82,7 +79,6 @@ public class DelimitedDataTupleParserFactory implements ITupleParserFactory
{
                             }
                             valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd
- cursor.fStart, dos);
                             tb.addFieldEndOffset();
-                            fieldCount++;
                         }
                         if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(),
0, tb.getSize())) {
                             FrameUtils.flushFrame(frame, writer);

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/ec8d7a2f/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
index 69ea0b1..780574c 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
@@ -32,7 +32,8 @@ public class FieldCursorForDelimitedDataParser {
     public char[] buffer;
     public int fStart;
     public int fEnd;
-    public int lineCount;
+    public int recordCount;
+    public int fieldCount;
     public int doubleQuoteCount;
     public boolean isDoubleQuoteIncludedInThisField;
 
@@ -69,10 +70,13 @@ public class FieldCursorForDelimitedDataParser {
         doubleQuoteCount = 0;
         startedQuote = false;
         isDoubleQuoteIncludedInThisField = false;
-        lineCount = 1;
+        recordCount = 0;
+        fieldCount = 0;
     }
 
     public boolean nextRecord() throws IOException {
+        recordCount++;
+        fieldCount = 0;
         while (true) {
             switch (state) {
                 case INIT:
@@ -119,12 +123,12 @@ public class FieldCursorForDelimitedDataParser {
                         } else if (ch == '\n' && !startedQuote) {
                             start = p + 1;
                             state = State.EOR;
-                            lineCount++;
                             lastDelimiterPosition = p;
                             break;
                         } else if (ch == '\r' && !startedQuote) {
                             start = p + 1;
                             state = State.CR;
+                            lastDelimiterPosition = p;
                             break;
                         }
                         ++p;
@@ -143,7 +147,6 @@ public class FieldCursorForDelimitedDataParser {
                     if (ch == '\n' && !startedQuote) {
                         ++start;
                         state = State.EOR;
-                        lineCount++;
                     } else {
                         state = State.IN_RECORD;
                         return true;
@@ -167,7 +170,8 @@ public class FieldCursorForDelimitedDataParser {
         }
     }
 
-    public boolean nextField(int fieldCount) throws IOException {
+    public boolean nextField() throws IOException {
+        fieldCount++;
         switch (state) {
             case INIT:
             case EOR:
@@ -217,10 +221,10 @@ public class FieldCursorForDelimitedDataParser {
                             } else {
                                 // In this case, we don't have a quote in the beginning of
a field.
                                 throw new IOException(
-                                        "At line: "
-                                                + lineCount
+                                        "At record: "
+                                                + recordCount
                                                 + ", field#: "
-                                                + (fieldCount + 1)
+                                                + fieldCount
                                                 + " - a quote enclosing a field needs to
be placed in the beginning of that field.");
                             }
                         }
@@ -262,7 +266,7 @@ public class FieldCursorForDelimitedDataParser {
                                 // There is a quote before the delimiter, however it is not
directly placed before the delimiter.
                                 // In this case, we throw an exception.
                                 // quoteCount == doubleQuoteCount * 2 + 2 : only true when
we have two quotes except double-quotes.
-                                throw new IOException("At line: " + lineCount + ", field#:
" + (fieldCount + 1)
+                                throw new IOException("At record: " + recordCount + ", field#:
" + fieldCount
                                         + " -  A quote enclosing a field needs to be followed
by the delimiter.");
                             }
                         }
@@ -275,7 +279,6 @@ public class FieldCursorForDelimitedDataParser {
                             fEnd = p;
                             start = p + 1;
                             state = State.EOR;
-                            lineCount++;
                             lastDelimiterPosition = p;
                             return true;
                         } else if (startedQuote && lastQuotePosition == p - 1 &&
lastDoubleQuotePosition != p - 1
@@ -286,7 +289,6 @@ public class FieldCursorForDelimitedDataParser {
                             lastDelimiterPosition = p;
                             start = p + 1;
                             state = State.EOR;
-                            lineCount++;
                             startedQuote = false;
                             return true;
                         }


Mime
View raw message