commons-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bay...@apache.org
Subject svn commit: r383468 - in /jakarta/commons/sandbox/csv/trunk/src: java/org/apache/commons/csv/CSVParser.java test/org/apache/commons/csv/CSVParserTest.java
Date Mon, 06 Mar 2006 05:11:23 GMT
Author: bayard
Date: Sun Mar  5 21:11:21 2006
New Revision: 383468

URL: http://svn.apache.org/viewcvs?rev=383468&view=rev
Log:
Javadoc improvements, more unit tests, change of API to a chain style, some bugfixes

Modified:
    jakarta/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
    jakarta/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java

Modified: jakarta/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java?rev=383468&r1=383467&r2=383468&view=diff
==============================================================================
--- jakarta/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java (original)
+++ jakarta/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java Sun Mar
 5 21:11:21 2006
@@ -34,7 +34,13 @@
  * <p>Parsing of a csv-string having ';' as separator:</p>
  * <pre>
  *  String[][] data = 
- *         (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ *      (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ * </pre>
+ * 
+ * <p>The API allows chained method calls, if you like this coding style:</p>
+ * <pre>
+ *  String[][] data = (new CSVParser(new StringReader("a;b\nc;d"),';'))
+ *      .setExcelStrategy().setIgnoreEmptyLines(true).getAllValues();
  * </pre>
  * 
  * <p>
@@ -45,14 +51,18 @@
  * for more details</p>
  */
 public class CSVParser {
-  
+
   /** length of the initial token (content-)buffer */
   private static final int INITIAL_TOKEN_LENGTH = 50;
   
   // the token types
+  /** Token has no valid content, i.e. is in its initilized state. */
   protected static final int TT_INVALID = -1;
+  /** Token with content, at beginning or in the middle of a line. */
   protected static final int TT_TOKEN = 0;
+  /** Token (which can have content) when end of file is reached. */
   protected static final int TT_EOF = 1;
+  /** Token with content when end of a line is reached. */
   protected static final int TT_EORECORD = 2;
    
   // the csv definition
@@ -72,12 +82,13 @@
    * It is used as contract between the lexer and the parser. 
    */
   class Token {
-    // token type see TT_xxx constants
+    /** Token type, see TT_xxx constants. */
     int type;
-    // the content buffer
+    /** The content buffer. */
     StringBuffer content;
-    // token ready flag: indicates a valid token (ready for the parser)
+    /** Token ready flag: indicates a valid token with content (ready for the parser). */
     boolean isReady;
+    /** Initializes an empty token. */
     Token() {
       content = new StringBuffer(INITIAL_TOKEN_LENGTH);
       type = TT_INVALID;
@@ -92,6 +103,7 @@
   /**
    * Parses the given String according to the default CSV strategy.
    * 
+   * @param s CSV String to be parsed.
    * @return parsed String matrix (which is never null)
    * @throws IOException in case of error
    * @see #setCSVStrategy()
@@ -100,7 +112,13 @@
     if (s == null) {
       throw new IllegalArgumentException("Null argument not allowed.");
     }
-    return (new CSVParser(new StringReader(s))).getAllValues();
+    String[][] result = (new CSVParser(new StringReader(s))).getAllValues();
+    if (result == null) {
+      // since CSVStrategy ignores empty lines an empty array is returned
+      // (i.e. not "result = new String[][] {{""}};")
+      result = new String[0][0];
+    }
+    return result;
   }
   
   /**
@@ -109,6 +127,7 @@
    * Parsing empty string will be handled as valid records containing zero
    * elements, so the following property holds: parseLine("").length == 0.
    * 
+   * @param s CSV String to be parsed.
    * @return parsed String vector (which is never null)
    * @throws IOException in case of error
    * @see #setCSVStrategy()
@@ -166,8 +185,8 @@
    * Customized csv parser.
    * 
    * The parser parses according to the given CSV dialect settings.
-   * Leading whitespaces are truncated whereas unicode escapes are
-   * not interpreted.
+   * Leading whitespaces are truncated, unicode escapes are
+   * not interpreted and empty lines are ignored.
    * 
    * @param input a Reader based on "csv-formatted" input
    * @param delimiter a Char used for value separation
@@ -201,6 +220,7 @@
    * the stream.
    * 
    * @return matrix of records x values ('null' when end of file)
+   * @throws IOException on parse error or input read-failure
    */
   public String[][] getAllValues() throws IOException {
     Vector records = new Vector();
@@ -221,7 +241,7 @@
    * and returns the next csv-value as string.
    * 
    * @return next value in the input stream ('null' when end of file)
-   * @throws IOException
+   * @throws IOException on parse error or input read-failure
    */
   public String nextValue() throws IOException {
     Token tkn = nextToken();
@@ -266,7 +286,11 @@
         record.add(tkn.content.toString());
         break;
       case TT_EOF:
-        ret = null;
+        if (tkn.isReady) {
+          record.add(tkn.content.toString());
+        } else {
+          ret = null;
+        }
         break;
       case TT_INVALID:
       default:
@@ -290,9 +314,8 @@
    *            number does not correspond to the record-number
    * 
    * @return  current line number
-   * @throws IOException
    */
-  public int getLineNumber() throws IOException {
+  public int getLineNumber() {
     return in.getLineNumber();  
   }
   
@@ -301,15 +324,17 @@
   // ======================================================
  
  /**
-  * Returns the next token 
-  * 
-  * a token coresponds to a term, a record change
-  * or and end-of-file indicator
-  */
+   * Returns the next token.
+   * 
+   * A token corresponds to a term, a record change or an
+   * end-of-file indicator.
+   * 
+   * @return the next token found
+   * @throws IOException on stream access error
+   */
   protected Token nextToken() throws IOException {
     Token tkn = new Token();
     StringBuffer wsBuf = new StringBuffer();
-    // boolean skipEmptyLines = false;
     
     // get the last read char (required for empty line detection)
     int lastChar = in.readAgain();
@@ -342,7 +367,7 @@
     }
 
     // did we reached eof during the last iteration already ? TT_EOF
-    if (isEndOfFile(lastChar)) {
+    if (isEndOfFile(lastChar) || (lastChar != delimiter && isEndOfFile(c))) {
       tkn.type = TT_EOF;
       return tkn;
     } 
@@ -375,8 +400,7 @@
       } else if (isEndOfFile(c)) {
         // end of file return TT_EOF()
         tkn.content.append("");
-        tkn.type = TT_EORECORD;
-        // tkn.type = TT_EOF;
+        tkn.type = TT_EOF;
         tkn.isReady = true;
       } else {
         // next token must be a simple token
@@ -417,23 +441,15 @@
         tkn.isReady = true;
       } else if (isEndOfFile(c)) {
         // end of file
-        // tkn.type = TT_EOF;
-        tkn.type = TT_EORECORD;
+        tkn.type = TT_EOF;
         tkn.isReady = true;
       } else if (c == delimiter) {
         // end of token
         tkn.type = TT_TOKEN;
         tkn.isReady = true;
-      } else if (c == '\\') {
-        // handle escaped delimiters (remove escaping)
-        if (in.lookAhead() == this.delimiter) {
-          tkn.content.append((char) in.read());
-        } else if (interpretUnicodeEscapes && in.lookAhead() == 'u') {
-          // interpret unicode escaped chars (like \u0070 -> p)
-          tkn.content.append((char) unicodeEscapeLexer(c));
-        } else {
-          tkn.content.append((char) c);
-        }
+      } else if (c == '\\' && interpretUnicodeEscapes && in.lookAhead() ==
'u') {
+        // interpret unicode escaped chars (like \u0070 -> p)
+        tkn.content.append((char) unicodeEscapeLexer(c));
       } else if (isWhitespace(c)) {
         // gather whitespaces 
         // (as long as they are not at the beginning of a token)
@@ -484,7 +500,9 @@
           c = in.read();
           tkn.content.append((char) c);
         } else if (c == '\\' && in.lookAhead() == '\\') {
-          // doubled escape character -> add single escape char to stream
+          // doubled escape char, it does not escape itself, only encapsulator 
+          // -> add both escape chars to stream
+          tkn.content.append((char) c);
           c = in.read();
           tkn.content.append((char) c);
         } else if (
@@ -493,16 +511,18 @@
           && in.lookAhead() == 'u') {
           // interpret unicode escaped chars (like \u0070 -> p)
           tkn.content.append((char) unicodeEscapeLexer(c));
+        } else if (c == '\\') {
+          // use a single escape character -> add it to stream
+          tkn.content.append((char) c);
         } else {
-          // token finish mark reached: ignore ws till delimiter
+          // token finish mark (encapsulator) reached: ignore whitespace till delimiter
           while (!tkn.isReady) {
             int n = in.lookAhead();
             if (n == delimiter) {
               tkn.type = TT_TOKEN;
               tkn.isReady = true;
             } else if (isEndOfFile(n)) {
-              // tkn.type = TT_EOF;
-              tkn.type = TT_EORECORD;
+              tkn.type = TT_EOF;
               tkn.isReady = true;
             } else if (isEndOfLine(n)) {
               // ok eo token reached
@@ -538,11 +558,11 @@
   
   
   /**
-   * Decodes Unicode escapes 
+   * Decodes Unicode escapes.
    * 
    * Interpretation of "\\uXXXX" escape sequences
-   * where XXXX is a hex-number
-   * @param c
+   * where XXXX is a hex-number.
+   * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
    * @return the decoded character
    * @throws IOException on wrong unicode escape sequence or read error
    */
@@ -576,29 +596,40 @@
    * Sets the "Default CSV" settings.
    * 
    * The default csv settings are relatively restrictive but implement
-   * something like the "least-common-basis" of CSV.
-   * 
-   * Values are separated by ',' (as the C in "CSV"). Complex values must
-   * be surrounded by '"'. Comments are not supported. Leading whitespaces
-   * are ignored, unicode escapes are not interpreted and empty lines
-   * are skiped.
+   * something like the "least-common-basis" of CSV:
+   * <ul>
+   * <li> Delimiter of values is comma ',' (as the C in "CSV") </li>
+   * <li> Complex values encapsulated by '"' </li>
+   * <li> Comments are not supported </li>
+   * <li> Leading whitespaces are ignored </li>
+   * <li> Unicode escapes are not interpreted </li>
+   * <li> empty lines are skiped </li>
+   * </ul>
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setCSVStrategy() {
+  public CSVParser setCSVStrategy() {
     setStrategy(',', '"', (char) 0, true, false, true);
+    return this;
   }
   
   /**
-   * Sets the "Excel CSV" settings.
-   * 
-   * There are companies out there which interpret "C" as an abbreviation for
-   * "Semicolon". For these companies the following settings might be
-   * appropriate: 
-   * <p>
-   * Delimiter Semicolon ';', Complex-values surrounded by '"', leading 
-   * whitespaces are not ignored and unicode escapes are not interpreted.
+   * Sets the "Excel CSV" settings. There are companies out there which
+   * interpret "C" as an abbreviation for "Semicolon". For these companies the
+   * following settings might be appropriate:
+   * <ul>
+   * <li> Delimiter of values is semicolon ';' </li>
+   * <li> Complex values encapsulated by '"' </li>
+   * <li> Comments are not supported </li>
+   * <li> Leading whitespaces are not ignored </li>
+   * <li> Unicode escapes are not interpreted </li>
+   * <li> empty lines are not skiped </li>
+   * </ul>
+   *
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setExcelStrategy() {
+  public CSVParser setExcelStrategy() {
     setStrategy(';', '"', (char) 0, false, false, false);
+    return this;
   }
   
   /**
@@ -612,8 +643,9 @@
    * @param interpretUnicodeEscapes TRUE when unicode escapes should be 
    *                                interpreted
    * @param ignoreEmptyLines TRUE when the parser should skip emtpy lines
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setStrategy(
+  public CSVParser setStrategy(
     char delimiter, 
     char encapsulator, 
     char commentStart, 
@@ -626,15 +658,18 @@
     this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
     this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
     this.setIgnoreEmptyLines(ignoreEmptyLines);
+    return this;
   }
   
   /**
-   * Set the desired delimiter
+   * Set the desired delimiter.
    *
    * @param c a Char used for value separation
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setDelimiter(char c) {
+  public CSVParser setDelimiter(char c) {
     this.delimiter = c;
+    return this;
   }
   
   /**
@@ -647,12 +682,14 @@
   }
   
   /**
-   * Set the desired encapsulator
+   * Set the desired encapsulator.
    * 
    * @param c a Char used as value encapsulation marker
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setEncapsulator(char c) {
+  public CSVParser setEncapsulator(char c) {
     this.encapsulator = c;
+    return this;
   }
   
   /**
@@ -665,16 +702,18 @@
   }
   
   /**
-   * Set the desired comment start character
+   * Set the desired comment start character.
    * 
    * @param c a Char used for comment identification
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setCommentStart(char c) {
+  public CSVParser setCommentStart(char c) {
     this.commentStart = c;
+    return this;
   }
   
   /**
-   * Gets the comment identifier
+   * Gets the comment identifier.
    * 
    * @return the comment identifier character
    */
@@ -683,16 +722,18 @@
   }
   
   /**
-   * Enables unicode escape interpretation
+   * Enables unicode escape interpretation.
    * 
    * @param b TRUE when interpretation should be enabled
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setUnicodeEscapeInterpretation(boolean b) {
+  public CSVParser setUnicodeEscapeInterpretation(boolean b) {
     this.interpretUnicodeEscapes = b;
+    return this;
   }
   
   /**
-   * Shows wether unicode interpretation is enabled
+   * Shows wether unicode interpretation is enabled.
    * 
    * @return TRUE when unicode interpretation is enabled
    */
@@ -704,16 +745,18 @@
    * Sets the ignore-leading-whitespaces behaviour.
    * 
    * Should the lexer ignore leading whitespaces when parsing non 
-   * encapsulated tokens
+   * encapsulated tokens.
    * 
    * @param b TRUE when leading whitespaces should be ignored
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setIgnoreLeadingWhitespaces(boolean b) {
+  public CSVParser setIgnoreLeadingWhitespaces(boolean b) {
     this.ignoreLeadingWhitespaces = b;
+    return this;
   }
   
   /**
-   * Shows wether unicode interpretation is enabled
+   * Shows whether unicode interpretation is enabled.
    * 
    * @return TRUE when unicode interpretation is enabled
    */
@@ -726,10 +769,21 @@
    * 
    * When set to 'true' empty lines in the input will be ignored.
    * 
-   * @param b
+   * @param b TRUE when empty lines in the input should be ignored
+   * @return current instance of CSVParser to allow chained method calls
    */
-  public void setIgnoreEmptyLines(boolean b) {
+  public CSVParser setIgnoreEmptyLines(boolean b) {
     this.ignoreEmptyLines = b;  
+    return this;
+  }
+  
+  /**
+   * Shows whether empty lines in the input are ignored.
+   * 
+   * @return TRUE when empty lines in the input are ignored
+   */
+  public boolean getIgnoreEmptyLines() {
+    return this.ignoreEmptyLines;
   }
   
   // ======================================================

Modified: jakarta/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java
URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java?rev=383468&r1=383467&r2=383468&view=diff
==============================================================================
--- jakarta/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java (original)
+++ jakarta/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java Sun
Mar  5 21:11:21 2006
@@ -36,12 +36,22 @@
 public class CSVParserTest extends TestCase {
   
   /**
-   * TestCSVParser
+   * TestCSVParser.
    */
   class TestCSVParser extends CSVParser {
+    /**
+     * Test parser to investigate the type of the internal Token.
+     * @param in a Reader
+     */
     TestCSVParser(Reader in) {
       super(in);
     }
+    /**
+     * Calls super.nextToken() and prints out a String representation of token
+     * type and content.
+     * @return String representation of token type and content
+     * @throws IOException like {@link CSVParser#nextToken()}
+     */
     public String testNextToken() throws IOException {
       Token t = super.nextToken();
       String tmp = Integer.toString(t.type) + ";" + t.content + ";";
@@ -51,13 +61,17 @@
   }
   
   /**
-   * Constructor for CSVParserTest.
-   * @param arg0
+   * Constructor for JUnit.
+   * @param name Name to be used in JUnit Test Environment
    */
-  public CSVParserTest(String arg0) {
-    super(arg0);
+  public CSVParserTest(String name) {
+    super(name);
   }
 
+  /**
+   * Returns a Test suite for JUnit.
+   * @return Test suite for JUnit
+   */
   public static Test suite() {
     return new TestSuite(CSVParserTest.class);
   }
@@ -95,23 +109,40 @@
   public void testSetCSVStrategy() {
     CSVParser parser = new CSVParser(new StringReader("hello world"));
     // default settings
-    assertEquals(parser.getCommentStart(), '\0');
-    assertEquals(parser.getEncapsulator(), '"');
     assertEquals(parser.getDelimiter(), ',');
+    assertEquals(parser.getEncapsulator(), '"');
+    assertEquals(parser.getCommentStart(), '\0');
+    assertEquals(true,  parser.getIgnoreLeadingWhitespaces());
+    assertEquals(false, parser.getUnicodeEscapeInterpretation());
+    assertEquals(true,  parser.getIgnoreEmptyLines());
     // explicit csv settings
     parser.setCSVStrategy();
-    assertEquals(parser.getCommentStart(), '\0');
-    assertEquals(parser.getEncapsulator(), '"');
     assertEquals(parser.getDelimiter(), ',');
+    assertEquals(parser.getEncapsulator(), '"');
+    assertEquals(parser.getCommentStart(), '\0');
+    assertEquals(true,  parser.getIgnoreLeadingWhitespaces());
+    assertEquals(false, parser.getUnicodeEscapeInterpretation());
+    assertEquals(true,  parser.getIgnoreEmptyLines());
   }
   
+  public void testSetExcelStrategy() {
+    CSVParser parser = new CSVParser(new StringReader("hello world"));
+    // explicit Excel settings
+    parser.setExcelStrategy();
+    assertEquals(parser.getDelimiter(), ';');
+    assertEquals(parser.getEncapsulator(), '"');
+    assertEquals(parser.getCommentStart(), '\0');
+    assertEquals(false,  parser.getIgnoreLeadingWhitespaces());
+    assertEquals(false, parser.getUnicodeEscapeInterpretation());
+    assertEquals(false, parser.getIgnoreEmptyLines());
+  }
   
   
   // ======================================================
   //   lexer tests
   // ======================================================
   
-  // single line (without comment)
+  // Single line (without comment)
   public void testNextToken1() throws IOException {
     String code = "abc,def, hijk,  lmnop,   qrst,uv ,wxy   ,z , ,";
     TestCSVParser parser = new TestCSVParser(new StringReader(code));
@@ -126,14 +157,13 @@
     assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken());
     assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken());
     assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
-    assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken());
     assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());  
   }
   
   // multiline including comments (and empty lines)
   public void testNextToken2() throws IOException {
     /*   file:   1,2,3,
-     *           a,b,c
+     *           a,b x,c
      *
      *           # this is a comment 
      *           d,e,
@@ -172,10 +202,13 @@
     parser.setCommentStart('#');
     System.out.println("---------\n" + code + "\n-------------");
     assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken());
+    // an unquoted single backslash is not an escape char
+    assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
+    assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
     assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken());
-    assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken());
+    // an unquoted single backslash is not an escape char
+    assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
+    assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
     assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
   }
   
@@ -183,7 +216,7 @@
   public void testNextToken4() throws IOException {
     /* file:  a,"foo",b
      *        a,   " foo",b
-     *        a,"foo "   ,b
+     *        a,"foo "   ,b     // whitespace after closing encapsulator
      *        a,  " foo " ,b
      */ 
      String code = 
@@ -202,28 +235,29 @@
      assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
      assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
      assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken());
-     assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
-     assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());    
+//     assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
+     assertEquals(CSVParser.TT_EOF + ";b;", parser.testNextToken());    
   }
   
   // encapsulator tokenizer (multi line, delimiter in string)
   public void testNextToken5() throws IOException {   
     String code = 
-      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\",\"\\\"\",\"\"\"\"";
+      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\",\"\\\"\""
+      + ",\"\\,\"" 
+      + ",\"\"\"\"";
     TestCSVParser parser = new TestCSVParser(new StringReader(code));
     parser.setCSVStrategy();
     System.out.println("---------\n" + code + "\n-------------");
     assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
     assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken());
     assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
-    assertEquals(
-      CSVParser.TT_EORECORD + ";foo\n  baar ,,,;", 
-      parser.testNextToken());
+    assertEquals(CSVParser.TT_EORECORD + ";foo\n  baar ,,,;",
+        parser.testNextToken());
     assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
     assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
-    assertEquals(CSVParser.TT_EORECORD + ";\";", parser.testNextToken());
-    assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
-    
+    // escape char in quoted input only escapes delimiter
+    assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
+    assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
   }
   
   // change delimiters, comment, encapsulater
@@ -259,11 +293,10 @@
     {"a", "b", "c", "d"},
     {"a", "b", "1 2"}, 
     {"foo baar", "b", ""}, 
-    {"foo\n,,\n\",,\n\"", "d", "e"},
-    {""}
+    {"foo\n,,\n\",,\n\"", "d", "e"}
   };
   public void testGetLine() throws IOException {
-    TestCSVParser parser = new TestCSVParser(new StringReader(code));
+    CSVParser parser = new CSVParser(new StringReader(code));
     System.out.println("---------\n" + code + "\n-------------");
     String[] tmp = null;
     for (int i = 0; i < res.length; i++) {
@@ -275,7 +308,7 @@
   }
   
   public void testNextValue() throws IOException {
-    TestCSVParser parser = new TestCSVParser(new StringReader(code));
+    CSVParser parser = new CSVParser(new StringReader(code));
     System.out.println("---------\n" + code + "\n-------------");
     String tmp = null;
     for (int i = 0; i < res.length; i++) {
@@ -289,7 +322,7 @@
   }
   
   public void testGetAllValues() throws IOException {
-    TestCSVParser parser = new TestCSVParser(new StringReader(code));
+    CSVParser parser = new CSVParser(new StringReader(code));
     System.out.println("---------\n" + code + "\n-------------");
     String[][] tmp = parser.getAllValues();
     assertEquals(res.length, tmp.length);
@@ -299,7 +332,7 @@
     }
   }
   
-  public void testExcelStrategyTest() throws IOException {
+  public void testExcelStrategy1() throws IOException {
     String code = 
       "value1;value2;value3;value4\r\na;b;c;d\r\n  x;;;"
       + "\r\n\r\n\"\"\"hello\"\"\";\"  \"\"world\"\"\";\"abc\ndef\";\r\n";
@@ -308,10 +341,9 @@
       {"a", "b", "c", "d"},
       {"  x", "", "", ""},
       {""},
-      {"\"hello\"", "  \"world\"", "abc\ndef", ""},
-      {""}
+      {"\"hello\"", "  \"world\"", "abc\ndef", ""}
     };
-    TestCSVParser parser = new TestCSVParser(new StringReader(code));
+    CSVParser parser = new CSVParser(new StringReader(code));
     parser.setExcelStrategy();
     System.out.println("---------\n" + code + "\n-------------");
     String[][] tmp = parser.getAllValues();
@@ -322,17 +354,16 @@
     }
   }
   
-  public void testExcelStrategyTest2() throws Exception {
+  public void testExcelStrategy2() throws Exception {
     String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
     String[][] res = {
       {"foo", "baar"},
       {""},
       {"hello", ""},
       {""},
-      {"world", ""},
-      {""} 
+      {"world", ""}
     };
-    TestCSVParser parser = new TestCSVParser(new StringReader(code));
+    CSVParser parser = new CSVParser(new StringReader(code));
     parser.setExcelStrategy();
     System.out.println("---------\n" + code + "\n-------------");
     String[][] tmp = parser.getAllValues();
@@ -344,7 +375,166 @@
       }
       assertTrue(Arrays.equals(res[i], tmp[i])); 
     }
-    //assertTrue(false);
+  }
+  
+  public void testEndOfFileBehaviourExcel() throws Exception {
+    String[] codes = {
+        "hello;\r\n\r\nworld;\r\n",
+        "hello;\r\n\r\nworld;",
+        "hello;\r\n\r\nworld;\"\"\r\n",
+        "hello;\r\n\r\nworld;\"\"",
+        "hello;\r\n\r\nworld;\n",
+        "hello;\r\n\r\nworld;",
+        "hello;\r\n\r\nworld;\"\"\n",
+        "hello;\r\n\r\nworld;\"\""
+        };
+    String[][] res = {
+      {"hello", ""},
+      {""},  // ExcelStrategy does not ignore empty lines
+      {"world", ""}
+    };
+    String code;
+    for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+      code = codes[codeIndex];
+      CSVParser parser = new CSVParser(new StringReader(code));
+      parser.setExcelStrategy();
+      System.out.println("---------\n" + code + "\n-------------");
+      String[][] tmp = parser.getAllValues();
+      assertEquals(res.length, tmp.length);
+      assertTrue(tmp.length > 0);
+      for (int i = 0; i < res.length; i++) {
+        for (int j = 0; j < tmp[i].length; j++) {
+          System.out.println("'" + tmp[i][j] + "'");
+        }
+        assertTrue(Arrays.equals(res[i], tmp[i]));
+      }
+    }
+  }
+  
+  public void testEndOfFileBehaviorCSV() throws Exception {
+    String[] codes = {
+        "hello,\r\n\r\nworld,\r\n",
+        "hello,\r\n\r\nworld,",
+        "hello,\r\n\r\nworld,\"\"\r\n",
+        "hello,\r\n\r\nworld,\"\"",
+        "hello,\r\n\r\nworld,\n",
+        "hello,\r\n\r\nworld,",
+        "hello,\r\n\r\nworld,\"\"\n",
+        "hello,\r\n\r\nworld,\"\""
+        };
+    String[][] res = {
+      {"hello", ""},  // CSV Strategy ignores empty lines
+      {"world", ""}
+    };
+    String code;
+    for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+      code = codes[codeIndex];
+      CSVParser parser = new CSVParser(new StringReader(code));
+      parser.setCSVStrategy();
+      System.out.println("---------\n" + code + "\n-------------");
+      String[][] tmp = parser.getAllValues();
+      assertEquals(res.length, tmp.length);
+      assertTrue(tmp.length > 0);
+      for (int i = 0; i < res.length; i++) {
+        for (int j = 0; j < tmp[i].length; j++) {
+          System.out.println("'" + tmp[i][j] + "'");
+        }
+        assertTrue(Arrays.equals(res[i], tmp[i]));
+      }
+    }
+  }
+  
+  public void testEmptyLineBehaviourExcel() throws Exception {
+    String[] codes = {
+        "hello;\r\n\r\n\r\n",
+        "hello;\n\n\n",
+        "hello;\"\"\r\n\r\n\r\n",
+        "hello;\"\"\n\n\n"
+        };
+    String[][] res = {
+      {"hello", ""},
+      {""},  // ExcelStrategy does not ignore empty lines
+      {""}
+    };
+    String code;
+    for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+      code = codes[codeIndex];
+      CSVParser parser = new CSVParser(new StringReader(code));
+      parser.setExcelStrategy();
+      System.out.println("---------\n" + code + "\n-------------");
+      String[][] tmp = parser.getAllValues();
+      assertEquals(res.length, tmp.length);
+      assertTrue(tmp.length > 0);
+      for (int i = 0; i < res.length; i++) {
+        for (int j = 0; j < tmp[i].length; j++) {
+          System.out.println("'" + tmp[i][j] + "'");
+        }
+        assertTrue(Arrays.equals(res[i], tmp[i]));
+      }
+    }
+  }
+  
+  public void testEmptyLineBehaviourCSV() throws Exception {
+    String[] codes = {
+        "hello,\r\n\r\n\r\n",
+        "hello,\n\n\n",
+        "hello,\"\"\r\n\r\n\r\n",
+        "hello,\"\"\n\n\n"
+        };
+    String[][] res = {
+      {"hello", ""}  // CSV Strategy ignores empty lines
+    };
+    String code;
+    for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+      code = codes[codeIndex];
+      CSVParser parser = new CSVParser(new StringReader(code));
+      parser.setCSVStrategy();
+      System.out.println("---------\n" + code + "\n-------------");
+      String[][] tmp = parser.getAllValues();
+      assertEquals(res.length, tmp.length);
+      assertTrue(tmp.length > 0);
+      for (int i = 0; i < res.length; i++) {
+        for (int j = 0; j < tmp[i].length; j++) {
+          System.out.println("'" + tmp[i][j] + "'");
+        }
+        assertTrue(Arrays.equals(res[i], tmp[i]));
+      }
+    }
+  }
+  
+  public void testBackslashEscaping() throws IOException {
+    String code =
+      "one,two,three\n"
+      + "on\\\"e,two\n"
+      + "on\"e,two\n"
+      + "one,\"tw\\\"o\"\n"
+      + "one,\"t\\,wo\"\n"
+      + "one,two,\"th,ree\"\n"
+      + "\"a\\\\\"\n"
+      + "a\\,b\n"
+      + "\"a\\\\,b\"";
+    String[][] res = {
+        { "one", "two", "three" },
+        { "on\\\"e", "two" },
+        { "on\"e", "two" },
+        { "one", "tw\"o" },
+        { "one", "t\\,wo" },  // backslash in quotes only escapes a delimiter (",")
+        { "one", "two", "th,ree" },
+        { "a\\\\" },     // backslash in quotes only escapes a delimiter (",")
+        { "a\\", "b" },  // a backslash must be returnd 
+        { "a\\\\,b" }    // backslash in quotes only escapes a delimiter (",")
+      };
+    CSVParser parser = new CSVParser(new StringReader(code));
+    System.out.println("---------\n" + code + "\n-------------");
+    String[][] tmp = parser.getAllValues();
+    assertEquals(res.length, tmp.length);
+    assertTrue(tmp.length > 0);
+    for (int i = 0; i < res.length; i++) {
+      for (int j = 0; j < tmp[i].length; j++) {
+        System.out.println("'" + tmp[i][j] + "'");
+      }
+      assertTrue(Arrays.equals(res[i], tmp[i])); 
+    }
   }
   
   // ======================================================
@@ -386,7 +576,8 @@
       assertEquals(2, data[0].length);
       assertEquals(1, data[1].length);
       assertEquals("abc", data[0][0]);
-      assertEquals("def\\nghi", data[0][1]);
+      // an escape char in quotes only escapes a delimiter, not itself
+      assertEquals("def\\\\nghi", data[0][1]);
       assertEquals("jkl", data[1][0]);
     }
 
@@ -402,9 +593,8 @@
     
     public void testParse6() throws IOException {
       String[][] data = CSVParser.parse("");
-      assertEquals(1, data.length);
-      assertEquals(1, data[0].length);
-      assertEquals("", data[0][0]);  
+      // default strategy is CSV, which ignores empty lines
+      assertEquals(0, data.length);
     }
     
     public void testParse7() throws IOException {
@@ -471,7 +661,7 @@
       
     public void testUnicodeEscape() throws IOException {
       String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
-      TestCSVParser parser = new TestCSVParser(new StringReader(code));
+      CSVParser parser = new CSVParser(new StringReader(code));
       System.out.println("---------\n" + code + "\n-------------");
       parser.setUnicodeEscapeInterpretation(true);
       String[] data = parser.getLine();
@@ -482,7 +672,7 @@
     
     public void testCarriageReturnLineFeedEndings() throws IOException {
      String code = "foo\r\nbaar,\r\nhello,world\r\n,kanu";
-     TestCSVParser parser = new TestCSVParser(new StringReader(code));
+     CSVParser parser = new CSVParser(new StringReader(code));
      System.out.println("---------\n" + code + "\n-------------");
      String[][] data = parser.getAllValues();
      assertEquals(4, data.length);
@@ -492,7 +682,7 @@
       String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
       //String code = "world\r\n\n";
       //String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
-      TestCSVParser parser = new TestCSVParser(new StringReader(code));
+      CSVParser parser = new CSVParser(new StringReader(code));
       System.out.println("---------\n" + code + "\n-------------");
       String[][] data = parser.getAllValues();
 //      for (int i = 0; i < data.length; i++) {
@@ -509,11 +699,11 @@
     
     public void testLineTokenConsistency() throws IOException {
       String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
-      TestCSVParser parser = new TestCSVParser(new StringReader(code));
+      CSVParser parser = new CSVParser(new StringReader(code));
       System.out.println("---------\n" + code + "\n-------------");
       String[][] data = parser.getAllValues();
-      parser = new TestCSVParser(new StringReader(code));
-      TestCSVParser parser1 = new TestCSVParser(new StringReader(code));
+      parser = new CSVParser(new StringReader(code));
+      CSVParser parser1 = new CSVParser(new StringReader(code));
       for (int i = 0; i < data.length; i++) {
         assertTrue(Arrays.equals(parser1.getLine(), data[i]));
         for (int j = 0; j < data[i].length; j++) {



---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


Mime
View raw message