Source code

001/*
-002 * Licensed to the Apache Software Foundation (ASF) under one or more
-003 * contributor license agreements.  See the NOTICE file distributed with
-004 * this work for additional information regarding copyright ownership.
-005 * The ASF licenses this file to You under the Apache License, Version 2.0
-006 * (the "License"); you may not use this file except in compliance with
-007 * the License.  You may obtain a copy of the License at
-008 *
-009 *      http://www.apache.org/licenses/LICENSE-2.0
-010 *
-011 * Unless required by applicable law or agreed to in writing, software
-012 * distributed under the License is distributed on an "AS IS" BASIS,
-013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-014 * See the License for the specific language governing permissions and
-015 * limitations under the License.
-016 */
-017
-018package org.apache.commons.csv;
-019
-020import java.io.Closeable;
-021import java.io.File;
-022import java.io.FileInputStream;
-023import java.io.IOException;
-024import java.io.InputStreamReader;
-025import java.io.Reader;
-026import java.io.StringReader;
-027import java.net.URL;
-028import java.nio.charset.Charset;
-029import java.util.ArrayList;
-030import java.util.Arrays;
-031import java.util.Iterator;
-032import java.util.LinkedHashMap;
-033import java.util.List;
-034import java.util.Map;
-035import java.util.NoSuchElementException;
-036
-037import static org.apache.commons.csv.Token.Type.*;
-038
-039/**
-040 * Parses CSV files according to the specified format.
-041 *
-042 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
-043 * specification of a {@link CSVFormat}.
-044 *
-045 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
-046 *
-047 * <h2>Creating instances</h2>
-048 * <p>
-049 * There are several static factory methods that can be used to create instances for various types of resources:
-050 * </p>
-051 * <ul>
-052 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
-053 *     <li>{@link #parse(String, CSVFormat)}</li>
-054 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
-055 * </ul>
-056 * <p>
-057 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
-058 *
-059 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
-060 * </p>
-061 * <pre>
-062 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
-063 *     ...
-064 * }
-065 * </pre>
-066 *
-067 * <h2>Parsing record wise</h2>
-068 * <p>
-069 * To parse a CSV input from a file, you write:
-070 * </p>
-071 *
-072 * <pre>
-073 * File csvData = new File(&quot;/path/to/csv&quot;);
-074 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
-075 * for (CSVRecord csvRecord : parser) {
-076 *     ...
-077 * }
-078 * </pre>
-079 *
-080 * <p>
-081 * This will read the parse the contents of the file using the
-082 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
-083 * </p>
-084 *
-085 * <p>
-086 * To parse CSV input in a format like Excel, you write:
-087 * </p>
-088 *
-089 * <pre>
-090 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
-091 * for (CSVRecord csvRecord : parser) {
-092 *     ...
-093 * }
-094 * </pre>
-095 *
-096 * <p>
-097 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
-098 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
-099 * </p>
-100 *
-101 * <h2>Parsing into memory</h2>
-102 * <p>
-103 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
-104 * </p>
-105 *
-106 * <pre>
-107 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
-108 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
-109 * List&lt;CSVRecord&gt; list = parser.getRecords();
-110 * </pre>
-111 *
-112 * <p>
-113 * There are two constraints that have to be kept in mind:
-114 * </p>
-115 *
-116 * <ol>
-117 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
-118 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
-119 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
-120 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
-121 * </ol>
-122 *
-123 * <h2>Notes</h2>
-124 * <p>
-125 * Internal parser state is completely covered by the format and the reader-state.
-126 * </p>
-127 *
-128 * @version $Id: CSVParser.java 1637611 2014-11-08 23:38:48Z ggregory $
-129 *
-130 * @see <a href="package-summary.html">package documentation for more details</a>
-131 */
-132public final class CSVParser implements Iterable<CSVRecord>, Closeable {
-133
-134    /**
-135     * Creates a parser for the given {@link File}.
-136     *
-137     * <p><strong>Note:</strong> This method internally creates a FileReader using
-138     * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
-139     * is executing the code. If this is insufficient create a URL to the file and use
-140     * {@link #parse(URL, Charset, CSVFormat)}</p>
-141     *
-142     * @param file
-143     *            a CSV file. Must not be null.
-144     * @param charset
-145     *            A charset
-146     * @param format
-147     *            the CSVFormat used for CSV parsing. Must not be null.
-148     * @return a new parser
-149     * @throws IllegalArgumentException
-150     *             If the parameters of the format are inconsistent or if either file or format are null.
-151     * @throws IOException
-152     *             If an I/O error occurs
-153     */
-154    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
-155        Assertions.notNull(file, "file");
-156        Assertions.notNull(format, "format");
-157        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
-158    }
-159
-160    /**
-161     * Creates a parser for the given {@link String}.
-162     *
-163     * @param string
-164     *            a CSV string. Must not be null.
-165     * @param format
-166     *            the CSVFormat used for CSV parsing. Must not be null.
-167     * @return a new parser
-168     * @throws IllegalArgumentException
-169     *             If the parameters of the format are inconsistent or if either string or format are null.
-170     * @throws IOException
-171     *             If an I/O error occurs
-172     */
-173    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
-174        Assertions.notNull(string, "string");
-175        Assertions.notNull(format, "format");
-176
-177        return new CSVParser(new StringReader(string), format);
-178    }
-179
-180    /**
-181     * Creates a parser for the given URL.
-182     *
-183     * <p>
-184     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
-185     * you close the {@code url}.
-186     * </p>
-187     *
-188     * @param url
-189     *            a URL. Must not be null.
-190     * @param charset
-191     *            the charset for the resource. Must not be null.
-192     * @param format
-193     *            the CSVFormat used for CSV parsing. Must not be null.
-194     * @return a new parser
-195     * @throws IllegalArgumentException
-196     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
-197     * @throws IOException
-198     *             If an I/O error occurs
-199     */
-200    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
-201        Assertions.notNull(url, "url");
-202        Assertions.notNull(charset, "charset");
-203        Assertions.notNull(format, "format");
-204
-205        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
-206    }
-207
-208    // the following objects are shared to reduce garbage
-209
-210    private final CSVFormat format;
-211
-212    /** A mapping of column names to column indices */
-213    private final Map<String, Integer> headerMap;
-214
-215    private final Lexer lexer;
-216
-217    /** A record buffer for getRecord(). Grows as necessary and is reused. */
-218    private final List<String> record = new ArrayList<String>();
-219
-220    /**
-221     * The next record number to assign.
-222     */
-223    private long recordNumber;
-224
-225    /**
-226     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
-227     * with {@link #recordNumber}.
-228     */
-229    private final long characterOffset;
-230
-231    private final Token reusableToken = new Token();
-232
-233    /**
-234     * Customized CSV parser using the given {@link CSVFormat}
-235     *
-236     * <p>
-237     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
-238     * unless you close the {@code reader}.
-239     * </p>
-240     *
-241     * @param reader
-242     *            a Reader containing CSV-formatted input. Must not be null.
-243     * @param format
-244     *            the CSVFormat used for CSV parsing. Must not be null.
-245     * @throws IllegalArgumentException
-246     *             If the parameters of the format are inconsistent or if either reader or format are null.
-247     * @throws IOException
-248     *             If there is a problem reading the header or skipping the first record
-249     */
-250    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
-251        this(reader, format, 0, 1);
-252    }
-253
-254    /**
-255     * Customized CSV parser using the given {@link CSVFormat}
-256     *
-257     * <p>
-258     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
-259     * unless you close the {@code reader}.
-260     * </p>
-261     *
-262     * @param reader
-263     *            a Reader containing CSV-formatted input. Must not be null.
-264     * @param format
-265     *            the CSVFormat used for CSV parsing. Must not be null.
-266     * @param characterOffset
-267     *            Lexer offset when the parser does not start parsing at the beginning of the source.
-268     * @param recordNumber
-269     *            The next record number to assign
-270     * @throws IllegalArgumentException
-271     *             If the parameters of the format are inconsistent or if either reader or format are null.
-272     * @throws IOException
-273     *             If there is a problem reading the header or skipping the first record
-274     * @since 1.1
-275     */
-276    public CSVParser(final Reader reader, final CSVFormat format, long characterOffset, long recordNumber)
-277            throws IOException {
-278        Assertions.notNull(reader, "reader");
-279        Assertions.notNull(format, "format");
-280
-281        this.format = format;
-282        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
-283        this.headerMap = this.initializeHeader();
-284        this.characterOffset = characterOffset;
-285        this.recordNumber = recordNumber - 1;
-286    }
-287
-288    private void addRecordValue() {
-289        final String input = this.reusableToken.content.toString();
-290        final String nullString = this.format.getNullString();
-291        if (nullString == null) {
-292            this.record.add(input);
-293        } else {
-294            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
-295        }
-296    }
-297
-298    /**
-299     * Closes resources.
-300     *
-301     * @throws IOException
-302     *             If an I/O error occurs
-303     */
-304    @Override
-305    public void close() throws IOException {
-306        if (this.lexer != null) {
-307            this.lexer.close();
-308        }
-309    }
-310
-311    /**
-312     * Returns the current line number in the input stream.
-313     *
-314     * <p>
-315     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
-316     * the record number.
-317     * </p>
-318     *
-319     * @return current line number
-320     */
-321    public long getCurrentLineNumber() {
-322        return this.lexer.getCurrentLineNumber();
-323    }
-324
-325    /**
-326     * Returns a copy of the header map that iterates in column order.
-327     * <p>
-328     * The map keys are column names. The map values are 0-based indices.
-329     * </p>
-330     * @return a copy of the header map that iterates in column order.
-331     */
-332    public Map<String, Integer> getHeaderMap() {
-333        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
-334    }
-335
-336    /**
-337     * Returns the current record number in the input stream.
-338     *
-339     * <p>
-340     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
-341     * the line number.
-342     * </p>
-343     *
-344     * @return current record number
-345     */
-346    public long getRecordNumber() {
-347        return this.recordNumber;
-348    }
-349
-350    /**
-351     * Parses the CSV input according to the given format and returns the content as a list of
-352     * {@link CSVRecord CSVRecords}.
-353     *
-354     * <p>
-355     * The returned content starts at the current parse-position in the stream.
-356     * </p>
-357     *
-358     * @return list of {@link CSVRecord CSVRecords}, may be empty
-359     * @throws IOException
-360     *             on parse error or input read-failure
-361     */
-362    public List<CSVRecord> getRecords() throws IOException {
-363        CSVRecord rec;
-364        List<CSVRecord> records = new ArrayList<CSVRecord>();
-365        while ((rec = this.nextRecord()) != null) {
-366            records.add(rec);
-367        }
-368        return records;
-369    }
-370
-371    /**
-372     * Initializes the name to index mapping if the format defines a header.
-373     *
-374     * @return null if the format has no header.
-375     * @throws IOException if there is a problem reading the header or skipping the first record
-376     */
-377    private Map<String, Integer> initializeHeader() throws IOException {
-378        Map<String, Integer> hdrMap = null;
-379        final String[] formatHeader = this.format.getHeader();
-380        if (formatHeader != null) {
-381            hdrMap = new LinkedHashMap<String, Integer>();
-382
-383            String[] headerRecord = null;
-384            if (formatHeader.length == 0) {
-385                // read the header from the first line of the file
-386                final CSVRecord nextRecord = this.nextRecord();
-387                if (nextRecord != null) {
-388                    headerRecord = nextRecord.values();
-389                }
-390            } else {
-391                if (this.format.getSkipHeaderRecord()) {
-392                    this.nextRecord();
-393                }
-394                headerRecord = formatHeader;
-395            }
-396
-397            // build the name to index mappings
-398            if (headerRecord != null) {
-399                for (int i = 0; i < headerRecord.length; i++) {
-400                    final String header = headerRecord[i];
-401                    final boolean containsHeader = hdrMap.containsKey(header);
-402                    final boolean emptyHeader = header == null || header.trim().isEmpty();
-403                    if (containsHeader &&
-404                            (!emptyHeader || (emptyHeader && !this.format.getAllowMissingColumnNames()))) {
-405                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header +
-406                                "\" in " + Arrays.toString(headerRecord));
-407                    }
-408                    hdrMap.put(header, Integer.valueOf(i));
-409                }
-410            }
-411        }
-412        return hdrMap;
-413    }
-414
-415    /**
-416     * Gets whether this parser is closed.
-417     *
-418     * @return whether this parser is closed.
-419     */
-420    public boolean isClosed() {
-421        return this.lexer.isClosed();
-422    }
-423
-424    /**
-425     * Returns an iterator on the records.
-426     *
-427     * <p>IOExceptions occurring during the iteration are wrapped in a
-428     * RuntimeException.
-429     * If the parser is closed a call to {@code next()} will throw a
-430     * NoSuchElementException.</p>
-431     */
-432    @Override
-433    public Iterator<CSVRecord> iterator() {
-434        return new Iterator<CSVRecord>() {
-435            private CSVRecord current;
-436
-437            private CSVRecord getNextRecord() {
-438                try {
-439                    return CSVParser.this.nextRecord();
-440                } catch (final IOException e) {
-441                    // TODO: This is not great, throw an ISE instead?
-442                    throw new RuntimeException(e);
-443                }
-444            }
-445
-446            @Override
-447            public boolean hasNext() {
-448                if (CSVParser.this.isClosed()) {
-449                    return false;
-450                }
-451                if (this.current == null) {
-452                    this.current = this.getNextRecord();
-453                }
-454
-455                return this.current != null;
-456            }
-457
-458            @Override
-459            public CSVRecord next() {
-460                if (CSVParser.this.isClosed()) {
-461                    throw new NoSuchElementException("CSVParser has been closed");
-462                }
-463                CSVRecord next = this.current;
-464                this.current = null;
-465
-466                if (next == null) {
-467                    // hasNext() wasn't called before
-468                    next = this.getNextRecord();
-469                    if (next == null) {
-470                        throw new NoSuchElementException("No more CSV records available");
-471                    }
-472                }
-473
-474                return next;
-475            }
-476
-477            @Override
-478            public void remove() {
-479                throw new UnsupportedOperationException();
-480            }
-481        };
-482    }
-483
-484    /**
-485     * Parses the next record from the current point in the stream.
-486     *
-487     * @return the record as an array of values, or {@code null} if the end of the stream has been reached
-488     * @throws IOException
-489     *             on parse error or input read-failure
-490     */
-491    CSVRecord nextRecord() throws IOException {
-492        CSVRecord result = null;
-493        this.record.clear();
-494        StringBuilder sb = null;
-495        final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
-496        do {
-497            this.reusableToken.reset();
-498            this.lexer.nextToken(this.reusableToken);
-499            switch (this.reusableToken.type) {
-500            case TOKEN:
-501                this.addRecordValue();
-502                break;
-503            case EORECORD:
-504                this.addRecordValue();
-505                break;
-506            case EOF:
-507                if (this.reusableToken.isReady) {
-508                    this.addRecordValue();
-509                }
-510                break;
-511            case INVALID:
-512                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
-513            case COMMENT: // Ignored currently
-514                if (sb == null) { // first comment for this record
-515                    sb = new StringBuilder();
-516                } else {
-517                    sb.append(Constants.LF);
-518                }
-519                sb.append(this.reusableToken.content);
-520                this.reusableToken.type = TOKEN; // Read another token
-521                break;
-522            default:
-523                throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type);
-524            }
-525        } while (this.reusableToken.type == TOKEN);
-526
-527        if (!this.record.isEmpty()) {
-528            this.recordNumber++;
-529            final String comment = sb == null ? null : sb.toString();
-530            result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
-531                    this.recordNumber, startCharPosition);
-532        }
-533        return result;
-534    }
-535
-536}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

001/*
+002 * Licensed to the Apache Software Foundation (ASF) under one or more
+003 * contributor license agreements.  See the NOTICE file distributed with
+004 * this work for additional information regarding copyright ownership.
+005 * The ASF licenses this file to You under the Apache License, Version 2.0
+006 * (the "License"); you may not use this file except in compliance with
+007 * the License.  You may obtain a copy of the License at
+008 *
+009 *      http://www.apache.org/licenses/LICENSE-2.0
+010 *
+011 * Unless required by applicable law or agreed to in writing, software
+012 * distributed under the License is distributed on an "AS IS" BASIS,
+013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+014 * See the License for the specific language governing permissions and
+015 * limitations under the License.
+016 */
+017
+018package org.apache.commons.csv;
+019
+020import java.io.Closeable;
+021import java.io.File;
+022import java.io.FileInputStream;
+023import java.io.IOException;
+024import java.io.InputStreamReader;
+025import java.io.Reader;
+026import java.io.StringReader;
+027import java.net.URL;
+028import java.nio.charset.Charset;
+029import java.util.ArrayList;
+030import java.util.Arrays;
+031import java.util.Iterator;
+032import java.util.LinkedHashMap;
+033import java.util.List;
+034import java.util.Map;
+035import java.util.NoSuchElementException;
+036
+037import static org.apache.commons.csv.Token.Type.*;
+038
+039/**
+040 * Parses CSV files according to the specified format.
+041 *
+042 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
+043 * specification of a {@link CSVFormat}.
+044 *
+045 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
+046 *
+047 * <h2>Creating instances</h2>
+048 * <p>
+049 * There are several static factory methods that can be used to create instances for various types of resources:
+050 * </p>
+051 * <ul>
+052 *     <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li>
+053 *     <li>{@link #parse(String, CSVFormat)}</li>
+054 *     <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li>
+055 * </ul>
+056 * <p>
+057 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor.
+058 *
+059 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut:
+060 * </p>
+061 * <pre>
+062 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
+063 *     ...
+064 * }
+065 * </pre>
+066 *
+067 * <h2>Parsing record wise</h2>
+068 * <p>
+069 * To parse a CSV input from a file, you write:
+070 * </p>
+071 *
+072 * <pre>
+073 * File csvData = new File(&quot;/path/to/csv&quot;);
+074 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
+075 * for (CSVRecord csvRecord : parser) {
+076 *     ...
+077 * }
+078 * </pre>
+079 *
+080 * <p>
+081 * This will read the parse the contents of the file using the
+082 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format.
+083 * </p>
+084 *
+085 * <p>
+086 * To parse CSV input in a format like Excel, you write:
+087 * </p>
+088 *
+089 * <pre>
+090 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
+091 * for (CSVRecord csvRecord : parser) {
+092 *     ...
+093 * }
+094 * </pre>
+095 *
+096 * <p>
+097 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about
+098 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}.
+099 * </p>
+100 *
+101 * <h2>Parsing into memory</h2>
+102 * <p>
+103 * If parsing record wise is not desired, the contents of the input can be read completely into memory.
+104 * </p>
+105 *
+106 * <pre>
+107 * Reader in = new StringReader(&quot;a;b\nc;d&quot;);
+108 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
+109 * List&lt;CSVRecord&gt; list = parser.getRecords();
+110 * </pre>
+111 *
+112 * <p>
+113 * There are two constraints that have to be kept in mind:
+114 * </p>
+115 *
+116 * <ol>
+117 *     <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from
+118 *     the input, those records will not end up in the in memory representation of your CSV data.</li>
+119 *     <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're
+120 *     parsing a 150MB file of CSV data the contents will be read completely into memory.</li>
+121 * </ol>
+122 *
+123 * <h2>Notes</h2>
+124 * <p>
+125 * Internal parser state is completely covered by the format and the reader-state.
+126 * </p>
+127 *
+128 * @version $Id: CSVParser.java 1695167 2015-08-10 21:08:58Z ggregory $
+129 *
+130 * @see <a href="package-summary.html">package documentation for more details</a>
+131 */
+132public final class CSVParser implements Iterable<CSVRecord>, Closeable {
+133
+134    /**
+135     * Creates a parser for the given {@link File}.
+136     *
+137     * <p><strong>Note:</strong> This method internally creates a FileReader using
+138     * {@link java.io.FileReader#FileReader(java.io.File)} which in turn relies on the default encoding of the JVM that
+139     * is executing the code. If this is insufficient create a URL to the file and use
+140     * {@link #parse(URL, Charset, CSVFormat)}</p>
+141     *
+142     * @param file
+143     *            a CSV file. Must not be null.
+144     * @param charset
+145     *            A charset
+146     * @param format
+147     *            the CSVFormat used for CSV parsing. Must not be null.
+148     * @return a new parser
+149     * @throws IllegalArgumentException
+150     *             If the parameters of the format are inconsistent or if either file or format are null.
+151     * @throws IOException
+152     *             If an I/O error occurs
+153     */
+154    public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException {
+155        Assertions.notNull(file, "file");
+156        Assertions.notNull(format, "format");
+157        return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format);
+158    }
+159
+160    /**
+161     * Creates a parser for the given {@link String}.
+162     *
+163     * @param string
+164     *            a CSV string. Must not be null.
+165     * @param format
+166     *            the CSVFormat used for CSV parsing. Must not be null.
+167     * @return a new parser
+168     * @throws IllegalArgumentException
+169     *             If the parameters of the format are inconsistent or if either string or format are null.
+170     * @throws IOException
+171     *             If an I/O error occurs
+172     */
+173    public static CSVParser parse(final String string, final CSVFormat format) throws IOException {
+174        Assertions.notNull(string, "string");
+175        Assertions.notNull(format, "format");
+176
+177        return new CSVParser(new StringReader(string), format);
+178    }
+179
+180    /**
+181     * Creates a parser for the given URL.
+182     *
+183     * <p>
+184     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
+185     * you close the {@code url}.
+186     * </p>
+187     *
+188     * @param url
+189     *            a URL. Must not be null.
+190     * @param charset
+191     *            the charset for the resource. Must not be null.
+192     * @param format
+193     *            the CSVFormat used for CSV parsing. Must not be null.
+194     * @return a new parser
+195     * @throws IllegalArgumentException
+196     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
+197     * @throws IOException
+198     *             If an I/O error occurs
+199     */
+200    public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException {
+201        Assertions.notNull(url, "url");
+202        Assertions.notNull(charset, "charset");
+203        Assertions.notNull(format, "format");
+204
+205        return new CSVParser(new InputStreamReader(url.openStream(), charset), format);
+206    }
+207
+208    // the following objects are shared to reduce garbage
+209
+210    private final CSVFormat format;
+211
+212    /** A mapping of column names to column indices */
+213    private final Map<String, Integer> headerMap;
+214
+215    private final Lexer lexer;
+216
+217    /** A record buffer for getRecord(). Grows as necessary and is reused. */
+218    private final List<String> record = new ArrayList<String>();
+219
+220    /**
+221     * The next record number to assign.
+222     */
+223    private long recordNumber;
+224
+225    /**
+226     * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination
+227     * with {@link #recordNumber}.
+228     */
+229    private final long characterOffset;
+230
+231    private final Token reusableToken = new Token();
+232
+233    /**
+234     * Customized CSV parser using the given {@link CSVFormat}
+235     *
+236     * <p>
+237     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+238     * unless you close the {@code reader}.
+239     * </p>
+240     *
+241     * @param reader
+242     *            a Reader containing CSV-formatted input. Must not be null.
+243     * @param format
+244     *            the CSVFormat used for CSV parsing. Must not be null.
+245     * @throws IllegalArgumentException
+246     *             If the parameters of the format are inconsistent or if either reader or format are null.
+247     * @throws IOException
+248     *             If there is a problem reading the header or skipping the first record
+249     */
+250    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
+251        this(reader, format, 0, 1);
+252    }
+253
+254    /**
+255     * Customized CSV parser using the given {@link CSVFormat}
+256     *
+257     * <p>
+258     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+259     * unless you close the {@code reader}.
+260     * </p>
+261     *
+262     * @param reader
+263     *            a Reader containing CSV-formatted input. Must not be null.
+264     * @param format
+265     *            the CSVFormat used for CSV parsing. Must not be null.
+266     * @param characterOffset
+267     *            Lexer offset when the parser does not start parsing at the beginning of the source.
+268     * @param recordNumber
+269     *            The next record number to assign
+270     * @throws IllegalArgumentException
+271     *             If the parameters of the format are inconsistent or if either reader or format are null.
+272     * @throws IOException
+273     *             If there is a problem reading the header or skipping the first record
+274     * @since 1.1
+275     */
+276    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
+277            throws IOException {
+278        Assertions.notNull(reader, "reader");
+279        Assertions.notNull(format, "format");
+280
+281        this.format = format;
+282        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+283        this.headerMap = this.initializeHeader();
+284        this.characterOffset = characterOffset;
+285        this.recordNumber = recordNumber - 1;
+286    }
+287
+288    private void addRecordValue() {
+289        final String input = this.reusableToken.content.toString();
+290        final String nullString = this.format.getNullString();
+291        if (nullString == null) {
+292            this.record.add(input);
+293        } else {
+294            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
+295        }
+296    }
+297
+298    /**
+299     * Closes resources.
+300     *
+301     * @throws IOException
+302     *             If an I/O error occurs
+303     */
+304    @Override
+305    public void close() throws IOException {
+306        if (this.lexer != null) {
+307            this.lexer.close();
+308        }
+309    }
+310
+311    /**
+312     * Returns the current line number in the input stream.
+313     *
+314     * <p>
+315     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
+316     * the record number.
+317     * </p>
+318     *
+319     * @return current line number
+320     */
+321    public long getCurrentLineNumber() {
+322        return this.lexer.getCurrentLineNumber();
+323    }
+324
+325    /**
+326     * Returns a copy of the header map that iterates in column order.
+327     * <p>
+328     * The map keys are column names. The map values are 0-based indices.
+329     * </p>
+330     * @return a copy of the header map that iterates in column order.
+331     */
+332    public Map<String, Integer> getHeaderMap() {
+333        return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
+334    }
+335
+336    /**
+337     * Returns the current record number in the input stream.
+338     *
+339     * <p>
+340     * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to
+341     * the line number.
+342     * </p>
+343     *
+344     * @return current record number
+345     */
+346    public long getRecordNumber() {
+347        return this.recordNumber;
+348    }
+349
+350    /**
+351     * Parses the CSV input according to the given format and returns the content as a list of
+352     * {@link CSVRecord CSVRecords}.
+353     *
+354     * <p>
+355     * The returned content starts at the current parse-position in the stream.
+356     * </p>
+357     *
+358     * @return list of {@link CSVRecord CSVRecords}, may be empty
+359     * @throws IOException
+360     *             on parse error or input read-failure
+361     */
+362    public List<CSVRecord> getRecords() throws IOException {
+363        CSVRecord rec;
+364        final List<CSVRecord> records = new ArrayList<CSVRecord>();
+365        while ((rec = this.nextRecord()) != null) {
+366            records.add(rec);
+367        }
+368        return records;
+369    }
+370
+371    /**

[... 231 lines stripped ...]