Return-Path: Delivered-To: apmail-cocoon-cvs-archive@www.apache.org Received: (qmail 56304 invoked from network); 17 Dec 2004 20:52:01 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur-2.apache.org with SMTP; 17 Dec 2004 20:52:01 -0000 Received: (qmail 76750 invoked by uid 500); 17 Dec 2004 20:51:53 -0000 Delivered-To: apmail-cocoon-cvs-archive@cocoon.apache.org Received: (qmail 76598 invoked by uid 500); 17 Dec 2004 20:51:52 -0000 Mailing-List: contact cvs-help@cocoon.apache.org; run by ezmlm Precedence: bulk Reply-To: dev@cocoon.apache.org list-help: list-unsubscribe: list-post: Delivered-To: mailing list cvs@cocoon.apache.org Received: (qmail 76577 invoked by uid 99); 17 Dec 2004 20:51:51 -0000 X-ASF-Spam-Status: No, hits=-9.8 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from minotaur.apache.org (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.28) with SMTP; Fri, 17 Dec 2004 12:50:54 -0800 Received: (qmail 55477 invoked by uid 65534); 17 Dec 2004 20:50:37 -0000 Date: 17 Dec 2004 20:50:36 -0000 Message-ID: <20041217205036.55472.qmail@minotaur.apache.org> From: pier@apache.org To: cvs@cocoon.apache.org Subject: svn commit: r122676 - /cocoon/branches/BRANCH_2_1_X/src/blocks/scratchpad/java/org/apache/cocoon/generation/CSVGenerator.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 X-Virus-Checked: Checked X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N Author: pier Date: Fri Dec 17 12:50:34 2004 New Revision: 122676 URL: http://svn.apache.org/viewcvs?view=rev&rev=122676 Log: A simple generator from CSV files (might be handy with database dumps, eh?) Added: cocoon/branches/BRANCH_2_1_X/src/blocks/scratchpad/java/org/apache/cocoon/generation/CSVGenerator.java Added: cocoon/branches/BRANCH_2_1_X/src/blocks/scratchpad/java/org/apache/cocoon/generation/CSVGenerator.java Url: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/scratchpad/java/org/apache/cocoon/generation/CSVGenerator.java?view=auto&rev=122676 ============================================================================== --- (empty file) +++ cocoon/branches/BRANCH_2_1_X/src/blocks/scratchpad/java/org/apache/cocoon/generation/CSVGenerator.java Fri Dec 17 12:50:34 2004 @@ -0,0 +1,428 @@ +/* =============================================================================== * + * Copyright (C) 1999-2004, The Apache Software Foundation. All rights reserved. * + * * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use * + * this file except in compliance with the License. You may obtain a copy of the * + * License at . * + * * + * Unless required by applicable law or agreed to in writing, software distributed * + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * + * CONDITIONS OF ANY KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations under the License. * + * =============================================================================== */ +package org.apache.cocoon.generation; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.CharArrayWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + +import org.apache.avalon.framework.parameters.Parameters; +import org.apache.cocoon.ProcessingException; +import org.apache.cocoon.environment.SourceResolver; +import org.apache.excalibur.source.Source; +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + *

A simple parser converting a Comma Separated Values (CSV) file into XML.

+ * + *

This parser is controlled by the following sitemap parameters:

+ * + *
    + *
  • + * process-headers: whether the first line in the CSV is considered + * to be the header defining column names (the resulting output will be + * different if this is true or false (default: false). + *
  • + *
  • + * encoding: the character encoding (UTF-8, ISO8859-1, ...) used to + * interpret the input CSV source file (default: system default). + *
  • + *
  • + * separator: the field-separator character in the CSV file (comma, + * tab, ...) (default: , comma). + *
  • + *
  • + * escape: the character used to escape fields, or part of them, in + * the CSV file (default: " quote). + *
  • + *
  • + * buffer-size: the size of the buffer used for reading the source + * CSV file (default: 4096 bytes). + *
  • + *
+ * + *

The generated output will look something like the following:

+ * + *
+ * <?xml version="1.0" encoding="ISO-8859-1"?>
+ * <csv:document xmlns:csv="http://apache.org/cocoon/csv/1.0">
+ *   <csv:header>
+ *     <csv:column number="1">Column A</csv:column>
+ *     <csv:column number="2">Column B</csv:column>
+ *     <csv:column number="3">Column C</csv:column>
+ *   </csv:header>
+ *   <csv:record number="1">
+ *     <csv:field number="1" column="Column A">Field A1</csv:field>
+ *     <csv:field number="2" column="Column B">Field B1</csv:field>
+ *     <csv:field number="3" column="Column C">Field C1</csv:field>
+ *   </csv:record>
+ *   <csv:record number="2">
+ *     <csv:field number="1" column="Column A">Field A2</csv:field>
+ *     <csv:field number="2" column="Column B">Field B2</csv:field>
+ *     <csv:field number="3" column="Column C">Field C2</csv:field>
+ *   </csv:record>
+ * </csv:document>
+ * 
+ * + *

Note that this generator has been thoroughly tested with CSV files generated + * by Microsoft Excel. + * Unfortunately no official CSV specification has ever been published by + * any standard body, so the interpretation of the format might be slightly + * different in cases.

+ * + * @author Pier Fumagalli + * @author Copyright © 2000-2004 The Apache + * Software Foundation. All rights reserved. + */ +public class CSVGenerator extends FileGenerator { + + /**

The namespace URI of XML generated by this instance.

*/ + public static final String NAMESPACE_URI = "http://apache.org/cocoon/csv/1.0"; + /**

The namespace prefix of XML generated by this instance.

*/ + public static final String NAMESPACE_PREFIX = "csv"; + + /**

The default encoding configured in the Java VM.

*/ + private static final String DEFAULT_ENCODING = + new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); + /**

The default field separator character.

*/ + private static final String DEFAULT_SEPARATOR = ","; + /**

The default field separator character.

*/ + private static final String DEFAULT_ESCAPE = "\""; + /**

The default field separator character.

*/ + private static final int DEFAULT_BUFFER_SIZE = 4096; + /**

A string used for indenting.

*/ + private static final char INDENT_STRING[] = "\n ".toCharArray(); + + /**

The encoding used to read the CSV resource from a stream.

*/ + private String encoding = DEFAULT_ENCODING; + /**

The character used to separate fields.

*/ + private char separator = DEFAULT_SEPARATOR.charAt(0); + /**

The character used to initiate and terminate esacaped sequences.

*/ + private char escape = DEFAULT_ESCAPE.charAt(0); + /**

The size of the buffer used to read the input.

*/ + private int buffersize = DEFAULT_BUFFER_SIZE; + /**

The current field (column) number in the current record.

*/ + private int fieldnumber = 1; + /**

The current record (line) number in the current CSV.

*/ + private int recordnumber = 1; + /**

A flag indicating whether the <record> tag was opened.

*/ + private boolean openrecord = false; + /**

The character buffer for the current field.

*/ + private CharArrayWriter buffer = null; + /**

A map of all known columns or null if no headers are processed.

*/ + private Map columns = null; + + /** + *

Create a new {@link CSVGenerator} instance.

+ */ + public CSVGenerator() { + super(); + } + + /** + *

Recycle this component.

. + */ + public void recycle() { + super.recycle(); + + this.encoding = DEFAULT_ENCODING; + this.separator = DEFAULT_SEPARATOR.charAt(0); + this.escape = DEFAULT_ESCAPE.charAt(0); + this.buffersize = DEFAULT_BUFFER_SIZE; + this.buffer = null; + this.columns = null; + this.recordnumber = 1; + this.fieldnumber = 1; + this.openrecord = false; + } + + /** + *

Setup this {@link CSVGenerator} instance.

+ */ + public void setup(SourceResolver resolver, Map object_model, String source, + Parameters parameters) + throws ProcessingException, SAXException, IOException { + super.setup(resolver, object_model, source, parameters); + + boolean header = parameters.getParameterAsBoolean("process-header", false); + + this.encoding = parameters.getParameter("encoding", DEFAULT_ENCODING); + this.separator = parameters.getParameter("separator", DEFAULT_SEPARATOR).charAt(0); + this.escape = parameters.getParameter("escape", DEFAULT_ESCAPE).charAt(0); + this.buffersize = parameters.getParameterAsInteger("buffer-size", DEFAULT_BUFFER_SIZE); + this.buffer = new CharArrayWriter(); + this.columns = (header ? new HashMap() : null); + this.recordnumber = (header ? 0 : 1); + this.fieldnumber = 1; + this.openrecord = false; + } + + /** + *

Generate the unique key.

+ */ + public Serializable getKey() { + String key = this.inputSource.getURI(); + if (this.columns != null) return (key + "+headers"); + return key; + } + + /** + *

Generate XML data from a Comma Separated Value resource.

. + */ + public void generate() + throws IOException, SAXException, ProcessingException { + + /* Create a new Reader correctly decoding the source stream */ + CSVReader csv = new CSVReader(this.inputSource, this.encoding, this.buffersize); + + try { + /* Start the document */ + this.contentHandler.setDocumentLocator(csv); + this.contentHandler.startDocument(); + this.contentHandler.startPrefixMapping(NAMESPACE_PREFIX, NAMESPACE_URI); + this.indent(0); + this.startElement("document"); + + /* Allocate buffer and status for parsing */ + boolean unescaped = true; + int prev = -1; + int curr = -1; + + /* Parse the file reading characters one-by-one */ + while ((curr = csv.read()) >= 0) { + + /* Process any occurrence of the escape character */ + if (curr == this.escape) { + if ((unescaped) && (prev == this.escape)) { + this.buffer.write(this.escape); + } + unescaped = ! unescaped; + prev = curr; + continue; + } + + /* Process any occurrence of the field separator */ + if ((unescaped) && (curr == this.separator)) { + this.dumpField(); + prev = curr; + continue; + } + + /* Process newline characters */ + if ((unescaped) && ((curr == '\r') || (curr == '\n'))) { + this.dumpField(); + this.dumpRecord(); + + /* Record numbering */ + if (((curr == '\n') && (prev != '\r')) || (curr == '\r')) { + this.recordnumber ++; + } + + /* Nothing else to do */ + prev = curr; + continue; + } + + /* Any other character simply gets added to the buffer */ + this.buffer.write(curr); + prev = curr; + } + + /* Terminate any hanging open record element (just in case) */ + this.dumpRecord(); + + /* Terminate the document */ + this.indent(0); + this.endElement("document"); + this.contentHandler.endPrefixMapping(NAMESPACE_PREFIX); + this.contentHandler.endDocument(); + + } finally { + csv.close(); + } + } + + + private void dumpField() + throws SAXException { + if (this.buffer.size() < 1) { + this.fieldnumber ++; + return; + } + + if (! this.openrecord) { + this.indent(4); + + if (this.recordnumber > 0) { + AttributesImpl attributes = new AttributesImpl(); + String value = Integer.toString(this.recordnumber); + attributes.addAttribute("", "number", "number", "CDATA", value); + this.startElement("record", attributes); + } else { + this.startElement("header"); + } + this.openrecord = true; + } + + /* Enclode the field in the proper element */ + String element = "field"; + char array[] = this.buffer.toCharArray(); + this.indent(8); + + AttributesImpl attributes = new AttributesImpl(); + String value = Integer.toString(this.fieldnumber); + attributes.addAttribute("", "number", "number", "CDATA", value); + + if (this.recordnumber < 1) { + this.columns.put(new Integer(this.fieldnumber), new String(array)); + element = "column"; + } else if (this.columns != null) { + String header = (String) this.columns.get(new Integer(this.fieldnumber)); + if (header != null) { + attributes.addAttribute("", "column", "column", "CDATA", header); + } + } + + this.startElement(element, attributes); + this.contentHandler.characters(array, 0, array.length); + this.endElement(element); + this.buffer.reset(); + + this.fieldnumber ++; + } + + private void dumpRecord() + throws SAXException { + if (this.openrecord) { + this.indent(4); + if (this.recordnumber > 0) { + this.endElement("record"); + } else { + this.endElement("header"); + } + this.openrecord = false; + } + this.fieldnumber = 1; + } + + private void indent(int level) + throws SAXException { + this.contentHandler.characters(INDENT_STRING, 0, level + 1); + } + + private void startElement(String name) + throws SAXException { + this.startElement(name, new AttributesImpl()); + } + + private void startElement(String name, Attributes atts) + throws SAXException { + if (name == null) throw new NullPointerException("Null name"); + if (atts == null) atts = new AttributesImpl(); + String qual = NAMESPACE_PREFIX + ':' + name; + this.contentHandler.startElement(NAMESPACE_URI, name, qual, atts); + } + + private void endElement(String name) + throws SAXException { + String qual = NAMESPACE_PREFIX + ':' + name; + this.contentHandler.endElement(NAMESPACE_URI, name, qual); + } + + private static final class CSVReader extends Reader implements Locator { + + private String uri = null; + private Reader input = null; + private int column = 1; + private int line = 1; + private int last = -1; + + private CSVReader(Source source, String encoding, int buffer) + throws IOException { + InputStream stream = source.getInputStream(); + Reader reader = new InputStreamReader(stream, encoding); + this.input = new BufferedReader(reader, buffer); + this.uri = source.getURI(); + } + + public String getPublicId() { + return null; + } + + public String getSystemId() { + return this.uri; + } + + public int getLineNumber() { + return this.line; + } + + public int getColumnNumber() { + return this.column; + } + + public void close() + throws IOException { + this.input.close(); + } + + public int read() + throws IOException { + int c = this.input.read(); + if (c < 0) return c; + + if (((c == '\n') && (this.last != '\r')) || (c == '\r')) { + this.column = 1; + this.line ++; + } + + this.last = c; + return c; + } + + public int read(char b[], int o, int l) + throws IOException { + if (b == null) throw new NullPointerException(); + if ((o<0)||(o>b.length)||(l<0)||((o+l)>b.length)||((o+l)<0)) { + throw new IndexOutOfBoundsException(); + } + if (l == 0) return 0; + + int c = read(); + if (c == -1) return -1; + b[o] = (char)c; + + int i = 1; + try { + for (i = 1; i < l ; i++) { + c = read(); + if (c == -1) break; + if (b != null) b[o + i] = (char)c; + } + } catch (IOException ee) { + return i; + } + return i; + } + } +}