crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tzo...@apache.org
Subject crunch git commit: CRUNCH-491: Add an Xml File Source
Date Wed, 04 Feb 2015 00:30:47 GMT
Repository: crunch
Updated Branches:
  refs/heads/master 5dccb5330 -> 958d011a4


CRUNCH-491: Add an Xml File Source


Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/958d011a
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/958d011a
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/958d011a

Branch: refs/heads/master
Commit: 958d011a4c4c704bd2e12dfbb8f7216342c7fb1f
Parents: 5dccb53
Author: tzolov <christian.tzolov@gmail.com>
Authored: Mon Jan 26 01:50:38 2015 +0100
Committer: tzolov <christian.tzolov@gmail.com>
Committed: Wed Feb 4 01:14:15 2015 +0100

----------------------------------------------------------------------
 .../apache/crunch/io/text/xml/XmlSourceIT.java  | Bin 0 -> 3325 bytes
 .../crunch/io/text/xml/XmlInputFormat.java      | 193 ++++++++++++
 .../apache/crunch/io/text/xml/XmlSource.java    |  71 +++++
 .../src/main/resources/xmlSourceSample1.xml     | 291 +++++++++++++++++++
 .../src/main/resources/xmlSourceSample2.xml     | Bin 0 -> 248 bytes
 5 files changed, 555 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java b/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java
new file mode 100644
index 0000000..4b4b9e1
Binary files /dev/null and b/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java
differ

http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java
new file mode 100644
index 0000000..58157fe
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text.xml;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Charsets;
+import com.google.common.primitives.Chars;
+
+/**
+ * Reads records that are delimited by a specific begin/end tag.
+ * 
+ * The {@link XmlInputFormat} extends the Mahout's XmlInputFormat implementation providing
encoding support
+ */
+public class XmlInputFormat extends TextInputFormat {
+
+  private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class);
+
+  public static final String START_TAG_KEY = "xmlinput.start";
+  public static final String END_TAG_KEY = "xmlinput.end";
+  public static final String ENCODING = "xml.encoding";
+
+  @Override
+  public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext
context) {
+    try {
+      return new XmlRecordReader((FileSplit) split, context.getConfiguration());
+    } catch (IOException ioe) {
+      log.warn("Error while creating XmlRecordReader", ioe);
+      return null;
+    }
+  }
+
+  /**
+   * XMLRecordReader class to read through a given xml document to output xml blocks as records
as specified by the
+   * start tag and end tag
+   */
+  public static class XmlRecordReader extends RecordReader<LongWritable, Text> {
+
+    private static final String DEFAULT_ENCODING = Charsets.UTF_8.name();
+
+    private final char[] startTag;
+    private final char[] endTag;
+    private final long start;
+    private final long end;
+
+    private LongWritable currentKey;
+    private Text currentValue;
+    private final DataOutputBuffer outBuffer;
+    private final BufferedReader inReader;
+    private final OutputStreamWriter outWriter;
+    private final String inputEncoding;
+    private int readByteCounter = 0;
+
+    public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
+      inputEncoding = conf.get(ENCODING, DEFAULT_ENCODING);
+      startTag = new String(conf.get(START_TAG_KEY).getBytes(inputEncoding), inputEncoding).toCharArray();
+      endTag = new String(conf.get(END_TAG_KEY).getBytes(inputEncoding), inputEncoding).toCharArray();
+
+      // open the file and seek to the start of the split
+      start = split.getStart();
+      end = start + split.getLength();
+      Path file = split.getPath();
+      FileSystem fs = file.getFileSystem(conf);
+      FSDataInputStream fsin = fs.open(split.getPath());
+      fsin.seek(start);
+      inReader = new BufferedReader(new InputStreamReader(fsin, Charset.forName(inputEncoding)));
+      outBuffer = new DataOutputBuffer();
+      outWriter = new OutputStreamWriter(outBuffer, inputEncoding);
+    }
+
+    private boolean next(LongWritable key, Text value) throws IOException {
+
+      if (readByteCounter < end && readUntilMatch(startTag, false)) {
+        try {
+          outWriter.write(startTag);
+
+          if (readUntilMatch(endTag, true)) {
+            key.set(readByteCounter);
+            outWriter.flush();
+            value.set(toUTF8(outBuffer.getData()), 0, outBuffer.getLength());
+            return true;
+          }
+        } finally {
+          outWriter.flush();
+          outBuffer.reset();
+        }
+      }
+      return false;
+    }
+
+    private byte[] toUTF8(byte[] in) throws UnsupportedEncodingException {
+      return new String(in, inputEncoding).getBytes(Charsets.UTF_8);
+    }
+
+    @Override
+    public void close() throws IOException {
+      inReader.close();
+    }
+
+    @Override
+    public float getProgress() throws IOException {
+      return (readByteCounter - start) / (float) (end - start);
+    }
+
+    private boolean readUntilMatch(char[] match, boolean withinBlock) throws IOException
{
+      int i = 0;
+      while (true) {
+        int nextInCharacter = inReader.read();
+
+        readByteCounter = +Chars.toByteArray((char) nextInCharacter).length;
+
+        // end of file:
+        if (nextInCharacter == -1) {
+          return false;
+        }
+        // save to buffer:
+        if (withinBlock) {
+          outWriter.write(nextInCharacter);
+        }
+
+        // check if we're matching:
+        if (nextInCharacter == match[i]) {
+          i++;
+          if (i >= match.length) {
+            return true;
+          }
+        } else {
+          i = 0;
+        }
+        // see if we've passed the stop point
+        if (!withinBlock && i == 0 && readByteCounter >= end) {
+          return false;
+        }
+      }
+    }
+
+    @Override
+    public LongWritable getCurrentKey() throws IOException, InterruptedException {
+      return currentKey;
+    }
+
+    @Override
+    public Text getCurrentValue() throws IOException, InterruptedException {
+      return currentValue;
+    }
+
+    @Override
+    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
+    }
+
+    @Override
+    public boolean nextKeyValue() throws IOException, InterruptedException {
+      currentKey = new LongWritable();
+      currentValue = new Text();
+      return next(currentKey, currentValue);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java
new file mode 100644
index 0000000..2e434e7
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text.xml;
+
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Large XML documents composed of repetitive XML elements can be broken into chunks delimited
by element's start and
+ * end tag. The {@link XmlSource2} process XML files and extract out the XML between the
pre-configured start / end
+ * tags. Developer should process the content between the tags.
+ * 
+ * The {@link XmlSource} does not parse the input XML files and is not aware of the XML semantics.
It just splits the
+ * input file in chunks defined by the start/end tags. Nested XML elements are not supported.
+ */
+public class XmlSource extends FileSourceImpl<String> {
+
+  /**
+   * Create new XML data loader using the UTF-8 encoding.
+   * 
+   * @param inputPath
+   *          Input XML file location
+   * @param tagStart
+   *          Elements's start tag
+   * @param tagEnd
+   *          Elements's end tag
+   */
+  public XmlSource(String inputPath, String tagStart, String tagEnd) {
+    this(inputPath, tagStart, tagEnd, Charsets.UTF_8.name());
+  }
+
+  /**
+   * Create new XML data loader using the specified encoding.
+   * 
+   * @param inputPath
+   *          Input XML file location
+   * @param tagStart
+   *          Elements's start tag
+   * @param tagEnd
+   *          Elements's end tag
+   * @param encoding
+   *          Input file encoding
+   */
+  public XmlSource(String inputPath, String tagStart, String tagEnd, String encoding) {
+    super(new Path(inputPath), 
+        Writables.strings(), 
+        FormatBundle.forInput(XmlInputFormat.class)
+          .set(XmlInputFormat.START_TAG_KEY, tagStart)
+          .set(XmlInputFormat.END_TAG_KEY, tagEnd)
+          .set(XmlInputFormat.ENCODING, encoding));
+  }
+}

http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-test/src/main/resources/xmlSourceSample1.xml
----------------------------------------------------------------------
diff --git a/crunch-test/src/main/resources/xmlSourceSample1.xml b/crunch-test/src/main/resources/xmlSourceSample1.xml
new file mode 100644
index 0000000..8734c1e
--- /dev/null
+++ b/crunch-test/src/main/resources/xmlSourceSample1.xml
@@ -0,0 +1,291 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CATALOG>
+	<PLANT>
+		<COMMON>Bloodroot</COMMON>
+		<BOTANICAL>Sanguinaria canadensis</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$2.44</PRICE>
+		<AVAILABILITY>031599</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Columbine</COMMON>
+		<BOTANICAL>Aquilegia canadensis</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$9.37</PRICE>
+		<AVAILABILITY>030699</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Marsh Marigold</COMMON>
+		<BOTANICAL>Caltha palustris</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Sunny</LIGHT>
+		<PRICE>$6.81</PRICE>
+		<AVAILABILITY>051799</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Cowslip</COMMON>
+		<BOTANICAL>Caltha palustris</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$9.90</PRICE>
+		<AVAILABILITY>030699</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Dutchman's-Breeches</COMMON>
+		<BOTANICAL>Dicentra cucullaria</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$6.44</PRICE>
+		<AVAILABILITY>012099</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Ginger, Wild</COMMON>
+		<BOTANICAL>Asarum canadense</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$9.03</PRICE>
+		<AVAILABILITY>041899</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Hepatica</COMMON>
+		<BOTANICAL>Hepatica americana</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$4.45</PRICE>
+		<AVAILABILITY>012699</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Liverleaf</COMMON>
+		<BOTANICAL>Hepatica americana</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$3.99</PRICE>
+		<AVAILABILITY>010299</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Jack-In-The-Pulpit</COMMON>
+		<BOTANICAL>Arisaema triphyllum</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$3.23</PRICE>
+		<AVAILABILITY>020199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Mayapple</COMMON>
+		<BOTANICAL>Podophyllum peltatum</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$2.98</PRICE>
+		<AVAILABILITY>060599</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Phlox, Woodland</COMMON>
+		<BOTANICAL>Phlox divaricata</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$2.80</PRICE>
+		<AVAILABILITY>012299</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Phlox, Blue</COMMON>
+		<BOTANICAL>Phlox divaricata</BOTANICAL>
+		<ZONE>3</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$5.59</PRICE>
+		<AVAILABILITY>021699</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Spring-Beauty</COMMON>
+		<BOTANICAL>Claytonia Virginica</BOTANICAL>
+		<ZONE>7</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$6.59</PRICE>
+		<AVAILABILITY>020199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Trillium</COMMON>
+		<BOTANICAL>Trillium grandiflorum</BOTANICAL>
+		<ZONE>5</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$3.90</PRICE>
+		<AVAILABILITY>042999</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Wake Robin</COMMON>
+		<BOTANICAL>Trillium grandiflorum</BOTANICAL>
+		<ZONE>5</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$3.20</PRICE>
+		<AVAILABILITY>022199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Violet, Dog-Tooth</COMMON>
+		<BOTANICAL>Erythronium americanum</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$9.04</PRICE>
+		<AVAILABILITY>020199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Trout Lily</COMMON>
+		<BOTANICAL>Erythronium americanum</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$6.94</PRICE>
+		<AVAILABILITY>032499</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Adder's-Tongue</COMMON>
+		<BOTANICAL>Erythronium americanum</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$9.58</PRICE>
+		<AVAILABILITY>041399</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Anemone</COMMON>
+		<BOTANICAL>Anemone blanda</BOTANICAL>
+		<ZONE>6</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$8.86</PRICE>
+		<AVAILABILITY>122698</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Grecian Windflower</COMMON>
+		<BOTANICAL>Anemone blanda</BOTANICAL>
+		<ZONE>6</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$9.16</PRICE>
+		<AVAILABILITY>071099</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Bee Balm</COMMON>
+		<BOTANICAL>Monarda didyma</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$4.59</PRICE>
+		<AVAILABILITY>050399</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Bergamot</COMMON>
+		<BOTANICAL>Monarda didyma</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$7.16</PRICE>
+		<AVAILABILITY>042799</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Black-Eyed Susan</COMMON>
+		<BOTANICAL>Rudbeckia hirta</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Sunny</LIGHT>
+		<PRICE>$9.80</PRICE>
+		<AVAILABILITY>061899</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Buttercup</COMMON>
+		<BOTANICAL>Ranunculus</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$2.57</PRICE>
+		<AVAILABILITY>061099</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Crowfoot</COMMON>
+		<BOTANICAL>Ranunculus</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$9.34</PRICE>
+		<AVAILABILITY>040399</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Butterfly Weed</COMMON>
+		<BOTANICAL>Asclepias tuberosa</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Sunny</LIGHT>
+		<PRICE>$2.78</PRICE>
+		<AVAILABILITY>063099</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Cinquefoil</COMMON>
+		<BOTANICAL>Potentilla</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$7.06</PRICE>
+		<AVAILABILITY>052599</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Primrose</COMMON>
+		<BOTANICAL>Oenothera</BOTANICAL>
+		<ZONE>3 - 5</ZONE>
+		<LIGHT>Sunny</LIGHT>
+		<PRICE>$6.56</PRICE>
+		<AVAILABILITY>013099</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Gentian</COMMON>
+		<BOTANICAL>Gentiana</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$7.81</PRICE>
+		<AVAILABILITY>051899</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Blue Gentian</COMMON>
+		<BOTANICAL>Gentiana</BOTANICAL>
+		<ZONE>4</ZONE>
+		<LIGHT>Sun or Shade</LIGHT>
+		<PRICE>$8.56</PRICE>
+		<AVAILABILITY>050299</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Jacob's Ladder</COMMON>
+		<BOTANICAL>Polemonium caeruleum</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$9.26</PRICE>
+		<AVAILABILITY>022199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Greek Valerian</COMMON>
+		<BOTANICAL>Polemonium caeruleum</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$4.36</PRICE>
+		<AVAILABILITY>071499</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>California Poppy</COMMON>
+		<BOTANICAL>Eschscholzia californica</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Sun</LIGHT>
+		<PRICE>$7.89</PRICE>
+		<AVAILABILITY>032799</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Shooting Star</COMMON>
+		<BOTANICAL>Dodecatheon</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Mostly Shady</LIGHT>
+		<PRICE>$8.60</PRICE>
+		<AVAILABILITY>051399</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Snakeroot</COMMON>
+		<BOTANICAL>Cimicifuga</BOTANICAL>
+		<ZONE>Annual</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$5.63</PRICE>
+		<AVAILABILITY>071199</AVAILABILITY>
+	</PLANT>
+	<PLANT>
+		<COMMON>Cardinal Flower</COMMON>
+		<BOTANICAL>Lobelia cardinalis</BOTANICAL>
+		<ZONE>2</ZONE>
+		<LIGHT>Shade</LIGHT>
+		<PRICE>$3.02</PRICE>
+		<AVAILABILITY>022299</AVAILABILITY>
+	</PLANT>
+</CATALOG>

http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-test/src/main/resources/xmlSourceSample2.xml
----------------------------------------------------------------------
diff --git a/crunch-test/src/main/resources/xmlSourceSample2.xml b/crunch-test/src/main/resources/xmlSourceSample2.xml
new file mode 100644
index 0000000..7d90532
Binary files /dev/null and b/crunch-test/src/main/resources/xmlSourceSample2.xml differ


Mime
View raw message