Mailing-List: contact commits-help@jackrabbit.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@jackrabbit.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r488717 [2/2] - in
 /jackrabbit/trunk/jackrabbit-text-extractor:
 ./ src/ src/main/ src/main/java/ src/main/java/org/
 src/main/java/org/apache/ src/main/java/org/apache/jackrabbit/
 src/main/java/org/apache/jackrabbit/extractor/ src/test/ src/...
Date: Tue, 19 Dec 2006 16:19:06 -0000
To: commits@jackrabbit.apache.org
From: mreutegg@apache.org
Message-Id: <20061219161907.81F731A981D@eris.apache.org>

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import javax.swing.text.BadLocationException;
+import javax.swing.text.DefaultStyledDocument;
+import javax.swing.text.rtf.RTFEditorKit;
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Rich Text Format (RTF)
+ */
+public class RTFTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Creates a new <code>RTFTextExtractor</code> instance.
+     */
+    public RTFTextExtractor() {
+        super(new String[]{"application/rtf"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+
+        try {
+            RTFEditorKit rek = new RTFEditorKit();
+            DefaultStyledDocument doc = new DefaultStyledDocument();
+            rek.read(stream, doc, 0);
+            String text = doc.getText(0, doc.getLength());
+            return new StringReader(text);
+        } catch (BadLocationException e) {
+            throw new IOException(e.getMessage());
+        } finally {
+            stream.close();
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+/**
+ * Interface for extracting text content from binary streams.
+ */
+public interface TextExtractor {
+
+    /**
+     * Returns the MIME types supported by this extractor. The returned
+     * strings must be in lower case, and the returned array must not be empty.
+     * <p>
+     * The returned array must not be modified.
+     *
+     * @return supported MIME types, lower case
+     */
+    String[] getContentTypes();
+
+    /**
+     * Returns a reader for the text content of the given binary document.
+     * The content type and character encoding (if available and applicable)
+     * are given as arguments. The given content type is guaranteed to be
+     * one of the types reported by {@link #getContentTypes()} unless the
+     * implementation explicitly permits other content types.
+     * <p>
+     * The implementation can choose either to read and parse the given
+     * document immediately or to return a reader that does it incrementally.
+     * The only constraint is that the implementation must close the given
+     * stream latest when the returned reader is closed. The caller on the
+     * other hand is responsible for closing the returned reader.
+     * <p>
+     * The implemenation should only throw an exception on transient
+     * errors, i.e. when it can expect to be able to successfully extract
+     * the text content of the same binary at another time. An effort
+     * should be made to recover from syntax errors and other similar problems.
+     * <p>
+     * This method should be thread-safe, i.e. it is possible that this
+     * method is invoked simultaneously by different threads to extract the
+     * text content of different documents. On the other hand the returned
+     * reader does not need to be thread-safe.
+     *
+     * @param stream   binary document from which to extract text
+     * @param type     MIME type of the given document, lower case
+     * @param encoding the character encoding of the binary data,
+     *                 or <code>null</code> if not available
+     * @return reader for the extracted text content
+     * @throws IOException on transient errors
+     */
+    Reader extractText(InputStream stream, String type, String encoding)
+        throws IOException;
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayReader;
+import java.io.CharArrayWriter;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+/**
+ * Text extractor for XML documents. This class extracts the text content
+ * and attribute values from XML documents.
+ * <p>
+ * This class can handle any XML-based format
+ * (<code>application/xml+something</code>), not just the base XML content
+ * types reported by {@link #getContentTypes()}. However, it often makes
+ * sense to use more specialized extractors that better understand the
+ * specific content type.
+ */
+public class XMLTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Creates a new <code>XMLTextExtractor</code> instance.
+     */
+    public XMLTextExtractor() {
+        super(new String[]{"text/xml", "application/xml"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * Returns a reader for the text content of the given XML document.
+     * Returns an empty reader if the given encoding is not supported or
+     * if the XML document could not be parsed.
+     *
+     * @param stream XML document
+     * @param type XML content type
+     * @param encoding character encoding, or <code>null</code>
+     * @return reader for the text content of the given XML document,
+     *         or an empty reader if the document could not be parsed
+     * @throws IOException if the XML document stream can not be closed
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        try {
+            CharArrayWriter writer = new CharArrayWriter();
+            ExtractorHandler handler = new ExtractorHandler(writer);
+
+            // TODO: Use a pull parser to avoid the memory overhead
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            SAXParser parser = factory.newSAXParser();
+            XMLReader reader = parser.getXMLReader();
+            reader.setContentHandler(handler);
+            reader.setErrorHandler(handler);
+
+            // It is unspecified whether the XML parser closes the stream when
+            // done parsing. To ensure that the stream gets closed just once,
+            // we prevent the parser from closing it by catching the close()
+            // call and explicitly close the stream in a finally block.
+            InputSource source = new InputSource(new FilterInputStream(stream) {
+                public void close() {
+                }
+            });
+            if (encoding != null) {
+                source.setEncoding(encoding);
+            }
+            reader.parse(source);
+
+            return new CharArrayReader(writer.toCharArray());
+        } catch (ParserConfigurationException e) {
+            return new StringReader("");
+        } catch (SAXException e) {
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link CompositeTextExtractor} class.
+ */
+public class CompositeTextExtractorTest extends TestCase {
+
+    /**
+     * Text extractor being tested.
+     */
+    private CompositeTextExtractor extractor;
+
+    /**
+     * Creates the text extractor to be tested.
+     */
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = new CompositeTextExtractor();
+        extractor.addTextExtractor(new PlainTextExtractor());
+        extractor.addTextExtractor(new XMLTextExtractor());
+    }
+
+    /**
+     * Tests that the extractor supports all the content types of the
+     * component extractors.
+     */
+    public void testContentTypes() {
+        Set types = new HashSet();
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+        assertTrue(
+                "CompositeTextExtractor does not support component types",
+                types.contains("text/plain"));
+        assertTrue(
+                "CompositeTextExtractor does not support component types",
+                types.contains("text/xml"));
+        assertTrue(
+                "CompositeTextExtractor does not support component types",
+                types.contains("application/xml"));
+        assertEquals(
+                "CompositeTextExtractor supports unknown content types",
+                3, types.size());
+    }
+
+    /**
+     * Tests that the extractor correctly handles an empty stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testEmptyStream() throws IOException {
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(new byte[0]), "text/plain", null);
+        assertEquals("", ExtractorHelper.read(reader));
+    }
+
+    /**
+     * Tests that the extractor correctly handles a normal stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testNormalStream() throws IOException {
+        String text = "some test content";
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+        assertEquals(text, ExtractorHelper.read(reader));
+    }
+
+    /**
+     * Tests that the extractor correctly handles unsupported content types.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testUnsupportedEncoding() throws IOException {
+        String text = "some test content";
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(text.getBytes()),
+                "unsupported", null);
+        assertEquals("", ExtractorHelper.read(reader));
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link EmptyTextExtractor} class.
+ */
+public class EmptyTextExtractorTest extends TestCase {
+
+    /**
+     * Text extractor being tested.
+     */
+    private TextExtractor extractor;
+
+    /**
+     * Creates the text extractor to be tested.
+     */
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = new EmptyTextExtractor("test/type");
+    }
+
+    /**
+     * Tests that the extractor supports no content types.
+     */
+    public void testContentTypes() {
+        Set types = new HashSet();
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+        assertTrue(
+                "EmptyTextExtractor does not support the given content type",
+                types.contains("test/type"));
+        assertEquals(
+                "EmptyTextExtractor supports unknown content types",
+                1, types.size());
+    }
+
+    /**
+     * Tests that the extractor correctly handles a normal stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testNormalStream() throws IOException {
+        String text = "some test content";
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+        assertEquals("", ExtractorHelper.read(reader));
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Helper class for text extractor unit tests.
+ */
+class ExtractorHelper {
+
+    /**
+     * Private constructor to prevent instantiation.
+     */
+    private ExtractorHelper() {
+    };
+
+    /**
+     * Returns the entire content of the given reader as a string.
+     *
+     * @param reader reader to be read and closed
+     * @return entire content of the reader
+     * @throws IOException on IO errors
+     */
+    public static String read(Reader reader) throws IOException {
+        try {
+            CharArrayWriter writer = new CharArrayWriter();
+            try {
+                char[] buffer = new char[4096];
+                int n = reader.read(buffer);
+                while (n > 0) {
+                    writer.write(buffer, 0, n);
+                    n = reader.read(buffer);
+                }
+            } finally {
+                writer.close();
+            }
+            return new String(writer.toCharArray());
+        } finally {
+            reader.close();
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link PlainTextExtractor} class.
+ */
+public class PlainTextExtractorTest extends TestCase {
+
+    /**
+     * Text extractor being tested.
+     */
+    private TextExtractor extractor;
+
+    /**
+     * Creates the text extractor to be tested.
+     */
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = new PlainTextExtractor();
+    }
+
+    /**
+     * Tests that the extractor supportes <code>text/plain</code>.
+     */
+    public void testContentTypes() {
+        Set types = new HashSet();
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+        assertTrue(
+                "PlainTextExtractor does not support text/plain",
+                types.contains("text/plain"));
+        assertEquals(
+                "PlainTextExtractor supports unknown content types",
+                1, types.size());
+    }
+
+    /**
+     * Tests that the extractor correctly handles an empty stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testEmptyStream() throws IOException {
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(new byte[0]), "text/plain", null);
+        assertEquals("", ExtractorHelper.read(reader));
+    }
+
+    /**
+     * Tests that the extractor correctly handles a normal stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testNormalStream() throws IOException {
+        String text = "some test content";
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+        assertEquals(text, ExtractorHelper.read(reader));
+    }
+
+    /**
+     * Tests that the extractor correctly handles unsupported encodings.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testUnsupportedEncoding() throws IOException {
+        try {
+            String text = "some test content";
+            Reader reader = extractor.extractText(
+                    new ByteArrayInputStream(text.getBytes()),
+                    "text/plain", "unsupported");
+            assertEquals("", ExtractorHelper.read(reader));
+        } catch (UnsupportedEncodingException e) {
+            fail("PlainTextExtractor does not handle unsupported encodings");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link XMLTextExtractor} class.
+ */
+public class XMLTextExtractorTest extends TestCase {
+
+    /**
+     * Text extractor being tested.
+     */
+    private TextExtractor extractor;
+
+    /**
+     * Creates the text extractor to be tested.
+     */
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = new XMLTextExtractor();
+    }
+
+    /**
+     * Tests that the extractor supportes <code>text/xml</code> and
+     * <code>application/xml</code>.
+     */
+    public void testContentTypes() {
+        Set types = new HashSet();
+        types.addAll(Arrays.asList(extractor.getContentTypes()));
+        assertTrue(
+                "XMLTextExtractor does not support text/xml",
+                types.contains("text/xml"));
+        assertTrue(
+                "XMLTextExtractor does not support application/xml",
+                types.contains("application/xml"));
+        assertEquals(
+                "XMLTextExtractor supports unknown content types",
+                2, types.size());
+    }
+
+    /**
+     * Tests that the extractor correctly handles an empty stream.
+     */
+    public void testEmptyStream() {
+        try {
+            Reader reader = extractor.extractText(
+                    new ByteArrayInputStream(new byte[0]), "text/xml", null);
+            assertEquals("", ExtractorHelper.read(reader));
+        } catch (IOException e) {
+            fail("XMLTextExtractor does not handle empty streams");
+        }
+    }
+
+    /**
+     * Tests that the extractor correctly handles a normal stream.
+     *
+     * @throws IOException on IO errors
+     */
+    public void testNormalStream() throws IOException {
+        String xml = "<a b=\"attribute value\">text content</a>";
+        Reader reader = extractor.extractText(
+                new ByteArrayInputStream(xml.getBytes()), "text/xml", null);
+        assertEquals("attribute value text content", ExtractorHelper.read(reader));
+    }
+
+    /**
+     * Tests that the extractor correctly handles XML parse errors.
+     */
+    public void testInvalidStream() {
+        try {
+            String xml = "<a b=\"attribute value\">text content</c>";
+            Reader reader = extractor.extractText(
+                    new ByteArrayInputStream(xml.getBytes()), "text/xml", null);
+            assertEquals("", ExtractorHelper.read(reader));
+        } catch (IOException e) {
+            fail("XMLTextExtractor does not handle XML parse errors");
+        }
+    }
+
+    /**
+     * Tests that the extractor correctly handles unsupported encodings.
+     */
+    public void testUnsupportedEncoding() {
+        try {
+            String xml = "<a b=\"attribute value\">text content</a>";
+            Reader reader = extractor.extractText(
+                    new ByteArrayInputStream(xml.getBytes()),
+                    "text/xml", "unsupported");
+            assertEquals("", ExtractorHelper.read(reader));
+        } catch (UnsupportedEncodingException e) {
+            fail("XMLTextExtractor does not handle unsupported encodings");
+        } catch (IOException e) {
+            fail("XMLTextExtractor does not handle unsupported encodings");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native