Return-Path: Delivered-To: apmail-jackrabbit-commits-archive@www.apache.org Received: (qmail 73033 invoked from network); 19 Dec 2006 16:19:57 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 19 Dec 2006 16:19:57 -0000 Received: (qmail 41755 invoked by uid 500); 19 Dec 2006 16:20:05 -0000 Delivered-To: apmail-jackrabbit-commits-archive@jackrabbit.apache.org Received: (qmail 41728 invoked by uid 500); 19 Dec 2006 16:20:05 -0000 Mailing-List: contact commits-help@jackrabbit.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@jackrabbit.apache.org Delivered-To: mailing list commits@jackrabbit.apache.org Received: (qmail 41719 invoked by uid 99); 19 Dec 2006 16:20:04 -0000 Received: from herse.apache.org (HELO herse.apache.org) (140.211.11.133) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Dec 2006 08:20:04 -0800 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Dec 2006 08:19:55 -0800 Received: by eris.apache.org (Postfix, from userid 65534) id 81F731A981D; Tue, 19 Dec 2006 08:19:07 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r488717 [2/2] - in /jackrabbit/trunk/jackrabbit-text-extractor: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/jackrabbit/ src/main/java/org/apache/jackrabbit/extractor/ src/test/ src/... Date: Tue, 19 Dec 2006 16:19:06 -0000 To: commits@jackrabbit.apache.org From: mreutegg@apache.org X-Mailer: svnmailer-1.1.0 Message-Id: <20061219161907.81F731A981D@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import javax.swing.text.BadLocationException; +import javax.swing.text.DefaultStyledDocument; +import javax.swing.text.rtf.RTFEditorKit; +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; + +/** + * Text extractor for Rich Text Format (RTF) + */ +public class RTFTextExtractor extends AbstractTextExtractor { + + /** + * Creates a new RTFTextExtractor instance. + */ + public RTFTextExtractor() { + super(new String[]{"application/rtf"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + + try { + RTFEditorKit rek = new RTFEditorKit(); + DefaultStyledDocument doc = new DefaultStyledDocument(); + rek.read(stream, doc, 0); + String text = doc.getText(0, doc.getLength()); + return new StringReader(text); + } catch (BadLocationException e) { + throw new IOException(e.getMessage()); + } finally { + stream.close(); + } + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +/** + * Interface for extracting text content from binary streams. + */ +public interface TextExtractor { + + /** + * Returns the MIME types supported by this extractor. The returned + * strings must be in lower case, and the returned array must not be empty. + *

+ * The returned array must not be modified. + * + * @return supported MIME types, lower case + */ + String[] getContentTypes(); + + /** + * Returns a reader for the text content of the given binary document. + * The content type and character encoding (if available and applicable) + * are given as arguments. The given content type is guaranteed to be + * one of the types reported by {@link #getContentTypes()} unless the + * implementation explicitly permits other content types. + *

+ * The implementation can choose either to read and parse the given + * document immediately or to return a reader that does it incrementally. + * The only constraint is that the implementation must close the given + * stream latest when the returned reader is closed. The caller on the + * other hand is responsible for closing the returned reader. + *

+ * The implemenation should only throw an exception on transient + * errors, i.e. when it can expect to be able to successfully extract + * the text content of the same binary at another time. An effort + * should be made to recover from syntax errors and other similar problems. + *

+ * This method should be thread-safe, i.e. it is possible that this + * method is invoked simultaneously by different threads to extract the + * text content of different documents. On the other hand the returned + * reader does not need to be thread-safe. + * + * @param stream binary document from which to extract text + * @param type MIME type of the given document, lower case + * @param encoding the character encoding of the binary data, + * or null if not available + * @return reader for the extracted text content + * @throws IOException on transient errors + */ + Reader extractText(InputStream stream, String type, String encoding) + throws IOException; + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayReader; +import java.io.CharArrayWriter; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +/** + * Text extractor for XML documents. This class extracts the text content + * and attribute values from XML documents. + *

+ * This class can handle any XML-based format + * (application/xml+something), not just the base XML content + * types reported by {@link #getContentTypes()}. However, it often makes + * sense to use more specialized extractors that better understand the + * specific content type. + */ +public class XMLTextExtractor extends AbstractTextExtractor { + + /** + * Creates a new XMLTextExtractor instance. + */ + public XMLTextExtractor() { + super(new String[]{"text/xml", "application/xml"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns a reader for the text content of the given XML document. + * Returns an empty reader if the given encoding is not supported or + * if the XML document could not be parsed. + * + * @param stream XML document + * @param type XML content type + * @param encoding character encoding, or null + * @return reader for the text content of the given XML document, + * or an empty reader if the document could not be parsed + * @throws IOException if the XML document stream can not be closed + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + try { + CharArrayWriter writer = new CharArrayWriter(); + ExtractorHandler handler = new ExtractorHandler(writer); + + // TODO: Use a pull parser to avoid the memory overhead + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser parser = factory.newSAXParser(); + XMLReader reader = parser.getXMLReader(); + reader.setContentHandler(handler); + reader.setErrorHandler(handler); + + // It is unspecified whether the XML parser closes the stream when + // done parsing. To ensure that the stream gets closed just once, + // we prevent the parser from closing it by catching the close() + // call and explicitly close the stream in a finally block. + InputSource source = new InputSource(new FilterInputStream(stream) { + public void close() { + } + }); + if (encoding != null) { + source.setEncoding(encoding); + } + reader.parse(source); + + return new CharArrayReader(writer.toCharArray()); + } catch (ParserConfigurationException e) { + return new StringReader(""); + } catch (SAXException e) { + return new StringReader(""); + } finally { + stream.close(); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link CompositeTextExtractor} class. + */ +public class CompositeTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private CompositeTextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() throws Exception { + super.setUp(); + extractor = new CompositeTextExtractor(); + extractor.addTextExtractor(new PlainTextExtractor()); + extractor.addTextExtractor(new XMLTextExtractor()); + } + + /** + * Tests that the extractor supports all the content types of the + * component extractors. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("text/plain")); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("text/xml")); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("application/xml")); + assertEquals( + "CompositeTextExtractor supports unknown content types", + 3, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + * + * @throws IOException on IO errors + */ + public void testEmptyStream() throws IOException { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/plain", null); + assertEquals("", ExtractorHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals(text, ExtractorHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles unsupported content types. + * + * @throws IOException on IO errors + */ + public void testUnsupportedEncoding() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), + "unsupported", null); + assertEquals("", ExtractorHelper.read(reader)); + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link EmptyTextExtractor} class. + */ +public class EmptyTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() throws Exception { + super.setUp(); + extractor = new EmptyTextExtractor("test/type"); + } + + /** + * Tests that the extractor supports no content types. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "EmptyTextExtractor does not support the given content type", + types.contains("test/type")); + assertEquals( + "EmptyTextExtractor supports unknown content types", + 1, types.size()); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals("", ExtractorHelper.read(reader)); + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.io.Reader; + +/** + * Helper class for text extractor unit tests. + */ +class ExtractorHelper { + + /** + * Private constructor to prevent instantiation. + */ + private ExtractorHelper() { + }; + + /** + * Returns the entire content of the given reader as a string. + * + * @param reader reader to be read and closed + * @return entire content of the reader + * @throws IOException on IO errors + */ + public static String read(Reader reader) throws IOException { + try { + CharArrayWriter writer = new CharArrayWriter(); + try { + char[] buffer = new char[4096]; + int n = reader.read(buffer); + while (n > 0) { + writer.write(buffer, 0, n); + n = reader.read(buffer); + } + } finally { + writer.close(); + } + return new String(writer.toCharArray()); + } finally { + reader.close(); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link PlainTextExtractor} class. + */ +public class PlainTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() throws Exception { + super.setUp(); + extractor = new PlainTextExtractor(); + } + + /** + * Tests that the extractor supportes text/plain. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "PlainTextExtractor does not support text/plain", + types.contains("text/plain")); + assertEquals( + "PlainTextExtractor supports unknown content types", + 1, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + * + * @throws IOException on IO errors + */ + public void testEmptyStream() throws IOException { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/plain", null); + assertEquals("", ExtractorHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals(text, ExtractorHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles unsupported encodings. + * + * @throws IOException on IO errors + */ + public void testUnsupportedEncoding() throws IOException { + try { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), + "text/plain", "unsupported"); + assertEquals("", ExtractorHelper.read(reader)); + } catch (UnsupportedEncodingException e) { + fail("PlainTextExtractor does not handle unsupported encodings"); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link XMLTextExtractor} class. + */ +public class XMLTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() throws Exception { + super.setUp(); + extractor = new XMLTextExtractor(); + } + + /** + * Tests that the extractor supportes text/xml and + * application/xml. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "XMLTextExtractor does not support text/xml", + types.contains("text/xml")); + assertTrue( + "XMLTextExtractor does not support application/xml", + types.contains("application/xml")); + assertEquals( + "XMLTextExtractor supports unknown content types", + 2, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + */ + public void testEmptyStream() { + try { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/xml", null); + assertEquals("", ExtractorHelper.read(reader)); + } catch (IOException e) { + fail("XMLTextExtractor does not handle empty streams"); + } + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), "text/xml", null); + assertEquals("attribute value text content", ExtractorHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles XML parse errors. + */ + public void testInvalidStream() { + try { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), "text/xml", null); + assertEquals("", ExtractorHelper.read(reader)); + } catch (IOException e) { + fail("XMLTextExtractor does not handle XML parse errors"); + } + } + + /** + * Tests that the extractor correctly handles unsupported encodings. + */ + public void testUnsupportedEncoding() { + try { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), + "text/xml", "unsupported"); + assertEquals("", ExtractorHelper.read(reader)); + } catch (UnsupportedEncodingException e) { + fail("XMLTextExtractor does not handle unsupported encodings"); + } catch (IOException e) { + fail("XMLTextExtractor does not handle unsupported encodings"); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java ------------------------------------------------------------------------------ svn:eol-style = native