Return-Path: Delivered-To: apmail-jackrabbit-commits-archive@www.apache.org Received: (qmail 73079 invoked from network); 19 Dec 2006 16:19:59 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 19 Dec 2006 16:19:59 -0000 Received: (qmail 41866 invoked by uid 500); 19 Dec 2006 16:20:06 -0000 Delivered-To: apmail-jackrabbit-commits-archive@jackrabbit.apache.org Received: (qmail 41846 invoked by uid 500); 19 Dec 2006 16:20:06 -0000 Mailing-List: contact commits-help@jackrabbit.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@jackrabbit.apache.org Delivered-To: mailing list commits@jackrabbit.apache.org Received: (qmail 41837 invoked by uid 99); 19 Dec 2006 16:20:06 -0000 Received: from herse.apache.org (HELO herse.apache.org) (140.211.11.133) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Dec 2006 08:20:06 -0800 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Dec 2006 08:19:56 -0800 Received: by eris.apache.org (Postfix, from userid 65534) id 79D571A981A; Tue, 19 Dec 2006 08:19:07 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r488717 [1/2] - in /jackrabbit/trunk/jackrabbit-text-extractor: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/jackrabbit/ src/main/java/org/apache/jackrabbit/extractor/ src/test/ src/... Date: Tue, 19 Dec 2006 16:19:06 -0000 To: commits@jackrabbit.apache.org From: mreutegg@apache.org X-Mailer: svnmailer-1.1.0 Message-Id: <20061219161907.79D571A981A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mreutegg Date: Tue Dec 19 08:19:04 2006 New Revision: 488717 URL: http://svn.apache.org/viewvc?view=rev&rev=488717 Log: JCR-415: Enhance indexing of binary content - Create new module jackrabbit-text-extractors - Migrated existing index-filter implementations to text-extractor module Added: jackrabbit/trunk/jackrabbit-text-extractor/ (with props) jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt (with props) jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt (with props) jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt (with props) jackrabbit/trunk/jackrabbit-text-extractor/README.txt (with props) jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml (with props) jackrabbit/trunk/jackrabbit-text-extractor/pom.xml (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/test/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (with props) jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (with props) Propchange: jackrabbit/trunk/jackrabbit-text-extractor/ ------------------------------------------------------------------------------ --- svn:ignore (added) +++ svn:ignore Tue Dec 19 08:19:04 2006 @@ -0,0 +1,4 @@ +*.iml +*.ipr +*.iws +target Added: jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt Tue Dec 19 08:19:04 2006 @@ -0,0 +1,16 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ Propchange: jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt Tue Dec 19 08:19:04 2006 @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. Propchange: jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt Tue Dec 19 08:19:04 2006 @@ -0,0 +1,5 @@ +Apache Jackrabbit +Copyright 2006 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). Propchange: jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/README.txt URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/README.txt?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/README.txt (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/README.txt Tue Dec 19 08:19:04 2006 @@ -0,0 +1,107 @@ +===================================== +Welcome to Jackrabbit Text Extractors +===================================== + +This is the Text Extractors component of the Apache Jackrabbit project. +This component contains extractor classes that allow Jackrabbit to +extract text content from binary properties for full text indexing. +The following file formats and MIME types are currently supported: + + * Microsoft Word + [org.apache.jackrabbit.extractor.MsWordTextExtractor] + * application/vnd.ms-word + * application/msword + + * Microsoft Excel + [org.apache.jackrabbit.extractor.MsExcelTextExtractor] + * application/vnd.ms-excel + + * Microsoft PowerPoint + [org.apache.jackrabbit.extractor.MsPowerPointTextExtractor] + * application/vnd.ms-powerpoint + * application/mspowerpoint + + * Portable Document Format (PDF) + [org.apache.jackrabbit.extractor.PdfTextExtractor] + * application/pdf + + * OpenOffice.org + [org.apache.jackrabbit.extractor.OpenOfficeTextExtractor] + * application/vnd.oasis.opendocument.database + * application/vnd.oasis.opendocument.formula + * application/vnd.oasis.opendocument.graphics + * application/vnd.oasis.opendocument.presentation + * application/vnd.oasis.opendocument.spreadsheet + * application/vnd.oasis.opendocument.text + + * Rich Text Format (RTF) + [org.apache.jackrabbit.extractor.RTFTextExtractor] + * application/rtf + + * HyperText Markup Language (HTML) + [org.apache.jackrabbit.extractor.HTMLTextExtractor] + * text/html + + * Extensible Markup Language (XML) + [org.apache.jackrabbit.extractor.XMLTextExtractor] + * text/xml + +To use these text extractors with the Jackrabbit Core: + + 1) add the jackrabbit-text-extractors jar file and the dependencies defined + in the Maven POM in the Jackrabbit classpath, and + 2) add the fully qualified class names listed above in the "textFilterClasses" + parameter of the "SearchIndex" configuration element of a Jackrabbit + workspace configuration file (workspace.xml). + +See the Apache Jackrabbit web site (http://jackrabbit.apache.org/) +for documentation and other information. You are welcome to join the +Jackrabbit mailing lists (http://jackrabbit.apache.org/mail-lists.html) +to discuss this compoment and to use the Jackrabbit issue tracker +(http://issues.apache.org/jira/browse/JCR) to report issues or request +new features. + +Apache Jackrabbit is a project of the Apache Software Foundation +(http://www.apache.org). + + +License (see also LICENSE.txt) +============================== + +Collective work: Copyright 2006 The Apache Software Foundation. + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +Getting Started +=============== + +This compoment uses a Maven 2 (http://maven.apache.org/) build +environment. If you have Maven 2 installed, you can compile and +package the jacrabbit-text-extractors jar using the following command: + + mvn package + +See the Maven 2 documentation for other build features. + +The latest source code for this compoment is available in the +Subversion (http://subversion.tigris.org/) source repository of +the Apache Software Foundation. If you have Subversion installed, +you can checkout the latest source using the following command: + + svn checkout http://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors + +See the Subversion documentation for other source control features. Propchange: jackrabbit/trunk/jackrabbit-text-extractor/README.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml Tue Dec 19 08:19:04 2006 @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Propchange: jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/pom.xml URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/pom.xml?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/pom.xml (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/pom.xml Tue Dec 19 08:19:04 2006 @@ -0,0 +1,78 @@ + + + + + + 4.0.0 + + + + + + org.apache.jackrabbit + jackrabbit + 1.2-SNAPSHOT + .. + + jackrabbit-text-extractors + Jackrabbit Text Extractors + Classes to extract text content from binary documents + + + + scm:svn:http://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors + + + scm:svn:https://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors + + http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors + + + + + poi + poi + 2.5.1-final-20040804 + + + pdfbox + pdfbox + 0.6.4 + + + org.textmining + tm-extractors + 0.4 + + + nekohtml + nekohtml + 0.9.4 + + + junit + junit + 3.8.1 + test + + + + Propchange: jackrabbit/trunk/jackrabbit-text-extractor/pom.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Base class for text extractor implementations. + */ +public abstract class AbstractTextExtractor implements TextExtractor { + + /** + * The supported content types by this text extractor. + */ + private final String[] contentTypes; + + /** + * @param contentTypes the supported content types by this text extractor. + */ + public AbstractTextExtractor(String[] contentTypes) { + this.contentTypes = new String[contentTypes.length]; + System.arraycopy(contentTypes, 0, this.contentTypes, 0, contentTypes.length); + } + + /** + * @inheritDoc + */ + public String[] getContentTypes() { + return contentTypes; + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * Composite text extractor. This class presents a unified interface + * for a set of {@link TextExtractor} instances. The composite extractor + * supports all the content types supported by the component extractors, + * and delegates text extraction calls to the appropriate components. + */ +public class CompositeTextExtractor implements TextExtractor { + + /** + * Configured {@link TextExtractor} instances, keyed by content types. + */ + private final Map extractors = new HashMap(); + + /** + * Adds a component text extractor. The given extractor is registered + * to process all the content types it claims to support. + * + * @param extractor component extractor + */ + public void addTextExtractor(TextExtractor extractor) { + String[] types = extractor.getContentTypes(); + for (int i = 0; i < types.length; i++) { + extractors.put(types[i], extractor); + } + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns all the content types supported by the component extractors. + * + * @return supported content types + */ + public String[] getContentTypes() { + Set types = extractors.keySet(); + return (String[]) types.toArray(new String[types.size()]); + } + + /** + * Extracts text content using one of the component extractors. If an + * extractor for the given content type does not exist, then the binary + * stream is just closed and an empty reader is returned. + * + * @param stream binary stream + * @param type content type + * @param encoding optional character encoding + * @return reader for the text content of the binary stream + * @throws IOException if the binary stream can not be read + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + TextExtractor extractor = (TextExtractor) extractors.get(type); + if (extractor != null) { + return extractor.extractText(stream, type, encoding); + } else { + stream.close(); + return new StringReader(""); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Composite text extractor that by default contains the standard + * text extractors found in this package. + */ +public class DefaultTextExtractor extends CompositeTextExtractor { + + /** + * Creates the default text extractor by adding instances of the standard + * text extractors as components. + */ + public DefaultTextExtractor() { + addTextExtractor(new PlainTextExtractor()); + addTextExtractor(new XMLTextExtractor()); + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Interface for text extractors that need to delegate the extraction + * of parts of content documents to another text extractor. This interface + * is usually implemented by extractors of composite multimedia or archive + * file formats. + *

+ * The configured delegate text extractor is usually a composite extractor + * that may contain also the delegating extractor, thus it is possible for + * the extractor to be invoked recursively within a single thread. An + * implementation should never pass the full content document to the + * delegate extractor to avoid infinite loops. + */ +public interface DelegatingTextExtractor extends TextExtractor { + + /** + * Sets the text textractor to which this extractor should delegate + * any partial text extraction tasks. The given delegate extractor + * is expected to be able to handle any content types passed to it. + * + * @param extractor delegate text extractor + */ + void setDelegateTextExtractor(TextExtractor extractor); + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; + +/** + * Dummy text extractor that always returns and empty reader for all documents. + * Useful as a dummy handler for unsupported content types. + */ +public class EmptyTextExtractor implements TextExtractor { + + /** + * Supported content types. + */ + private final String[] types; + + /** + * Creates a dummy text extractor for the given content types. + * The given array must not be modified after it has been passed + * to this constructor. + * + * @param types supported content types + */ + public EmptyTextExtractor(String[] types) { + this.types = types; + } + + /** + * Creates a dummy text extractor for the given content type. + * + * @param type supported content type + */ + public EmptyTextExtractor(String type) { + this(new String[] { type }); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns the supported content types. + * + * @return supported content types + */ + public String[] getContentTypes() { + return types; + } + + /** + * Closes the given stream and returns an empty reader. + * + * @param stream binary stream that simply gets closed + * @param type ignored + * @param encoding ignored + * @return empty reader + * @throws IOException if the binary stream can not be closed + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + stream.close(); + return new StringReader(""); + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.Writer; + +import org.xml.sax.Attributes; +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Utility class for extracting text content from an XML document. + * An instance of this class is a SAX event handler that extracts + * character data and attribute values from the SAX events and writes + * the extracted content to a given {@link Writer}. + *

+ * Any whitespace sequences are imploded into a single space character + * and consecutive attribute values and character data are delimited + * using spaces. + *

+ * This class also implements the {@link ErrorHandler} interface by + * ignoring all errors and warnings. This is useful in avoiding the + * default console output or other error logging of many XML parsers. + * + * @see XMLTextExtractor + */ +class ExtractorHandler extends DefaultHandler implements ErrorHandler { + + /** + * Separator that is written between consecutive text and attribute values. + */ + private static final char SPACE = ' '; + + /** + * The writer to which the selected text content is written. + */ + private final Writer writer; + + /** + * Flag for outputting a space before the next character to be outputted. + * Used to implode all whitespace sequences and to separate consecutive + * attribute values and text elements. + */ + private boolean space; + + /** + * Creates an extractor handler that writes text content to the given + * writer. + * + * @param writer writer to which the XML text content is written + */ + public ExtractorHandler(Writer writer) { + this.writer = writer; + this.space = false; + } + + //------------------------------------------------------< DefaultHandler > + + /** + * Writes attribute values to the underlying writer. + * + * @param uri ignored + * @param local ignored + * @param name ignored + * @param attributes attributes, whose values to extract + * @throws SAXException on IO errors + */ + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + for (int i = 0; i < attributes.getLength(); i++) { + String value = attributes.getValue(i); + characters(value.toCharArray(), 0, value.length()); + } + } + + /** + * Writes the given characters to the underlying writer. + * + * @param ch character array that contains the characters to be written + * @param start start index within the array + * @param length number of characters to write + * @throws SAXException on IO errors + */ + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + for (int i = 0; i < length; i++) { + if (Character.isSpaceChar(ch[start + i])) { + space = true; + } else { + if (space) { + writer.write(SPACE); + space = false; + } + writer.write(ch[start + i]); + } + } + space = true; + } catch (IOException e) { + throw new SAXException(e.getMessage()); + } + } + + //--------------------------------------------------------< ErrorHandler > + + /** + * Ignored. + * + * @param exception ignored + */ + public void warning(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void error(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void fatalError(SAXParseException exception) { + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.apache.xerces.parsers.AbstractSAXParser; +import org.apache.xerces.xni.Augmentations; +import org.apache.xerces.xni.NamespaceContext; +import org.apache.xerces.xni.XMLLocator; +import org.apache.xerces.xni.XMLString; +import org.apache.xerces.xni.XNIException; +import org.cyberneko.html.HTMLConfiguration; + +/** + * Helper class for HTML parsing + */ +public class HTMLParser extends AbstractSAXParser { + + private StringBuffer buffer; + + public HTMLParser() { + + super(new HTMLConfiguration()); + } + + public void startDocument(XMLLocator arg0, + String arg1, + NamespaceContext arg2, + Augmentations arg3) throws XNIException { + + super.startDocument(arg0, arg1, arg2, arg3); + + buffer = new StringBuffer(); + } + + public void characters(XMLString xmlString, Augmentations augmentations) + throws XNIException { + + super.characters(xmlString, augmentations); + + buffer.append(xmlString.toString()); + } + + private String filterAndJoin(String text) { + + boolean space = false; + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + + if ((c == '\n') || (c == ' ') || Character.isWhitespace(c)) { + if (space) { + continue; + } else { + space = true; + buffer.append(' '); + continue; + } + } else { + if (!Character.isLetter(c)) { + if (!space) { + space = true; + buffer.append(' '); + continue; + } + continue; + } + } + space = false; + buffer.append(c); + } + return buffer.toString(); + } + + /** + * Returns parsed content + * + * @return String Parsed content + */ + public String getContents() { + + String text = filterAndJoin(buffer.toString()); + return text; + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.xml.sax.InputSource; +import org.xml.sax.helpers.DefaultHandler; + +import javax.xml.transform.sax.SAXSource; +import javax.xml.transform.sax.SAXResult; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerConfigurationException; +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; + +/** + * Text extractor for HyperText Markup Language (HTML). + */ +public class HTMLTextExtractor extends AbstractTextExtractor { + + /** + * Creates a new HTMLTextExtractor instance. + */ + public HTMLTextExtractor() { + super(new String[]{"text/html"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + TransformerFactory factory = TransformerFactory.newInstance(); + Transformer transformer = factory.newTransformer(); + HTMLParser parser = new HTMLParser(); + SAXResult result = new SAXResult(new DefaultHandler()); + + SAXSource source = new SAXSource(parser, new InputSource(stream)); + transformer.transform(source, result); + + return new StringReader(parser.getContents()); + } catch (TransformerConfigurationException e) { + return new StringReader(""); + } catch (TransformerException e) { + return new StringReader(""); + } finally { + stream.close(); + } + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFRow; +import org.apache.poi.hssf.usermodel.HSSFCell; + +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.CharArrayWriter; +import java.io.CharArrayReader; +import java.util.Iterator; + +/** + * Text extractor for Microsoft Excel sheets. + */ +public class MsExcelTextExtractor extends AbstractTextExtractor { + + /** + * Force loading of dependent class. + */ + static { + POIFSFileSystem.class.getName(); + } + + /** + * Creates a new MsExcelTextExtractor instance. + */ + public MsExcelTextExtractor() { + super(new String[]{"application/vnd.ms-excel"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + CharArrayWriter writer = new CharArrayWriter(); + try { + POIFSFileSystem fs = new POIFSFileSystem(stream); + HSSFWorkbook workbook = new HSSFWorkbook(fs); + + for (int i = 0; i < workbook.getNumberOfSheets(); i++) { + HSSFSheet sheet = workbook.getSheetAt(i); + + Iterator rows = sheet.rowIterator(); + while (rows.hasNext()) { + HSSFRow row = (HSSFRow) rows.next(); + + Iterator cells = row.cellIterator(); + while (cells.hasNext()) { + HSSFCell cell = (HSSFCell) cells.next(); + switch (cell.getCellType()) { + case HSSFCell.CELL_TYPE_NUMERIC: + String num = Double.toString(cell.getNumericCellValue()).trim(); + if (num.length() > 0) { + writer.write(num + " "); + } + break; + case HSSFCell.CELL_TYPE_STRING: + String text = cell.getStringCellValue().trim(); + if (text.length() > 0) { + writer.write(text + " "); + } + break; + } + } + } + } + + return new CharArrayReader(writer.toCharArray()); + } finally { + stream.close(); + } + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.apache.poi.poifs.eventfilesystem.POIFSReader; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.util.LittleEndian; + +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStreamReader; +import java.io.ByteArrayInputStream; + +/** + * Text extractor for Microsoft PowerPoint presentations. + */ +public class MsPowerPointTextExtractor extends AbstractTextExtractor { + + /** + * Force loading of dependent class. + */ + static { + POIFSReader.class.getName(); + } + + /** + * Creates a new MsPowerPointTextExtractor instance. + */ + public MsPowerPointTextExtractor() { + super(new String[]{"application/vnd.ms-powerpoint", + "application/mspowerpoint"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + MsPowerPointListener listener = new MsPowerPointListener(baos); + POIFSReader reader = new POIFSReader(); + reader.registerListener(listener); + reader.read(stream); + return new InputStreamReader( + new ByteArrayInputStream(baos.toByteArray())); + } finally { + stream.close(); + } + } + + //------------------------------------------------< MsPowerPointListener > + + /** + * Reader listener. + */ + private class MsPowerPointListener implements POIFSReaderListener { + private OutputStream os; + + MsPowerPointListener(OutputStream os) { + this.os = os; + } + + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + try { + if (!event.getName().equalsIgnoreCase("PowerPoint Document")) { + return; + } + DocumentInputStream input = event.getStream(); + byte[] buffer = new byte[input.available()]; + input.read(buffer, 0, input.available()); + for (int i = 0; i < buffer.length - 20; i++) { + long type = LittleEndian.getUShort(buffer, i + 2); + long size = LittleEndian.getUInt(buffer, i + 4); + if (type == 4008) { + os.write(buffer, i + 4 + 1, (int) size + 3); + i = i + 4 + 1 + (int) size - 1; + } + } + } catch (Exception e) { + + } + } + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.textmining.text.extraction.WordExtractor; + +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; + +/** + * Text extractor for Microsoft Word documents. + */ +public class MsWordTextExtractor extends AbstractTextExtractor { + + /** + * Force loading of dependent class. + */ + static { + WordExtractor.class.getName(); + } + + /** + * Creates a new MsWordTextExtractor instance. + */ + public MsWordTextExtractor() { + super(new String[]{"application/vnd.ms-word", "application/msword"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + * Returns an empty reader if an error occured extracting text from + * the word document. + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + WordExtractor extractor = new WordExtractor(); + + // This throws raw Exception - not nice + String text = extractor.extractText(stream); + + return new StringReader(text); + } catch (Exception e) { + return new StringReader(""); + } finally { + stream.close(); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,133 @@ +/* + * $URL:$ + * $Id:$ + * + * Copyright 1997-2005 Day Management AG + * Barfuesserplatz 6, 4001 Basel, Switzerland + * All Rights Reserved. + * + * This software is the confidential and proprietary information of + * Day Management AG, ("Confidential Information"). You shall not + * disclose such Confidential Information and shall use it only in + * accordance with the terms of the license agreement you entered into + * with Day. + */ +package org.apache.jackrabbit.extractor; + +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.InputSource; +import org.xml.sax.XMLReader; + +import javax.xml.parsers.SAXParserFactory; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.ParserConfigurationException; +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipEntry; + +/** + * Text extractor for OpenOffice documents. + */ +public class OpenOfficeTextExtractor extends AbstractTextExtractor { + + /** + * Creates a new OpenOfficeTextExtractor instance. + */ + public OpenOfficeTextExtractor() { + super(new String[]{"application/vnd.oasis.opendocument.database", + "application/vnd.oasis.opendocument.formula", + "application/vnd.oasis.opendocument.graphics", + "application/vnd.oasis.opendocument.presentation", + "application/vnd.oasis.opendocument.spreadsheet", + "application/vnd.oasis.opendocument.text"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); + saxParserFactory.setValidating(false); + SAXParser saxParser = saxParserFactory.newSAXParser(); + XMLReader xmlReader = saxParser.getXMLReader(); + xmlReader.setFeature("http://xml.org/sax/features/validation", false); + xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + + ZipInputStream zis = new ZipInputStream(stream); + ZipEntry ze = zis.getNextEntry(); + while (!ze.getName().equals("content.xml")) { + ze = zis.getNextEntry(); + } + + OpenOfficeContentHandler contentHandler = + new OpenOfficeContentHandler(); + xmlReader.setContentHandler(contentHandler); + try { + xmlReader.parse(new InputSource(zis)); + } finally { + zis.close(); + } + + return new StringReader(contentHandler.getContent()); + } catch (ParserConfigurationException e) { + return new StringReader(""); + } catch (SAXException e) { + return new StringReader(""); + } finally { + stream.close(); + } + } + + //--------------------------------------------< OpenOfficeContentHandler > + + private class OpenOfficeContentHandler extends DefaultHandler { + + private StringBuffer content; + private boolean appendChar; + + public OpenOfficeContentHandler() { + content = new StringBuffer(); + appendChar = false; + } + + /** + * Returns the text content extracted from parsed content.xml + */ + public String getContent() { + return content.toString(); + } + + public void startElement(String namespaceURI, String localName, + String rawName, Attributes atts) + throws SAXException { + if (rawName.startsWith("text:")) { + appendChar = true; + } + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + if (appendChar) { + content.append(ch, start, length).append(" "); + } + } + + public void endElement(java.lang.String namespaceURI, + java.lang.String localName, + java.lang.String qName) + throws SAXException { + appendChar = false; + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.pdfbox.pdfparser.PDFParser; +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.util.PDFTextStripper; + +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.BufferedInputStream; +import java.io.CharArrayWriter; +import java.io.CharArrayReader; +import java.io.StringReader; + +/** + * Text extractor for Portable Document Format (PDF). + */ +public class PdfTextExtractor extends AbstractTextExtractor { + + /** + * Force loading of dependent class. + */ + static { + PDFParser.class.getName(); + } + + /** + * Creates a new PdfTextExtractor instance. + */ + public PdfTextExtractor() { + super(new String[]{"application/pdf"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + PDFParser parser = new PDFParser(new BufferedInputStream(stream)); + parser.parse(); + + PDDocument document = parser.getPDDocument(); + try { + CharArrayWriter writer = new CharArrayWriter(); + + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setLineSeparator("\n"); + stripper.writeText(document, writer); + + return new CharArrayReader(writer.toCharArray()); + } finally { + document.close(); + } + } catch (Exception e) { + // it may happen that PDFParser throws a runtime + // exception when parsing certain pdf documents + return new StringReader(""); + } finally { + stream.close(); + } + } +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java?view=auto&rev=488717 ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (added) +++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java Tue Dec 19 08:19:04 2006 @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; + +/** + * Text extractor for plain text. + */ +public class PlainTextExtractor extends AbstractTextExtractor { + + /** + * Creates a new PlainTextExtractor instance. + */ + public PlainTextExtractor() { + super(new String[]{"text/plain"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Wraps the given input stream to an {@link InputStreamReader} using + * the given encoding, or the platform default encoding if the encoding + * is not given. Closes the stream and returns an empty reader if the + * given encoding is not supported. + * + * @param stream binary stream + * @param type ignored + * @param encoding character encoding, optional + * @return reader for the plain text content + * @throws IOException if the binary stream can not be closed in case + * of an encoding issue + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + try { + if (encoding != null) { + return new InputStreamReader(stream, encoding); + } else { + return new InputStreamReader(stream); + } + } catch (UnsupportedEncodingException e) { + stream.close(); + return new StringReader(""); + } + } + +} Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native