jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r488717 [1/2] - in /jackrabbit/trunk/jackrabbit-text-extractor: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/jackrabbit/ src/main/java/org/apache/jackrabbit/extractor/ src/test/ src/...
Date Tue, 19 Dec 2006 16:19:06 GMT
Author: mreutegg
Date: Tue Dec 19 08:19:04 2006
New Revision: 488717

URL: http://svn.apache.org/viewvc?view=rev&rev=488717
Log:
JCR-415: Enhance indexing of binary content
- Create new module jackrabbit-text-extractors
- Migrated existing index-filter implementations to text-extractor module 

Added:
    jackrabbit/trunk/jackrabbit-text-extractor/   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/README.txt   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/pom.xml   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/ExtractorHelper.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java   (with props)
    jackrabbit/trunk/jackrabbit-text-extractor/src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java   (with props)

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Dec 19 08:19:04 2006
@@ -0,0 +1,4 @@
+*.iml
+*.ipr
+*.iws
+target

Added: jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt Tue Dec 19 08:19:04 2006
@@ -0,0 +1,16 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/HEADER.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt Tue Dec 19 08:19:04 2006
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/LICENSE.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt Tue Dec 19 08:19:04 2006
@@ -0,0 +1,5 @@
+Apache Jackrabbit
+Copyright 2006 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/NOTICE.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/README.txt
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/README.txt?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/README.txt (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/README.txt Tue Dec 19 08:19:04 2006
@@ -0,0 +1,107 @@
+=====================================
+Welcome to Jackrabbit Text Extractors
+=====================================
+
+This is the Text Extractors component of the Apache Jackrabbit project.
+This component contains extractor classes that allow Jackrabbit to
+extract text content from binary properties for full text indexing.
+The following file formats and MIME types are currently supported:
+
+    * Microsoft Word
+      [org.apache.jackrabbit.extractor.MsWordTextExtractor]
+      * application/vnd.ms-word
+      * application/msword
+
+    * Microsoft Excel
+      [org.apache.jackrabbit.extractor.MsExcelTextExtractor]
+      * application/vnd.ms-excel
+
+    * Microsoft PowerPoint
+      [org.apache.jackrabbit.extractor.MsPowerPointTextExtractor] 
+      * application/vnd.ms-powerpoint
+      * application/mspowerpoint
+
+    * Portable Document Format (PDF)
+      [org.apache.jackrabbit.extractor.PdfTextExtractor]
+      * application/pdf
+
+    * OpenOffice.org
+      [org.apache.jackrabbit.extractor.OpenOfficeTextExtractor]
+      * application/vnd.oasis.opendocument.database
+      * application/vnd.oasis.opendocument.formula
+      * application/vnd.oasis.opendocument.graphics
+      * application/vnd.oasis.opendocument.presentation
+      * application/vnd.oasis.opendocument.spreadsheet
+      * application/vnd.oasis.opendocument.text
+
+    * Rich Text Format (RTF)
+      [org.apache.jackrabbit.extractor.RTFTextExtractor]
+      * application/rtf
+
+    * HyperText Markup Language (HTML)
+      [org.apache.jackrabbit.extractor.HTMLTextExtractor]
+      * text/html
+
+    * Extensible Markup Language (XML)
+      [org.apache.jackrabbit.extractor.XMLTextExtractor]
+      * text/xml
+
+To use these text extractors with the Jackrabbit Core:
+
+   1) add the jackrabbit-text-extractors jar file and the dependencies defined
+      in the Maven POM in the Jackrabbit classpath, and
+   2) add the fully qualified class names listed above in the "textFilterClasses"
+      parameter of the "SearchIndex" configuration element of a Jackrabbit
+      workspace configuration file (workspace.xml).
+
+See the Apache Jackrabbit web site (http://jackrabbit.apache.org/)
+for documentation and other information. You are welcome to join the
+Jackrabbit mailing lists (http://jackrabbit.apache.org/mail-lists.html)
+to discuss this compoment and to use the Jackrabbit issue tracker
+(http://issues.apache.org/jira/browse/JCR) to report issues or request
+new features.
+
+Apache Jackrabbit is a project of the Apache Software Foundation
+(http://www.apache.org).
+
+
+License (see also LICENSE.txt)
+==============================
+
+Collective work: Copyright 2006 The Apache Software Foundation.
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+Getting Started
+===============
+
+This compoment uses a Maven 2 (http://maven.apache.org/) build
+environment. If you have Maven 2 installed, you can compile and
+package the jacrabbit-text-extractors jar using the following command:
+
+    mvn package
+
+See the Maven 2 documentation for other build features.
+
+The latest source code for this compoment is available in the
+Subversion (http://subversion.tigris.org/) source repository of
+the Apache Software Foundation. If you have Subversion installed,
+you can checkout the latest source using the following command:
+
+    svn checkout http://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors
+
+See the Subversion documentation for other source control features.

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/README.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml Tue Dec 19 08:19:04 2006
@@ -0,0 +1,166 @@
+<?xml version="1.0"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+  -->
+  
+<!DOCTYPE module PUBLIC
+    "-//Puppy Crawl//DTD Check Configuration 1.1//EN"
+    "http://www.puppycrawl.com/dtds/configuration_1_1.dtd">
+
+<!--
+  Checkstyle checks configured for Maven.
+-->
+
+<module name="Checker">
+
+    <!-- Checks that a package.html file exists for each package.     -->
+    <!-- See http://checkstyle.sf.net/config_javadoc.html#PackageHtml -->
+    <module name="PackageHtml"/>
+
+    <!-- Checks whether files end with a new line.                        -->
+    <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile -->
+    <module name="NewlineAtEndOfFile"/>
+
+    <!-- Checks that property files contain the same keys.         -->
+    <!-- See http://checkstyle.sf.net/config_misc.html#Translation -->
+    <module name="Translation"/>
+
+    <module name="TreeWalker">
+
+        <property name="cacheFile" value="${checkstyle.cache.file}"/>
+
+        <!-- ************************************************************** -->
+        <!-- Checks that are different from the sun coding conventions ones -->
+        <!-- ************************************************************** -->
+
+        <module name="Header">
+            <property name="headerFile" value="${basedir}/HEADER.txt"/>
+        </module>
+        <!-- <property name="tabWidth" value="4"/> -->
+        <module name="LeftCurly">
+          <property name="option" value="eol"/>
+        </module>
+        <module name="LineLength">
+          <property name="max" value="132"/>
+          <property name="ignorePattern" value="\* \$"/>
+        </module>
+        <module name="MethodLength">
+          <property name="max" value="175"/>
+        </module>
+        <module name="ConstantName">
+          <property name="format" value="log|^[a-zA-Z][a-zA-Z0-9_]*$"/>
+        </module>
+
+        <!-- ************************************************************** -->
+        <!-- Default Sun coding conventions checks                          -->
+        <!-- ************************************************************** -->
+
+        <!-- Checks for Javadoc comments.                     -->
+        <!-- See http://checkstyle.sf.net/config_javadoc.html -->
+        <module name="JavadocMethod"/>
+        <module name="JavadocType"/>
+        <module name="JavadocVariable"/>
+
+        <!-- Checks for Naming Conventions.                  -->
+        <!-- See http://checkstyle.sf.net/config_naming.html -->
+        <module name="LocalFinalVariableName"/>
+        <module name="LocalVariableName"/>
+        <module name="MethodName"/>
+        <module name="PackageName"/>
+        <module name="ParameterName"/>
+        <module name="StaticVariableName"/>
+        <module name="TypeName"/>
+        <module name="MemberName"/>
+
+        <!-- Checks for imports                              -->
+        <!-- See http://checkstyle.sf.net/config_import.html -->
+        <module name="AvoidStarImport"/>
+        <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
+        <module name="RedundantImport"/>
+        <module name="UnusedImports"/>
+
+
+        <!-- Checks for Size Violations.                    -->
+        <!-- See http://checkstyle.sf.net/config_sizes.html -->
+        <module name="FileLength"/>
+        <module name="ParameterNumber"/>
+
+
+        <!-- Checks for whitespace                               -->
+        <!-- See http://checkstyle.sf.net/config_whitespace.html -->
+        <module name="EmptyForIteratorPad"/>
+        <module name="NoWhitespaceAfter"/>
+        <module name="NoWhitespaceBefore"/>
+        <module name="OperatorWrap"/>
+        <module name="TabCharacter"/>
+        <module name="WhitespaceAfter"/>
+        <module name="WhitespaceAround"/>
+
+
+        <!-- Modifier Checks                                    -->
+        <!-- See http://checkstyle.sf.net/config_modifiers.html -->
+        <module name="ModifierOrder"/>
+        <module name="RedundantModifier"/>
+
+
+        <!-- Checks for blocks. You know, those {}'s         -->
+        <!-- See http://checkstyle.sf.net/config_blocks.html -->
+        <module name="AvoidNestedBlocks"/>     
+        <module name="NeedBraces"/>
+
+        <!-- Checks for common coding problems               -->
+        <!-- See http://checkstyle.sf.net/config_coding.html -->
+        <!-- <module name="AvoidInlineConditionals"/> -->      <!-- DISABLED-->
+        <module name="DoubleCheckedLocking"/>
+        <module name="EqualsHashCode"/>
+        <module name="IllegalInstantiation"/>
+        <module name="InnerAssignment"/>
+        <module name="MissingSwitchDefault"/>
+        <module name="RedundantThrows">
+            <property name="allowUnchecked" value="true"/>   <!-- DISABLED -->
+            <property name="allowSubclasses" value="true"/>   <!-- DISABLED -->
+        </module>
+        <module name="SimplifyBooleanExpression"/>
+        <module name="SimplifyBooleanReturn"/>
+
+        <!-- Checks for class design                         -->
+        <!-- See http://checkstyle.sf.net/config_design.html -->
+        <module name="DesignForExtension">
+            <property name="severity" value="ignore"/>   <!-- DISABLED -->
+        </module>
+        <module name="HideUtilityClassConstructor"/>
+        <module name="InterfaceIsType"/>
+        <module name="VisibilityModifier">
+            <!-- Protected member variables are widely used in Jackrabbit -->
+            <property name="protectedAllowed" value="true"/>
+        </module>
+
+
+        <!-- Miscellaneous other checks.                   -->
+        <!-- See http://checkstyle.sf.net/config_misc.html -->
+        <module name="ArrayTypeStyle"/>
+        <module name="FinalParameters">
+            <property name="severity" value="ignore"/>   <!-- DISABLED -->
+        </module>
+        <module name="GenericIllegalRegexp">
+            <property name="format" value="\s+$"/>
+            <property name="message" value="Line has trailing spaces."/>
+        </module>
+        <module name="TodoComment"/>
+        <module name="UpperEll"/>
+
+    </module>
+</module>

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/checkstyle.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/pom.xml?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/pom.xml (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/pom.xml Tue Dec 19 08:19:04 2006
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+                             http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+<!-- ====================================================================== -->
+<!-- P R O J E C T  D E S C R I P T I O N                                   -->
+<!-- ====================================================================== -->
+  <parent>
+    <groupId>org.apache.jackrabbit</groupId>
+    <artifactId>jackrabbit</artifactId>
+    <version>1.2-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+  <artifactId>jackrabbit-text-extractors</artifactId>
+  <name>Jackrabbit Text Extractors</name>
+  <description>Classes to extract text content from binary documents</description>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/jackrabbit/trunk/jackrabbit-text-extractors
+    </developerConnection>
+    <url>http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors</url>
+  </scm>
+
+  <dependencies>
+    <dependency>
+      <groupId>poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>2.5.1-final-20040804</version>
+    </dependency>
+    <dependency>
+      <groupId>pdfbox</groupId>
+      <artifactId>pdfbox</artifactId>
+      <version>0.6.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.textmining</groupId>
+      <artifactId>tm-extractors</artifactId>
+      <version>0.4</version>
+    </dependency>
+    <dependency>
+      <groupId>nekohtml</groupId>
+      <artifactId>nekohtml</artifactId>
+      <version>0.9.4</version>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>3.8.1</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+</project>

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+/**
+ * Base class for text extractor implementations.
+ */
+public abstract class AbstractTextExtractor implements TextExtractor {
+
+    /**
+     * The supported content types by this text extractor.
+     */
+    private final String[] contentTypes;
+
+    /**
+     * @param contentTypes the supported content types by this text extractor.
+     */
+    public AbstractTextExtractor(String[] contentTypes) {
+        this.contentTypes = new String[contentTypes.length];
+        System.arraycopy(contentTypes, 0, this.contentTypes, 0, contentTypes.length);
+    }
+
+    /**
+     * @inheritDoc
+     */
+    public String[] getContentTypes() {
+        return contentTypes;
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/AbstractTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Composite text extractor. This class presents a unified interface
+ * for a set of {@link TextExtractor} instances. The composite extractor
+ * supports all the content types supported by the component extractors,
+ * and delegates text extraction calls to the appropriate components.
+ */
+public class CompositeTextExtractor implements TextExtractor {
+
+    /**
+     * Configured {@link TextExtractor} instances, keyed by content types.
+     */
+    private final Map extractors = new HashMap();
+
+    /**
+     * Adds a component text extractor. The given extractor is registered
+     * to process all the content types it claims to support.
+     *
+     * @param extractor component extractor
+     */
+    public void addTextExtractor(TextExtractor extractor) {
+        String[] types = extractor.getContentTypes();
+        for (int i = 0; i < types.length; i++) {
+            extractors.put(types[i], extractor);
+        }
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * Returns all the content types supported by the component extractors.
+     *
+     * @return supported content types
+     */
+    public String[] getContentTypes() {
+        Set types = extractors.keySet();
+        return (String[]) types.toArray(new String[types.size()]);
+    }
+
+    /**
+     * Extracts text content using one of the component extractors. If an
+     * extractor for the given content type does not exist, then the binary
+     * stream is just closed and an empty reader is returned.
+     *
+     * @param stream binary stream
+     * @param type content type
+     * @param encoding optional character encoding
+     * @return reader for the text content of the binary stream
+     * @throws IOException if the binary stream can not be read
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        TextExtractor extractor = (TextExtractor) extractors.get(type);
+        if (extractor != null) {
+            return extractor.extractText(stream, type, encoding);
+        } else {
+            stream.close();
+            return new StringReader("");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+/**
+ * Composite text extractor that by default contains the standard
+ * text extractors found in this package.
+ */
+public class DefaultTextExtractor extends CompositeTextExtractor {
+
+    /**
+     * Creates the default text extractor by adding instances of the standard
+     * text extractors as components.
+     */
+    public DefaultTextExtractor() {
+        addTextExtractor(new PlainTextExtractor());
+        addTextExtractor(new XMLTextExtractor());
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+/**
+ * Interface for text extractors that need to delegate the extraction
+ * of parts of content documents to another text extractor. This interface
+ * is usually implemented by extractors of composite multimedia or archive
+ * file formats.
+ * <p>
+ * The configured delegate text extractor is usually a composite extractor
+ * that may contain also the delegating extractor, thus it is possible for
+ * the extractor to be invoked recursively within a single thread. An
+ * implementation should never pass the full content document to the
+ * delegate extractor to avoid infinite loops.
+ */
+public interface DelegatingTextExtractor extends TextExtractor {
+
+    /**
+     * Sets the text textractor to which this extractor should delegate
+     * any partial text extraction tasks. The given delegate extractor
+     * is expected to be able to handle any content types passed to it.
+     *
+     * @param extractor delegate text extractor
+     */
+    void setDelegateTextExtractor(TextExtractor extractor);
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Dummy text extractor that always returns and empty reader for all documents.
+ * Useful as a dummy handler for unsupported content types.
+ */
+public class EmptyTextExtractor implements TextExtractor {
+
+    /**
+     * Supported content types.
+     */
+    private final String[] types;
+
+    /**
+     * Creates a dummy text extractor for the given content types.
+     * The given array must not be modified after it has been passed
+     * to this constructor.
+     *
+     * @param types supported content types
+     */
+    public EmptyTextExtractor(String[] types) {
+        this.types = types;
+    }
+
+    /**
+     * Creates a dummy text extractor for the given content type.
+     *
+     * @param type supported content type
+     */
+    public EmptyTextExtractor(String type) {
+        this(new String[] { type });
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * Returns the supported content types.
+     *
+     * @return supported content types
+     */
+    public String[] getContentTypes() {
+        return types;
+    }
+
+    /**
+     * Closes the given stream and returns an empty reader.
+     *
+     * @param stream binary stream that simply gets closed
+     * @param type ignored
+     * @param encoding ignored
+     * @return empty reader
+     * @throws IOException if the binary stream can not be closed
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        stream.close();
+        return new StringReader("");
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Utility class for extracting text content from an XML document.
+ * An instance of this class is a SAX event handler that extracts
+ * character data and attribute values from the SAX events and writes
+ * the extracted content to a given {@link Writer}.
+ * <p>
+ * Any whitespace sequences are imploded into a single space character
+ * and consecutive attribute values and character data are delimited
+ * using spaces.
+ * <p>
+ * This class also implements the {@link ErrorHandler} interface by
+ * ignoring all errors and warnings. This is useful in avoiding the
+ * default console output or other error logging of many XML parsers.
+ *
+ * @see XMLTextExtractor
+ */
+class ExtractorHandler extends DefaultHandler implements ErrorHandler {
+
+    /**
+     * Separator that is written between consecutive text and attribute values.
+     */
+    private static final char SPACE = ' ';
+
+    /**
+     * The writer to which the selected text content is written.
+     */
+    private final Writer writer;
+
+    /**
+     * Flag for outputting a space before the next character to be outputted.
+     * Used to implode all whitespace sequences and to separate consecutive
+     * attribute values and text elements.
+     */
+    private boolean space;
+
+    /**
+     * Creates an extractor handler that writes text content to the given
+     * writer.
+     *
+     * @param writer writer to which the XML text content is written
+     */
+    public ExtractorHandler(Writer writer) {
+        this.writer = writer;
+        this.space = false;
+    }
+
+    //------------------------------------------------------< DefaultHandler >
+
+    /**
+     * Writes attribute values to the underlying writer.
+     *
+     * @param uri ignored
+     * @param local ignored
+     * @param name ignored
+     * @param attributes attributes, whose values to extract
+     * @throws SAXException on IO errors
+     */
+    public void startElement(
+            String uri, String local, String name, Attributes attributes)
+            throws SAXException {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            String value = attributes.getValue(i);
+            characters(value.toCharArray(), 0, value.length());
+        }
+    }
+
+    /**
+     * Writes the given characters to the underlying writer.
+     *
+     * @param ch character array that contains the characters to be written
+     * @param start start index within the array
+     * @param length number of characters to write
+     * @throws SAXException on IO errors
+     */
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        try {
+            for (int i = 0; i < length; i++) {
+                if (Character.isSpaceChar(ch[start + i])) {
+                    space = true;
+                } else {
+                    if (space) {
+                        writer.write(SPACE);
+                        space = false;
+                    }
+                    writer.write(ch[start + i]);
+                }
+            }
+            space = true;
+        } catch (IOException e) {
+            throw new SAXException(e.getMessage());
+        }
+    }
+
+    //--------------------------------------------------------< ErrorHandler >
+
+    /**
+     * Ignored.
+     *
+     * @param exception ignored
+     */
+    public void warning(SAXParseException exception) {
+    }
+
+    /**
+     * Ignored.
+     *
+     * @param exception ignored
+     */
+    public void error(SAXParseException exception) {
+    }
+
+    /**
+     * Ignored.
+     *
+     * @param exception ignored
+     */
+    public void fatalError(SAXParseException exception) {
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.apache.xerces.xni.Augmentations;
+import org.apache.xerces.xni.NamespaceContext;
+import org.apache.xerces.xni.XMLLocator;
+import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.xni.XNIException;
+import org.cyberneko.html.HTMLConfiguration;
+
+/**
+ * Helper class for HTML parsing
+ */
+public class HTMLParser extends AbstractSAXParser {
+
+    private StringBuffer buffer;
+
+    public HTMLParser() {
+
+        super(new HTMLConfiguration());
+    }
+
+    public void startDocument(XMLLocator arg0,
+                              String arg1,
+                              NamespaceContext arg2,
+                              Augmentations arg3) throws XNIException {
+
+        super.startDocument(arg0, arg1, arg2, arg3);
+
+        buffer = new StringBuffer();
+    }
+
+    public void characters(XMLString xmlString, Augmentations augmentations)
+            throws XNIException {
+
+        super.characters(xmlString, augmentations);
+
+        buffer.append(xmlString.toString());
+    }
+
+    private String filterAndJoin(String text) {
+
+        boolean space = false;
+        StringBuffer buffer = new StringBuffer();
+        for (int i = 0; i < text.length(); i++) {
+            char c = text.charAt(i);
+
+            if ((c == '\n') || (c == ' ') || Character.isWhitespace(c)) {
+                if (space) {
+                    continue;
+                } else {
+                    space = true;
+                    buffer.append(' ');
+                    continue;
+                }
+            } else {
+                if (!Character.isLetter(c)) {
+                    if (!space) {
+                        space = true;
+                        buffer.append(' ');
+                        continue;
+                    }
+                    continue;
+                }
+            }
+            space = false;
+            buffer.append(c);
+        }
+        return buffer.toString();
+    }
+
+    /**
+     * Returns parsed content
+     *
+     * @return String Parsed content
+     */
+    public String getContents() {
+
+        String text = filterAndJoin(buffer.toString());
+        return text;
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.transform.sax.SAXSource;
+import javax.xml.transform.sax.SAXResult;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerConfigurationException;
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for HyperText Markup Language (HTML).
+ */
+public class HTMLTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Creates a new <code>HTMLTextExtractor</code> instance.
+     */
+    public HTMLTextExtractor() {
+        super(new String[]{"text/html"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            TransformerFactory factory = TransformerFactory.newInstance();
+            Transformer transformer = factory.newTransformer();
+            HTMLParser parser = new HTMLParser();
+            SAXResult result = new SAXResult(new DefaultHandler());
+
+            SAXSource source = new SAXSource(parser, new InputSource(stream));
+            transformer.transform(source, result);
+
+            return new StringReader(parser.getContents());
+        } catch (TransformerConfigurationException e) {
+            return new StringReader("");
+        } catch (TransformerException e) {
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFCell;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.CharArrayWriter;
+import java.io.CharArrayReader;
+import java.util.Iterator;
+
+/**
+ * Text extractor for Microsoft Excel sheets.
+ */
+public class MsExcelTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        POIFSFileSystem.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsExcelTextExtractor</code> instance.
+     */
+    public MsExcelTextExtractor() {
+        super(new String[]{"application/vnd.ms-excel"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        CharArrayWriter writer = new CharArrayWriter();
+        try {
+            POIFSFileSystem fs = new POIFSFileSystem(stream);
+            HSSFWorkbook workbook = new HSSFWorkbook(fs);
+
+            for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
+                HSSFSheet sheet = workbook.getSheetAt(i);
+
+                Iterator rows = sheet.rowIterator();
+                while (rows.hasNext()) {
+                    HSSFRow row = (HSSFRow) rows.next();
+
+                    Iterator cells = row.cellIterator();
+                    while (cells.hasNext()) {
+                        HSSFCell cell = (HSSFCell) cells.next();
+                        switch (cell.getCellType()) {
+                        case HSSFCell.CELL_TYPE_NUMERIC:
+                            String num = Double.toString(cell.getNumericCellValue()).trim();
+                            if (num.length() > 0) {
+                                writer.write(num + " ");
+                            }
+                            break;
+                        case HSSFCell.CELL_TYPE_STRING:
+                            String text = cell.getStringCellValue().trim();
+                            if (text.length() > 0) {
+                                writer.write(text + " ");
+                            }
+                            break;
+                        }
+                    }
+                }
+            }
+
+            return new CharArrayReader(writer.toCharArray());
+        } finally {
+            stream.close();
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.LittleEndian;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStreamReader;
+import java.io.ByteArrayInputStream;
+
+/**
+ * Text extractor for Microsoft PowerPoint presentations.
+ */
+public class MsPowerPointTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        POIFSReader.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsPowerPointTextExtractor</code> instance.
+     */
+    public MsPowerPointTextExtractor() {
+        super(new String[]{"application/vnd.ms-powerpoint",
+                           "application/mspowerpoint"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            MsPowerPointListener listener = new MsPowerPointListener(baos);
+            POIFSReader reader = new POIFSReader();
+            reader.registerListener(listener);
+            reader.read(stream);
+            return new InputStreamReader(
+                    new ByteArrayInputStream(baos.toByteArray()));
+        } finally {
+            stream.close();
+        }
+    }
+
+    //------------------------------------------------< MsPowerPointListener >
+
+    /**
+     * Reader listener.
+     */
+    private class MsPowerPointListener implements POIFSReaderListener {
+        private OutputStream os;
+
+        MsPowerPointListener(OutputStream os) {
+            this.os = os;
+        }
+
+        public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+            try {
+                if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
+                    return;
+                }
+                DocumentInputStream input = event.getStream();
+                byte[] buffer = new byte[input.available()];
+                input.read(buffer, 0, input.available());
+                for (int i = 0; i < buffer.length - 20; i++) {
+                    long type = LittleEndian.getUShort(buffer, i + 2);
+                    long size = LittleEndian.getUInt(buffer, i + 4);
+                    if (type == 4008) {
+                        os.write(buffer, i + 4 + 1, (int) size + 3);
+                        i = i + 4 + 1 + (int) size - 1;
+                    }
+                }
+            } catch (Exception e) {
+
+            }
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.textmining.text.extraction.WordExtractor;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Microsoft Word documents.
+ */
+public class MsWordTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        WordExtractor.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsWordTextExtractor</code> instance.
+     */
+    public MsWordTextExtractor() {
+        super(new String[]{"application/vnd.ms-word", "application/msword"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     * Returns an empty reader if an error occured extracting text from
+     * the word document.
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            WordExtractor extractor = new WordExtractor();
+
+            // This throws raw Exception - not nice
+            String text = extractor.extractText(stream);
+
+            return new StringReader(text);
+        } catch (Exception e) {
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,133 @@
+/*
+ * $URL:$
+ * $Id:$
+ *
+ * Copyright 1997-2005 Day Management AG
+ * Barfuesserplatz 6, 4001 Basel, Switzerland
+ * All Rights Reserved.
+ *
+ * This software is the confidential and proprietary information of
+ * Day Management AG, ("Confidential Information"). You shall not
+ * disclose such Confidential Information and shall use it only in
+ * accordance with the terms of the license agreement you entered into
+ * with Day.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.InputSource;
+import org.xml.sax.XMLReader;
+
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipEntry;
+
+/**
+ * Text extractor for OpenOffice documents.
+ */
+public class OpenOfficeTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Creates a new <code>OpenOfficeTextExtractor</code> instance.
+     */
+    public OpenOfficeTextExtractor() {
+        super(new String[]{"application/vnd.oasis.opendocument.database",
+                           "application/vnd.oasis.opendocument.formula",
+                           "application/vnd.oasis.opendocument.graphics",
+                           "application/vnd.oasis.opendocument.presentation",
+                           "application/vnd.oasis.opendocument.spreadsheet",
+                           "application/vnd.oasis.opendocument.text"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */ 
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+            saxParserFactory.setValidating(false);
+            SAXParser saxParser = saxParserFactory.newSAXParser();
+            XMLReader xmlReader = saxParser.getXMLReader();
+            xmlReader.setFeature("http://xml.org/sax/features/validation", false);
+            xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+
+            ZipInputStream zis = new ZipInputStream(stream);
+            ZipEntry ze = zis.getNextEntry();
+            while (!ze.getName().equals("content.xml")) {
+                ze = zis.getNextEntry();
+            }
+
+            OpenOfficeContentHandler contentHandler =
+                    new OpenOfficeContentHandler();
+            xmlReader.setContentHandler(contentHandler);
+            try {
+                xmlReader.parse(new InputSource(zis));
+            } finally {
+                zis.close();
+            }
+
+            return new StringReader(contentHandler.getContent());
+        } catch (ParserConfigurationException e) {
+            return new StringReader("");
+        } catch (SAXException e) {
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+
+    //--------------------------------------------< OpenOfficeContentHandler >
+
+    private class OpenOfficeContentHandler extends DefaultHandler {
+
+        private StringBuffer content;
+        private boolean appendChar;
+
+        public OpenOfficeContentHandler() {
+            content = new StringBuffer();
+            appendChar = false;
+        }
+
+        /**
+         * Returns the text content extracted from parsed content.xml
+         */
+        public String getContent() {
+            return content.toString();
+        }
+
+        public void startElement(String namespaceURI, String localName,
+                                 String rawName, Attributes atts)
+                throws SAXException {
+            if (rawName.startsWith("text:")) {
+                appendChar = true;
+            }
+        }
+
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            if (appendChar) {
+                content.append(ch, start, length).append(" ");
+            }
+        }
+
+        public void endElement(java.lang.String namespaceURI,
+                               java.lang.String localName,
+                               java.lang.String qName)
+                throws SAXException {
+            appendChar = false;
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.BufferedInputStream;
+import java.io.CharArrayWriter;
+import java.io.CharArrayReader;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Portable Document Format (PDF).
+ */
+public class PdfTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        PDFParser.class.getName();
+    }
+
+    /**
+     * Creates a new <code>PdfTextExtractor</code> instance.
+     */
+    public PdfTextExtractor() {
+        super(new String[]{"application/pdf"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */ 
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
+            parser.parse();
+
+            PDDocument document = parser.getPDDocument();
+            try {
+                CharArrayWriter writer = new CharArrayWriter();
+
+                PDFTextStripper stripper = new PDFTextStripper();
+                stripper.setLineSeparator("\n");
+                stripper.writeText(document, writer);
+
+                return new CharArrayReader(writer.toCharArray());
+            } finally {
+                document.close();
+            }
+        } catch (Exception e) {
+            // it may happen that PDFParser throws a runtime
+            // exception when parsing certain pdf documents
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java?view=auto&rev=488717
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java Tue Dec 19 08:19:04 2006
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Text extractor for plain text.
+ */
+public class PlainTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Creates a new <code>PlainTextExtractor</code> instance.
+     */
+    public PlainTextExtractor() {
+        super(new String[]{"text/plain"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * Wraps the given input stream to an {@link InputStreamReader} using
+     * the given encoding, or the platform default encoding if the encoding
+     * is not given. Closes the stream and returns an empty reader if the
+     * given encoding is not supported.
+     *
+     * @param stream binary stream
+     * @param type ignored
+     * @param encoding character encoding, optional
+     * @return reader for the plain text content
+     * @throws IOException if the binary stream can not be closed in case
+     *                     of an encoding issue
+     */
+    public Reader extractText(InputStream stream, String type, String encoding)
+            throws IOException {
+        try {
+            if (encoding != null) {
+                return new InputStreamReader(stream, encoding);
+            } else {
+                return new InputStreamReader(stream);
+            }
+        } catch (UnsupportedEncodingException e) {
+            stream.close();
+            return new StringReader("");
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractor/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message