jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r794633 - in /jackrabbit/trunk/jackrabbit-text-extractors/src/main: java/org/apache/jackrabbit/extractor/ resources/org/apache/jackrabbit/extractor/
Date Thu, 16 Jul 2009 11:26:35 GMT
Author: jukka
Date: Thu Jul 16 11:26:35 2009
New Revision: 794633

URL: http://svn.apache.org/viewvc?rev=794633&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Remove the Java 1.4 check as we already require Java 5.

Use a custom Tika configuration file that enables the MS Office parsers, but disables parsing
of generic zip files and other packages. The packages often take lots of effort to parse,
but aren't that useful in the full text index.

TODO: Configuration mechanism to allow alternative Tika configurations.

Added:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml
      - copied, changed from r793830, jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
Removed:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=794633&r1=794632&r2=794633&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
(original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
Thu Jul 16 11:26:35 2009
@@ -24,7 +24,6 @@
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
 
 /**
@@ -35,7 +34,7 @@
     /**
      * Auto-detecting parser.
      */
-    private static final Parser PARSER;
+    private static final AutoDetectParser PARSER;
 
     /**
      * Supported content types.
@@ -43,34 +42,21 @@
     private static final String[] TYPES;
 
     static {
-        // The default Tika configuration refers to Apache POI libraries that
-        // are compiled for Java 5, and can thus not be loaded in Java 1.4.
-        // This makes it impossible to load the default Tika configuration
-        // (see TIKA-217 for background), and so we need to use the following
-        // workaround to instantiate the Tika AutoDetectParser without the
-        // POI classes (and thus support for MS Office formats) when running
-        // on Java 1.4.
-        AutoDetectParser parser;
-        if ("1.4".equals(System.getProperty("java.specification.version"))) {
-            InputStream stream =
-                DefaultTextExtractor.class.getResourceAsStream("tika-config-jdk14.xml");
+        InputStream stream =
+            DefaultTextExtractor.class.getResourceAsStream("tika-config.xml");
+        try {
             try {
-                try {
-                    parser = new AutoDetectParser(new TikaConfig(stream));
-                } finally {
-                    stream.close();
-                }
-            } catch (Exception e) {
-                throw new RuntimeException(
-                        "Unable to load Tika configuration", e);
+                PARSER = new AutoDetectParser(new TikaConfig(stream));
+
+                Set<String> types = PARSER.getParsers().keySet();
+                TYPES = types.toArray(new String[types.size()]);
+            } finally {
+                stream.close();
             }
-        } else {
-            parser = new AutoDetectParser();
+        } catch (Exception e) {
+            throw new RuntimeException(
+                    "Unable to load Tika configuration", e);
         }
-        PARSER = parser;
-
-        Set types = parser.getParsers().keySet();
-        TYPES = (String[]) types.toArray(new String[types.size()]);
     }
 
     public String[] getContentTypes() {

Copied: jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml
(from r793830, jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml)
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml?p2=jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml&p1=jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml&r1=793830&r2=794633&rev=794633&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config-jdk14.xml
(original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml
Thu Jul 16 11:26:35 2009
@@ -28,6 +28,22 @@
       <mime>image/svg+xml</mime>
     </parser>
 
+    <parser name="parse-office" class="org.apache.tika.parser.microsoft.OfficeParser">
+      <mime>application/x-tika-msoffice</mime>
+      <mime>application/msword</mime>
+      <mime>application/vnd.ms-excel</mime>
+      <mime>application/vnd.ms-powerpoint</mime>
+      <mime>application/vnd.visio</mime>
+      <mime>application/vnd.ms-outlook</mime>
+    </parser>
+
+    <parser name="parse-ooxml" class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+      <mime>application/vnd.openxmlformats-package.core-properties+xml</mime>
+      <mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime>
+      <mime>application/vnd.openxmlformats-officedocument.presentationml.presentation</mime>
+      <mime>application/vnd.openxmlformats-officedocument.wordprocessingml.document</mime>
+    </parser>
+
     <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
       <mime>text/html</mime>
       <mime>application/xhtml+xml</mime>
@@ -94,22 +110,6 @@
       <mime>image/x-xcf</mime>
     </parser>
 
-    <parser name="parse-zip" class="org.apache.tika.parser.pkg.ZipParser">
-      <mime>application/zip</mime>
-    </parser>
-
-    <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser">
-      <mime>application/x-tar</mime>
-    </parser>
-
-    <parser name="parse-gzip" class="org.apache.tika.parser.pkg.GzipParser">
-      <mime>application/x-gzip</mime>
-    </parser>
-
-    <parser name="parse-bzip2" class="org.apache.tika.parser.pkg.Bzip2Parser">
-      <mime>application/x-bzip</mime>
-    </parser>
-
     <parser name="parse-class" class="org.apache.tika.parser.asm.ClassParser">
       <mime>application/x-tika-java-class</mime>
     </parser>



Mime
View raw message