jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r883382 - in /jackrabbit/trunk: jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml jackrabbit-parent/pom.xml
Date Mon, 23 Nov 2009 15:50:38 GMT
Author: jukka
Date: Mon Nov 23 15:50:38 2009
New Revision: 883382

URL: http://svn.apache.org/viewvc?rev=883382&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Upgrade to Tika 0.5

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
    jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
    jackrabbit/trunk/jackrabbit-parent/pom.xml

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java?rev=883382&r1=883381&r2=883382&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
Mon Nov 23 15:50:38 2009
@@ -26,11 +26,12 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.microsoft.OfficeParser;
-import org.apache.tika.parser.opendocument.OpenOfficeParser;
+import org.apache.tika.parser.odf.OpenDocumentParser;
 import org.apache.tika.parser.pdf.PDFParser;
 import org.apache.tika.parser.rtf.RTFParser;
 import org.apache.tika.parser.txt.TXTParser;
@@ -136,7 +137,7 @@
                 parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
parser);
             } else if (name.equals(
                     "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
-                Parser parser = new OpenOfficeParser();
+                Parser parser = new OpenDocumentParser();
                 parsers.put("application/vnd.oasis.opendocument.database", parser);
                 parsers.put("application/vnd.oasis.opendocument.formula", parser);
                 parsers.put("application/vnd.oasis.opendocument.graphics", parser);
@@ -181,10 +182,17 @@
      * Delegates the call to the configured {@link AutoDetectParser}.
      */
     public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         waitIfBlocked();
-        parser.parse(stream, handler, metadata);
+        parser.parse(stream, handler, metadata, context);
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parser.parse(stream, handler, metadata, new ParseContext());
     }
 
     /**

Modified: jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml?rev=883382&r1=883381&r2=883382&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
(original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
Mon Nov 23 15:50:38 2009
@@ -113,7 +113,7 @@
     </parser>
 
     <parser name="parse-class" class="org.apache.tika.parser.asm.ClassParser">
-      <mime>application/x-tika-java-class</mime>
+      <mime>application/java-vm</mime>
     </parser>
 
     <parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
@@ -131,6 +131,14 @@
       <mime>audio/x-aiff</mime>
     </parser>
 
+    <parser name="parse-mbox" class="org.apache.tika.parser.mbox.MboxParser">
+      <mime>application/mbox</mime>
+    </parser>
+
+    <parser name="parse-epub" class="org.apache.tika.parser.epub.EpubParser">
+      <mime>application/epub+zip</mime>
+    </parser>
+
   </parsers>
 
-</properties>
\ No newline at end of file
+</properties>

Modified: jackrabbit/trunk/jackrabbit-parent/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-parent/pom.xml?rev=883382&r1=883381&r2=883382&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-parent/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-parent/pom.xml Mon Nov 23 15:50:38 2009
@@ -42,7 +42,7 @@
   <properties>
     <slf4j.version>1.5.3</slf4j.version>
     <jetty.version>6.1.14</jetty.version>
-    <tika.version>0.4</tika.version>
+    <tika.version>0.5</tika.version>
   </properties>
 
   <build>



Mime
View raw message