jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r762804 - /jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
Date Wed, 08 Apr 2009 04:11:08 GMT
Author: jukka
Date: Tue Apr  7 15:03:05 2009
New Revision: 762804

URL: http://svn.apache.org/viewvc?rev=762804&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction

Use a static Tika Parser instance to avoid separate instances being created for each search
index and TikaTextExtractor subclass.

Modified:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java

Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java?rev=762804&r1=762803&r2=762804&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
(original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/TikaTextExtractor.java
Tue Apr  7 15:03:05 2009
@@ -24,15 +24,30 @@
 import org.apache.jackrabbit.extractor.TextExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
 
 public class TikaTextExtractor implements TextExtractor {
 
-    private final AutoDetectParser parser = new AutoDetectParser();
+    /**
+     * Auto-detecting parser.
+     */
+    private static final Parser PARSER;
 
-    public String[] getContentTypes() {
+    /**
+     * Supported content types.
+     */
+    private static final String[] TYPES;
+
+    static {
+        AutoDetectParser parser = new AutoDetectParser();
+        PARSER = parser;
         Set types = parser.getParsers().keySet();
-        return (String[]) types.toArray(new String[types.size()]);
+        TYPES = (String[]) types.toArray(new String[types.size()]);
+    }
+
+    public String[] getContentTypes() {
+        return TYPES;
     }
 
     public Reader extractText(InputStream stream, String type, String encoding)
@@ -41,7 +56,8 @@
         if (type != null && type.trim().length() > 0) {
             metadata.set(Metadata.CONTENT_TYPE, type.trim());
         }
-        return new ParsingReader(parser, stream, metadata);
+        // TODO: This creates a background thread. Is that a problem?
+        return new ParsingReader(PARSER, stream, metadata);
     }
 
 }



Mime
View raw message