cxf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject git commit: CXF-5549: Introduce Tika Search Visitor
Date Thu, 19 Jun 2014 02:24:09 GMT
Repository: cxf
Updated Branches:
  refs/heads/master 507b03b8e -> 459b988fa


CXF-5549: Introduce Tika Search Visitor


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/459b988f
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/459b988f
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/459b988f

Branch: refs/heads/master
Commit: 459b988fa64bb1b87470c8aa0fe51ac5c136c7c6
Parents: 507b03b
Author: reta <drreta@gmail.com>
Authored: Wed Jun 18 22:23:49 2014 -0400
Committer: reta <drreta@gmail.com>
Committed: Wed Jun 18 22:23:49 2014 -0400

----------------------------------------------------------------------
 .../ext/search/tika/TikaContentExtractor.java   | 72 +++++++++++++++++---
 .../search/tika/TikaContentExtractorTest.java   |  3 +-
 2 files changed, 64 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/459b988f/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 258917e..72760e7 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -18,6 +18,9 @@
  */
 package org.apache.cxf.jaxrs.ext.search.tika;
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.logging.Level;
@@ -36,27 +39,56 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.ToTextContentHandler;
 
 public class TikaContentExtractor {
     private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class);
     
-    private final PDFParser parser;
+    private final Parser parser;
     private final DefaultDetector detector;
+    private final boolean validateMediaType;
     
-    public TikaContentExtractor() {
-        detector = new DefaultDetector();
-        parser = new PDFParser();
+    /**
+     * Create new Tika-based content extractor using the provided parser instance.  
+     * @param parser parser instance
+     */
+    public TikaContentExtractor(final Parser parser) {
+        this(parser, true);
     }
     
+    /**
+     * Create new Tika-based content extractor using the provided parser instance and
+     * optional media type validation. If validation is enabled, the implementation 
+     * will try to detect the media type of the input and validate it against media types
+     * supported by the parser.
+     * @param parser parser instance
+     * @param validateMediaType enabled or disable media type validation
+     */
+    public TikaContentExtractor(final Parser parser, final boolean validateMediaType) {
+        this.detector = new DefaultDetector();
+        this.parser = parser;
+        this.validateMediaType = validateMediaType;
+    }
+    
+    /**
+     * Extract the content and metadata from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports
this
+     * type of content. 
+     * @param in input stream to extract the content and metadata from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
     public Document extract(final InputStream in) {
         try {
-            final Metadata metadata = new Metadata();
-            final MediaType mediaType = detector.detect(in, metadata);
-            final ParseContext context = new ParseContext(); 
-            if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType))
{
-                return null;
+            final Metadata metadata = new Metadata();            
+            final ParseContext context = new ParseContext();
+            
+            // Try to validate that input stream media type is supported by the parser 
+            if (validateMediaType) {
+                final MediaType mediaType = detector.detect(in, metadata);
+                if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType))
{
+                    return null;
+                }
             }
             
             final ToTextContentHandler handler = new ToTextContentHandler();
@@ -80,4 +112,24 @@ public class TikaContentExtractor {
      
         return null;
     }
+    
+    /**
+     * Extract the content and metadata from the file. Depending on media type validation,
+     * the detector could be run against file content in order to ensure that parser supports
this
+     * type of content. 
+     * @param file file to extract the content and metadata from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */    
+    public Document extract(final File file) throws FileNotFoundException  {
+        InputStream in = null;
+        
+        try {
+            in = new FileInputStream(file);
+            return extract(in);
+        } finally {
+            if (in != null) {
+                try { in.close(); } catch (final IOException ex) { /* do nothing */ }
+            }
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/cxf/blob/459b988f/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
index 19ab4ce..df33d69 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -36,6 +36,7 @@ import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
+import org.apache.tika.parser.pdf.PDFParser;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -57,7 +58,7 @@ public class TikaContentExtractorTest extends Assert {
         writer.commit();
         
         parser = new FiqlParser<SearchBean>(SearchBean.class);
-        extractor = new TikaContentExtractor();
+        extractor = new TikaContentExtractor(new PDFParser());
     }
     
     @Test


Mime
View raw message