cxf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From serg...@apache.org
Subject git commit: Updating Tika version to 1.6, trying to use a null content handler if only the metadata needs to be extracted
Date Tue, 09 Sep 2014 10:04:02 GMT
Repository: cxf
Updated Branches:
  refs/heads/master bee82ba7d -> eba07e615


Updating Tika version to 1.6, trying to use a null content handler if only the metadata needs
to be extracted


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/eba07e61
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/eba07e61
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/eba07e61

Branch: refs/heads/master
Commit: eba07e615684e475772a7a2d23d7e93a0b099485
Parents: bee82ba
Author: Sergey Beryozkin <sberyozkin@talend.com>
Authored: Tue Sep 9 11:03:31 2014 +0100
Committer: Sergey Beryozkin <sberyozkin@talend.com>
Committed: Tue Sep 9 11:03:31 2014 +0100

----------------------------------------------------------------------
 parent/pom.xml                                   |  2 +-
 .../ext/search/tika/TikaContentExtractor.java    | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/eba07e61/parent/pom.xml
----------------------------------------------------------------------
diff --git a/parent/pom.xml b/parent/pom.xml
index 13a2f68..4d4482a 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -185,7 +185,7 @@
         <cxf.dom4j.bundle.version>1.6.1_5</cxf.dom4j.bundle.version>
         <cxf.jdom.bundle.version>1.1_4</cxf.jdom.bundle.version>
         <cxf.olingo.version>1.2.0</cxf.olingo.version>
-        <cxf.tika.version>1.5</cxf.tika.version>
+        <cxf.tika.version>1.6</cxf.tika.version>
         <cxf.jexl.version>2.1.1</cxf.jexl.version>
         <cxf.checkstyle.extension />
         <cxf.jaxb.context.class />

http://git-wip-us.apache.org/repos/asf/cxf/blob/eba07e61/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 4904201..b46d7ce 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -173,7 +173,19 @@ public class TikaContentExtractor {
             if (context == null) {
                 context = new ParseContext();
             }
-            parser.parse(in, handler, metadata, context);
+            try {
+                parser.parse(in, handler, metadata, context);
+            } catch (Exception ex) {
+                // Starting from Tika 1.6 PDFParser (with other parsers to be updated in
the future) will skip 
+                // the content processing if the content handler is null. This can be used
to optimize the 
+                // extraction process. If we get an exception with a null handler then a
given parser is still 
+                // not ready to accept null handlers so lets retry with IgnoreContentHandler.
+                if (handler == null) {
+                    parser.parse(in, new IgnoreContentHandler(), metadata, context);
+                } else {
+                    throw ex;
+                }
+            }
             return new TikaContent(handler, metadata, mediaType);
         } catch (final IOException ex) {
             LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
@@ -187,8 +199,7 @@ public class TikaContentExtractor {
     }
     
     TikaContent extract(final InputStream in, boolean extractContent) {
-        final ToTextContentHandler handler = extractContent 
-            ? new ToTextContentHandler() : new IgnoreContentHandler();
+        final ToTextContentHandler handler = extractContent ? new ToTextContentHandler()
: null;
         return extract(in, handler, null);
     }
     
@@ -210,7 +221,7 @@ public class TikaContentExtractor {
          *         to parse the content  
          */
         public String getContent() {
-            return contentHandler.toString();
+            return contentHandler == null ? null : contentHandler.toString();
         }
         /**
          * Return the metadata


Mime
View raw message