Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 5537A200B85 for ; Thu, 15 Sep 2016 12:22:06 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 53C09160AB7; Thu, 15 Sep 2016 10:22:06 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 9C306160AB5 for ; Thu, 15 Sep 2016 12:22:05 +0200 (CEST) Received: (qmail 12996 invoked by uid 500); 15 Sep 2016 10:22:04 -0000 Mailing-List: contact commits-help@cxf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cxf.apache.org Delivered-To: mailing list commits@cxf.apache.org Received: (qmail 12987 invoked by uid 99); 15 Sep 2016 10:22:04 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 15 Sep 2016 10:22:04 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 570CFE0159; Thu, 15 Sep 2016 10:22:04 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: sergeyb@apache.org To: commits@cxf.apache.org Message-Id: <4d883ab76dd241b392027a1deb9203a9@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: cxf git commit: Updating TikaContentExtractor to support the embedded attachments Date: Thu, 15 Sep 2016 10:22:04 +0000 (UTC) archived-at: Thu, 15 Sep 2016 10:22:06 -0000 Repository: cxf Updated Branches: refs/heads/master 9810a8448 -> cc2341a45 Updating TikaContentExtractor to support the embedded attachments Project: http://git-wip-us.apache.org/repos/asf/cxf/repo Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/cc2341a4 Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/cc2341a4 Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/cc2341a4 Branch: refs/heads/master Commit: cc2341a453a8edd467d83fdeb2c09ea62aee0ffa Parents: 9810a84 Author: Sergey Beryozkin Authored: Thu Sep 15 11:21:46 2016 +0100 Committer: Sergey Beryozkin Committed: Thu Sep 15 11:21:46 2016 +0100 ---------------------------------------------------------------------- .../ext/search/tika/TikaContentExtractor.java | 40 +++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cxf/blob/cc2341a4/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java index fd3511a..e4d1918 100644 --- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java +++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java @@ -36,6 +36,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.ToTextContentHandler; @@ -47,6 +48,13 @@ public class TikaContentExtractor { private final Detector detector; /** + * Create new Tika-based content extractor using AutoDetectParser. + */ + public TikaContentExtractor() { + this(new AutoDetectParser(), false); + } + + /** * Create new Tika-based content extractor using the provided parser instance. * @param parser parser instance */ @@ -159,9 +167,6 @@ public class TikaContentExtractor { if (in == null) { return null; } - if (context == null) { - context = new ParseContext(); - } final Metadata metadata = new Metadata(); try { @@ -171,20 +176,37 @@ public class TikaContentExtractor { mediaType = MediaType.parse(mtHint.toString()); } else if (detector != null && in.markSupported()) { mediaType = detector.detect(in, metadata); - } + } + if (mediaType != null) { + metadata.set(Metadata.CONTENT_TYPE, mediaType.toString()); + } Parser parser = null; - for (Parser p : parsers) { - if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) { - continue; + if (parsers.size() == 1) { + parser = parsers.get(0); + } else { + for (Parser p : parsers) { + if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) { + continue; + } + parser = p; + break; } - parser = p; - break; } if (parser == null) { return null; } + if (context == null) { + context = new ParseContext(); + } + if (context.get(Parser.class) == null) { + // to process the embedded attachments + context.set(Parser.class, + parser instanceof AutoDetectParser ? parser : new AutoDetectParser()); + } + + try { parser.parse(in, handler, metadata, context); } catch (Exception ex) {