Return-Path: X-Original-To: apmail-cxf-commits-archive@www.apache.org Delivered-To: apmail-cxf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id EB0A4112C3 for ; Fri, 20 Jun 2014 12:37:10 +0000 (UTC) Received: (qmail 26625 invoked by uid 500); 20 Jun 2014 12:37:10 -0000 Delivered-To: apmail-cxf-commits-archive@cxf.apache.org Received: (qmail 26556 invoked by uid 500); 20 Jun 2014 12:37:10 -0000 Mailing-List: contact commits-help@cxf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cxf.apache.org Delivered-To: mailing list commits@cxf.apache.org Received: (qmail 26547 invoked by uid 99); 20 Jun 2014 12:37:10 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 20 Jun 2014 12:37:10 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 820C5986CAE; Fri, 20 Jun 2014 12:37:10 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: sergeyb@apache.org To: commits@cxf.apache.org Message-Id: <0bb05c9e49164097995782954c247a5c@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: git commit: [CXF-5549] Customizing the contents field, adding a test checking that metadata can be matched, adding methods allowing to extract either the content or metadata only Date: Fri, 20 Jun 2014 12:37:10 +0000 (UTC) Repository: cxf Updated Branches: refs/heads/master 7d5f8b519 -> 25549fb80 [CXF-5549] Customizing the contents field, adding a test checking that metadata can be matched, adding methods allowing to extract either the content or metadata only Project: http://git-wip-us.apache.org/repos/asf/cxf/repo Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/25549fb8 Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/25549fb8 Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/25549fb8 Branch: refs/heads/master Commit: 25549fb8036df20b1aa14b1274c236771184fc62 Parents: 7d5f8b5 Author: Sergey Beryozkin Authored: Fri Jun 20 13:36:27 2014 +0100 Committer: Sergey Beryozkin Committed: Fri Jun 20 13:36:27 2014 +0100 ---------------------------------------------------------------------- rt/rs/extensions/search/pom.xml | 1 - .../ext/search/tika/TikaContentExtractor.java | 90 ++++++++++++-------- .../search/tika/TikaContentExtractorTest.java | 16 +--- 3 files changed, 58 insertions(+), 49 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/pom.xml ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/pom.xml b/rt/rs/extensions/search/pom.xml index fc956fc..7dd664c 100644 --- a/rt/rs/extensions/search/pom.xml +++ b/rt/rs/extensions/search/pom.xml @@ -78,7 +78,6 @@ org.apache.tika tika-parsers - true test http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java index f1cefce..3b0a52c 100644 --- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java +++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java @@ -18,15 +18,13 @@ */ package org.apache.cxf.jaxrs.ext.search.tika; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.logging.Level; import java.util.logging.Logger; import org.xml.sax.SAXException; + import org.apache.cxf.common.logging.LogUtils; import org.apache.cxf.common.util.StringUtils; import org.apache.lucene.document.Document; @@ -48,6 +46,7 @@ public class TikaContentExtractor { private final Parser parser; private final DefaultDetector detector; private final boolean validateMediaType; + private final String contentFieldName; /** * Create new Tika-based content extractor using the provided parser instance. @@ -66,9 +65,24 @@ public class TikaContentExtractor { * @param validateMediaType enabled or disable media type validation */ public TikaContentExtractor(final Parser parser, final boolean validateMediaType) { - this.detector = new DefaultDetector(); + this(parser, validateMediaType, "contents"); + } + + /** + * Create new Tika-based content extractor using the provided parser instance and + * optional media type validation. If validation is enabled, the implementation + * will try to detect the media type of the input and validate it against media types + * supported by the parser. + * @param parser parser instance + * @param validateMediaType enabled or disable media type validation + * @param contentFieldName name of the content field, default is "contents" + */ + public TikaContentExtractor(final Parser parser, final boolean validateMediaType, + final String contentFieldName) { this.parser = parser; this.validateMediaType = validateMediaType; + this.detector = validateMediaType ? new DefaultDetector() : null; + this.contentFieldName = contentFieldName; } /** @@ -79,6 +93,32 @@ public class TikaContentExtractor { * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extract(final InputStream in) { + return extractAll(in, true, true); + } + + /** + * Extract the content only from the input stream. Depending on media type validation, + * the detector could be run against input stream in order to ensure that parser supports this + * type of content. + * @param in input stream to extract the content from + * @return the extracted document or null if extraction is not possible or was unsuccessful + */ + public Document extractContent(final InputStream in) { + return extractAll(in, true, false); + } + + /** + * Extract the metadata only from the input stream. Depending on media type validation, + * the detector could be run against input stream in order to ensure that parser supports this + * type of content. + * @param in input stream to extract the metadata from + * @return the extracted document or null if extraction is not possible or was unsuccessful + */ + public Document extractMetadata(final InputStream in) { + return extractAll(in, false, true); + } + + private Document extractAll(final InputStream in, boolean extractContent, boolean extractMetadata) { if (in == null) { return null; } @@ -99,14 +139,17 @@ public class TikaContentExtractor { parser.parse(in, handler, metadata, context); final Document document = new Document(); - final String content = handler.toString(); - - if (!StringUtils.isEmpty(content)) { - document.add(new Field("contents", content, TextField.TYPE_STORED)); - } - - for (final String property: metadata.names()) { - document.add(new StringField(property, metadata.get(property), Store.YES)); + if (extractContent) { + final String content = handler.toString(); + + if (!StringUtils.isEmpty(content)) { + document.add(new Field(contentFieldName, content, TextField.TYPE_STORED)); + } + } + if (extractMetadata) { + for (final String property: metadata.names()) { + document.add(new StringField(property, metadata.get(property), Store.YES)); + } } return document; @@ -120,27 +163,4 @@ public class TikaContentExtractor { return null; } - - /** - * Extract the content and metadata from the file. Depending on media type validation, - * the detector could be run against file content in order to ensure that parser supports this - * type of content. - * @param file file to extract the content and metadata from - * @return the extracted document or null if extraction is not possible or was unsuccessful - */ - public Document extract(final File file) throws FileNotFoundException { - if (file == null) { - return null; - } - - InputStream in = null; - try { - in = new FileInputStream(file); - return extract(in); - } finally { - if (in != null) { - try { in.close(); } catch (final IOException ex) { /* do nothing */ } - } - } - } } http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java index e169ee0..9f6649d 100644 --- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java +++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java @@ -18,8 +18,6 @@ */ package org.apache.cxf.jaxrs.ext.search.tika; -import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -40,6 +38,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.apache.tika.parser.pdf.PDFParser; + import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -75,6 +74,8 @@ public class TikaContentExtractorTest extends Assert { assertEquals(1, getHits("ct==tika").length); assertEquals(1, getHits("ct==incubation").length); assertEquals(0, getHits("ct==toolsuite").length); + // meta-data + assertEquals(1, getHits("Author==Bertrand*").length); } @Test @@ -101,17 +102,6 @@ public class TikaContentExtractorTest extends Assert { assertNull("Document should be null, it is encrypted", extractor.extract((InputStream)null)); } - @Test - public void testExtractionFromNullFileFails() throws FileNotFoundException { - assertNull("Document should be null, it is encrypted", extractor.extract((File)null)); - } - - @Test(expected = FileNotFoundException.class) - public void testExtractionFromNonExistingFileFails() throws FileNotFoundException { - assertNull("Document should be null, it is encrypted", - extractor.extract(new File("a.txt"))); - } - private ScoreDoc[] getHits(final String expression) throws IOException { IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader);