Mailing-List: contact commits-help@cxf.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@cxf.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: sergeyb@apache.org
To: commits@cxf.apache.org
Message-Id: <0bb05c9e49164097995782954c247a5c@git.apache.org>
Subject: git commit: [CXF-5549] Customizing the contents field,
 adding a test checking that metadata can be matched,
 adding methods allowing to extract either the content or metadata only
Date: Fri, 20 Jun 2014 12:37:10 +0000 (UTC)

Repository: cxf
Updated Branches:
  refs/heads/master 7d5f8b519 -> 25549fb80


[CXF-5549] Customizing the contents field, adding a test checking that metadata can be matched, adding methods allowing to extract either the content or metadata only


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/25549fb8
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/25549fb8
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/25549fb8

Branch: refs/heads/master
Commit: 25549fb8036df20b1aa14b1274c236771184fc62
Parents: 7d5f8b5
Author: Sergey Beryozkin <sberyozkin@talend.com>
Authored: Fri Jun 20 13:36:27 2014 +0100
Committer: Sergey Beryozkin <sberyozkin@talend.com>
Committed: Fri Jun 20 13:36:27 2014 +0100

----------------------------------------------------------------------
 rt/rs/extensions/search/pom.xml                 |  1 -
 .../ext/search/tika/TikaContentExtractor.java   | 90 ++++++++++++--------
 .../search/tika/TikaContentExtractorTest.java   | 16 +---
 3 files changed, 58 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/pom.xml
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/pom.xml b/rt/rs/extensions/search/pom.xml
index fc956fc..7dd664c 100644
--- a/rt/rs/extensions/search/pom.xml
+++ b/rt/rs/extensions/search/pom.xml
@@ -78,7 +78,6 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-parsers</artifactId>
-            <optional>true</optional>
             <scope>test</scope>
             <exclusions>
             	<exclusion>

http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index f1cefce..3b0a52c 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -18,15 +18,13 @@
  */
 package org.apache.cxf.jaxrs.ext.search.tika;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.xml.sax.SAXException;
+
 import org.apache.cxf.common.logging.LogUtils;
 import org.apache.cxf.common.util.StringUtils;
 import org.apache.lucene.document.Document;
@@ -48,6 +46,7 @@ public class TikaContentExtractor {
     private final Parser parser;
     private final DefaultDetector detector;
     private final boolean validateMediaType;
+    private final String contentFieldName;
     
     /**
      * Create new Tika-based content extractor using the provided parser instance.  
@@ -66,9 +65,24 @@ public class TikaContentExtractor {
      * @param validateMediaType enabled or disable media type validation
      */
     public TikaContentExtractor(final Parser parser, final boolean validateMediaType) {
-        this.detector = new DefaultDetector();
+        this(parser, validateMediaType, "contents");
+    }
+    
+    /**
+     * Create new Tika-based content extractor using the provided parser instance and
+     * optional media type validation. If validation is enabled, the implementation 
+     * will try to detect the media type of the input and validate it against media types
+     * supported by the parser.
+     * @param parser parser instance
+     * @param validateMediaType enabled or disable media type validation
+     * @param contentFieldName name of the content field, default is "contents"
+     */
+    public TikaContentExtractor(final Parser parser, final boolean validateMediaType, 
+                                final String contentFieldName) {
         this.parser = parser;
         this.validateMediaType = validateMediaType;
+        this.detector = validateMediaType ? new DefaultDetector() : null;
+        this.contentFieldName = contentFieldName;
     }
     
     /**
@@ -79,6 +93,32 @@ public class TikaContentExtractor {
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */
     public Document extract(final InputStream in) {
+        return extractAll(in, true, true);
+    }
+    
+    /**
+     * Extract the content only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports this
+     * type of content. 
+     * @param in input stream to extract the content from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
+    public Document extractContent(final InputStream in) {
+        return extractAll(in, true, false);
+    }
+    
+    /**
+     * Extract the metadata only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports this
+     * type of content. 
+     * @param in input stream to extract the metadata from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
+    public Document extractMetadata(final InputStream in) {
+        return extractAll(in, false, true);
+    }
+    
+    private Document extractAll(final InputStream in, boolean extractContent, boolean extractMetadata) {
         if (in == null) {
             return null;
         }
@@ -99,14 +139,17 @@ public class TikaContentExtractor {
             parser.parse(in, handler, metadata, context);
             
             final Document document = new Document();
-            final String content = handler.toString();
-            
-            if (!StringUtils.isEmpty(content)) {
-                document.add(new Field("contents", content, TextField.TYPE_STORED));
-            }
-            
-            for (final String property: metadata.names()) {
-                document.add(new StringField(property, metadata.get(property), Store.YES));
+            if (extractContent) {
+                final String content = handler.toString();
+                
+                if (!StringUtils.isEmpty(content)) {
+                    document.add(new Field(contentFieldName, content, TextField.TYPE_STORED));
+                }
+            } 
+            if (extractMetadata) {
+                for (final String property: metadata.names()) {
+                    document.add(new StringField(property, metadata.get(property), Store.YES));
+                }
             }
             
             return document;
@@ -120,27 +163,4 @@ public class TikaContentExtractor {
      
         return null;
     }
-    
-    /**
-     * Extract the content and metadata from the file. Depending on media type validation,
-     * the detector could be run against file content in order to ensure that parser supports this
-     * type of content. 
-     * @param file file to extract the content and metadata from  
-     * @return the extracted document or null if extraction is not possible or was unsuccessful
-     */    
-    public Document extract(final File file) throws FileNotFoundException  {
-        if (file == null) {
-            return null;
-        }
-        
-        InputStream in = null;        
-        try {
-            in = new FileInputStream(file);
-            return extract(in);
-        } finally {
-            if (in != null) {
-                try { in.close(); } catch (final IOException ex) { /* do nothing */ }
-            }
-        }
-    }
 }

http://git-wip-us.apache.org/repos/asf/cxf/blob/25549fb8/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
index e169ee0..9f6649d 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -18,8 +18,6 @@
  */
 package org.apache.cxf.jaxrs.ext.search.tika;
 
-import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 
@@ -40,6 +38,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.tika.parser.pdf.PDFParser;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -75,6 +74,8 @@ public class TikaContentExtractorTest extends Assert {
         assertEquals(1, getHits("ct==tika").length);
         assertEquals(1, getHits("ct==incubation").length);
         assertEquals(0, getHits("ct==toolsuite").length);
+        // meta-data
+        assertEquals(1, getHits("Author==Bertrand*").length);
     }
 
     @Test
@@ -101,17 +102,6 @@ public class TikaContentExtractorTest extends Assert {
         assertNull("Document should be null, it is encrypted", extractor.extract((InputStream)null));        
     }
 
-    @Test
-    public void testExtractionFromNullFileFails() throws FileNotFoundException {
-        assertNull("Document should be null, it is encrypted", extractor.extract((File)null));        
-    }
-    
-    @Test(expected = FileNotFoundException.class)
-    public void testExtractionFromNonExistingFileFails() throws FileNotFoundException {
-        assertNull("Document should be null, it is encrypted", 
-            extractor.extract(new File("a.txt")));        
-    }
-
     private ScoreDoc[] getHits(final String expression) throws IOException {
         IndexReader reader = DirectoryReader.open(directory);
         IndexSearcher searcher = new IndexSearcher(reader);