cxf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From serg...@apache.org
Subject git commit: [CXF-5549] Splitting Tika extractor into 2 handlers so that the tika content can be fed directly into CXF SearchCondition API
Date Fri, 20 Jun 2014 15:06:14 GMT
Repository: cxf
Updated Branches:
  refs/heads/master 018b563f8 -> df94a7fe5


[CXF-5549] Splitting Tika extractor into 2 handlers so that the tika content can be fed directly
into CXF SearchCondition API


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/df94a7fe
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/df94a7fe
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/df94a7fe

Branch: refs/heads/master
Commit: df94a7fe524549455ec65c3068a39a1d1d37acb1
Parents: 018b563
Author: Sergey Beryozkin <sberyozkin@talend.com>
Authored: Fri Jun 20 16:05:56 2014 +0100
Committer: Sergey Beryozkin <sberyozkin@talend.com>
Committed: Fri Jun 20 16:05:56 2014 +0100

----------------------------------------------------------------------
 .../ext/search/tika/TikaContentExtractor.java   |  80 +++++-------
 .../search/tika/TikaLuceneContentExtractor.java | 125 +++++++++++++++++++
 .../search/tika/TikaContentExtractorTest.java   |  72 +++--------
 .../tika/TikaLuceneContentExtractorTest.java    | 102 +++++++++++++++
 4 files changed, 274 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/df94a7fe/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 3b0a52c..904a672 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -26,12 +26,7 @@ import java.util.logging.Logger;
 import org.xml.sax.SAXException;
 
 import org.apache.cxf.common.logging.LogUtils;
-import org.apache.cxf.common.util.StringUtils;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -46,7 +41,6 @@ public class TikaContentExtractor {
     private final Parser parser;
     private final DefaultDetector detector;
     private final boolean validateMediaType;
-    private final String contentFieldName;
     
     /**
      * Create new Tika-based content extractor using the provided parser instance.  
@@ -65,24 +59,9 @@ public class TikaContentExtractor {
      * @param validateMediaType enabled or disable media type validation
      */
     public TikaContentExtractor(final Parser parser, final boolean validateMediaType) {
-        this(parser, validateMediaType, "contents");
-    }
-    
-    /**
-     * Create new Tika-based content extractor using the provided parser instance and
-     * optional media type validation. If validation is enabled, the implementation 
-     * will try to detect the media type of the input and validate it against media types
-     * supported by the parser.
-     * @param parser parser instance
-     * @param validateMediaType enabled or disable media type validation
-     * @param contentFieldName name of the content field, default is "contents"
-     */
-    public TikaContentExtractor(final Parser parser, final boolean validateMediaType, 
-                                final String contentFieldName) {
         this.parser = parser;
         this.validateMediaType = validateMediaType;
         this.detector = validateMediaType ? new DefaultDetector() : null;
-        this.contentFieldName = contentFieldName;
     }
     
     /**
@@ -92,19 +71,19 @@ public class TikaContentExtractor {
      * @param in input stream to extract the content and metadata from  
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */
-    public Document extract(final InputStream in) {
-        return extractAll(in, true, true);
+    public TikaContent extract(final InputStream in) {
+        return extractAll(in, true);
     }
     
     /**
-     * Extract the content only from the input stream. Depending on media type validation,
+     * Extract the metadata only from the input stream. Depending on media type validation,
      * the detector could be run against input stream in order to ensure that parser supports
this
      * type of content. 
-     * @param in input stream to extract the content from  
+     * @param in input stream to extract the metadata from  
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */
-    public Document extractContent(final InputStream in) {
-        return extractAll(in, true, false);
+    public TikaContent extractMetadata(final InputStream in) {
+        return extractAll(in, false);
     }
     
     /**
@@ -114,11 +93,16 @@ public class TikaContentExtractor {
      * @param in input stream to extract the metadata from  
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */
-    public Document extractMetadata(final InputStream in) {
-        return extractAll(in, false, true);
+    public SearchBean extractMetadataToSearchBean(final InputStream in) {
+        Metadata metadata = extractMetadata(in).getMetadata();
+        SearchBean bean = new SearchBean();
+        for (final String property: metadata.names()) {
+            bean.set(property, metadata.get(property));
+        }
+        return bean;
     }
     
-    private Document extractAll(final InputStream in, boolean extractContent, boolean extractMetadata)
{
+    TikaContent extractAll(final InputStream in, boolean extractContent) {
         if (in == null) {
             return null;
         }
@@ -137,22 +121,9 @@ public class TikaContentExtractor {
             
             final ToTextContentHandler handler = new ToTextContentHandler();
             parser.parse(in, handler, metadata, context);
-            
-            final Document document = new Document();
-            if (extractContent) {
-                final String content = handler.toString();
-                
-                if (!StringUtils.isEmpty(content)) {
-                    document.add(new Field(contentFieldName, content, TextField.TYPE_STORED));
-                }
-            } 
-            if (extractMetadata) {
-                for (final String property: metadata.names()) {
-                    document.add(new StringField(property, metadata.get(property), Store.YES));
-                }
-            }
-            
-            return document;
+            // TODO: use a content handler which will ignore parser content events 
+            String content = extractContent ? handler.toString() : ""; 
+            return new TikaContent(content, metadata);
         } catch (final IOException ex) {
             LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
         } catch (final SAXException ex) {
@@ -163,4 +134,19 @@ public class TikaContentExtractor {
      
         return null;
     }
+    public static class TikaContent {
+        private String content;
+        private Metadata metadata;
+        public TikaContent(String content, Metadata metadata) {
+            this.content = content;
+            this.metadata = metadata;
+        }
+        public String getContent() {
+            return content;
+        }
+        public Metadata getMetadata() {
+            return metadata;
+        }
+    }
+    
 }

http://git-wip-us.apache.org/repos/asf/cxf/blob/df94a7fe/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
new file mode 100644
index 0000000..1bad780
--- /dev/null
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.InputStream;
+
+import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+
+public class TikaLuceneContentExtractor {
+    private final TikaContentExtractor extractor;
+    private final String contentFieldName;
+    
+    /**
+     * Create new Tika-based content extractor using the provided parser instance.  
+     * @param parser parser instance
+     */
+    public TikaLuceneContentExtractor(final Parser parser) {
+        this(parser, true);
+    }
+    
+    /**
+     * Create new Tika-based content extractor using the provided parser instance and
+     * optional media type validation. If validation is enabled, the implementation 
+     * will try to detect the media type of the input and validate it against media types
+     * supported by the parser.
+     * @param parser parser instance
+     * @param validateMediaType enabled or disable media type validation
+     */
+    public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType)
{
+        this(parser, validateMediaType, "contents");
+    }
+    
+    /**
+     * Create new Tika-based content extractor using the provided parser instance and
+     * optional media type validation. If validation is enabled, the implementation 
+     * will try to detect the media type of the input and validate it against media types
+     * supported by the parser.
+     * @param parser parser instance
+     * @param validateMediaType enabled or disable media type validation
+     * @param contentFieldName name of the content field, default is "contents"
+     */
+    public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType,

+                                final String contentFieldName) {
+        extractor = new TikaContentExtractor(parser, validateMediaType);
+        this.contentFieldName = contentFieldName;
+    }
+    
+    /**
+     * Extract the content and metadata from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports
this
+     * type of content. 
+     * @param in input stream to extract the content and metadata from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
+    public Document extract(final InputStream in) {
+        return extractAll(in, true, true);
+    }
+    
+    /**
+     * Extract the content only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports
this
+     * type of content. 
+     * @param in input stream to extract the content from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
+    public Document extractContent(final InputStream in) {
+        return extractAll(in, true, false);
+    }
+    
+    /**
+     * Extract the metadata only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports
this
+     * type of content. 
+     * @param in input stream to extract the metadata from  
+     * @return the extracted document or null if extraction is not possible or was unsuccessful
+     */
+    public Document extractMetadata(final InputStream in) {
+        return extractAll(in, false, true);
+    }
+    
+    private Document extractAll(final InputStream in, boolean extractContent, boolean extractMetadata)
{
+        
+        TikaContent content = extractor.extractAll(in, extractContent);
+        
+        if (content == null) {
+            return null;
+        }
+        final Document document = new Document();
+        if (content.getContent() != null) {
+            document.add(new Field(contentFieldName, content.getContent(), TextField.TYPE_STORED));
+        } 
+        if (extractMetadata) {
+            Metadata metadata = content.getMetadata();
+            for (final String property: metadata.names()) {
+                document.add(new StringField(property, metadata.get(property), Store.YES));
+            }
+        }
+        
+        return document;
+        
+    }
+}

http://git-wip-us.apache.org/repos/asf/cxf/blob/df94a7fe/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
index 9f6649d..6fce551 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -18,64 +18,43 @@
  */
 package org.apache.cxf.jaxrs.ext.search.tika;
 
-import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchCondition;
 import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
 import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
-import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
 import org.apache.tika.parser.pdf.PDFParser;
 
-import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
 public class TikaContentExtractorTest extends Assert {
     private TikaContentExtractor extractor;
-    private Directory directory;
-    private IndexWriter writer;
     private SearchConditionParser< SearchBean > parser;
     
     @Before
     public void setUp() throws Exception {
-        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
-        directory = new RAMDirectory();
-        
-        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
-        writer = new IndexWriter(directory, config);    
-        writer.commit();
-        
         parser = new FiqlParser<SearchBean>(SearchBean.class);
         extractor = new TikaContentExtractor(new PDFParser());
     }
     
     @Test
     public void testExtractedTextContentMatchesSearchCriteria() throws Exception {
-        final Document document = extractor.extract(getClass().getResourceAsStream("/files/testPDF.pdf"));
-        assertNotNull("Document should not be null", document);
-        
-        writer.addDocument(document);
-        writer.commit();
-
-        assertEquals(1, getHits("ct==tika").length);
-        assertEquals(1, getHits("ct==incubation").length);
-        assertEquals(0, getHits("ct==toolsuite").length);
-        // meta-data
-        assertEquals(1, getHits("Author==Bertrand*").length);
+        SearchCondition<SearchBean> sc = parser.parse("Author==Bertrand*");
+        final SearchBean bean = extractor.extractMetadataToSearchBean(
+            getClass().getResourceAsStream("/files/testPDF.pdf"));
+        assertNotNull("Document should not be null", bean);
+        assertTrue(sc.isMet(bean));
+    }
+    @Test
+    public void testExtractedTextContentDoesNotMatchSearchCriteria() throws Exception {
+        SearchCondition<SearchBean> sc = parser.parse("Author==Barry*");
+        final SearchBean bean = extractor.extractMetadataToSearchBean(
+            getClass().getResourceAsStream("/files/testPDF.pdf"));
+        assertNotNull("Document should not be null", bean);
+        assertFalse(sc.isMet(bean));
     }
 
     @Test
@@ -101,27 +80,4 @@ public class TikaContentExtractorTest extends Assert {
     public void testExtractionFromNullInputStreamFails() {
         assertNull("Document should be null, it is encrypted", extractor.extract((InputStream)null));
       
     }
-
-    private ScoreDoc[] getHits(final String expression) throws IOException {
-        IndexReader reader = DirectoryReader.open(directory);
-        IndexSearcher searcher = new IndexSearcher(reader);        
-
-        try {
-            LuceneQueryVisitor<SearchBean> visitor = new LuceneQueryVisitor<SearchBean>("ct",
"contents");
-            visitor.visit(parser.parse(expression));
-    
-            ScoreDoc[] hits = searcher.search(visitor.getQuery(), null, 1000).scoreDocs;
-            assertNotNull(hits);
-            
-            return hits;            
-        } finally {
-            reader.close();
-        }
-    }
-    
-    @After
-    public void tearDown() throws Exception {
-        writer.close();        
-        directory.close();
-    }
 }

http://git-wip-us.apache.org/repos/asf/cxf/blob/df94a7fe/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
new file mode 100644
index 0000000..cce75cb
--- /dev/null
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.IOException;
+
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
+import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
+import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.tika.parser.pdf.PDFParser;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaLuceneContentExtractorTest extends Assert {
+    private TikaLuceneContentExtractor extractor;
+    private Directory directory;
+    private IndexWriter writer;
+    private SearchConditionParser< SearchBean > parser;
+    
+    @Before
+    public void setUp() throws Exception {
+        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
+        directory = new RAMDirectory();
+        
+        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
+        writer = new IndexWriter(directory, config);    
+        writer.commit();
+        
+        parser = new FiqlParser<SearchBean>(SearchBean.class);
+        extractor = new TikaLuceneContentExtractor(new PDFParser());
+    }
+    
+    @Test
+    public void testExtractedTextContentMatchesSearchCriteria() throws Exception {
+        final Document document = extractor.extract(getClass().getResourceAsStream("/files/testPDF.pdf"));
+        assertNotNull("Document should not be null", document);
+        
+        writer.addDocument(document);
+        writer.commit();
+
+        assertEquals(1, getHits("ct==tika").length);
+        assertEquals(1, getHits("ct==incubation").length);
+        assertEquals(0, getHits("ct==toolsuite").length);
+        // meta-data
+        assertEquals(1, getHits("Author==Bertrand*").length);
+    }
+
+    private ScoreDoc[] getHits(final String expression) throws IOException {
+        IndexReader reader = DirectoryReader.open(directory);
+        IndexSearcher searcher = new IndexSearcher(reader);        
+
+        try {
+            LuceneQueryVisitor<SearchBean> visitor = new LuceneQueryVisitor<SearchBean>("ct",
"contents");
+            visitor.visit(parser.parse(expression));
+    
+            ScoreDoc[] hits = searcher.search(visitor.getQuery(), null, 1000).scoreDocs;
+            assertNotNull(hits);
+            
+            return hits;            
+        } finally {
+            reader.close();
+        }
+    }
+    
+    @After
+    public void tearDown() throws Exception {
+        writer.close();        
+        directory.close();
+    }
+}


Mime
View raw message