cxf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject git commit: CXF-5549: Introduce Tika Search Visitor. Added integration tests (systest) for JAX-RS/Tika/Lucene/Search
Date Fri, 11 Jul 2014 02:20:55 GMT
Repository: cxf
Updated Branches:
  refs/heads/master f5d2a0332 -> 2209258ce


CXF-5549: Introduce Tika Search Visitor. Added integration tests (systest) for JAX-RS/Tika/Lucene/Search


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/2209258c
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/2209258c
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/2209258c

Branch: refs/heads/master
Commit: 2209258ce1246f3f48d23d51c4d981dd47b3d600
Parents: f5d2a03
Author: reta <drreta@gmail.com>
Authored: Thu Jul 10 22:20:37 2014 -0400
Committer: reta <drreta@gmail.com>
Committed: Thu Jul 10 22:20:37 2014 -0400

----------------------------------------------------------------------
 .../ext/search/tika/LuceneDocumentMetadata.java |  10 ++
 .../search/tika/TikaLuceneContentExtractor.java |   5 +
 .../tika/TikaLuceneContentExtractorTest.java    |  17 +++
 systests/jaxrs/pom.xml                          |  23 ++++
 .../systest/jaxrs/extraction/BookCatalog.java   | 118 ++++++++++++++++++
 .../extraction/JAXRSClientServerTikaTest.java   | 121 +++++++++++++++++++
 .../jaxrs/src/test/resources/files/testPDF.pdf  | Bin 0 -> 34824 bytes
 7 files changed, 294 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
index dcb8f5a..f3e0b7e 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
@@ -28,6 +28,7 @@ import org.apache.cxf.jaxrs.ext.search.DefaultParamConverterProvider;
 public class LuceneDocumentMetadata {
     private final Map< String, Class< ? > > fieldTypes;
     private final String contentFieldName;
+    private String source;
     private ParamConverterProvider converterProvider = new DefaultParamConverterProvider();
     
     public LuceneDocumentMetadata() {
@@ -54,10 +55,19 @@ public class LuceneDocumentMetadata {
         return this;
     }
     
+    public LuceneDocumentMetadata withSource(final String src) {
+        this.source = src;
+        return this;
+    }
+    
     public String getContentFieldName() {
         return contentFieldName;
     }
     
+    public String getSource() {
+        return source;
+    }
+    
     public Class<?> getFieldType(String name) {
         return fieldTypes.get(name);
     }

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
index 8ca0b29..dc086ac 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
@@ -24,6 +24,7 @@ import java.util.List;
 
 import javax.ws.rs.ext.ParamConverterProvider;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoubleField;
@@ -191,6 +192,10 @@ public class TikaLuceneContentExtractor {
             }
         }
         
+        if (!StringUtils.isBlank(documentMetadata.getSource())) {
+            document.add(new StringField("source", documentMetadata.getSource(), Store.YES));
+        }
+        
         return document;
         
     }

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
index 1012db6..cbdef04 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
@@ -198,6 +198,23 @@ public class TikaLuceneContentExtractorTest extends Assert {
         assertEquals(0, getHits("xmpTPg:NPages=lt=1.0", documentMetadata.getFieldTypes()).length);
     }
     
+    @Test
+    public void testContentSourceMatchesSearchCriteria() throws Exception {
+        final LuceneDocumentMetadata documentMetadata = new LuceneDocumentMetadata()
+            .withSource("testPDF.pdf");
+
+        final Document document = extractor.extract(
+            getClass().getResourceAsStream("/files/testPDF.pdf"), documentMetadata);
+        assertNotNull("Document should not be null", document);
+        
+        writer.addDocument(document);
+        writer.commit();
+
+        // Should work by exact match only
+        assertEquals(1, getHits("source==testPDF.pdf").length);
+        assertEquals(0, getHits("source==testPDF").length);
+    }
+    
     private ScoreDoc[] getHits(final String expression) throws IOException {
         return getHits(expression, new HashMap<String, Class<?>>());
     }

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/systests/jaxrs/pom.xml
----------------------------------------------------------------------
diff --git a/systests/jaxrs/pom.xml b/systests/jaxrs/pom.xml
index 4e1aa5f..0b219a4 100644
--- a/systests/jaxrs/pom.xml
+++ b/systests/jaxrs/pom.xml
@@ -423,6 +423,29 @@
             <version>${cxf.netty3.version}</version>
             <scope>test</scope>
         </dependency>
+         <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>${cxf.lucene.version}</version>
+            <scope>test</scope>
+        </dependency>
+         <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>${cxf.lucene.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <scope>test</scope>
+            <exclusions>
+            	<exclusion>
+            		<groupId>org.apache.poi</groupId>
+            		<artifactId>poi-ooxml</artifactId>            	
+            	</exclusion>       
+            </exclusions>
+        </dependency>
     </dependencies>
     <build>
         <plugins>

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/BookCatalog.java
----------------------------------------------------------------------
diff --git a/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/BookCatalog.java
b/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/BookCatalog.java
new file mode 100644
index 0000000..8ff5113
--- /dev/null
+++ b/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/BookCatalog.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cxf.systest.jaxrs.extraction;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.activation.DataHandler;
+import javax.ws.rs.Consumes;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchContext;
+import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
+import org.apache.cxf.jaxrs.ext.search.tika.LuceneDocumentMetadata;
+import org.apache.cxf.jaxrs.ext.search.tika.TikaLuceneContentExtractor;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.tika.parser.pdf.PDFParser;
+
+@Path("/catalog")
+public class BookCatalog {
+    private final TikaLuceneContentExtractor extractor = new TikaLuceneContentExtractor(new
PDFParser());    
+    private final Directory directory = new RAMDirectory();
+    private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
+    private final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
+    
+    @POST
+    @Consumes("multipart/form-data")
+    public Response addBook(final MultipartBody body) throws Exception {
+        for (final Attachment attachment: body.getAllAttachments()) {
+            final DataHandler handler =  attachment.getDataHandler();
+            
+            if (handler != null) {
+                final String source = handler.getName();                
+                final LuceneDocumentMetadata metadata = new LuceneDocumentMetadata()
+                    .withSource(source)
+                    .withField("modified", Date.class);
+                
+                final Document document = extractor.extract(handler.getInputStream(), metadata);
+                if (document != null) {                    
+                    final IndexWriter writer = new IndexWriter(directory, config);
+                    
+                    try {
+                        writer.addDocument(document);
+                        writer.commit();
+                    } finally {
+                        writer.close();
+                    }
+                }
+            }
+        }        
+        
+        return Response.ok().build();
+    }
+    
+    @GET
+    @Produces(MediaType.APPLICATION_JSON)
+    public Collection<ScoreDoc> findBook(@Context SearchContext searchContext) throws
IOException {
+        IndexReader reader = DirectoryReader.open(directory);
+        IndexSearcher searcher = new IndexSearcher(reader);        
+
+        try {
+            final Map< String, Class< ? > > fieldTypes = new HashMap< String,
Class< ? > >();
+            fieldTypes.put("modified", Date.class);
+            
+            LuceneQueryVisitor<SearchBean> visitor = new LuceneQueryVisitor<SearchBean>("ct",
"contents");
+            visitor.setPrimitiveFieldTypeMap(fieldTypes);
+            visitor.visit(searchContext.getCondition(SearchBean.class));
+    
+            return Arrays.asList(searcher.search(visitor.getQuery(), null, 1000).scoreDocs);
+        } finally {
+            reader.close();
+        }
+    }
+}
+
+

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/JAXRSClientServerTikaTest.java
----------------------------------------------------------------------
diff --git a/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/JAXRSClientServerTikaTest.java
b/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/JAXRSClientServerTikaTest.java
new file mode 100644
index 0000000..8744247
--- /dev/null
+++ b/systests/jaxrs/src/test/java/org/apache/cxf/systest/jaxrs/extraction/JAXRSClientServerTikaTest.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cxf.systest.jaxrs.extraction;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.ws.rs.core.MediaType;
+
+import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchContextProvider;
+import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.cxf.jaxrs.model.AbstractResourceInfo;
+import org.apache.cxf.jaxrs.provider.MultipartProvider;
+import org.apache.cxf.testutil.common.AbstractBusClientServerTestBase;
+import org.apache.cxf.testutil.common.AbstractBusTestServerBase;
+import org.apache.lucene.search.ScoreDoc;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class JAXRSClientServerTikaTest extends AbstractBusClientServerTestBase {
+    public static final String PORT = allocatePort(JAXRSClientServerTikaTest.class);
+    
+    @Ignore
+    public static class Server extends AbstractBusTestServerBase {        
+        protected void run() {
+            JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
+            
+            final Map< String, Object > properties = new HashMap< String, Object
>();        
+            properties.put("search.query.parameter.name", "$filter");
+            properties.put("search.parser", new FiqlParser< SearchBean >(SearchBean.class));
+            
+            sf.setResourceClasses(BookCatalog.class);
+            sf.setResourceProvider(BookCatalog.class, new SingletonResourceProvider(new BookCatalog()));
+            sf.setAddress("http://localhost:" + PORT + "/");
+            sf.setProperties(properties);
+            sf.setProvider(new MultipartProvider());
+            sf.setProvider(new SearchContextProvider());
+            sf.setProvider(new JacksonJsonProvider());
+            
+            sf.create();
+        }
+
+        public static void main(String[] args) {
+            try {
+                Server s = new Server();
+                s.start();
+            } catch (Exception ex) {
+                ex.printStackTrace();
+                System.exit(-1);
+            } finally {
+                System.out.println("done!");
+            }
+        }
+    }
+    
+    @BeforeClass
+    public static void startServers() throws Exception {
+        AbstractResourceInfo.clearAllMaps();
+        //keep out of process due to stack traces testing failures
+        assertTrue("server did not launch correctly", launchServer(Server.class, true));
+        createStaticBus();
+    }
+    
+    @Test
+    public void testUploadIndexAndSearchPdfFile() {
+        final WebClient wc = createWebClient("/catalog").type(MediaType.MULTIPART_FORM_DATA);
+        
+        final ContentDisposition disposition = new ContentDisposition("attachment;filename=testPDF.pdf");
+        final Attachment attachment = new Attachment("root", 
+            getClass().getResourceAsStream("/files/testPDF.pdf"), disposition);
+        wc.post(new MultipartBody(attachment));
+        
+        final Collection<ScoreDoc> hits = search("modified=le=2007-09-15T09:02:31");
       
+        assertEquals(hits.size(), 1);
+    }
+
+    @SuppressWarnings("unchecked")
+    private Collection<ScoreDoc> search(final String expression) {
+        return (Collection<ScoreDoc>)createWebClient("/catalog")
+            .accept(MediaType.APPLICATION_JSON)
+            .query("$filter", expression)
+            .get(Collection.class);
+    }
+    
+    @SuppressWarnings("unchecked")
+    private WebClient createWebClient(final String url) {
+        WebClient wc = WebClient.create("http://localhost:" + PORT + url, 
+            Arrays.asList(new MultipartProvider(), new JacksonJsonProvider()));
+        WebClient.getConfig(wc).getHttpConduit().getClient().setReceiveTimeout(10000000L);
+        return wc;
+    }
+}

http://git-wip-us.apache.org/repos/asf/cxf/blob/2209258c/systests/jaxrs/src/test/resources/files/testPDF.pdf
----------------------------------------------------------------------
diff --git a/systests/jaxrs/src/test/resources/files/testPDF.pdf b/systests/jaxrs/src/test/resources/files/testPDF.pdf
new file mode 100644
index 0000000..1f1bcff
Binary files /dev/null and b/systests/jaxrs/src/test/resources/files/testPDF.pdf differ


Mime
View raw message