cxf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From r...@apache.org
Subject git commit: CXF-5549: Introduce Tika Search Visitor
Date Fri, 20 Jun 2014 01:56:45 GMT
Repository: cxf
Updated Branches:
  refs/heads/master a980e2ff8 -> 667214435


CXF-5549: Introduce Tika Search Visitor


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/66721443
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/66721443
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/66721443

Branch: refs/heads/master
Commit: 6672144354475a53bd087e6c1ecbb2a47cf88ba2
Parents: a980e2f
Author: reta <drreta@gmail.com>
Authored: Thu Jun 19 21:53:14 2014 -0400
Committer: reta <drreta@gmail.com>
Committed: Thu Jun 19 21:53:14 2014 -0400

----------------------------------------------------------------------
 .../ext/search/tika/TikaContentExtractor.java   | 17 +++++++--
 .../search/tika/TikaContentExtractorTest.java   | 38 ++++++++++++++++++++
 .../search/src/test/resources/files/testRTF.rtf | 17 +++++++++
 .../search/src/test/resources/files/testTXT.txt |  2 ++
 4 files changed, 71 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/66721443/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 72760e7..f1cefce 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -27,8 +27,8 @@ import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.xml.sax.SAXException;
-
 import org.apache.cxf.common.logging.LogUtils;
+import org.apache.cxf.common.util.StringUtils;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Store;
@@ -79,6 +79,10 @@ public class TikaContentExtractor {
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */
     public Document extract(final InputStream in) {
+        if (in == null) {
+            return null;
+        }
+        
         try {
             final Metadata metadata = new Metadata();            
             final ParseContext context = new ParseContext();
@@ -95,7 +99,11 @@ public class TikaContentExtractor {
             parser.parse(in, handler, metadata, context);
             
             final Document document = new Document();
-            document.add(new Field("contents", handler.toString(), TextField.TYPE_STORED));
+            final String content = handler.toString();
+            
+            if (!StringUtils.isEmpty(content)) {
+                document.add(new Field("contents", content, TextField.TYPE_STORED));
+            }
             
             for (final String property: metadata.names()) {
                 document.add(new StringField(property, metadata.get(property), Store.YES));
@@ -121,8 +129,11 @@ public class TikaContentExtractor {
      * @return the extracted document or null if extraction is not possible or was unsuccessful
      */    
     public Document extract(final File file) throws FileNotFoundException  {
-        InputStream in = null;
+        if (file == null) {
+            return null;
+        }
         
+        InputStream in = null;        
         try {
             in = new FileInputStream(file);
             return extract(in);

http://git-wip-us.apache.org/repos/asf/cxf/blob/66721443/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
index df33d69..e169ee0 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -18,7 +18,10 @@
  */
 package org.apache.cxf.jaxrs.ext.search.tika;
 
+import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
 
 import org.apache.cxf.jaxrs.ext.search.SearchBean;
 import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
@@ -74,6 +77,41 @@ public class TikaContentExtractorTest extends Assert {
         assertEquals(0, getHits("ct==toolsuite").length);
     }
 
+    @Test
+    public void testExtractionFromTextFileUsingPdfParserFails() {        
+        assertNull("Document should be null, it is not a PDF", 
+            extractor.extract(getClass().getResourceAsStream("/files/testTXT.txt")));   
    
+    }
+
+    @Test
+    public void testExtractionFromRtfFileUsingPdfParserWithoutMediaTypeValidationFails()
{
+        final TikaContentExtractor another = new TikaContentExtractor(new PDFParser(), false);
+        assertNull("Document should be null, it is not a PDF", 
+            another.extract(getClass().getResourceAsStream("/files/testRTF.rtf")));     
  
+    }
+
+    @Test
+    public void testExtractionFromEncryptedPdfFails() {
+        assertNull("Document should be null, it is encrypted", 
+            extractor.extract(getClass().getResourceAsStream("/files/testPDF.Encrypted.pdf")));
       
+    }
+    
+    @Test
+    public void testExtractionFromNullInputStreamFails() {
+        assertNull("Document should be null, it is encrypted", extractor.extract((InputStream)null));
       
+    }
+
+    @Test
+    public void testExtractionFromNullFileFails() throws FileNotFoundException {
+        assertNull("Document should be null, it is encrypted", extractor.extract((File)null));
       
+    }
+    
+    @Test(expected = FileNotFoundException.class)
+    public void testExtractionFromNonExistingFileFails() throws FileNotFoundException {
+        assertNull("Document should be null, it is encrypted", 
+            extractor.extract(new File("a.txt")));        
+    }
+
     private ScoreDoc[] getHits(final String expression) throws IOException {
         IndexReader reader = DirectoryReader.open(directory);
         IndexSearcher searcher = new IndexSearcher(reader);        

http://git-wip-us.apache.org/repos/asf/cxf/blob/66721443/rt/rs/extensions/search/src/test/resources/files/testRTF.rtf
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testRTF.rtf b/rt/rs/extensions/search/src/test/resources/files/testRTF.rtf
new file mode 100644
index 0000000..487e6f4
--- /dev/null
+++ b/rt/rs/extensions/search/src/test/resources/files/testRTF.rtf
@@ -0,0 +1,17 @@
+{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1036\deflangfe1036{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose
02020603050405020304}Times New Roman;}{\f37\froman\fcharset238\fprq2 Times New Roman CE;}
+{\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times
New Roman Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman Tur;}{\f42\froman\fcharset177\fprq2
Times New Roman (Hebrew);}
+{\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times
New Roman Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;
+\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;
+\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \snext0 Normal;}{\*\cs10 \additive
\ssemihidden 
+Default Paragraph Font;}{\*\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv

+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024
\snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\rsidtbl
\rsid2954171\rsid10375891}
+{\*\generator Microsoft Word 11.0.6568;}{\info{\title Test d\'92indexation Word}{\author
Bibliotheque}{\operator Bibliotheque}{\creatim\yr2006\mo5\dy18\hr12\min19}{\revtim\yr2006\mo5\dy18\hr12\min19}{\version2}{\edmins0}{\nofpages1}{\nofwords3}
+{\nofchars21}{\*\company Universite Laval}{\nofcharsws23}{\vern24579}}\paperw11906\paperh16838\margl1417\margr1417\margt1417\margb1417

+\deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1417\dgvorigin1417\dghshow1\dgvshow1
+\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\nojkernpunct\rsidroot2954171
\fet0
+\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj
{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang
{\pntxta .}}{\*\pnseclvl3
+\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang
{\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang
{\pntxtb (}{\pntxta )}}
+{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang
{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}\pard\plain 
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036
{\insrsid2954171 Test d\rquote indexation Word
+\par 
+\par }}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/cxf/blob/66721443/rt/rs/extensions/search/src/test/resources/files/testTXT.txt
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testTXT.txt b/rt/rs/extensions/search/src/test/resources/files/testTXT.txt
new file mode 100644
index 0000000..0b5605a
--- /dev/null
+++ b/rt/rs/extensions/search/src/test/resources/files/testTXT.txt
@@ -0,0 +1,2 @@
+Test d'indexation de Txt
+http://www.apache.org


Mime
View raw message