lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r1040815 - in /lucene/dev/trunk/solr/contrib/extraction: ./ lib/ src/main/java/org/apache/solr/handler/extraction/ src/test/java/org/apache/solr/handler/
Date Tue, 30 Nov 2010 22:33:33 GMT
Author: gsingers
Date: Tue Nov 30 22:33:30 2010
New Revision: 1040815

URL: http://svn.apache.org/viewvc?rev=1040815&view=rev
Log:
SOLR-2241: upgrade to Tika 0.8

Added:
    lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar   (with props)
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar   (with props)
Removed:
    lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.0.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.1.0.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.1.0.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.1.0.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.6.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.6.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
    lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
Modified:
    lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java

Modified: lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/extraction/CHANGES.txt Tue Nov 30 22:33:30 2010
@@ -20,13 +20,13 @@ to your Solr Home lib directory.  See ht
  Tika Dependency
  ---------------
 
-Current Version: Tika 0.8-SNAPSHOT (rev 942725)
+Current Version: Tika 0.8 (released 11/07/2010)
 
 $Id:$
 
-================== Release 1.5-dev ==================
-
+================== Release 3.1-dev ==================
 
+* Upgraded to Tika 0.8 and changed deprecated parse call
 
 * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config
code that
   parses this setting does not properly use the same iterator instance. (Christoph Brill,
Mark Miller)

Added: lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/fontbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/jempbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/netcdf-4.2.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/pdfbox-1.3.1.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/rome-0.9.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-0.8.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-0.8.jar?rev=1040815&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Tue Nov 30 22:33:30 2010
@@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentSt
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
@@ -190,7 +191,8 @@ public class ExtractingDocumentLoader ex
         } //else leave it as is
 
         //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
for getting the document.
-        parser.parse(inputStream, parsingHandler, metadata);
+        ParseContext context = new ParseContext();//TODO: should we design a way to pass
in parse context?
+        parser.parse(inputStream, parsingHandler, metadata, context);
         if (extractOnly == false) {
           addDoc(handler);
         } else {

Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=1040815&r1=1040814&r2=1040815&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
(original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
Tue Nov 30 22:33:30 2010
@@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTes
 
   @Test
   public void testExtraction() throws Exception {
-    // broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
-    String defLang = Locale.getDefault().getLanguage();
-    assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088",
defLang.equals("tr") || defLang.equals("az"));
     ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
     assertTrue("handler is null and it shouldn't be", handler != null);
-    loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+    loadLocal("solr-word.pdf",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
             "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+            "fmap.Creation-Date", "extractedDate",
+            "fmap.AAPL:Keywords", "ignored_a",
+            "fmap.xmpTPg:NPages", "ignored_a",
             "fmap.Author", "extractedAuthor",
             "fmap.content", "extractedContent",
            "literal.id", "one",
@@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTes
 
   }
 
+
   @Test
   public void testDefaultField() throws Exception {
     ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
@@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTes
 
     loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
         "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "fmap.AAPL:Keywords", "ignored_a",
+        "fmap.xmpTPg:NPages", "ignored_a",
         "fmap.Author", "extractedAuthor",
         "fmap.content", "wdf_nocase",
        "literal.id", "one",



Mime
View raw message