tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1212573 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/resources/test-documents/
Date Fri, 09 Dec 2011 18:48:49 GMT
Author: mikemccand
Date: Fri Dec  9 18:48:49 2011
New Revision: 1212573

URL: http://svn.apache.org/viewvc?rev=1212573&view=rev
Log:
TIKA-801: fixed NPE when filtering Outlook docs with RTF or HTML content

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testMSG_forwarded.msg   (with
props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1212573&r1=1212572&r2=1212573&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Dec  9 18:48:49 2011
@@ -17,7 +17,6 @@ Release 1.1 - Current Development
    XHTML output, causing NPE when opening some PDFs with the GUI
    (TIKA-778).
 
-
  * RTF: Fixed case where a font change would result in processing
    bytes in the wrong font's charset, producing bogus text output
    (TIKA-777).  Don't output whitespace in ignored group states,
@@ -40,6 +39,9 @@ Release 1.1 - Current Development
  * Microsoft Project (MPP): Filetype detection has been fixed,
    and basic metadata (but no text) is now extracted. (TIKA-789)
 
+ * Outlook: fixed NullPointerException in TikaGUI when messages with
+   embedded RTF or HTML content were filtered (TIKA-801).
+
 Release 1.0 - 11/4/2011
 ---------------------------------
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1212573&r1=1212572&r2=1212573&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Fri Dec  9 18:48:49 2011
@@ -42,6 +42,7 @@ import org.apache.tika.parser.rtf.RTFPar
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -189,10 +190,11 @@ public class OutlookExtractor extends Ab
                  data = ((StringChunk)htmlChunk).getRawValue();
               }
               if(data != null) {
+                  // nocommit same problem here?
                  HtmlParser htmlParser = new HtmlParser();
                  htmlParser.parse(
                        new ByteArrayInputStream(data),
-                       new BodyContentHandler(xhtml), 
+                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)), 
                        new Metadata(), new ParseContext()
                  );
                  doneBody = true;
@@ -206,8 +208,8 @@ public class OutlookExtractor extends Ab
               RTFParser rtfParser = new RTFParser();
               rtfParser.parse(
                               new ByteArrayInputStream(rtf.getData()),
-                              xhtml, new Metadata(), new ParseContext()
-                              );
+                              new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                              new Metadata(), new ParseContext());
               doneBody = true;
            }
            if(textChunk != null && !doneBody) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1212573&r1=1212572&r2=1212573&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Fri Dec  9 18:48:49 2011
@@ -170,7 +170,34 @@ public class OutlookParserTest extends T
         
         // Make sure we don't have nested html docs
         assertEquals(2, content.split("<body>").length);
-        //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+
+    public void testOutlookForwarded() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+       
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        InputStream stream = OutlookParserTest.class.getResourceAsStream(
+               "/test-documents/testMSG_forwarded.msg");
+        try {
+           parser.parse(stream, handler, metadata, new ParseContext());
+        } finally {
+           stream.close();
+        }
+         
+        // Make sure we don't have nested docs
+        String content = sw.toString();
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
     }
     
     public void testOutlookHTMLfromRTF() throws Exception {

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMSG_forwarded.msg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMSG_forwarded.msg?rev=1212573&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMSG_forwarded.msg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message