tika-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r1210322 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/WordExtractor.java test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Date Mon, 05 Dec 2011 03:44:43 GMT
Author: nick
Date: Mon Dec  5 03:44:42 2011
New Revision: 1210322

URL: http://svn.apache.org/viewvc?rev=1210322&view=rev
Log:
TIKA-410 Word Parser support for extracting textbox content (Patch from John Mastarone)

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1210322&r1=1210321&r2=1210322&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
(original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Mon Dec  5 03:44:42 2011
@@ -98,7 +98,11 @@ public class WordExtractor extends Abstr
         }
 
         // Do everything else
-        for (String paragraph : wordExtractor.getFootnoteText()) {
+        for (String paragraph: wordExtractor.getMainTextboxText()) {
+            xhtml.element("p", paragraph);
+        }
+
+	for (String paragraph : wordExtractor.getFootnoteText()) {
             xhtml.element("p", paragraph);
         }
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1210322&r1=1210321&r2=1210322&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Mon Dec  5 03:44:42 2011
@@ -202,8 +202,7 @@ public class WordParserTest extends Tika
         assertContains("This is a footnote.", content);
         assertContains("This is the header text.", content);
         assertContains("This is the footer text.", content);
-        // TODO: WordExtractor misses this
-        //assertContains("Here is a text box", content);
+        assertContains("Here is a text box", content);
         assertContains("Bold", content);
         assertContains("italic", content);
         assertContains("underline", content);



Mime
View raw message