poi-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r377372 - in /jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf: data/test2.doc extractor/ extractor/TestDifferentRoutes.java extractor/TestWordExtractor.java
Date Mon, 13 Feb 2006 12:59:01 GMT
Author: nick
Date: Mon Feb 13 04:59:00 2006
New Revision: 377372

URL: http://svn.apache.org/viewcvs?rev=377372&view=rev
Log:
Friendly wrapper on HWPF for extracting text from Word Documents

Added:
    jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc   (with
props)
    jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/
    jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
    jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc?rev=377372&view=auto
==============================================================================
Binary file - no diff available.

Propchange: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
------------------------------------------------------------------------------
    svn:executable = *

Propchange: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java?rev=377372&view=auto
==============================================================================
--- jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
(added)
+++ jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
Mon Feb 13 04:59:00 2006
@@ -0,0 +1,87 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.io.FileInputStream;
+import java.util.Iterator;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+
+import junit.framework.TestCase;
+
+/**
+ * Test the different routes to extracting text
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class TestDifferentRoutes extends TestCase {
+	private String[] p_text = new String[] {
+			"This is a simple word document\r",
+			"\r",
+			"It has a number of paragraphs in it\r",
+			"\r",
+			"Some of them even feature bold, italic and underlined text\r",
+			"\r",
+			"\r",
+			"This bit is in a different font and size\r",
+			"\r",
+			"\r",
+			"This bit features some red text.\r",
+			"\r",
+			"\r",
+			"It is otherwise very very boring.\r"
+	};
+	
+	private HWPFDocument doc;
+	
+    protected void setUp() throws Exception {
+		String dirname = System.getProperty("HWPF.testdata.path");
+		
+		String filename = dirname + "/test2.doc";
+		doc = new HWPFDocument(new FileInputStream(filename));
+    }			
+    
+    /**
+     * Test model based extraction
+     */
+    public void testExtractFromModel() {
+    	Range r = doc.getRange();
+    	
+    	String[] text = new String[r.numParagraphs()];
+    	for(int i=0; i < r.numParagraphs(); i++) {
+    		Paragraph p = r.getParagraph(i);
+    		text[i] = p.text();
+    	}
+    	
+    	assertEquals(p_text.length, text.length);
+    	for(int i=0; i<p_text.length; i++) {
+    		assertEquals(p_text[i], text[i]);
+    	}
+    }
+    
+    /**
+     * Test textPieces based extraction
+     */
+    public void testExtractFromTextPieces() throws Exception {
+    	StringBuffer textBuf = new StringBuffer();
+    	
+    	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
+    	while (textPieces.hasNext()) {
+    		TextPiece piece = (TextPiece) textPieces.next();
+
+    		String encoding = "Cp1252";
+    		if (piece.usesUnicode()) {
+    			encoding = "UTF-16LE";
+    		}
+    		String text = new String(piece.getRawBytes(), encoding);
+    		textBuf.append(text);
+    	}
+    	
+    	StringBuffer exp = new StringBuffer();
+    	for(int i=0; i<p_text.length; i++) {
+    		exp.append(p_text[i]);
+    	}
+    	assertEquals(exp.toString(), textBuf.toString());
+    }
+}

Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java?rev=377372&view=auto
==============================================================================
--- jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
(added)
+++ jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
Mon Feb 13 04:59:00 2006
@@ -0,0 +1,88 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.io.FileInputStream;
+import java.util.Iterator;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+
+import junit.framework.TestCase;
+
+/**
+ * Test the different routes to extracting text
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class TestWordExtractor extends TestCase {
+	private String[] p_text1 = new String[] {
+			"This is a simple word document\r\n",
+			"\r\n",
+			"It has a number of paragraphs in it\r\n",
+			"\r\n",
+			"Some of them even feature bold, italic and underlined text\r\n",
+			"\r\n",
+			"\r\n",
+			"This bit is in a different font and size\r\n",
+			"\r\n",
+			"\r\n",
+			"This bit features some red text.\r\n",
+			"\r\n",
+			"\r\n",
+			"It is otherwise very very boring.\r\n"
+	};
+	private String p_text1_block = new String();
+		
+	// Well behaved document
+	private WordExtractor extractor;
+	// Corrupted document - can't do paragraph based stuff
+	private WordExtractor extractor2;
+	
+    protected void setUp() throws Exception {
+		String dirname = System.getProperty("HWPF.testdata.path");
+		
+		String filename = dirname + "/test2.doc";
+		String filename2 = dirname + "/test.doc";
+		extractor = new WordExtractor(new FileInputStream(filename));
+		extractor2 = new WordExtractor(new FileInputStream(filename2));
+		
+		// Build splat'd out text version
+		for(int i=0; i<p_text1.length; i++) {
+			p_text1_block += p_text1[i];
+		}
+    }			
+    
+    /**
+     * Test paragraph based extraction
+     */
+    public void testExtractFromParagraphs() {
+    	String[] text = extractor.getParagraphText();
+    	
+    	assertEquals(p_text1.length, text.length);
+    	for(int i=0; i<p_text1.length; i++) {
+    		assertEquals(p_text1[i], text[i]);
+    	}
+    	
+    	// On second one, should fall back
+    	assertEquals(1, extractor2.getParagraphText().length);
+    }
+    
+    /**
+     * Test the paragraph -> flat extraction
+     */
+    public void testGetText() {
+    	assertEquals(p_text1_block, extractor.getText());
+    	
+    	// On second one, should fall back to text piece
+    	assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+    }
+    
+    /**
+     * Test textPieces based extraction
+     */
+    public void testExtractFromTextPieces() throws Exception {
+    	String text = extractor.getTextFromPieces();
+    	assertEquals(p_text1_block, text);
+    }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: poi-dev-unsubscribe@jakarta.apache.org
Mailing List:    http://jakarta.apache.org/site/mail2.html#poi
The Apache Jakarta POI Project: http://jakarta.apache.org/poi/


Mime
View raw message