poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r687443 - in /poi/trunk/src/scratchpad: src/org/apache/poi/hpbf/extractor/ src/org/apache/poi/hpbf/model/qcbits/ testcases/org/apache/poi/hpbf/data/ testcases/org/apache/poi/hpbf/extractor/ testcases/org/apache/poi/hpbf/model/
Date Wed, 20 Aug 2008 20:13:09 GMT
Author: nick
Date: Wed Aug 20 13:13:08 2008
New Revision: 687443

URL: http://svn.apache.org/viewvc?rev=687443&view=rev
Log:
HPBF text extractor and unit tests

Added:
    poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/
    poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
  (with props)
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub   (with props)
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
  (with props)
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java   (with
props)
Modified:
    poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java

Added: poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java?rev=687443&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
(added)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
Wed Aug 20 13:13:08 2008
@@ -0,0 +1,78 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Extract text from HPBF Publisher files 
+ */
+public class PublisherTextExtractor extends POIOLE2TextExtractor {
+	private HPBFDocument doc;
+	
+	public PublisherTextExtractor(HPBFDocument doc) {
+		super(doc);
+		this.doc = doc;
+	}
+	public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
+		this(new HPBFDocument(fs));
+	}
+	public PublisherTextExtractor(InputStream is) throws IOException {
+		this(new POIFSFileSystem(is));
+	}
+	
+	public String getText() {
+		StringBuffer text = new StringBuffer();
+		
+		// Get the text from the Quill Contents
+		QCBit[] bits = doc.getQuillContents().getBits();
+		for(int i=0; i<bits.length; i++) {
+			if(bits[i] != null && bits[i] instanceof QCTextBit) {
+				QCTextBit t = (QCTextBit)bits[i];
+				text.append( t.getText().replace('\r', '\n') );
+			}
+		}
+		
+		// Get more text
+		// TODO
+		
+		return text.toString();
+	}
+	
+	
+	public static void main(String[] args) throws Exception {
+		if(args.length == 0) {
+			System.err.println("Use:");
+			System.err.println("  PublisherTextExtractor <file.pub>");
+		}
+		
+		for(int i=0; i<args.length; i++) {
+			PublisherTextExtractor te = new PublisherTextExtractor(
+					new FileInputStream(args[i])
+			);
+			System.out.println(te.getText());
+		}
+	}
+}

Propchange: poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java?rev=687443&r1=687442&r2=687443&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java Wed Aug 20
13:13:08 2008
@@ -25,7 +25,11 @@
 	public QCTextBit(String thingType, String bitType, byte[] data) {
 		super(thingType, bitType, data);
 	}
-	
+
+	/**
+	 * Returns the text. Note that line endings
+	 *  are \r and not \n
+	 */
 	public String getText() {
 		return StringUtil.getFromUnicodeLE(
 				data, 0, data.length/2

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub?rev=687443&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
------------------------------------------------------------------------------
    svn:executable = *

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java?rev=687443&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
(added)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
Wed Aug 20 13:13:08 2008
@@ -0,0 +1,105 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TextPublisherTextExtractor extends TestCase {
+	private String dir;
+
+	protected void setUp() throws Exception {
+		dir = System.getProperty("HPBF.testdata.path");
+	}
+
+	public void testBasics() throws Exception {
+		File f = new File(dir, "Sample.pub");
+		HPBFDocument doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+
+		PublisherTextExtractor ext = 
+			new PublisherTextExtractor(doc);
+		ext.getText();
+		
+		f = new File(dir, "Simple.pub");
+		ext = new PublisherTextExtractor(
+				new FileInputStream(f)
+		);
+		ext.getText();
+	}
+	
+	public void testContents() throws Exception {
+		File f = new File(dir, "Sample.pub");
+		HPBFDocument doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+
+		PublisherTextExtractor ext = 
+			new PublisherTextExtractor(doc);
+		String text = ext.getText();
+		
+		assertEquals(
+"This is some text on the first page\n" +
+"It’s in times new roman, font size 10, all normal\n" +
+"" +
+"This is in bold and italic\n" +
+"It’s Arial, 20 point font\n" +
+"It’s in the second textbox on the first page\n" +
+"" +
+"This is the second page\n\n" +
+"" +
+"It is also times new roman, 10 point\n" +
+"" +
+"Table on page 2\nTop right\n" +
+"P2 table left\nP2 table right\n" +
+"Bottom Left\nBottom Right\n" +
+"" +
+"This text is on page two\n" +
+"#This is a link to Apache POI\n" +
+"More normal text\n" +
+"Link to a file\n" +
+"" +
+"More text, more hyperlinks\n" +
+"email link\n" +
+"Final hyperlink\n" +
+"Within doc to page 1\n"
+				, text
+		);
+		
+		// Now a simpler one
+		f = new File(dir, "Simple.pub");
+		ext = new PublisherTextExtractor(
+				new FileInputStream(f)
+		);
+		text = ext.getText();
+		assertEquals(
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef0123456789abcdef\n"
+				, text
+		);
+	}
+}

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java?rev=687443&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java (added)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java Wed
Aug 20 13:13:08 2008
@@ -0,0 +1,50 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.model;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TestEscherParts extends TestCase {
+	private String dir;
+
+	protected void setUp() throws Exception {
+		dir = System.getProperty("HPBF.testdata.path");
+	}
+
+	public void testBasics() throws Exception {
+		File f = new File(dir, "Sample.pub");
+		HPBFDocument doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+
+		EscherStm es = doc.getEscherStm();
+		EscherDelayStm eds = doc.getEscherDelayStm();
+		
+		assertNotNull(es);
+		assertNotNull(eds);
+		
+		assertEquals(13, es.getEscherRecords().length);
+		assertEquals(0, eds.getEscherRecords().length);
+		
+		// TODO - check the contents
+	}
+}

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
------------------------------------------------------------------------------
    svn:eol-style = native



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message