poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From n...@apache.org
Subject svn commit: r903182 - in /poi/trunk/src: documentation/content/xdocs/ ooxml/java/org/apache/poi/extractor/ ooxml/java/org/apache/poi/xssf/extractor/ ooxml/testcases/org/apache/poi/extractor/ ooxml/testcases/org/apache/poi/xssf/ ooxml/testcases/org/apac...
Date Tue, 26 Jan 2010 11:39:44 GMT
Author: nick
Date: Tue Jan 26 11:39:44 2010
New Revision: 903182

URL: http://svn.apache.org/viewvc?rev=903182&view=rev
Log:
New event based xssf text extractor (XSSFEventBasedExcelExtractor)

Added:
    poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=903182&r1=903181&r2=903182&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Tue Jan 26 11:39:44 2010
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">New event based xssf text extractor
(XSSFEventBasedExcelExtractor)</action>
            <action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told
to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
            <action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV
when shared string table is missing</action>
            <action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams
created in OPCPackage</action>

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=903182&r1=903181&r2=903182&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Tue Jan 26 11:39:44
2010
@@ -50,6 +50,7 @@
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -161,8 +162,7 @@
             corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType())
||
             corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType()))
{
            if(getPreferEventExtractor()) {
-              // TODO
-              return new XSSFExcelExtractor(pkg);
+              return new XSSFEventBasedExcelExtractor(pkg);
            } else {
               return new XSSFExcelExtractor(pkg);
            }

Added: poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java?rev=903182&view=auto
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
(added)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
Tue Jan 26 11:39:44 2010
@@ -0,0 +1,357 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.ss.usermodel.BuiltinFormats;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFCellStyle;
+import org.apache.poi.xssf.usermodel.XSSFRichTextString;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Implementation of a text extractor from OOXML Excel
+ *  files that uses SAX event based parsing.
+ */
+public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
+   private OPCPackage container;
+	private boolean includeSheetNames = true;
+	private boolean formulasNotResults = false;
+
+   /**
+    * These are the different kinds of cells we support.
+    * We keep track of the current one between
+    *  the start and end.
+    */
+   enum xssfDataType {
+       BOOLEAN,
+       ERROR,
+       FORMULA,
+       INLINE_STRING,
+       SST_STRING,
+       NUMBER,
+   }
+   
+	public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException,
IOException {
+		this(OPCPackage.open(path));
+	}
+	public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException,
IOException {
+		super(null);
+		this.container = container;
+	}
+
+	public static void main(String[] args) throws Exception {
+		if(args.length < 1) {
+			System.err.println("Use:");
+			System.err.println("  XSSFEventBasedExcelExtractor <filename.xlsx>");
+			System.exit(1);
+		}
+		POIXMLTextExtractor extractor =
+			new XSSFEventBasedExcelExtractor(args[0]);
+		System.out.println(extractor.getText());
+	}
+
+	/**
+	 * Should sheet names be included? Default is true
+	 */
+	public void setIncludeSheetNames(boolean includeSheetNames) {
+		this.includeSheetNames = includeSheetNames;
+	}
+	/**
+	 * Should we return the formula itself, and not
+	 *  the result it produces? Default is false
+	 */
+	public void setFormulasNotResults(boolean formulasNotResults) {
+		this.formulasNotResults = formulasNotResults;
+	}
+	
+	
+   /**
+    * Handler for sheets. Processes each row and cell,
+    *  formatting Cells as best as it can.
+    */
+   class MyXSSFSheetHandler extends DefaultHandler {
+       /**
+        * Table with the styles used for formatting
+        */
+       private StylesTable stylesTable;
+
+       private ReadOnlySharedStringsTable sharedStringsTable;
+
+       /**
+        * Where our text is going
+        */
+       private final StringBuffer output;
+
+       // Set when V start element is seen
+       private boolean vIsOpen;
+       // Set when F start element is seen
+       private boolean fIsOpen;
+
+       // Set when cell start element is seen;
+       // used when cell close element is seen.
+       private xssfDataType nextDataType;
+
+       // Used to format numeric cell values.
+       private short formatIndex;
+       private String formatString;
+       private final DataFormatter formatter;
+
+       // Gathers characters as they are seen.
+       private StringBuffer value = new StringBuffer();
+       private StringBuffer formula = new StringBuffer();
+       private boolean firstCellOfRow = true;
+
+       /**
+        * Accepts objects needed while parsing.
+        *
+        * @param styles  Table of styles
+        * @param strings Table of shared strings
+        * @param cols    Minimum number of columns to show
+        * @param target  Sink for output
+        */
+       public MyXSSFSheetHandler(
+               StylesTable styles,
+               ReadOnlySharedStringsTable strings,
+               StringBuffer output) {
+           this.stylesTable = styles;
+           this.sharedStringsTable = strings;
+           this.output = output;
+           this.nextDataType = xssfDataType.NUMBER;
+           this.formatter = new DataFormatter();
+       }
+
+       public void startElement(String uri, String localName, String name,
+                                Attributes attributes) throws SAXException {
+
+           if ("inlineStr".equals(name) || "v".equals(name)) {
+               vIsOpen = true;
+               // Clear contents cache
+               value.setLength(0);
+           } else if ("f".equals(name)) {
+              // Clear contents cache
+              formula.setLength(0);
+              
+              // Mark us as being a formula if not already
+              if(nextDataType == xssfDataType.NUMBER) {
+                 nextDataType = xssfDataType.FORMULA;
+              }
+              
+              // Decide where to get the formula string from
+              String type = attributes.getValue("t"); 
+              if(type != null && type.equals("shared")) {
+                 System.err.println("Warning - shared formulas not yet supported!");
+              } else {
+                 fIsOpen = true;
+              }
+           }
+           else if("row".equals(name)) {
+               firstCellOfRow = true;
+           }
+           // c => cell
+           else if ("c".equals(name)) {
+               // Set up defaults.
+               this.nextDataType = xssfDataType.NUMBER;
+               this.formatIndex = -1;
+               this.formatString = null;
+               String cellType = attributes.getValue("t");
+               String cellStyleStr = attributes.getValue("s");
+               if ("b".equals(cellType))
+                   nextDataType = xssfDataType.BOOLEAN;
+               else if ("e".equals(cellType))
+                   nextDataType = xssfDataType.ERROR;
+               else if ("inlineStr".equals(cellType))
+                   nextDataType = xssfDataType.INLINE_STRING;
+               else if ("s".equals(cellType))
+                   nextDataType = xssfDataType.SST_STRING;
+               else if ("str".equals(cellType))
+                   nextDataType = xssfDataType.FORMULA;
+               else if (cellStyleStr != null) {
+                  // Number, but almost certainly with a special style or format
+                   int styleIndex = Integer.parseInt(cellStyleStr);
+                   XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
+                   this.formatIndex = style.getDataFormat();
+                   this.formatString = style.getDataFormatString();
+                   if (this.formatString == null)
+                       this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
+               }
+           }
+       }
+
+       public void endElement(String uri, String localName, String name)
+               throws SAXException {
+           String thisStr = null;
+
+           // v => contents of a cell
+           if ("v".equals(name)) {
+               vIsOpen = false;
+               
+               // Process the value contents as required, now we have it all
+               switch (nextDataType) {
+                   case BOOLEAN:
+                       char first = value.charAt(0);
+                       thisStr = first == '0' ? "FALSE" : "TRUE";
+                       break;
+
+                   case ERROR:
+                       thisStr = "ERROR:" + value.toString();
+                       break;
+
+                   case FORMULA:
+                       if(formulasNotResults) {
+                          thisStr = formula.toString();
+                       } else {
+                          thisStr = value.toString();
+                       }
+                       break;
+
+                   case INLINE_STRING:
+                       // TODO: have seen an example of this, so it's untested.
+                       XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
+                       thisStr = rtsi.toString();
+                       break;
+
+                   case SST_STRING:
+                       String sstIndex = value.toString();
+                       try {
+                           int idx = Integer.parseInt(sstIndex);
+                           XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
+                           thisStr = rtss.toString();
+                       }
+                       catch (NumberFormatException ex) {
+                           System.err.println("Failed to parse SST index '" + sstIndex +
"': " + ex.toString());
+                       }
+                       break;
+
+                   case NUMBER:
+                       String n = value.toString();
+                       if (this.formatString != null)
+                           thisStr = formatter.formatRawCellContents(Double.parseDouble(n),
this.formatIndex, this.formatString);
+                       else
+                           thisStr = n;
+                       break;
+
+                   default:
+                       thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
+                       break;
+               }
+               
+               // Output
+               if(!firstCellOfRow) {
+                  output.append('\t');
+               }
+               firstCellOfRow = false;
+               
+               output.append(thisStr);
+           } else if ("f".equals(name)) {
+              fIsOpen = false;
+           } else if ("row".equals(name)) {
+              // Finish the line
+              output.append('\n');
+           }
+       }
+
+       /**
+        * Captures characters only if a suitable element is open.
+        * Originally was just "v"; extended for inlineStr also.
+        */
+       public void characters(char[] ch, int start, int length)
+               throws SAXException {
+           if (vIsOpen) {
+               value.append(ch, start, length);
+           }
+           if (fIsOpen) {
+              formula.append(ch, start, length);
+           }
+       }
+   }
+
+   /**
+    * Processes the given sheet
+    */
+   public void processSheet(
+           StringBuffer output,
+           StylesTable styles,
+           ReadOnlySharedStringsTable strings,
+           InputStream sheetInputStream)
+           throws IOException, SAXException {
+
+       InputSource sheetSource = new InputSource(sheetInputStream);
+       SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+       try {
+          SAXParser saxParser = saxFactory.newSAXParser();
+          XMLReader sheetParser = saxParser.getXMLReader();
+          ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output);
+          sheetParser.setContentHandler(handler);
+          sheetParser.parse(sheetSource);
+       } catch(ParserConfigurationException e) {
+          throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
+       }
+   }
+
+   /**
+    * Processes the file and returns the text
+    */
+   public String getText() {
+       try {
+          ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
+          XSSFReader xssfReader = new XSSFReader(container);
+          StylesTable styles = xssfReader.getStylesTable();
+          XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+   
+          StringBuffer text = new StringBuffer();
+          while (iter.hasNext()) {
+              InputStream stream = iter.next();
+              if(includeSheetNames) {
+                 text.append(iter.getSheetName());
+                 text.append('\n');
+              }
+              processSheet(text, styles, strings, stream);
+              stream.close();
+          }
+          
+          return text.toString();
+       } catch(IOException e) {
+          System.err.println(e);
+          return null;
+       } catch(SAXException se) {
+          System.err.println(se);
+          return null;
+       } catch(OpenXML4JException o4je) {
+          System.err.println(o4je);
+          return null;
+       }
+   }
+}

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java?rev=903182&r1=903181&r2=903182&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java Tue Jan
26 11:39:44 2010
@@ -56,7 +56,7 @@
 	public static void main(String[] args) throws Exception {
 		if(args.length < 1) {
 			System.err.println("Use:");
-			System.err.println("  HXFExcelExtractor <filename.xlsx>");
+			System.err.println("  XSSFExcelExtractor <filename.xlsx>");
 			System.exit(1);
 		}
 		POIXMLTextExtractor extractor =

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=903182&r1=903181&r2=903182&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Tue Jan
26 11:39:44 2010
@@ -32,6 +32,7 @@
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 
@@ -427,7 +428,7 @@
       
       assertTrue(
             ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
-            instanceof XSSFExcelExtractor // TODO
+            instanceof XSSFEventBasedExcelExtractor
       );
       assertTrue(
             ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length()
> 200

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java?rev=903182&r1=903181&r2=903182&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java Tue Jan 26
11:39:44 2010
@@ -19,18 +19,14 @@
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.poi.hssf.HSSFTestDataSamples;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.poi.util.TempFile;
 
 /**
  * Centralises logic for finding/opening sample files in the src/testcases/org/apache/poi/hssf/hssf/data
folder. 
@@ -39,6 +35,15 @@
  */
 public class XSSFTestDataSamples {
 
+   public static OPCPackage openSamplePackage(String sampleName) {
+      try {
+         return OPCPackage.open(
+               HSSFTestDataSamples.openSampleFileStream(sampleName)
+         );
+      } catch(Exception e) {
+         throw new RuntimeException(e);
+      }
+   }
 	public static XSSFWorkbook openSampleWorkbook(String sampleName) {
 		InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleName);
 		try {

Added: poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java?rev=903182&view=auto
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
(added)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
Tue Jan 26 11:39:44 2010
@@ -0,0 +1,141 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.extractor;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.HSSFTestDataSamples;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.xssf.XSSFTestDataSamples;
+
+/**
+ * Tests for {@link XSSFEventBasedExcelExtractor}
+ */
+public final class TestXSSFEventBasedExcelExtractor extends TestCase {
+
+
+	private static final XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws
Exception {
+		return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples.
+		      openSamplePackage(sampleName));
+	}
+
+	/**
+	 * Get text out of the simple file
+	 */
+	public void testGetSimpleText() throws Exception {
+		// a very simple file
+	   XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx");
+		extractor.getText();
+		
+		String text = extractor.getText();
+		assertTrue(text.length() > 0);
+		
+		// Check sheet names
+		assertTrue(text.startsWith("Sheet1"));
+		assertTrue(text.endsWith("Sheet3\n"));
+		
+		// Now without, will have text
+		extractor.setIncludeSheetNames(false);
+		text = extractor.getText();
+		String CHUNK1 =
+			"Lorem\t111\n" + 
+    		"ipsum\t222\n" + 
+    		"dolor\t333\n" + 
+    		"sit\t444\n" + 
+    		"amet\t555\n" + 
+    		"consectetuer\t666\n" + 
+    		"adipiscing\t777\n" + 
+    		"elit\t888\n" + 
+    		"Nunc\t999\n";
+		String CHUNK2 =
+			"The quick brown fox jumps over the lazy dog\n" +
+			"hello, xssf	hello, xssf\n" +
+			"hello, xssf	hello, xssf\n" +
+			"hello, xssf	hello, xssf\n" +
+			"hello, xssf	hello, xssf\n";
+		assertEquals(
+				CHUNK1 + 
+				"at\t4995\n" + 
+				CHUNK2
+				, text);
+		
+		// Now get formulas not their values
+		extractor.setFormulasNotResults(true);
+		text = extractor.getText();
+		assertEquals(
+				CHUNK1 +
+				"at\tSUM(B1:B9)\n" + 
+				CHUNK2, text);
+		
+		// With sheet names too
+		extractor.setIncludeSheetNames(true);
+		text = extractor.getText();
+		assertEquals(
+				"Sheet1\n" +
+				CHUNK1 +
+				"at\tSUM(B1:B9)\n" + 
+				"rich test\n" +
+				CHUNK2 +
+				"Sheet3\n"
+				, text);
+	}
+	
+	public void testGetComplexText() throws Exception {
+		// A fairly complex file
+	   XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx");
+		extractor.getText();
+		
+		String text = extractor.getText();
+		assertTrue(text.length() > 0);
+		
+		// Might not have all formatting it should do!
+		assertTrue(text.startsWith(
+						"Avgtxfull\n" +
+						"(iii) AVERAGE TAX RATES ON ANNUAL"	
+		));
+	}
+	
+	/**
+	 * Test that we return pretty much the same as
+	 *  ExcelExtractor does, when we're both passed
+	 *  the same file, just saved as xls and xlsx
+	 */
+	public void testComparedToOLE2() throws Exception {
+		// A fairly simple file - ooxml
+	   XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx");
+
+		ExcelExtractor ole2Extractor =
+			new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls"));
+		
+		POITextExtractor[] extractors =
+			new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
+		for (int i = 0; i < extractors.length; i++) {
+			POITextExtractor extractor = extractors[i];
+			
+			String text = extractor.getText().replaceAll("[\r\t]", "");
+			assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
+			Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
+			Matcher m = pattern.matcher(text);
+			assertTrue(m.matches());			
+		}
+	}
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message