poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cen...@apache.org
Subject svn commit: r1662652 - in /poi/trunk/src: integrationtest/org/apache/poi/ integrationtest/org/apache/poi/stress/ ooxml/java/org/apache/poi/extractor/ ooxml/java/org/apache/poi/xssf/extractor/
Date Fri, 27 Feb 2015 09:59:15 GMT
Author: centic
Date: Fri Feb 27 09:59:14 2015
New Revision: 1662652

URL: http://svn.apache.org/r1662652
Log:
* Add text-extraction verification to integration-tests via a new abstract base FileHandler
* Fix NullPointerException found in some documents when running against the test-data
* Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely
formatted XLS files.

Added:
    poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
Modified:
    poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
    poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java

Modified: poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/TestAllFiles.java Fri Feb 27 09:59:14 2015
@@ -253,20 +253,26 @@ public class TestAllFiles {
     @Test
     public void testAllFiles() throws Exception {
 		assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
-		InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100);
+		File inputFile = new File(ROOT_DIR, file);
+		
 		try {
-			handler.handleFile(stream);
-			
-			assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did
not fail!", 
-			        EXPECTED_FAILURES.contains(file));
-		} catch (Exception e) {
-		    // check if we expect failure for this file
-			if(!EXPECTED_FAILURES.contains(file)) {
-			    throw new Exception("While handling " + file, e);
-			}
-		} finally {
-			stream.close();
-		}
+            InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
+    		try {
+    			handler.handleFile(stream);
+    
+    			assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but
did not fail!", 
+    			        EXPECTED_FAILURES.contains(file));
+    		} finally {
+    			stream.close();
+    		}
+
+            handler.handleExtracting(inputFile);
+        } catch (Exception e) {
+            // check if we expect failure for this file
+            if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file))
{
+                throw new Exception("While handling " + file, e);
+            }
+        }
 	}
 
 	private static String getExtension(String file) {
@@ -282,5 +288,9 @@ public class TestAllFiles {
 		@Override
         public void handleFile(InputStream stream) throws Exception {
 		}
+
+		@Override
+        public void handleExtracting(File file) throws Exception {
+        }
 	}
 }

Added: poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java?rev=1662652&view=auto
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java (added)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java Fri Feb 27
09:59:14 2015
@@ -0,0 +1,55 @@
+package org.apache.poi.stress;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.File;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+
+public abstract class AbstractFileHandler implements FileHandler {
+    public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>();
+    static {
+        // password protected files
+        EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx");
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx");
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx");
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx");
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx");
+        
+        // unsupported file-types, no supported OLE2 parts
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat");
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat");
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat");
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat");
+        EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm");
+        EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg");
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2");
+        EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx");
+    }
+
+    public void handleExtracting(File file) throws Exception {
+        POITextExtractor extractor = ExtractorFactory.createExtractor(file);
+        try  {
+            assertNotNull(extractor);
+
+            assertNotNull(extractor.getText());
+            
+            // also try metadata
+            POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
+            assertNotNull(metadataExtractor.getText());
+
+            assertFalse("Expected Extraction to fail for file " + file + " and handler "
+ this + ", but did not fail!", 
+                    EXPECTED_EXTRACTOR_FAILURES.contains(file));
+        } catch (IllegalArgumentException e) {
+            if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
+                throw new Exception("While handling " + file, e);
+            }
+        } finally {
+            extractor.close();
+        }
+    }
+}

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/FileHandler.java Fri Feb 27 09:59:14
2015
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.stress;
 
+import java.io.File;
 import java.io.InputStream;
 
 /**
@@ -34,4 +35,10 @@ public interface FileHandler {
 	 * @throws Exception
 	 */
 	void handleFile(InputStream stream) throws Exception;
+	
+	/**
+	 * Ensures that extracting text from the given file
+	 * is returning some text. 
+	 */
+	void handleExtracting(File file) throws Exception;
 }

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java Fri Feb 27 09:59:14
2015
@@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAP
 import org.apache.poi.hmef.attribute.MAPIStringAttribute;
 import org.junit.Test;
 
-public class HMEFFileHandler implements FileHandler {
+public class HMEFFileHandler extends AbstractFileHandler {
 
 	@Override
     public void handleFile(InputStream stream) throws Exception {

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java Fri Feb 27 09:59:14
2015
@@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertie
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.junit.Test;
 
-public class HPSFFileHandler implements FileHandler {
+public class HPSFFileHandler extends AbstractFileHandler {
 	@Override
     public void handleFile(InputStream stream) throws Exception {
 		HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java Fri Feb 27 09:59:14
2015
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.stress;
 
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 
@@ -49,4 +50,10 @@ public class HSSFFileHandler extends Spr
 			stream.close();
 		}
 	}
+
+	// a test-case to test this locally without executing the full TestAllFiles
+    @Test
+    public void testExtractor() throws Exception {
+        handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls"));
+    }
 }
\ No newline at end of file

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java Fri Feb 27 09:59:14
2015
@@ -25,7 +25,7 @@ import java.io.InputStream;
 import org.apache.poi.POIDocument;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
-public class POIFSFileHandler implements FileHandler {
+public class POIFSFileHandler extends AbstractFileHandler {
 
 	@Override
     public void handleFile(InputStream stream) throws Exception {

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java Fri Feb 27
09:59:14 2015
@@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.ss.usermodel.WorkbookFactory;
 
-public abstract class SpreadsheetHandler implements FileHandler {
+public abstract class SpreadsheetHandler extends AbstractFileHandler {
 	public void handleWorkbook(Workbook wb, String extension) throws IOException {
 		// try to access some of the content
 		readContent(wb);

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java Fri Feb 27 09:59:14
2015
@@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCP
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.junit.Test;
 
-public class XSLFFileHandler implements FileHandler {
+public class XSLFFileHandler extends AbstractFileHandler {
 	@Override
     public void handleFile(InputStream stream) throws Exception {
         // ignore password protected files

Modified: poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java (original)
+++ poi/trunk/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java Fri Feb 27 09:59:14
2015
@@ -22,7 +22,7 @@ import java.io.InputStream;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.junit.Test;
 
-public class XWPFFileHandler implements FileHandler {
+public class XWPFFileHandler extends AbstractFileHandler {
 	@Override
     public void handleFile(InputStream stream) throws Exception {
         // ignore password protected files

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri Feb 27 09:59:14
2015
@@ -213,7 +213,9 @@ public class ExtractorFactory {
     {
         // Look for certain entries in the stream, to figure it
         // out from
-        if (poifsDir.hasEntry("Workbook")) {
+        if (poifsDir.hasEntry("Workbook") ||
+                // some XLS files have different entry-names
+                poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) {
             if (getPreferEventExtractor()) {
                 return new EventBasedExcelExtractor(poifsDir);
             }

Modified: poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java?rev=1662652&r1=1662651&r2=1662652&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java Fri Feb
27 09:59:14 2015
@@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends
         }
         POIXMLTextExtractor extractor =
                 new XSSFExcelExtractor(args[0]);
-        System.out.println(extractor.getText());
+        try {
+            System.out.println(extractor.getText());
+        } finally {
+            extractor.close();
+        }
     }
 
     /**
@@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends
         if (type == Cell.CELL_TYPE_NUMERIC) {
             CellStyle cs = cell.getCellStyle();
 
-            if (cs.getDataFormatString() != null) {
+            if (cs != null && cs.getDataFormatString() != null) {
                 text.append(formatter.formatRawCellContents(
                         cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
                         ));



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message