poi-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cen...@apache.org
Subject svn commit: r1721064 - in /poi/trunk/src: java/org/apache/poi/POITextExtractor.java ooxml/java/org/apache/poi/extractor/ExtractorFactory.java ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
Date Sun, 20 Dec 2015 20:39:01 GMT
Author: centic
Date: Sun Dec 20 20:39:01 2015
New Revision: 1721064

URL: http://svn.apache.org/viewvc?rev=1721064&view=rev
Log:
Handle some cases better where file handles were left open by the ExtractorFactory, mostly
when opening files failed, but also when using the NPOIFSFileSystem for initialization.

Modified:
    poi/trunk/src/java/org/apache/poi/POITextExtractor.java
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

Modified: poi/trunk/src/java/org/apache/poi/POITextExtractor.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/POITextExtractor.java?rev=1721064&r1=1721063&r2=1721064&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/POITextExtractor.java (original)
+++ poi/trunk/src/java/org/apache/poi/POITextExtractor.java Sun Dec 20 20:39:01 2015
@@ -31,6 +31,8 @@ import java.io.IOException;
  * @see org.apache.poi.hwpf.extractor.WordExtractor
  */
 public abstract class POITextExtractor implements Closeable {
+    private Closeable fsToClose = null;
+    
 	/**
 	 * Retrieves all the text from the document.
 	 * How cells, paragraphs etc are separated in the text
@@ -46,6 +48,13 @@ public abstract class POITextExtractor i
 	 *  metadata / properties, such as author and title.
 	 */
 	public abstract POITextExtractor getMetadataTextExtractor();
+
+	/**
+	 * Used to ensure file handle cleanup.
+	 */
+	public void setFilesystem(Closeable fs) {
+	    fsToClose = fs;
+	}
 	
 	/**
 	 * Allows to free resources of the Extractor as soon as
@@ -55,6 +64,8 @@ public abstract class POITextExtractor i
 	 * The Extractor cannot be used after close has been called.
 	 */
 	public void close() throws IOException {
-		// nothing to do in abstract class, derived classes may perform actions.
+		if(fsToClose != null) {
+		    fsToClose.close();
+		}
 	}
 }

Modified: poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=1721064&r1=1721063&r2=1721064&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Sun Dec 20 20:39:01
2015
@@ -128,20 +128,25 @@ public class ExtractorFactory {
       return threadPreferEventExtractors.get();
    }
 
-
 	public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException,
OpenXML4JException, XmlException {
-		InputStream inp = null;
+	    NPOIFSFileSystem fs = null;
         try {
-            try {
-                NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
-                return createExtractor(fs);
-            } catch (OfficeXmlFileException e) {
-                return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
-            } catch (NotOLE2FileException ne) {
-                throw new IllegalArgumentException("Your File was neither an OLE2 file, nor
an OOXML file");
+            fs = new NPOIFSFileSystem(f);
+            POIOLE2TextExtractor extractor = createExtractor(fs);
+            extractor.setFilesystem(fs);
+            return extractor;
+        } catch (OfficeXmlFileException e) {
+            // ensure file-handle release
+            if(fs != null) {
+                fs.close();
+            }
+            return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+        } catch (NotOLE2FileException ne) {
+            // ensure file-handle release
+            if(fs != null) {
+                fs.close();
             }
-        } finally {
-            if(inp != null) inp.close();
+            throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an
OOXML file");
         }
     }
 
@@ -161,65 +166,95 @@ public class ExtractorFactory {
 		throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an
OOXML stream");
 	}
 
+	/**
+	 * Tries to determine the actual type of file and produces a matching text-extractor for
it.
+	 *
+	 * @param pkg An {@link OPCPackage}.
+	 * @return A {@link POIXMLTextExtractor} for the given file.
+	 * @throws IOException If an error occurs while reading the file 
+	 * @throws OpenXML4JException If an error parsing the OpenXML file format is found. 
+	 * @throws XmlException If an XML parsing error occurs.
+	 * @throws IllegalArgumentException If no matching file type could be found.
+	 */
 	public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException,
XmlException {
-	   // Check for the normal Office core document
-       PackageRelationshipCollection core =
-            pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-       
-       // If nothing was found, try some of the other OOXML-based core types
-       if (core.size() == 0) {
-           // Could it be an OOXML-Strict one?
-           core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
-       }
-       if (core.size() == 0) {
-           // Could it be a visio one?
-           core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
-           if (core.size() == 1)
-               return new XDGFVisioExtractor(pkg);
-       }
-       
-       // Should just be a single core document, complain if not
-       if (core.size() != 1) {
-           throw new IllegalArgumentException("Invalid OOXML Package received - expected
1 core document, found " + core.size());
-       }
-
-       // Grab the core document part, and try to identify from that
-       PackagePart corePart = pkg.getPart(core.getRelationship(0));
-
-       // Is it XSSF?
-       for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             if(getPreferEventExtractor()) {
-                return new XSSFEventBasedExcelExtractor(pkg);
-             }
-
-             return new XSSFExcelExtractor(pkg);
-          }
-       }
-
-       // Is it XWPF?
-       for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             return new XWPFWordExtractor(pkg);
-          }
-       }
-
-       // Is it XSLF?
-       for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             return new XSLFPowerPointExtractor(pkg);
-          }
-       }
-
-       // special handling for SlideShow-Theme-files, 
-       if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType()))
{
-           return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
-       }
-
-       // ensure that we close the package again if there is an error opening it, however
-       // we need to revert the package to not re-write the file via close(), which is very
likely not wanted for a TextExtractor!
-       pkg.revert();
-       throw new IllegalArgumentException("No supported documents found in the OOXML package
(found "+corePart.getContentType()+")");
+        try {
+    	   // Check for the normal Office core document
+           PackageRelationshipCollection core =
+                pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
+           
+           // If nothing was found, try some of the other OOXML-based core types
+           if (core.size() == 0) {
+               // Could it be an OOXML-Strict one?
+               core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
+           }
+           if (core.size() == 0) {
+               // Could it be a visio one?
+               core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+               if (core.size() == 1)
+                   return new XDGFVisioExtractor(pkg);
+           }
+           
+           // Should just be a single core document, complain if not
+           if (core.size() != 1) {
+               throw new IllegalArgumentException("Invalid OOXML Package received - expected
1 core document, found " + core.size());
+           }
+    
+           // Grab the core document part, and try to identify from that
+           PackagePart corePart = pkg.getPart(core.getRelationship(0));
+    
+           // Is it XSSF?
+           for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 if(getPreferEventExtractor()) {
+                    return new XSSFEventBasedExcelExtractor(pkg);
+                 }
+    
+                 return new XSSFExcelExtractor(pkg);
+              }
+           }
+    
+           // Is it XWPF?
+           for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 return new XWPFWordExtractor(pkg);
+              }
+           }
+    
+           // Is it XSLF?
+           for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 return new XSLFPowerPointExtractor(pkg);
+              }
+           }
+    
+           // special handling for SlideShow-Theme-files, 
+           if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType()))
{
+               return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+           }
+           
+           throw new IllegalArgumentException("No supported documents found in the OOXML
package (found "+corePart.getContentType()+")");
+	    } catch (IOException e) {
+	        // ensure that we close the package again if there is an error opening it, however
+	        // we need to revert the package to not re-write the file via close(), which is
very likely not wanted for a TextExtractor!
+	        pkg.revert();
+	        throw e;
+        } catch (OpenXML4JException e) {
+            // ensure that we close the package again if there is an error opening it, however
+            // we need to revert the package to not re-write the file via close(), which
is very likely not wanted for a TextExtractor!
+            pkg.revert();
+            throw e;
+        } catch (XmlException e) {
+            // ensure that we close the package again if there is an error opening it, however
+            // we need to revert the package to not re-write the file via close(), which
is very likely not wanted for a TextExtractor!
+            pkg.revert();
+            throw e;
+	    } catch (RuntimeException e) {
+           // ensure that we close the package again if there is an error opening it, however
+           // we need to revert the package to not re-write the file via close(), which is
very likely not wanted for a TextExtractor!
+           pkg.revert();
+           
+           throw e;
+	    }
 	}
 
 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException,
InvalidFormatException, OpenXML4JException, XmlException {

Modified: poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=1721064&r1=1721063&r2=1721064&view=diff
==============================================================================
--- poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java (original)
+++ poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java Sun Dec
20 20:39:01 2015
@@ -193,29 +193,35 @@ public class TestExtractorFactory {
 
 
         // Word
+        extractor = ExtractorFactory.createExtractor(doc);
         assertTrue(
-                ExtractorFactory.createExtractor(doc)
+                extractor
                 instanceof WordExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
+        extractor = ExtractorFactory.createExtractor(doc6);
         assertTrue(
-                ExtractorFactory.createExtractor(doc6)
+                extractor
                 instanceof Word6Extractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc6).getText().length() > 20
+                extractor.getText().length() > 20
         );
+        extractor.close();
 
+        extractor = ExtractorFactory.createExtractor(doc95);
         assertTrue(
-                ExtractorFactory.createExtractor(doc95)
+                extractor
                 instanceof Word6Extractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc95).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
         extractor = ExtractorFactory.createExtractor(docx);
         assertTrue(
@@ -241,62 +247,71 @@ public class TestExtractorFactory {
         );
         extractor.close();
 
-        // PowerPoint
+        // PowerPoint (PPT)
+        extractor = ExtractorFactory.createExtractor(ppt);
         assertTrue(
-                ExtractorFactory.createExtractor(ppt)
+                extractor
                 instanceof PowerPointExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(ppt).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
+        // PowerPoint (PPTX)
         extractor = ExtractorFactory.createExtractor(pptx);
         assertTrue(
                 extractor
                 instanceof XSLFPowerPointExtractor
         );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(pptx);
         assertTrue(
                 extractor.getText().length() > 120
         );
         extractor.close();
 
         // Visio - binary
+        extractor = ExtractorFactory.createExtractor(vsd);
         assertTrue(
-                ExtractorFactory.createExtractor(vsd)
+                extractor
                 instanceof VisioTextExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(vsd).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
+
         // Visio - vsdx
+        extractor = ExtractorFactory.createExtractor(vsdx);
         assertTrue(
-                ExtractorFactory.createExtractor(vsdx)
+                extractor
                 instanceof XDGFVisioExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(vsdx).getText().length() > 20
+                extractor.getText().length() > 20
         );
+        extractor.close();
 
         // Publisher
+        extractor = ExtractorFactory.createExtractor(pub);
         assertTrue(
-                ExtractorFactory.createExtractor(pub)
+                extractor
                 instanceof PublisherTextExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(pub).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
 
         // Outlook msg
+        extractor = ExtractorFactory.createExtractor(msg);
         assertTrue(
-                ExtractorFactory.createExtractor(msg)
+                extractor
                 instanceof OutlookTextExtactor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(msg).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
 
         // Text
         try {
@@ -557,13 +572,15 @@ public class TestExtractorFactory {
         extractor.close();
         
         // Visio
+        extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
         assertTrue(
-                ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
+                extractor
                 instanceof XDGFVisioExtractor
         );
         assertTrue(
                 extractor.getText().length() > 20
         );
+        extractor.close();
 
         // Text
         try {
@@ -670,6 +687,7 @@ public class TestExtractorFactory {
                 ExtractorFactory.createExtractor(xls);
         embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
         assertEquals(0, embeds.length);
+        ext.close();
 
         // Excel
         ext = (POIOLE2TextExtractor)
@@ -690,6 +708,7 @@ public class TestExtractorFactory {
         assertEquals(2, numXls);
         assertEquals(2, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Word
         ext = (POIOLE2TextExtractor)
@@ -709,6 +728,7 @@ public class TestExtractorFactory {
         assertEquals(2, numXls);
         assertEquals(1, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Word which contains an OOXML file
         ext = (POIOLE2TextExtractor)
@@ -730,6 +750,7 @@ public class TestExtractorFactory {
         assertEquals(0, numWord);
         assertEquals(1, numWordX);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Outlook
         ext = (OutlookTextExtactor)
@@ -749,6 +770,7 @@ public class TestExtractorFactory {
         assertEquals(0, numXls);
         assertEquals(1, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Outlook with another outlook file in it
         ext = (OutlookTextExtactor)
@@ -768,7 +790,7 @@ public class TestExtractorFactory {
         assertEquals(0, numXls);
         assertEquals(0, numWord);
         assertEquals(1, numMsg);
-
+        ext.close();
 
         // TODO - PowerPoint
         // TODO - Publisher



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org


Mime
View raw message