pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From til...@apache.org
Subject svn commit: r1709640 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: contentstream/operator/DrawObject.java pdmodel/PDResources.java
Date Tue, 20 Oct 2015 16:34:19 GMT
Author: tilman
Date: Tue Oct 20 16:34:19 2015
New Revision: 1709640

URL: http://svn.apache.org/viewvc?rev=1709640&view=rev
Log:
PDFBOX-3037: check for image to avoid decoding them when doing text extraction

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/DrawObject.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDResources.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/DrawObject.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/DrawObject.java?rev=1709640&r1=1709639&r2=1709640&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/DrawObject.java
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/DrawObject.java
Tue Oct 20 16:34:19 2015
@@ -21,7 +21,6 @@ import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
-import org.apache.pdfbox.text.PDFMarkedContentExtractor;
 
 import java.io.IOException;
 import java.util.List;
@@ -47,11 +46,14 @@ public class DrawObject extends Operator
             return;
         }
         COSName name = (COSName) base0;
-        PDXObject xobject =  context.getResources().getXObject(name);
-        if (context instanceof PDFMarkedContentExtractor)
+
+        if (context.getResources().isImageXObject(name))
         {
-            ((PDFMarkedContentExtractor) context).xobject(xobject);
+            // we're done here, don't decode images when doing text extraction
+            return;
         }
+        
+        PDXObject xobject = context.getResources().getXObject(name);
 
         if (xobject instanceof PDTransparencyGroup)
         {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDResources.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDResources.java?rev=1709640&r1=1709639&r2=1709640&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDResources.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDResources.java Tue Oct 20
16:34:19 2015
@@ -23,6 +23,7 @@ import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList;
 import org.apache.pdfbox.pdmodel.font.PDFont;
@@ -309,6 +310,32 @@ public final class PDResources implement
     }
 
     /**
+     * Tells whether the XObject resource with the given name is an image.
+     *
+     * @param name Name of the XObject resource.
+     * @return true if it is an image XObject, false if not.
+     */
+    public boolean isImageXObject(COSName name)
+    {
+        // get the instance
+        COSBase value = get(COSName.XOBJECT, name);
+        if (value == null)
+        {
+            return false;
+        }
+        else if (value instanceof COSObject)
+        {
+            value = ((COSObject) value).getObject();
+        }
+        if (!(value instanceof COSStream))
+        {
+            return false;
+        }
+        COSStream stream = (COSStream) value;
+        return COSName.IMAGE.equals(stream.getCOSName(COSName.SUBTYPE));
+    }
+
+    /**
      * Returns the XObject resource with the given name, or null if none exists.
      * 
      * @param name Name of the XObject resource.



Mime
View raw message