pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From le...@apache.org
Subject svn commit: r1809890 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Date Wed, 27 Sep 2017 17:37:14 GMT
Author: lehmi
Date: Wed Sep 27 17:37:14 2017
New Revision: 1809890

URL: http://svn.apache.org/viewvc?rev=1809890&view=rev
Log:
PDFBOX-3934: include compressed objects in brute force search when rebuilding the trailer

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1809890&r1=1809889&r2=1809890&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Wed Sep 27
17:37:14 2017
@@ -41,6 +41,7 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSInputStream;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNull;
 import org.apache.pdfbox.cos.COSNumber;
@@ -111,6 +112,11 @@ public class COSParser extends BaseParse
      */
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
+    /**
+     * ObjStream-marker.
+     */
+    private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm'
};
+
     private long trailerOffset;
     
     /**
@@ -630,14 +636,27 @@ public class COSParser extends BaseParse
                                 // negative offset means we have a compressed
                                 // object within object stream;
                                 // get offset of object stream
-                                fileOffset = document.getXrefTable()
-                                        .get(
-                                        new COSObjectKey((int)-fileOffset, 0));
+                                COSObjectKey key = new COSObjectKey((int) -fileOffset, 0);
+                                fileOffset = document.getXrefTable().get(key);
                                 if ((fileOffset == null) || (fileOffset <= 0))
                                 {
-                                    throw new IOException(
-                                            "Invalid object stream xref object reference
for key '" + objKey + "': "
-                                                    + fileOffset);
+                                    if (isLenient)
+                                    {
+                                        Map<COSObjectKey, Long> bfCOSObjectKeyOffsets
= getBFCOSObjectOffsets();
+                                        fileOffset = bfCOSObjectKeyOffsets.get(key);
+                                        if (fileOffset != null)
+                                        {
+                                            LOG.debug("Set missing " + fileOffset + " for
object "
+                                                    + key);
+                                            document.getXrefTable().put(key, fileOffset);
+                                        }
+                                    }
+                                    else
+                                    {
+                                        throw new IOException(
+                                                "Invalid object stream xref object reference
for key '"
+                                                        + objKey + "': " + fileOffset);
+                                    }
                                 }
 
                                 List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
@@ -1542,6 +1561,7 @@ public class COSParser extends BaseParse
             bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
                     lastObjOffset);
         }
+        bfSearchForObjStreams();
         // reestablish origin position
         source.seek(originOffset);
     }
@@ -1671,6 +1691,119 @@ public class COSParser extends BaseParse
     }
 
     /**
+     * Brute force search for all object streams.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForObjStreams() throws IOException
+    {
+        HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        char[] string = " obj".toCharArray();
+        while (!source.isEOF())
+        {
+            // search for EOF marker
+            if (isString(OBJ_STREAM))
+            {
+                long currentPosition = source.getPosition();
+                // search backwards for the beginning of the object
+                long newOffset = -1;
+                COSObjectKey streamObjectKey = null;
+                boolean objFound = false;
+                for (int i = 1; i < 40 && !objFound; i++)
+                {
+                    long currentOffset = currentPosition - (i * 10);
+                    if (currentOffset > 0)
+                    {
+                        source.seek(currentOffset);
+                        for (int j = 0; j < 10; j++)
+                        {
+                            if (isString(string))
+                            {
+                                long tempOffset = currentOffset - 1;
+                                source.seek(tempOffset);
+                                int genID = source.peek();
+                                // is the next char a digit?
+                                if (isDigit(genID))
+                                {
+                                    tempOffset--;
+                                    source.seek(tempOffset);
+                                    if (isSpace())
+                                    {
+                                        int length = 0;
+                                        source.seek(--tempOffset);
+                                        while (tempOffset > MINIMUM_SEARCH_OFFSET &&
isDigit())
+                                        {
+                                            source.seek(--tempOffset);
+                                            length++;
+                                        }
+                                        if (length > 0)
+                                        {
+                                            source.read();
+                                            newOffset = source.getPosition();
+                                            long objNumber = readObjectNumber();
+                                            int genNumber = readGenerationNumber();
+                                            streamObjectKey = new COSObjectKey(objNumber,
+                                                    genNumber);
+                                            bfSearchObjStreamsOffsets.put(newOffset,
+                                                    streamObjectKey);
+                                        }
+                                    }
+                                }
+                                LOG.debug("Dictionary start for object stream -> " + newOffset);
+                                objFound = true;
+                                break;
+                            }
+                            else
+                            {
+                                currentOffset++;
+                                source.read();
+                            }
+                        }
+                    }
+                }
+                source.seek(currentPosition + OBJ_STREAM.length);
+            }
+            source.read();
+        }
+        // add all found compressed objects to the brute force search result
+        for (Long offset : bfSearchObjStreamsOffsets.keySet())
+        {
+            long bfOffset = bfSearchCOSObjectKeyOffsets.get(bfSearchObjStreamsOffsets.get(offset));
+            // check if the object was overwritten
+            if (offset == bfOffset)
+            {
+                source.seek(offset);
+                long stmObjNumber = readObjectNumber();
+                readGenerationNumber();
+                readExpectedString(OBJ_MARKER, true);
+                COSDictionary dict = parseCOSDictionary();
+                int offsetFirstStream = dict.getInt(COSName.FIRST);
+                int nrOfObjects = dict.getInt(COSName.N);
+                COSStream stream = parseCOSStream(dict);
+                COSInputStream is = stream.createInputStream();
+                byte[] numbersStr = new byte[offsetFirstStream];
+                is.read(numbersStr);
+                is.close();
+                stream.close();
+                String[] numbers = new String(numbersStr, "ISO-8859-1").split(" ");
+                for (int i = 0; i < nrOfObjects; i++)
+                {
+                    long objNumber = Long.parseLong(numbers[i * 2]);
+                    COSObjectKey objKey = new COSObjectKey(objNumber, 0);
+                    Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
+                    if (existingOffset == null || offset > existingOffset)
+                    {
+                        bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
+                    }
+                }
+            }
+        }
+        source.seek(originOffset);
+    }
+
+    /**
      * Brute force search for all xref entries (tables).
      * 
      * @throws IOException if something went wrong
@@ -1812,6 +1945,11 @@ public class COSParser extends BaseParse
         for (Entry<COSObjectKey, Long> entry : bfCOSObjectKeyOffsets.entrySet())
         {
             Long offset = entry.getValue();
+            // skip compressed objects
+            if (offset < 0)
+            {
+                continue;
+            }
             source.seek(offset);
             readObjectNumber();
             readGenerationNumber();



Mime
View raw message