pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From le...@apache.org
Subject svn commit: r1812933 - /pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Date Sun, 22 Oct 2017 17:04:37 GMT
Author: lehmi
Date: Sun Oct 22 17:04:37 2017
New Revision: 1812933

URL: http://svn.apache.org/viewvc?rev=1812933&view=rev
Log:
PDFBOX-3957: search for valid trailer entries when rebuilding the trailer

Modified:
    pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1812933&r1=1812932&r2=1812933&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun
Oct 22 17:04:37 2017
@@ -113,6 +113,11 @@ public class COSParser extends BaseParse
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
     /**
+     * trailer-marker.
+     */
+    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e',
'r' };
+
+    /**
      * ObjStream-marker.
      */
     private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm'
};
@@ -1618,6 +1623,75 @@ public class COSParser extends BaseParse
     }
     
     /**
+     * Brute force search for all trailer marker.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private List<COSObjectKey[]> bfSearchForTrailer() throws IOException
+    {
+        List<COSObjectKey[]> trailerDicts = new ArrayList<COSObjectKey[]>();
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        while (!source.isEOF())
+        {
+            // search for trailer marker
+            if (isString(TRAILER_MARKER))
+            {
+                source.seek(source.getPosition() + TRAILER_MARKER.length);
+                try
+                {
+                    skipSpaces();
+                    COSDictionary trailerDict = parseCOSDictionary();
+                    COSObjectKey[] trailerKeys = new COSObjectKey[2];
+                    if (trailerDict.containsKey(COSName.ROOT))
+                    {
+                        COSBase rootObj = trailerDict.getItem(COSName.ROOT);
+                        if (rootObj instanceof COSObject)
+                        {
+                            long objNumber = ((COSObject) rootObj).getObjectNumber();
+                            int genNumber = ((COSObject) rootObj).getGenerationNumber();
+                            trailerKeys[0] = new COSObjectKey(objNumber, genNumber);
+                        }
+                    }
+                    if (trailerDict.containsKey(COSName.INFO))
+                    {
+                        COSBase infoObj = trailerDict.getItem(COSName.INFO);
+                        long objNumber = ((COSObject) infoObj).getObjectNumber();
+                        int genNumber = ((COSObject) infoObj).getGenerationNumber();
+                        trailerKeys[1] = new COSObjectKey(objNumber, genNumber);
+                    }
+                    if (trailerKeys[0] != null || trailerKeys[1] != null)
+                    {
+                        trailerDicts.add(trailerKeys);
+                    }
+                }
+                catch (IOException exception)
+                {
+                    continue;
+                }
+            }
+            source.read();
+        }
+        source.seek(originOffset);
+        // eliminate double entries
+        int trailerdictsSize = trailerDicts.size();
+        if (trailerdictsSize > 1)
+        {
+            COSObjectKey[] first = trailerDicts.get(0);
+            for (int i = trailerdictsSize - 1; i > 0; i--)
+            {
+                COSObjectKey[] other = trailerDicts.get(i);
+                if (first[0].equals(other[0]) && first[1].equals(other[1]))
+                {
+                    trailerDicts.remove(other);
+                }
+            }
+
+        }
+        return trailerDicts;
+    }
+
+    /**
      * Brute force search for the last EOF marker.
      * 
      * @throws IOException if something went wrong
@@ -1966,78 +2040,100 @@ public class COSParser extends BaseParse
             xrefTrailerResolver.setStartxref(0);
             trailer = xrefTrailerResolver.getTrailer();
             getDocument().setTrailer(trailer);
-            // search for the different parts of the trailer dictionary
-            for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet())
+            List<COSObjectKey[]> trailerObjects = bfSearchForTrailer();
+            if (trailerObjects.size() == 1)
             {
-                Long offset = entry.getValue();
-                COSDictionary dictionary = null;
-                // handle compressed objects
-                if (offset < 0)
+                COSObjectKey[] trailerObj = trailerObjects.get(0);
+                COSObjectKey rootKey = trailerObj[0];
+                Long rootOffset = rootKey != null ? bfSearchCOSObjectKeyOffsets.get(rootKey)
: null;
+                COSObjectKey infoKey = trailerObj[1];
+                Long infoOffset = infoKey != null ? bfSearchCOSObjectKeyOffsets.get(infoKey)
: null;
+                if (rootKey != null && rootOffset != null)
                 {
-                    COSObject compressedObject = document.getObjectFromPool(entry.getKey());
-                    if (compressedObject.getObject() == null)
+                    COSDictionary rootDict = retrieveCOSDictionary(rootKey, rootOffset);
+                    if (rootDict != null && isCatalog(rootDict))
                     {
-                        parseObjectStream((int) -offset);
+                        trailer.setItem(COSName.ROOT, document.getObjectFromPool(rootKey));
                     }
-                    COSBase baseObject = compressedObject.getObject();
-                    if (baseObject instanceof COSDictionary)
-                    {
-                        dictionary = (COSDictionary) baseObject;
-                    }
-                    else
+                }
+                if (infoKey != null && infoOffset != null)
+                {
+                    COSDictionary infoDict = retrieveCOSDictionary(infoKey, infoOffset);
+                    if (infoDict != null && isInfo(infoDict))
                     {
-                        continue;
+                        trailer.setItem(COSName.INFO, document.getObjectFromPool(infoKey));
                     }
                 }
-                else
+            }
+            else
+            {
+                // search for the different parts of the trailer dictionary
+                for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet())
                 {
-                    source.seek(offset);
-                    readObjectNumber();
-                    readGenerationNumber();
-                    readExpectedString(OBJ_MARKER, true);
-                    if (source.peek() != '<')
+                    COSDictionary dictionary = retrieveCOSDictionary(entry.getKey(),
+                            entry.getValue());
+                    if (dictionary == null)
                     {
                         continue;
                     }
-                    try
+                    // document catalog
+                    if (isCatalog(dictionary))
                     {
-                        dictionary = parseCOSDictionary();
+                        trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
                     }
-                    catch (IOException exception)
+                    // info dictionary
+                    else if (isInfo(dictionary))
                     {
-                        LOG.debug("Skipped object " + entry.getKey()
-                                + ", either it's corrupt or not a dictionary");
-                        continue;
+                        trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
                     }
+                    // encryption dictionary, if existing, is lost
+                    // We can't run "Algorithm 2" from PDF specification because of missing
ID
                 }
-                // document catalog
-                if (isCatalog(dictionary))
-                {
-                    trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
-                }
-                // info dictionary
-                else if (!dictionary.containsKey(COSName.PARENT)
-                      && !dictionary.containsKey(COSName.A)
-                      && !dictionary.containsKey(COSName.DEST)
-                        && (dictionary.containsKey(COSName.MOD_DATE)
-                                || dictionary.containsKey(COSName.TITLE)
-                                || dictionary.containsKey(COSName.AUTHOR)
-                                || dictionary.containsKey(COSName.SUBJECT)
-                                || dictionary.containsKey(COSName.KEYWORDS)
-                                || dictionary.containsKey(COSName.CREATOR)
-                                || dictionary.containsKey(COSName.PRODUCER)
-                                || dictionary.containsKey(COSName.CREATION_DATE)))
-                {
-                    trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
-                }
-                // encryption dictionary, if existing, is lost
-                // We can't run "Algorithm 2" from PDF specification because of missing ID
             }
         }
         trailerWasRebuild = true;
         return trailer;
     }
 
+    private COSDictionary retrieveCOSDictionary(COSObjectKey key, Long offset) throws IOException
+    {
+        COSDictionary dictionary = null;
+        // handle compressed objects
+        if (offset < 0)
+        {
+            COSObject compressedObject = document.getObjectFromPool(key);
+            if (compressedObject.getObject() == null)
+            {
+                parseObjectStream((int) -offset);
+            }
+            COSBase baseObject = compressedObject.getObject();
+            if (baseObject instanceof COSDictionary)
+            {
+                dictionary = (COSDictionary) baseObject;
+            }
+        }
+        else
+        {
+            source.seek(offset);
+            readObjectNumber();
+            readGenerationNumber();
+            readExpectedString(OBJ_MARKER, true);
+            if (source.peek() != '<')
+            {
+                return null;
+            }
+            try
+            {
+                dictionary = parseCOSDictionary();
+            }
+            catch (IOException exception)
+            {
+                LOG.debug("Skipped object " + key
+                        + ", either it's corrupt or not a dictionary");
+            }
+        }
+        return dictionary;
+    }
     /**
      * Check if all entries of the pages dictionary are present. Those which can't be dereferenced
are removed.
      * 
@@ -2101,7 +2197,7 @@ public class COSParser extends BaseParse
      * Tell if the dictionary is a PDF catalog. Override this for an FDF catalog.
      * 
      * @param dictionary
-     * @return 
+     * @return true if the given dictionary is a root dictionary
      */
     protected boolean isCatalog(COSDictionary dictionary)
     {
@@ -2109,8 +2205,32 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * This will parse the startxref section from the stream.
-     * The startxref value is ignored.
+     * Tell if the dictionary is an info dictionary.
+     * 
+     * @param dictionary
+     * @return true if the given dictionary is an info dictionary
+     */
+    private boolean isInfo(COSDictionary dictionary)
+    {
+        if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A) ||
dictionary.containsKey(COSName.DEST))
+        {
+            return false;
+        }
+        if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
+                && !dictionary.containsKey(COSName.AUTHOR)
+                && !dictionary.containsKey(COSName.SUBJECT)
+                && !dictionary.containsKey(COSName.KEYWORDS)
+                && !dictionary.containsKey(COSName.CREATOR)
+                && !dictionary.containsKey(COSName.PRODUCER)
+                && !dictionary.containsKey(COSName.CREATION_DATE))
+        {
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * This will parse the startxref section from the stream. The startxref value is ignored.
      *
      * @return the startxref value or -1 on parsing error
      * @throws IOException If an IO error occurs.



Mime
View raw message