pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From til...@apache.org
Subject svn commit: r1817522 - /pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
Date Fri, 08 Dec 2017 16:07:49 GMT
Author: tilman
Date: Fri Dec  8 16:07:49 2017
New Revision: 1817522

URL: http://svn.apache.org/viewvc?rev=1817522&view=rev
Log:
PDFBOX-3999: improved search for orphan pages in /ParentTree number tree, search also in /K
tree

Modified:
    pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java

Modified: pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java?rev=1817522&r1=1817521&r2=1817522&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
(original)
+++ pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
Fri Dec  8 16:07:49 2017
@@ -31,8 +31,8 @@ import org.apache.pdfbox.pdmodel.PDDocum
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
-import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
 import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitDestination;
 import org.apache.pdfbox.rendering.PDFRenderer;
@@ -162,11 +162,12 @@ public class PDFMergerUtilityTest extend
     }
 
     /**
-     * PDFBOX-3999: check that entries in the number tree only reference pages from the page
tree.
+     * PDFBOX-3999: check that page entries in the structure tree only reference pages from
the page
+     * tree, i.e. that no orphan pages exist.
      * 
      * @throws IOException 
      */
-     public void testStructureTreeMerge() throws IOException
+    public void testStructureTreeMerge() throws IOException
     {
         PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
         PDDocument src = PDDocument.load(new File(TARGETPDFDIR, "PDFBOX-3999-GeneralForbearance.pdf"));
@@ -178,31 +179,54 @@ public class PDFMergerUtilityTest extend
 
         PDDocument doc = PDDocument.load(new File(TARGETTESTDIR, "PDFBOX-3999-GovFormPreFlattened-merged.pdf"));
         PDPageTree pageTree = doc.getPages();
-        PDNumberTreeNode parentTree = doc.getDocumentCatalog().getStructureTreeRoot().getParentTree();
-        COSArray numArray = (COSArray) parentTree.getCOSObject().getDictionaryObject(COSName.NUMS);
-        for (COSBase base : numArray)
+
+        // check for orphan pages in the StructTreeRoot/K and StructTreeRoot/ParentTree trees.
+        PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
+        checkElement(pageTree, structureTreeRoot.getParentTree().getCOSObject());
+        checkElement(pageTree, structureTreeRoot.getK());
+    }
+
+    // Each element can be an array, a dictionary or a number.
+    // See PDF specification Table 37 – Entries in a number tree node dictionary
+    // See PDF specification Table 322 – Entries in the structure tree root
+    // example of file with /Kids: 000153.pdf 000208.pdf 000314.pdf 000359.pdf 000671.pdf
+    // from digitalcorpora site
+    private void checkElement(PDPageTree pageTree, COSBase base)
+    {
+        if (base instanceof COSArray)
         {
-            if (base instanceof COSObject)
+            for (COSBase base2 : (COSArray) base)
             {
-                base = ((COSObject) base).getObject();
-            }
-            if (base instanceof COSArray)
-            {
-                for (COSBase base2 : (COSArray) base)
+                if (base2 instanceof COSObject)
                 {
-                    if (base2 instanceof COSObject)
-                    {
-                        base2 = ((COSObject) base2).getObject();
-                    }
-                    PDStructureElement structureElement = new PDStructureElement((COSDictionary)
base2);
-                    checkForPage(pageTree, structureElement);
+                    base2 = ((COSObject) base2).getObject();
                 }
+                checkElement(pageTree, base2);
             }
-            else if (base instanceof COSDictionary)
+        }
+        else if (base instanceof COSDictionary)
+        {
+            COSDictionary kdict = (COSDictionary) base;
+            if (kdict.containsKey(COSName.PG))
             {
-                PDStructureElement structureElement = new PDStructureElement((COSDictionary)
base);
+                PDStructureElement structureElement = new PDStructureElement(kdict);
                 checkForPage(pageTree, structureElement);
             }
+            if (kdict.containsKey(COSName.K))
+            {
+                checkElement(pageTree, kdict.getDictionaryObject(COSName.K));
+                return;
+            }
+
+            // if we're in a number tree, check /Nums and /Kids
+            if (kdict.containsKey(COSName.KIDS))
+            {
+                checkElement(pageTree, kdict.getDictionaryObject(COSName.KIDS));
+            }
+            else if (kdict.containsKey(COSName.NUMS))
+            {
+                checkElement(pageTree, kdict.getDictionaryObject(COSName.NUMS));
+            }
         }
     }
 



Mime
View raw message