Return-Path: X-Original-To: apmail-pdfbox-commits-archive@www.apache.org Delivered-To: apmail-pdfbox-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 37B2011DE0 for ; Fri, 1 Aug 2014 15:45:27 +0000 (UTC) Received: (qmail 29459 invoked by uid 500); 1 Aug 2014 15:45:27 -0000 Delivered-To: apmail-pdfbox-commits-archive@pdfbox.apache.org Received: (qmail 29437 invoked by uid 500); 1 Aug 2014 15:45:27 -0000 Mailing-List: contact commits-help@pdfbox.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@pdfbox.apache.org Delivered-To: mailing list commits@pdfbox.apache.org Received: (qmail 29428 invoked by uid 99); 1 Aug 2014 15:45:27 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 01 Aug 2014 15:45:27 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 01 Aug 2014 15:45:22 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 39C6A238931B; Fri, 1 Aug 2014 15:44:55 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1615149 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Date: Fri, 01 Aug 2014 15:44:55 -0000 To: commits@pdfbox.apache.org From: lehmi@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140801154455.39C6A238931B@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: lehmi Date: Fri Aug 1 15:44:54 2014 New Revision: 1615149 URL: http://svn.apache.org/r1615149 Log: code reformatted using the template Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1615149&r1=1615148&r2=1615149&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Fri Aug 1 15:44:54 2014 @@ -66,18 +66,14 @@ import org.apache.pdfbox.pdmodel.encrypt import org.apache.pdfbox.persistence.util.COSObjectKey; /** - * PDFParser which first reads startxref and xref tables in order to know valid - * objects and parse only these objects. Thus it is closer to a conforming - * parser than the sequential reading of {@link PDFParser}. + * PDFParser which first reads startxref and xref tables in order to know valid objects and parse only these objects. + * Thus it is closer to a conforming parser than the sequential reading of {@link PDFParser}. * - * This class can be used as a {@link PDFParser} replacement. First - * {@link #parse()} must be called before page objects can be retrieved, e.g. - * {@link #getPDDocument()}. + * This class can be used as a {@link PDFParser} replacement. First {@link #parse()} must be called before page objects + * can be retrieved, e.g. {@link #getPDDocument()}. * - * This class is a much enhanced version of QuickParser presented - * in PDFBOX-1104 by - * Jeremy Villalobos. + * This class is a much enhanced version of QuickParser presented in PDFBOX-1104 by Jeremy Villalobos. */ public class NonSequentialPDFParser extends PDFParser { @@ -100,7 +96,8 @@ public class NonSequentialPDFParser exte /** * StartXRef-marker. */ - protected static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' }; + protected static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x', + 'r', 'e', 'f' }; /** * obj-marker. */ @@ -128,11 +125,9 @@ public class NonSequentialPDFParser exte // EOF marker /** - * If true object references in catalog are not followed; pro: - * page objects will be only parsed when needed; cons: some information of - * catalog might not be available (e.g. outline). Catalog parsing without - * pages is not an option since a number of entries will also refer to page - * objects (like OpenAction). + * If true object references in catalog are not followed; pro: page objects will be only parsed when + * needed; cons: some information of catalog might not be available (e.g. outline). Catalog parsing without pages is + * not an option since a number of entries will also refer to page objects (like OpenAction). */ private boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL)); @@ -142,9 +137,8 @@ public class NonSequentialPDFParser exte private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class); /** - * true if the NonSequentialPDFParser is initialized by a - * InputStream, in this case a temporary file is created. At the end of the - * {@linkplain #parse()} method,the temporary file will be deleted. + * true if the NonSequentialPDFParser is initialized by a InputStream, in this case a temporary file is + * created. At the end of the {@linkplain #parse()} method,the temporary file will be deleted. */ private boolean isTmpPDFFile = false; @@ -164,8 +158,7 @@ public class NonSequentialPDFParser exte } /** - * Constructs parser for given file using given buffer for temporary - * storage. + * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing @@ -173,8 +166,7 @@ public class NonSequentialPDFParser exte * @throws IOException If something went wrong. */ /** - * Constructs parser for given file using given buffer for temporary - * storage. + * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing @@ -187,8 +179,7 @@ public class NonSequentialPDFParser exte } /** - * Constructs parser for given file using given buffer for temporary - * storage. + * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing @@ -196,8 +187,7 @@ public class NonSequentialPDFParser exte * @throws IOException If something went wrong. */ /** - * Constructs parser for given file using given buffer for temporary - * storage. + * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing @@ -205,7 +195,8 @@ public class NonSequentialPDFParser exte * * @throws IOException If something went wrong. */ - public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) throws IOException + public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) + throws IOException { super(EMPTY_INPUT_STREAM, null, false); pdfFile = file; @@ -224,12 +215,13 @@ public class NonSequentialPDFParser exte } catch (NumberFormatException nfe) { - LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '" - + eofLookupRangeStr + "'"); + LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + + " does not contain an integer value, but: '" + eofLookupRangeStr + "'"); } } - setDocument((raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument(raBuf, false)); + setDocument((raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument( + raBuf, false)); pdfSource = new PushBackInputStream(raStream, 4096); @@ -255,7 +247,8 @@ public class NonSequentialPDFParser exte * @param decryptionPassword password to be used for decryption. * @throws IOException If something went wrong. */ - public NonSequentialPDFParser(InputStream input, RandomAccess raBuf, String decryptionPassword) throws IOException + public NonSequentialPDFParser(InputStream input, RandomAccess raBuf, String decryptionPassword) + throws IOException { super(EMPTY_INPUT_STREAM, null, false); pdfFile = createTmpFile(input); @@ -264,9 +257,8 @@ public class NonSequentialPDFParser exte } /** - * Create a temporary file with the input stream. If the creation succeed, - * the {@linkplain #isTmpPDFFile} is set to true. This Temporary file will - * be deleted at end of the parse method + * Create a temporary file with the input stream. If the creation succeed, the {@linkplain #isTmpPDFFile} is set to + * true. This Temporary file will be deleted at end of the parse method * * @param input * @return the temporary file @@ -293,18 +285,17 @@ public class NonSequentialPDFParser exte // ------------------------------------------------------------------------ /** - * Sets how many trailing bytes of PDF file are searched for EOF marker and - * 'startxref' marker. If not set we use default value - * {@link #DEFAULT_TRAIL_BYTECOUNT}. - * - * - * - *

In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined - * this value will be set on initialization but can be overwritten - * later.

+ * Sets how many trailing bytes of PDF file are searched for EOF marker and 'startxref' marker. If not set we use + * default value {@link #DEFAULT_TRAIL_BYTECOUNT}. + * + * + * + *

+ * In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined this value will be set on initialization but + * can be overwritten later. + *

* * @param byteCount number of trailing bytes */ @@ -318,10 +309,9 @@ public class NonSequentialPDFParser exte // ------------------------------------------------------------------------ /** - * The initial parse will first parse only the trailer, the xrefstart and - * all xref tables to have a pointer (offset) to all the pdf's objects. It - * can handle linearized pdfs, which will have an xref at the end pointing - * to an xref at the beginning of the file. Last the root object is parsed. + * The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a pointer (offset) + * to all the pdf's objects. It can handle linearized pdfs, which will have an xref at the end pointing to an xref + * at the beginning of the file. Last the root object is parsed. * * @throws IOException If something went wrong. */ @@ -333,7 +323,8 @@ public class NonSequentialPDFParser exte long startXrefOffset = document.getStartXref(); // check the startxref offset - if (isLenient) { + if (isLenient) + { startXrefOffset -= calculateFixingOffset(startXrefOffset); document.setStartXref(startXrefOffset); } @@ -355,31 +346,33 @@ public class NonSequentialPDFParser exte parseXrefTable(prev); // parse the last trailer. long trailerOffset = pdfSource.getOffset(); - //PDFBOX-1739 skip extra xref entries in RegisSTAR documents + // PDFBOX-1739 skip extra xref entries in RegisSTAR documents while (isLenient && pdfSource.peek() != 't') { if (pdfSource.getOffset() == trailerOffset) { // warn only the first time - LOG.warn("Expected trailer object at position " + trailerOffset + ", keep trying"); + LOG.warn("Expected trailer object at position " + trailerOffset + + ", keep trying"); } readLine(); } if (!parseTrailer()) { - throw new IOException("Expected trailer object at position: " + pdfSource.getOffset()); + throw new IOException("Expected trailer object at position: " + + pdfSource.getOffset()); } COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); prev = trailer.getInt(COSName.PREV); if (isLenient && prev > -1) { - // check the xref table reference - long fixingOffset = calculateFixingOffset(prev); - if (fixingOffset != 0) - { - prev -= fixingOffset; - trailer.setLong(COSName.PREV, prev); - } + // check the xref table reference + long fixingOffset = calculateFixingOffset(prev); + if (fixingOffset != 0) + { + prev -= fixingOffset; + trailer.setLong(COSName.PREV, prev); + } } } else @@ -388,14 +381,14 @@ public class NonSequentialPDFParser exte prev = parseXrefObjStream(prev); if (isLenient && prev > -1) { - // check the xref table reference - long fixingOffset = calculateFixingOffset(prev); - if (fixingOffset != 0) - { - prev -= fixingOffset; - COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); - trailer.setLong(COSName.PREV, prev); - } + // check the xref table reference + long fixingOffset = calculateFixingOffset(prev); + if (fixingOffset != 0) + { + prev -= fixingOffset; + COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); + trailer.setLong(COSName.PREV, prev); + } } } } @@ -406,10 +399,11 @@ public class NonSequentialPDFParser exte document.setTrailer(trailer); document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType()); // check the offsets of all referenced objects - if (isLenient) { + if (isLenient) + { checkXrefOffsets(); } - + // ---- prepare encryption if necessary COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT); if (trailerEncryptItem != null) @@ -437,22 +431,21 @@ public class NonSequentialPDFParser exte } securityHandler = encryption.getSecurityHandler(); - securityHandler.prepareForDecryption(encryption, document.getDocumentID(), decryptionMaterial); + securityHandler.prepareForDecryption(encryption, document.getDocumentID(), + decryptionMaterial); AccessPermission permission = securityHandler.getCurrentAccessPermission(); if (!permission.canExtractContent()) { - LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content."); + LOG.warn("PDF file '" + pdfFile.getPath() + + "' does not allow extracting content."); } } catch (Exception e) { throw new IOException("Error (" + e.getClass().getSimpleName() - + ") while creating security handler for decryption: " + e.getMessage() /* - * , e TODO: remove - * remark with Java 1.6 - */); + + ") while creating security handler for decryption",e); } } @@ -490,10 +483,10 @@ public class NonSequentialPDFParser exte } } } - + // PDFBOX-1922: read the version again now that all objects have been resolved readVersionInTrailer(trailer); - + initialParseDone = true; } @@ -501,8 +494,7 @@ public class NonSequentialPDFParser exte /** * Parses an xref object stream starting with indirect object id. * - * @return value of PREV item in dictionary or -1 if no such - * item exists + * @return value of PREV item in dictionary or -1 if no such item exists */ private long parseXrefObjStream(long objByteOffset) throws IOException { @@ -548,6 +540,7 @@ public class NonSequentialPDFParser exte /** * Enable handling of alternative pdfSource implementation. + * * @throws IOException If something went wrong. */ protected final void releasePdfSourceInputStream() throws IOException @@ -566,12 +559,11 @@ public class NonSequentialPDFParser exte // ------------------------------------------------------------------------ /** - * Looks for and parses startxref. We first look for last '%%EOF' marker - * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via - * {@link #setEOFLookupRange(int)}) and go back to find + * Looks for and parses startxref. We first look for last '%%EOF' marker (within last + * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find * startxref. * - * @return the offset of StartXref + * @return the offset of StartXref * @throws IOException If something went wrong. */ protected final long getStartxrefOffset() throws IOException @@ -600,8 +592,9 @@ public class NonSequentialPDFParser exte // should never happen) if (readBytes < 1) { - throw new IOException("No more bytes to read for trailing buffer, but expected: " - + (trailByteCount - off)); + throw new IOException( + "No more bytes to read for trailing buffer, but expected: " + + (trailByteCount - off)); } off += readBytes; } @@ -639,15 +632,13 @@ public class NonSequentialPDFParser exte // ------------------------------------------------------------------------ /** - * Searches last appearance of pattern within buffer. Lookup before _lastOff - * and goes back until 0. + * Searches last appearance of pattern within buffer. Lookup before _lastOff and goes back until 0. * * @param pattern pattern to search for * @param buf buffer to search pattern in * @param endOff offset (exclusive) where lookup starts at * - * @return start offset of pattern within buffer or -1 if - * pattern could not be found + * @return start offset of pattern within buffer or -1 if pattern could not be found */ protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) { @@ -681,8 +672,7 @@ public class NonSequentialPDFParser exte // ------------------------------------------------------------------------ /** - * Reads given pattern from {@link #pdfSource}. Skipping whitespace at start - * and end. + * Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end. * * @param pattern pattern to be skipped * @throws IOException if pattern could not be read @@ -695,8 +685,9 @@ public class NonSequentialPDFParser exte { if (pdfSource.read() != c) { - throw new IOException("Expected pattern '" + new String(pattern) + "' but missed at character '" + c - + "' at offset " + pdfSource.getOffset()); + throw new IOException("Expected pattern '" + new String(pattern) + + "' but missed at character '" + c + "' at offset " + + pdfSource.getOffset()); } } @@ -707,8 +698,7 @@ public class NonSequentialPDFParser exte private COSDictionary pagesDictionary = null; /** - * Returns PAGES {@link COSDictionary} object or throws {@link IOException} - * if PAGES dictionary does not exist. + * Returns PAGES {@link COSDictionary} object or throws {@link IOException} if PAGES dictionary does not exist. */ private COSDictionary getPagesObject() throws IOException { @@ -727,7 +717,8 @@ public class NonSequentialPDFParser exte if (!(object instanceof COSDictionary)) { - throw new IOException("PAGES not a dictionary object, but: " + object.getClass().getSimpleName()); + throw new IOException("PAGES not a dictionary object, but: " + + object.getClass().getSimpleName()); } pagesDictionary = (COSDictionary) object; @@ -751,7 +742,7 @@ public class NonSequentialPDFParser exte // this part copied from the sequential parser parseHeader(); pdfSource.seek(0); - + if (!initialParseDone) { initialParse(); @@ -812,7 +803,8 @@ public class NonSequentialPDFParser exte * * @return true if parser is lenient */ - public boolean isLenient () { + public boolean isLenient() + { return isLenient; } @@ -825,15 +817,17 @@ public class NonSequentialPDFParser exte * * @throws IllegalArgumentException if the method is called after parsing. */ - public void setLenient (boolean lenient) throws IllegalArgumentException { - if (initialParseDone) { + public void setLenient(boolean lenient) throws IllegalArgumentException + { + if (initialParseDone) + { throw new IllegalArgumentException("Cannot change leniency after parsing"); } this.isLenient = lenient; } + /** - * Remove the temporary file. A temporary file is created if this class is - * instantiated with an InputStream + * Remove the temporary file. A temporary file is created if this class is instantiated with an InputStream */ protected void deleteTempFile() { @@ -915,10 +909,9 @@ public class NonSequentialPDFParser exte } /** - * Returns the object for a specific page. The page tree is made up of kids. - * The kids have COSArray with COSObjects inside of them. The COSObject can - * be parsed using the dynamic parsing method We want to only parse the - * minimum COSObjects and still return a complete page. ready to be used. + * Returns the object for a specific page. The page tree is made up of kids. The kids have COSArray with COSObjects + * inside of them. The COSObject can be parsed using the dynamic parsing method We want to only parse the minimum + * COSObjects and still return a complete page. ready to be used. * * @param num the requested page number; numbering starts with 0 * @param startKids Kids array to start with looking up page number @@ -928,7 +921,8 @@ public class NonSequentialPDFParser exte * * @throws IOException */ - private COSObject getPageObject(int num, COSArray startKids, int startPageCount) throws IOException + private COSObject getPageObject(int num, COSArray startKids, int startPageCount) + throws IOException { int curPageCount = startPageCount; Iterator kidsIter = startKids.iterator(); @@ -981,8 +975,7 @@ public class NonSequentialPDFParser exte } /** - * Creates a unique object id using object number and object generation - * number. (requires object number < 2^31)) + * Creates a unique object id using object number and object generation number. (requires object number < 2^31)) */ private final long getObjectId(final COSObject obj) { @@ -990,11 +983,11 @@ public class NonSequentialPDFParser exte } /** - * Adds all from newObjects to toBeParsedList if it is not an COSObject or - * we didn't add this COSObject already (checked via addedObjects). + * Adds all from newObjects to toBeParsedList if it is not an COSObject or we didn't add this COSObject already + * (checked via addedObjects). */ - private final void addNewToList(final Queue toBeParsedList, final Collection newObjects, - final Set addedObjects) + private final void addNewToList(final Queue toBeParsedList, + final Collection newObjects, final Set addedObjects) { for (COSBase newObject : newObjects) { @@ -1011,8 +1004,8 @@ public class NonSequentialPDFParser exte } /** - * Adds newObject to toBeParsedList if it is not an COSObject or we didn't - * add this COSObject already (checked via addedObjects). + * Adds newObject to toBeParsedList if it is not an COSObject or we didn't add this COSObject already (checked via + * addedObjects). */ private final void addNewToList(final Queue toBeParsedList, final COSBase newObject, final Set addedObjects) @@ -1029,13 +1022,11 @@ public class NonSequentialPDFParser exte } /** - * Will parse every object necessary to load a single page from the pdf - * document. We try our best to order objects according to offset in file - * before reading to minimize seek operations. + * Will parse every object necessary to load a single page from the pdf document. We try our best to order objects + * according to offset in file before reading to minimize seek operations. * * @param dict the COSObject from the parent pages. - * @param excludeObjects dictionary object reference entries with these - * names will not be parsed + * @param excludeObjects dictionary object reference entries with these names will not be parsed * * @throws IOException */ @@ -1078,7 +1069,8 @@ public class NonSequentialPDFParser exte } else if (baseObj instanceof COSDictionary) { - addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects); + addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), + addedObjects); } else if (baseObj instanceof COSArray) { @@ -1092,8 +1084,8 @@ public class NonSequentialPDFParser exte { COSObject obj = (COSObject) baseObj; long objId = getObjectId(obj); - COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj.getGenerationNumber() - .intValue()); + COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj + .getGenerationNumber().intValue()); if (!(parsedObjects.contains(objId) /* * || document.hasObjectInPool ( objKey ) @@ -1113,16 +1105,20 @@ public class NonSequentialPDFParser exte // negative offset means we have a compressed // object within object stream; // get offset of object stream - fileOffset = xrefTrailerResolver.getXrefTable().get(new COSObjectKey(-fileOffset, 0)); + fileOffset = xrefTrailerResolver.getXrefTable().get( + new COSObjectKey(-fileOffset, 0)); if ((fileOffset == null) || (fileOffset <= 0)) { - throw new IOException("Invalid object stream xref object reference: " + fileOffset); + throw new IOException( + "Invalid object stream xref object reference: " + + fileOffset); } List stmObjects = objToBeParsed.get(fileOffset); if (stmObjects == null) { - objToBeParsed.put(fileOffset, stmObjects = new ArrayList()); + objToBeParsed.put(fileOffset, + stmObjects = new ArrayList()); } stmObjects.add(obj); } @@ -1157,44 +1153,39 @@ public class NonSequentialPDFParser exte } /** - * This will parse the next object from the stream and add it to the local - * state. This is taken from {@link PDFParser} and reduced to parsing an - * indirect object. - * - * @param obj object to be parsed (we only take object number and generation - * number for lookup start offset) - * @param requireExistingNotCompressedObj if true object to be - * parsed must not be contained within compressed stream + * This will parse the next object from the stream and add it to the local state. This is taken from + * {@link PDFParser} and reduced to parsing an indirect object. + * + * @param obj object to be parsed (we only take object number and generation number for lookup start offset) + * @param requireExistingNotCompressedObj if true object to be parsed must not be contained within + * compressed stream * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ - protected final COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj) - throws IOException + protected final COSBase parseObjectDynamically(COSObject obj, + boolean requireExistingNotCompressedObj) throws IOException { - return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue(), - requireExistingNotCompressedObj); + return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber() + .intValue(), requireExistingNotCompressedObj); } /** - * This will parse the next object from the stream and add it to the local - * state. This is taken from {@link PDFParser} and reduced to parsing an - * indirect object. + * This will parse the next object from the stream and add it to the local state. This is taken from + * {@link PDFParser} and reduced to parsing an indirect object. * * @param objNr object number of object to be parsed * @param objGenNr object generation number of object to be parsed - * @param requireExistingNotCompressedObj if true the object to - * be parsed must be defined in xref (comment: null objects may - * be missing from xref) and it must not be a compressed object - * within object stream (this is used to circumvent being stuck - * in a loop in a malicious PDF) + * @param requireExistingNotCompressedObj if true the object to be parsed must be defined in xref + * (comment: null objects may be missing from xref) and it must not be a compressed object within object stream + * (this is used to circumvent being stuck in a loop in a malicious PDF) * * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ - protected COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj) - throws IOException + protected COSBase parseObjectDynamically(int objNr, int objGenNr, + boolean requireExistingNotCompressedObj) throws IOException { // ---- create object key and get object (container) from pool final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); @@ -1207,10 +1198,11 @@ public class NonSequentialPDFParser exte Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents - if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) + if (requireExistingNotCompressedObj + && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) { - throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() - + ":" + objKey.getGeneration()); + throw new IOException("Object must be defined and must not be compressed object: " + + objKey.getNumber() + ":" + objKey.getGeneration()); } if (offsetOrObjstmObNr == null) @@ -1232,8 +1224,9 @@ public class NonSequentialPDFParser exte // ---- consistency check if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) { - throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() - + " points to wrong object: " + readObjNr + ":" + readObjGen); + throw new IOException("XREF for " + objKey.getNumber() + ":" + + objKey.getGeneration() + " points to wrong object: " + readObjNr + + ":" + readObjGen); } skipSpaces(); @@ -1246,7 +1239,8 @@ public class NonSequentialPDFParser exte pdfSource.unread(' '); if (pb instanceof COSDictionary) { - COSStream stream = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile()); + COSStream stream = parseCOSStream((COSDictionary) pb, getDocument() + .getScratchFile()); if (securityHandler != null) { @@ -1259,7 +1253,8 @@ public class NonSequentialPDFParser exte // this is not legal // the combination of a dict and the stream/endstream // forms a complete stream object - throw new IOException("Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")."); + throw new IOException("Stream not preceded by dictionary (offset: " + + offsetOrObjstmObNr + ")."); } skipSpaces(); endObjectKey = readLine(); @@ -1318,12 +1313,14 @@ public class NonSequentialPDFParser exte if (endObjectKey.endsWith(" obj") && isLenient) { LOG.warn("Object (" + readObjNr + ":" + readObjGen + ") at offset " - + offsetOrObjstmObNr + " does not end with 'endobj' but with '" + endObjectKey + "'"); + + offsetOrObjstmObNr + " does not end with 'endobj' but with '" + + endObjectKey + "'"); } else { - throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " - + offsetOrObjstmObNr + " does not end with 'endobj' but with '" + endObjectKey + "'"); + throw new IOException("Object (" + readObjNr + ":" + readObjGen + + ") at offset " + offsetOrObjstmObNr + + " does not end with 'endobj' but with '" + endObjectKey + "'"); } } @@ -1341,13 +1338,14 @@ public class NonSequentialPDFParser exte if (objstmBaseObj instanceof COSStream) { // parse object stream - PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document, - forceParsing); + PDFObjectStreamParser parser = new PDFObjectStreamParser( + (COSStream) objstmBaseObj, document, forceParsing); parser.parse(); // get set of object numbers referenced for this object // stream - final Set refObjNrs = xrefTrailerResolver.getContainedObjectNumbers(objstmObjNr); + final Set refObjNrs = xrefTrailerResolver + .getContainedObjectNumbers(objstmObjNr); // register all objects which are referenced to be contained // in object stream @@ -1433,8 +1431,8 @@ public class NonSequentialPDFParser exte if (!(lengthObj.getObject() instanceof COSNumber)) { - throw new IOException("Wrong type of referenced length object " + lengthObj + ": " - + lengthObj.getObject().getClass().getSimpleName()); + throw new IOException("Wrong type of referenced length object " + lengthObj + + ": " + lengthObj.getObject().getClass().getSimpleName()); } retVal = (COSNumber) lengthObj.getObject(); @@ -1442,7 +1440,8 @@ public class NonSequentialPDFParser exte } else { - throw new IOException("Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName()); + throw new IOException("Wrong type of length object: " + + lengthBaseObj.getClass().getSimpleName()); } } finally @@ -1457,21 +1456,18 @@ public class NonSequentialPDFParser exte private final byte[] streamCopyBuf = new byte[streamCopyBufLen]; /** - * This will read a COSStream from the input stream using length attribute - * within dictionary. If length attribute is a indirect reference it is - * first resolved to get the stream length. This means we copy stream data - * without testing for 'endstream' or 'endobj' and thus it is no problem if - * these keywords occur within stream. We require 'endstream' to be found - * after stream data is read. + * This will read a COSStream from the input stream using length attribute within dictionary. If length attribute is + * a indirect reference it is first resolved to get the stream length. This means we copy stream data without + * testing for 'endstream' or 'endobj' and thus it is no problem if these keywords occur within stream. We require + * 'endstream' to be found after stream data is read. * * @param dic dictionary that goes with this stream. * @param file file to write the stream to when reading. * * @return parsed pdf stream. * - * @throws IOException if an error occurred reading the stream, like - * problems with reading length attribute, stream does not end - * with 'endstream' after data read, stream too short etc. + * @throws IOException if an error occurred reading the stream, like problems with reading length attribute, stream + * does not end with 'endstream' after data read, stream too short etc. */ @Override protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException @@ -1531,23 +1527,25 @@ public class NonSequentialPDFParser exte if (validateStreamLength(streamLengthObj.longValue())) { out = stream.createFilteredStream(streamLengthObj); - long remainBytes = streamLengthObj.longValue(); - int bytesRead = 0; - while (remainBytes > 0) - { - final int readBytes = pdfSource.read(streamCopyBuf, 0, - (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); - if (readBytes <= 0) - { - useReadUntilEnd = true; - out.close(); - pdfSource.unread(bytesRead); - break; - } - out.write(streamCopyBuf, 0, readBytes); - remainBytes -= readBytes; - bytesRead += readBytes; - } + long remainBytes = streamLengthObj.longValue(); + int bytesRead = 0; + while (remainBytes > 0) + { + final int readBytes = pdfSource + .read(streamCopyBuf, + 0, + (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); + if (readBytes <= 0) + { + useReadUntilEnd = true; + out.close(); + pdfSource.unread(bytesRead); + break; + } + out.write(streamCopyBuf, 0, readBytes); + remainBytes -= readBytes; + bytesRead += readBytes; + } } else { @@ -1561,8 +1559,9 @@ public class NonSequentialPDFParser exte String endStream = readString(); if (!endStream.equals("endstream")) { - throw new IOException("Error reading stream using length value. Expected='endstream' actual='" - + endStream + "' at offset " + pdfSource.getOffset()); + throw new IOException( + "Error reading stream using length value. Expected='endstream' actual='" + + endStream + "' at offset " + pdfSource.getOffset()); } } finally @@ -1577,26 +1576,26 @@ public class NonSequentialPDFParser exte private boolean validateStreamLength(long streamLength) throws IOException { - boolean streamLengthIsValid = true; - long originOffset = pdfSource.getOffset(); - long expectedEndOfStream = originOffset + streamLength; - if (expectedEndOfStream > fileLen) - { - streamLengthIsValid = false; - LOG.error("The end of the stream is out of range, using workaround to read the stream"); - } - else - { - pdfSource.seek(expectedEndOfStream); - skipSpaces(); - if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1"))) - { - streamLengthIsValid = false; - LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream"); - } - pdfSource.seek(originOffset); - } - return streamLengthIsValid; + boolean streamLengthIsValid = true; + long originOffset = pdfSource.getOffset(); + long expectedEndOfStream = originOffset + streamLength; + if (expectedEndOfStream > fileLen) + { + streamLengthIsValid = false; + LOG.error("The end of the stream is out of range, using workaround to read the stream"); + } + else + { + pdfSource.seek(expectedEndOfStream); + skipSpaces(); + if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1"))) + { + streamLengthIsValid = false; + LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream"); + } + pdfSource.seek(originOffset); + } + return streamLengthIsValid; } private void readUntilEndStream(final OutputStream out) throws IOException @@ -1610,7 +1609,8 @@ public class NonSequentialPDFParser exte // read next chunk into buffer; already matched chars are added to // beginning of buffer - while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen - charMatchCount)) > 0) + while ((bufSize = pdfSource.read(streamCopyBuf, charMatchCount, streamCopyBufLen + - charMatchCount)) > 0) { // number of already matching chars int startingMatchCount = charMatchCount; @@ -1628,7 +1628,8 @@ public class NonSequentialPDFParser exte // this shortcut is inspired by Boyer-Moore string search // algorithm // and can reduce parsing time by approx. 20% - if ((charMatchCount == 0) && ((quickTestIdx = bIdx + quickTestOffset) < maxQuicktestIdx)) + if ((charMatchCount == 0) + && ((quickTestIdx = bIdx + quickTestOffset) < maxQuicktestIdx)) { final byte ch = streamCopyBuf[quickTestIdx]; @@ -1690,9 +1691,10 @@ public class NonSequentialPDFParser exte } if (charMatchCount == keyw.length) { - // keyword matched; - // unread matched keyword (endstream/endobj) and following buffered content - pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + startingMatchCount); + // keyword matched; + // unread matched keyword (endstream/endobj) and following buffered content + pdfSource.unread(streamCopyBuf, contentBytes, bufSize - contentBytes - keyw.length + + startingMatchCount); break; } else @@ -1701,10 +1703,10 @@ public class NonSequentialPDFParser exte System.arraycopy(keyw, 0, streamCopyBuf, 0, charMatchCount); } } // while - + out.flush(); // this writes a lonely CR or drops trailing CR LF and LF } - + /** * * @param startXRefOffset @@ -1713,11 +1715,11 @@ public class NonSequentialPDFParser exte */ private long calculateFixingOffset(long startXRefOffset) throws IOException { - setPdfSource(startXRefOffset); - if (pdfSource.peek() == X && calculateFixingOffset(startXRefOffset, XREF) == 0) - { - return 0; - } + setPdfSource(startXRefOffset); + if (pdfSource.peek() == X && calculateFixingOffset(startXRefOffset, XREF) == 0) + { + return 0; + } int nextValue = pdfSource.peek(); // is the next character a digit? if (nextValue > 47 && nextValue < 57) @@ -1743,12 +1745,11 @@ public class NonSequentialPDFParser exte } } // TODO try to repair for XRef streams - return calculateFixingOffset(startXRefOffset, XREF); + return calculateFixingOffset(startXRefOffset, XREF); } /** - * Try to dereference the given object at the given offset and calculate a new - * offset if necessary. + * Try to dereference the given object at the given offset and calculate a new offset if necessary. * * @param objectOffset the offset where to look at * @param objectID the object ID @@ -1756,12 +1757,13 @@ public class NonSequentialPDFParser exte * @return the difference to the origin offset * @throws IOException if something went wrong */ - private long calculateFixingOffset(long objectOffset, long objectID, long genID) throws IOException + private long calculateFixingOffset(long objectOffset, long objectID, long genID) + throws IOException { - String objString = Long.toString(objectID) + " " + Long.toString(genID)+ " obj"; - return calculateFixingOffset(objectOffset, objString.getBytes("ISO-8859-1")); + String objString = Long.toString(objectID) + " " + Long.toString(genID) + " obj"; + return calculateFixingOffset(objectOffset, objString.getBytes("ISO-8859-1")); } - + /** * Check if the given bytes can be found at the current offset. * @@ -1771,35 +1773,33 @@ public class NonSequentialPDFParser exte */ private boolean checkBytesAtOffset(byte[] string) throws IOException { - boolean bytesMatching = false; - if (pdfSource.peek() == string[0]) - { - int length = string.length; - byte[] bytesRead = new byte[length]; - int numberOfBytes = pdfSource.read(bytesRead, 0, length); - while (numberOfBytes < length) - { - int readMore = pdfSource.read(bytesRead, numberOfBytes, length-numberOfBytes); - if (readMore < 0) - { - break; - } - numberOfBytes += readMore; - } - if (Arrays.equals(string, bytesRead)) - { - bytesMatching = true; - } - pdfSource.unread(bytesRead, 0, numberOfBytes); - } - return bytesMatching; - } - - /** - * Check if the given bytes can be found at the given offset. - * The method seeks 200 bytes backward/forward if the given string - * can't be found at the given offset and returns the difference - * of the new offset to the origin one. + boolean bytesMatching = false; + if (pdfSource.peek() == string[0]) + { + int length = string.length; + byte[] bytesRead = new byte[length]; + int numberOfBytes = pdfSource.read(bytesRead, 0, length); + while (numberOfBytes < length) + { + int readMore = pdfSource.read(bytesRead, numberOfBytes, length - numberOfBytes); + if (readMore < 0) + { + break; + } + numberOfBytes += readMore; + } + if (Arrays.equals(string, bytesRead)) + { + bytesMatching = true; + } + pdfSource.unread(bytesRead, 0, numberOfBytes); + } + return bytesMatching; + } + + /** + * Check if the given bytes can be found at the given offset. The method seeks 200 bytes backward/forward if the + * given string can't be found at the given offset and returns the difference of the new offset to the origin one. * * @param objectOffset the given offset where to look at * @param string the bytes to look for @@ -1808,103 +1808,105 @@ public class NonSequentialPDFParser exte */ private long calculateFixingOffset(long objectOffset, byte[] string) throws IOException { - if (objectOffset < 0) - { - LOG.error("Invalid object offset " + objectOffset + " for object " + new String(string)); - return 0; - } - long originOffset = pdfSource.getOffset(); - pdfSource.seek(objectOffset); - // most likely the object can be found at the given offset - if (checkBytesAtOffset(string)) - { - pdfSource.seek(originOffset); - return 0; - } - // the offset seems to be wrong -> seek backward to find the object we are looking for - long currentOffset = objectOffset; - for (int i=1; i<20;i++) - { - currentOffset = objectOffset - (i*10); - if (currentOffset > 0) - { - pdfSource.seek(currentOffset); - for (int j=0; j<10;j++) - { - if (checkBytesAtOffset(string)) - { - pdfSource.seek(originOffset); - LOG.debug("Fixed reference for object "+new String(string)+" "+objectOffset + " -> "+(objectOffset - currentOffset)); - return objectOffset - currentOffset; - } - else - { - currentOffset++; - pdfSource.read(); - } - } - } - } - // no luck by seeking backward -> seek forward to find the object we are looking for - pdfSource.seek(objectOffset); - currentOffset = objectOffset; - do - { - if (checkBytesAtOffset(string)) - { - pdfSource.seek(originOffset); - if (currentOffset != 0) - { - LOG.debug("Fixed reference for object "+new String(string)+" "+objectOffset + " -> "+(objectOffset - currentOffset)); - } - return objectOffset - currentOffset; - } - else - { - // next byte - currentOffset++; - if (pdfSource.read() == -1) - { - throw new IOException("Premature end of file while dereferencing object "+ new String(string) + " at offset " + objectOffset); - } - } - } - while(currentOffset < objectOffset+200); - pdfSource.seek(originOffset); - LOG.error("Can't find the object " + new String(string) + " at offset " + objectOffset); - return 0; + if (objectOffset < 0) + { + LOG.error("Invalid object offset " + objectOffset + " for object " + new String(string)); + return 0; + } + long originOffset = pdfSource.getOffset(); + pdfSource.seek(objectOffset); + // most likely the object can be found at the given offset + if (checkBytesAtOffset(string)) + { + pdfSource.seek(originOffset); + return 0; + } + // the offset seems to be wrong -> seek backward to find the object we are looking for + long currentOffset = objectOffset; + for (int i = 1; i < 20; i++) + { + currentOffset = objectOffset - (i * 10); + if (currentOffset > 0) + { + pdfSource.seek(currentOffset); + for (int j = 0; j < 10; j++) + { + if (checkBytesAtOffset(string)) + { + pdfSource.seek(originOffset); + LOG.debug("Fixed reference for object " + new String(string) + " " + + objectOffset + " -> " + (objectOffset - currentOffset)); + return objectOffset - currentOffset; + } + else + { + currentOffset++; + pdfSource.read(); + } + } + } + } + // no luck by seeking backward -> seek forward to find the object we are looking for + pdfSource.seek(objectOffset); + currentOffset = objectOffset; + do + { + if (checkBytesAtOffset(string)) + { + pdfSource.seek(originOffset); + if (currentOffset != 0) + { + LOG.debug("Fixed reference for object " + new String(string) + " " + + objectOffset + " -> " + (objectOffset - currentOffset)); + } + return objectOffset - currentOffset; + } + else + { + // next byte + currentOffset++; + if (pdfSource.read() == -1) + { + throw new IOException("Premature end of file while dereferencing object " + + new String(string) + " at offset " + objectOffset); + } + } + } while (currentOffset < objectOffset + 200); + pdfSource.seek(originOffset); + LOG.error("Can't find the object " + new String(string) + " at offset " + objectOffset); + return 0; } /** - * Check the XRef table by dereferencing all objects and fixing - * the offset if necessary. + * Check the XRef table by dereferencing all objects and fixing the offset if necessary. * * @throws IOException if something went wrong. */ private void checkXrefOffsets() throws IOException { - MapxrefOffset = xrefTrailerResolver.getXrefTable(); - if (xrefOffset != null) - { - for (COSObjectKey objectKey : xrefOffset.keySet()) - { - Long objectOffset = xrefOffset.get(objectKey); - // a negative offset number represents a object number itself - // see type 2 entry in xref stream - if (objectOffset != null && objectOffset > 0) - { - long objectNr = objectKey.getNumber(); - long objectGen = objectKey.getGeneration(); - long fixingOffset = calculateFixingOffset(objectOffset, objectNr, objectGen); - if (fixingOffset != 0) - { - long newOffset = objectOffset - fixingOffset; - xrefOffset.put(objectKey, newOffset); - LOG.debug("Fixed reference for object "+objectNr+" "+objectGen+" "+objectOffset + " -> "+newOffset); - } - } - } - } + Map xrefOffset = xrefTrailerResolver.getXrefTable(); + if (xrefOffset != null) + { + for (COSObjectKey objectKey : xrefOffset.keySet()) + { + Long objectOffset = xrefOffset.get(objectKey); + // a negative offset number represents a object number itself + // see type 2 entry in xref stream + if (objectOffset != null && objectOffset > 0) + { + long objectNr = objectKey.getNumber(); + long objectGen = objectKey.getGeneration(); + long fixingOffset = calculateFixingOffset(objectOffset, objectNr, objectGen); + if (fixingOffset != 0) + { + long newOffset = objectOffset - fixingOffset; + xrefOffset.put(objectKey, newOffset); + LOG.debug("Fixed reference for object " + objectNr + " " + objectGen + " " + + objectOffset + " -> " + newOffset); + } + } + } + } } }