Return-Path: X-Original-To: apmail-pdfbox-commits-archive@www.apache.org Delivered-To: apmail-pdfbox-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B2BC6F6A3 for ; Sun, 7 Apr 2013 13:29:49 +0000 (UTC) Received: (qmail 39019 invoked by uid 500); 7 Apr 2013 13:29:49 -0000 Delivered-To: apmail-pdfbox-commits-archive@pdfbox.apache.org Received: (qmail 38983 invoked by uid 500); 7 Apr 2013 13:29:49 -0000 Mailing-List: contact commits-help@pdfbox.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@pdfbox.apache.org Delivered-To: mailing list commits@pdfbox.apache.org Received: (qmail 38896 invoked by uid 99); 7 Apr 2013 13:29:45 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Apr 2013 13:29:45 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 07 Apr 2013 13:29:36 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id B60C423889EB; Sun, 7 Apr 2013 13:29:13 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1465396 [2/3] - in /pdfbox/branches/1.8: ./ fontbox/src/main/java/org/apache/fontbox/ttf/ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ pdfbox/src/main/java/org/apache/pdfbox/pdfviewer/ pdfbox/src/main/java/org/apache/pdfbox/pdfwriter/... Date: Sun, 07 Apr 2013 13:29:13 -0000 To: commits@pdfbox.apache.org From: lehmi@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130407132913.B60C423889EB@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1465396&r1=1465395&r2=1465396&view=diff ============================================================================== --- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original) +++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun Apr 7 13:29:12 2013 @@ -16,7 +16,6 @@ */ package org.apache.pdfbox.pdfparser; - import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; @@ -68,341 +67,372 @@ import org.apache.pdfbox.persistence.uti /** * PDFParser which first reads startxref and xref tables in order to know valid - * objects and parse only these objects. Thus it is closer to a conforming parser - * than the sequential reading of {@link PDFParser}. + * objects and parse only these objects. Thus it is closer to a conforming + * parser than the sequential reading of {@link PDFParser}. * - * This class can be used as a {@link PDFParser} replacement. First {@link #parse()} - * must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}. + * This class can be used as a {@link PDFParser} replacement. First + * {@link #parse()} must be called before page objects can be retrieved, e.g. + * {@link #getPDDocument()}. * - * This class is a much enhanced version of QuickParser presented in - * PDFBOX-1104 - * by Jeremy Villalobos. + * This class is a much enhanced version of QuickParser presented + * in PDFBOX-1104 by + * Jeremy Villalobos. */ public class NonSequentialPDFParser extends PDFParser { - - public static final String SYSPROP_PARSEMINIMAL = - "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal"; - public static final String SYSPROP_EOFLOOKUPRANGE = - "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange"; - - private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream( new byte[0] ); - - protected static final int DEFAULT_TRAIL_BYTECOUNT = 2048; - protected static final char[] EOF_MARKER = new char[] { '%','%','E','O','F' }; - protected static final char[] STARTXREF_MARKER = new char[] { 's','t','a','r','t','x','r','e','f' }; - protected static final char[] OBJ_MARKER = new char[] { 'o','b','j' }; + + private static final int E = 'e'; + private static final int N = 'n'; + + public static final String SYSPROP_PARSEMINIMAL = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal"; + public static final String SYSPROP_EOFLOOKUPRANGE = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange"; + + private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream(new byte[0]); + + protected static final int DEFAULT_TRAIL_BYTECOUNT = 2048; + protected static final char[] EOF_MARKER = new char[] + { '%', '%', 'E', 'O', 'F' }; + protected static final char[] STARTXREF_MARKER = new char[] + { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' }; + protected static final char[] OBJ_MARKER = new char[] + { 'o', 'b', 'j' }; private final File pdfFile; private final RandomAccessBufferedFileInputStream raStream; - + protected SecurityHandler securityHandler = null; - + private String keyStoreFilename = null; - private String alias = null; - private String password = ""; - private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing bytes to read for EOF marker - - /** If true object references in catalog are not followed; - * pro: page objects will be only parsed when needed; cons: some information of catalog - * might not be available (e.g. outline). - * Catalog parsing without pages is not an option since a number of entries will - * also refer to page objects (like OpenAction). + private String alias = null; + private String password = ""; + private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing + // bytes to read for + // EOF marker + + /** + * If true object references in catalog are not followed; pro: + * page objects will be only parsed when needed; cons: some information of + * catalog might not be available (e.g. outline). Catalog parsing without + * pages is not an option since a number of entries will also refer to page + * objects (like OpenAction). */ - private boolean parseMinimalCatalog = "true".equals( System.getProperty( SYSPROP_PARSEMINIMAL ) ); - + private boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL)); + private boolean initialParseDone = false; - private boolean allPagesParsed = false; - - private static final Log LOG = LogFactory.getLog( NonSequentialPDFParser.class ); - - /** - * true if the NonSequentialPDFParser is initialized by a InputStream, in this case - * a temporary file is created. At the end of the {@linkplain #parse()} method,the temporary file will - * be deleted. - */ - private boolean isTmpPDFFile = false; - - public static final String TMP_FILE_PREFIX = "tmpPDF"; - - // ------------------------------------------------------------------------ - /** - * Constructs parser for given file using memory buffer. - * - * @param filename the filename of the pdf to be parsed - * - * @throws IOException If something went wrong. - */ - public NonSequentialPDFParser( String filename ) throws IOException - { - this( new File( filename ), null ); - } - - /** - * Constructs parser for given file using given buffer for temporary storage. - * - * @param file the pdf to be parsed - * @param raBuf the buffer to be used for parsing - * - * @throws IOException If something went wrong. - */ - /** - * Constructs parser for given file using given buffer for temporary storage. - * - * @param file the pdf to be parsed - * @param raBuf the buffer to be used for parsing - * - * @throws IOException If something went wrong. - */ - public NonSequentialPDFParser( File file, RandomAccess raBuf ) throws IOException - { - this(file, raBuf, ""); - } - - /** - * Constructs parser for given file using given buffer for temporary storage. - * - * @param file the pdf to be parsed - * @param raBuf the buffer to be used for parsing - * - * @throws IOException If something went wrong. - */ - /** - * Constructs parser for given file using given buffer for temporary storage. - * - * @param file the pdf to be parsed - * @param raBuf the buffer to be used for parsing - * @param decryptionPassword password to be used for decryption - * - * @throws IOException If something went wrong. - */ - public NonSequentialPDFParser( File file, RandomAccess raBuf, String decryptionPassword ) throws IOException - { - super( EMPTY_INPUT_STREAM, null, false ); - pdfFile = file; - raStream = new RandomAccessBufferedFileInputStream( pdfFile ); - init(file, raBuf, decryptionPassword); - } - - private void init(File file, RandomAccess raBuf, String decryptionPassword) throws IOException { - String eofLookupRangeStr = System.getProperty( SYSPROP_EOFLOOKUPRANGE ); - if ( eofLookupRangeStr != null ) - { - try - { - setEOFLookupRange( Integer.parseInt( eofLookupRangeStr ) ); - } - catch ( NumberFormatException nfe ) - { - LOG.warn( "System property " + SYSPROP_EOFLOOKUPRANGE + - " does not contain an integer value, but: '" + eofLookupRangeStr + "'" ); - } - } - - setDocument( ( raBuf == null ) ? new COSDocument( new RandomAccessBuffer(), false ) : new COSDocument( raBuf, false ) ); - - pdfSource = new PushBackInputStream( raStream, 4096 ); - - password = decryptionPassword; - } - - public NonSequentialPDFParser(InputStream input) throws IOException - { - super( EMPTY_INPUT_STREAM, null, false ); - pdfFile = createTmpFile(input); - raStream = new RandomAccessBufferedFileInputStream( pdfFile ); - init(pdfFile, null, ""); - } - - /** - * Create a temporary file with the input stream. - * If the creation succeed, the {@linkplain #isTmpPDFFile} is set to true. - * This Temporary file will be deleted at end of the parse method - * @param input - * @return - * @throws IOException - */ - private File createTmpFile(InputStream input) throws IOException { - File tmpFile = null; - FileOutputStream fos = null; - try { - tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf"); - fos = new FileOutputStream(tmpFile); - IOUtils.copy(input, fos); - isTmpPDFFile = true; - return tmpFile; - } finally { - IOUtils.closeQuietly(input); - IOUtils.closeQuietly(fos); - } - } + private boolean allPagesParsed = false; + + private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class); + + /** + * true if the NonSequentialPDFParser is initialized by a + * InputStream, in this case a temporary file is created. At the end of the + * {@linkplain #parse()} method,the temporary file will be deleted. + */ + private boolean isTmpPDFFile = false; + + public static final String TMP_FILE_PREFIX = "tmpPDF"; // ------------------------------------------------------------------------ - /** - * Sets how many trailing bytes of PDF file are searched for - * EOF marker and 'startxref' marker. - * If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}. - * - * - * - *

In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined - * this value will be set on initialization but can be overwritten later.

- * - * @param byteCount number of trailing bytes + /** + * Constructs parser for given file using memory buffer. + * + * @param filename the filename of the pdf to be parsed + * + * @throws IOException If something went wrong. + */ + public NonSequentialPDFParser(String filename) throws IOException + { + this(new File(filename), null); + } + + /** + * Constructs parser for given file using given buffer for temporary + * storage. + * + * @param file the pdf to be parsed + * @param raBuf the buffer to be used for parsing + * + * @throws IOException If something went wrong. + */ + /** + * Constructs parser for given file using given buffer for temporary + * storage. + * + * @param file the pdf to be parsed + * @param raBuf the buffer to be used for parsing + * + * @throws IOException If something went wrong. + */ + public NonSequentialPDFParser(File file, RandomAccess raBuf) throws IOException + { + this(file, raBuf, ""); + } + + /** + * Constructs parser for given file using given buffer for temporary + * storage. + * + * @param file the pdf to be parsed + * @param raBuf the buffer to be used for parsing + * + * @throws IOException If something went wrong. + */ + /** + * Constructs parser for given file using given buffer for temporary + * storage. + * + * @param file the pdf to be parsed + * @param raBuf the buffer to be used for parsing + * @param decryptionPassword password to be used for decryption + * + * @throws IOException If something went wrong. */ - public void setEOFLookupRange( int byteCount ) + public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) throws IOException { - if ( byteCount > 15 ) + super(EMPTY_INPUT_STREAM, null, false); + pdfFile = file; + raStream = new RandomAccessBufferedFileInputStream(pdfFile); + init(file, raBuf, decryptionPassword); + } + + private void init(File file, RandomAccess raBuf, String decryptionPassword) throws IOException + { + String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE); + if (eofLookupRangeStr != null) + { + try { - readTrailBytes = byteCount; + setEOFLookupRange(Integer.parseInt(eofLookupRangeStr)); } + catch (NumberFormatException nfe) + { + LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '" + + eofLookupRangeStr + "'"); + } + } + + setDocument((raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument(raBuf, false)); + + pdfSource = new PushBackInputStream(raStream, 4096); + + password = decryptionPassword; + } + + public NonSequentialPDFParser(InputStream input) throws IOException + { + super(EMPTY_INPUT_STREAM, null, false); + pdfFile = createTmpFile(input); + raStream = new RandomAccessBufferedFileInputStream(pdfFile); + init(pdfFile, null, ""); } - + + /** + * Create a temporary file with the input stream. If the creation succeed, + * the {@linkplain #isTmpPDFFile} is set to true. This Temporary file will + * be deleted at end of the parse method + * + * @param input + * @return + * @throws IOException + */ + private File createTmpFile(InputStream input) throws IOException + { + File tmpFile = null; + FileOutputStream fos = null; + try + { + tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf"); + fos = new FileOutputStream(tmpFile); + IOUtils.copy(input, fos); + isTmpPDFFile = true; + return tmpFile; + } + finally + { + IOUtils.closeQuietly(input); + IOUtils.closeQuietly(fos); + } + } + // ------------------------------------------------------------------------ /** - * The initial parse will first parse only the trailer, the xrefstart and - * all xref tables to have a pointer (offset) to all the pdf's objects. - * It can handle linearized pdfs, which will have an xref at the - * end pointing to an xref at the beginning of the file. - * Last the root object is parsed. + * Sets how many trailing bytes of PDF file are searched for EOF marker and + * 'startxref' marker. If not set we use default value + * {@link #DEFAULT_TRAIL_BYTECOUNT}. + * + * + * + *

In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined + * this value will be set on initialization but can be overwritten + * later.

+ * + * @param byteCount number of trailing bytes + */ + public void setEOFLookupRange(int byteCount) + { + if (byteCount > 15) + { + readTrailBytes = byteCount; + } + } + + // ------------------------------------------------------------------------ + /** + * The initial parse will first parse only the trailer, the xrefstart and + * all xref tables to have a pointer (offset) to all the pdf's objects. It + * can handle linearized pdfs, which will have an xref at the end pointing + * to an xref at the beginning of the file. Last the root object is parsed. * * @throws IOException */ protected void initialParse() throws IOException { final long startxrefOff = getStartxrefOffset(); - + // ---- parse startxref - setPdfSource( startxrefOff ); + setPdfSource(startxrefOff); parseStartXref(); - + final long xrefOffset = document.getStartXref(); - long prev = xrefOffset; - - // ---- parse whole chain of xref tables/object streams using PREV reference - while( prev > -1 ) + long prev = xrefOffset; + + // ---- parse whole chain of xref tables/object streams using PREV + // reference + while (prev > -1) { // seek to xref table - setPdfSource( prev ); - + setPdfSource(prev); + // -- parse xref - if ( pdfSource.peek() == 'x' ) + if (pdfSource.peek() == 'x') { // xref table and trailer // use existing parser to parse xref table - parseXrefTable( prev ); - + parseXrefTable(prev); + // parse the last trailer. - if ( ! parseTrailer() ) + if (!parseTrailer()) { - throw new IOException( "Expected trailer object at position: " + pdfSource.getOffset() ); + throw new IOException("Expected trailer object at position: " + pdfSource.getOffset()); } COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); - prev = trailer.getInt( COSName.PREV ); + prev = trailer.getInt(COSName.PREV); } else { // xref stream - prev = parseXrefObjStream( prev ); + prev = parseXrefObjStream(prev); } } - + // ---- build valid xrefs out of the xref chain - xrefTrailerResolver.setStartxref( xrefOffset ); - document.setTrailer( xrefTrailerResolver.getTrailer() ); - + xrefTrailerResolver.setStartxref(xrefOffset); + COSDictionary trailer = xrefTrailerResolver.getTrailer(); + document.setTrailer(trailer); + + // JIRA-1557 - ensure that all COSObject are loaded in the trailer + for (COSBase trailerEntry : trailer.getValues()) + { + if (trailerEntry instanceof COSObject) + { + COSObject tmpObj = (COSObject) trailerEntry; + parseObjectDynamically(tmpObj, true); + } + } // ---- prepare encryption if necessary - COSBase trailerEncryptItem = document.getTrailer().getItem( COSName.ENCRYPT ); - if ( trailerEncryptItem != null ) + COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT); + if (trailerEncryptItem != null) { - if ( trailerEncryptItem instanceof COSObject ) - { - COSObject trailerEncryptObj = (COSObject) trailerEncryptItem; - parseObjectDynamically( trailerEncryptObj, true ); - } - try - { - PDEncryptionDictionary encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary() ); - - DecryptionMaterial decryptionMaterial = null; - if( keyStoreFilename != null ) - { - KeyStore ks = KeyStore.getInstance( "PKCS12" ); - ks.load( new FileInputStream( keyStoreFilename ), password.toCharArray() ); - - decryptionMaterial = new PublicKeyDecryptionMaterial( ks, alias, password ); - } - else - { - decryptionMaterial = new StandardDecryptionMaterial( password ); - } - - securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler( encParameters.getFilter() ); - securityHandler.prepareForDecryption( encParameters, document.getDocumentID(), decryptionMaterial ); - - AccessPermission permission = securityHandler.getCurrentAccessPermission(); - if ( ! permission.canExtractContent() ) - { - LOG.warn( "PDF file '" + pdfFile.getPath() + "' does not allow extracting content." ); - } - - } - catch ( Exception e ) - { - throw new IOException( "Error (" + e.getClass().getSimpleName() + - ") while creating security handler for decryption: " + - e.getMessage() /*, e // TODO: remove remark with Java 1.6 */); - } + if (trailerEncryptItem instanceof COSObject) + { + COSObject trailerEncryptObj = (COSObject) trailerEncryptItem; + parseObjectDynamically(trailerEncryptObj, true); + } + try + { + PDEncryptionDictionary encParameters = new PDEncryptionDictionary(document.getEncryptionDictionary()); + + DecryptionMaterial decryptionMaterial = null; + if (keyStoreFilename != null) + { + KeyStore ks = KeyStore.getInstance("PKCS12"); + ks.load(new FileInputStream(keyStoreFilename), password.toCharArray()); + + decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password); + } + else + { + decryptionMaterial = new StandardDecryptionMaterial(password); + } + + securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler(encParameters.getFilter()); + securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial); + + AccessPermission permission = securityHandler.getCurrentAccessPermission(); + if (!permission.canExtractContent()) + { + LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content."); + } + + } + catch (Exception e) + { + throw new IOException("Error (" + e.getClass().getSimpleName() + + ") while creating security handler for decryption: " + + e.getMessage() /*, e TODO: remove remark with Java 1.6 */); + } } - + // ---- parse catalog or root object - COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem( COSName.ROOT ); - - if ( root == null ) - { - throw new IOException( "Missing root object specification in trailer." ); - } - - parseObjectDynamically( root, false ); - + COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT); + + if (root == null) + { + throw new IOException("Missing root object specification in trailer."); + } + + parseObjectDynamically(root, false); + // ---- resolve all objects (including pages) - if ( ! parseMinimalCatalog ) + if (!parseMinimalCatalog) { COSObject catalogObj = document.getCatalog(); - if ( catalogObj != null ) + if (catalogObj != null) { - if ( catalogObj.getObject() instanceof COSDictionary ) + if (catalogObj.getObject() instanceof COSDictionary) { - parseDictObjects( (COSDictionary) catalogObj.getObject(), (COSName[]) null ); + parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null); allPagesParsed = true; document.setDecrypted(); } } } initialParseDone = true; - } - + // ------------------------------------------------------------------------ - /** Parses an xref object stream starting with indirect object id. - * - * @return value of PREV item in dictionary or -1 if no such item exists + /** + * Parses an xref object stream starting with indirect object id. + * + * @return value of PREV item in dictionary or -1 if no such + * item exists */ - private long parseXrefObjStream( long objByteOffset ) throws IOException + private long parseXrefObjStream(long objByteOffset) throws IOException { // ---- parse indirect object head readInt(); readInt(); - readPattern( OBJ_MARKER ); - - COSDictionary dict = parseCOSDictionary(); - COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile() ); - parseXrefStream( xrefStream, (int) objByteOffset ); - - return dict.getLong( COSName.PREV ); + readPattern(OBJ_MARKER); + + COSDictionary dict = parseCOSDictionary(); + COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile()); + parseXrefStream(xrefStream, (int) objByteOffset); + + return dict.getLong(COSName.PREV); } // ------------------------------------------------------------------------ @@ -413,190 +443,200 @@ public class NonSequentialPDFParser exte } /** Sets {@link #pdfSource} to start next parsing at given file offset. */ - protected final void setPdfSource( long fileOffset ) throws IOException + protected final void setPdfSource(long fileOffset) throws IOException { - - pdfSource.seek( fileOffset ); + + pdfSource.seek(fileOffset); // alternative using 'old fashioned' input stream - // if ( pdfSource != null ) - // pdfSource.close(); - // - // pdfSource = new PushBackInputStream( - // new BufferedInputStream( - // new FileInputStream( file ), 16384), 4096); - // pdfSource.skip( _fileOffset ); + // if ( pdfSource != null ) + // pdfSource.close(); + // + // pdfSource = new PushBackInputStream( + // new BufferedInputStream( + // new FileInputStream( file ), 16384), 4096); + // pdfSource.skip( _fileOffset ); } /** Enable handling of alternative pdfSource implementation. */ protected final void releasePdfSourceInputStream() throws IOException { - // if ( pdfSource != null ) - // pdfSource.close(); + // if ( pdfSource != null ) + // pdfSource.close(); } - private final void closeFileStream() throws IOException + private final void closeFileStream() throws IOException { - if ( pdfSource != null ) + if (pdfSource != null) { pdfSource.close(); } } // ------------------------------------------------------------------------ - /** Looks for and parses startxref. We first look for last '%%EOF' marker - * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via - * {@link #setEOFLookupRange(int)}) and go back to find startxref. */ + /** + * Looks for and parses startxref. We first look for last '%%EOF' marker + * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via + * {@link #setEOFLookupRange(int)}) and go back to find + * startxref. + */ protected final long getStartxrefOffset() throws IOException { - byte[] buf; - long skipBytes; - + byte[] buf; + long skipBytes; + // ---- read trailing bytes into buffer final long fileLen = pdfFile.length(); - + FileInputStream fIn = null; - try + try { - fIn = new FileInputStream( pdfFile ); - - final int trailByteCount = ( fileLen < readTrailBytes ) ? (int) fileLen : readTrailBytes; - buf = new byte[ trailByteCount ]; - fIn.skip( skipBytes = fileLen - trailByteCount ); - + fIn = new FileInputStream(pdfFile); + + final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes; + buf = new byte[trailByteCount]; + fIn.skip(skipBytes = fileLen - trailByteCount); + int off = 0; int readBytes; - while ( off < trailByteCount ) + while (off < trailByteCount) { - readBytes = fIn.read( buf, off, trailByteCount - off ); - // in order to not get stuck in a loop we check readBytes (this should never happen) - if ( readBytes < 1 ) + readBytes = fIn.read(buf, off, trailByteCount - off); + // in order to not get stuck in a loop we check readBytes (this + // should never happen) + if (readBytes < 1) { - throw new IOException( "No more bytes to read for trailing buffer, but expected: " + - ( trailByteCount - off ) ); + throw new IOException("No more bytes to read for trailing buffer, but expected: " + + (trailByteCount - off)); } off += readBytes; } } finally { - if ( fIn != null ) + if (fIn != null) { - try - { - fIn.close(); - } - catch ( IOException ioe ) - {} + try + { + fIn.close(); + } + catch (IOException ioe) + { + } } } - + // ---- find last '%%EOF' - int bufOff = lastIndexOf( EOF_MARKER, buf, buf.length ); - - if ( bufOff < 0 ) + int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length); + + if (bufOff < 0) { - throw new IOException( "Missing end of file marker '" + ( new String( EOF_MARKER ) ) + "'" ); - } + throw new IOException("Missing end of file marker '" + (new String(EOF_MARKER)) + "'"); + } // ---- find last startxref preceding EOF marker - bufOff = lastIndexOf( STARTXREF_MARKER, buf, bufOff ); - - if ( bufOff < 0 ) + bufOff = lastIndexOf(STARTXREF_MARKER, buf, bufOff); + + if (bufOff < 0) { - throw new IOException( "Missing 'startxref' marker." ); + throw new IOException("Missing 'startxref' marker."); } return skipBytes + bufOff; } // ------------------------------------------------------------------------ - /** Searches last appearance of pattern within buffer. Lookup before _lastOff - * and goes back until 0. - * - * @param pattern pattern to search for - * @param buf buffer to search pattern in - * @param endOff offset (exclusive) where lookup starts at - * - * @return start offset of pattern within buffer or -1 if pattern could not be found + /** + * Searches last appearance of pattern within buffer. Lookup before _lastOff + * and goes back until 0. + * + * @param pattern pattern to search for + * @param buf buffer to search pattern in + * @param endOff offset (exclusive) where lookup starts at + * + * @return start offset of pattern within buffer or -1 if + * pattern could not be found */ - protected int lastIndexOf( final char[] pattern, final byte[] buf, final int endOff ) + protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) { final int lastPatternChOff = pattern.length - 1; - - int bufOff = endOff; - int patOff = lastPatternChOff; - char lookupCh = pattern[ patOff ]; - - while ( --bufOff >= 0 ) + + int bufOff = endOff; + int patOff = lastPatternChOff; + char lookupCh = pattern[patOff]; + + while (--bufOff >= 0) { - if ( buf[ bufOff ] == lookupCh ) + if (buf[bufOff] == lookupCh) { - if ( --patOff < 0 ) + if (--patOff < 0) { // whole pattern matched return bufOff; } // matched current char, advance to preceding one - lookupCh = pattern[ patOff ]; + lookupCh = pattern[patOff]; } - else if ( patOff < lastPatternChOff ) + else if (patOff < lastPatternChOff) { - // no char match but already matched some chars; reset - lookupCh = pattern[ patOff = lastPatternChOff ]; + // no char match but already matched some chars; reset + lookupCh = pattern[patOff = lastPatternChOff]; } } - + return -1; } // ------------------------------------------------------------------------ - /** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end. + /** + * Reads given pattern from {@link #pdfSource}. Skipping whitespace at start + * and end. * * @throws IOException if pattern could not be read */ - protected final void readPattern( final char[] pattern ) throws IOException + protected final void readPattern(final char[] pattern) throws IOException { skipSpaces(); - - for ( char c : pattern ) + + for (char c : pattern) { - if ( pdfSource.read() != c ) + if (pdfSource.read() != c) { - throw new IOException( "Expected pattern '" + new String( pattern ) + - " but missed at character '" + c + "'" ); + throw new IOException("Expected pattern '" + new String(pattern) + " but missed at character '" + c + + "'"); } } - + skipSpaces(); } // ------------------------------------------------------------------------ private COSDictionary pagesDictionary = null; - - /** Returns PAGES {@link COSDictionary} object or throws {@link IOException} - * if PAGES dictionary does not exist. */ - private COSDictionary getPagesObject() throws IOException + + /** + * Returns PAGES {@link COSDictionary} object or throws {@link IOException} + * if PAGES dictionary does not exist. + */ + private COSDictionary getPagesObject() throws IOException { - if ( pagesDictionary != null ) + if (pagesDictionary != null) { return pagesDictionary; - } - COSObject pages = (COSObject) document.getCatalog().getItem( COSName.PAGES ); - - if ( pages == null ) - { - throw new IOException( "Missing PAGES entry in document catalog." ); - } - - COSBase object = parseObjectDynamically( pages, false ); - - if ( ! ( object instanceof COSDictionary ) ) + } + COSObject pages = (COSObject) document.getCatalog().getItem(COSName.PAGES); + + if (pages == null) + { + throw new IOException("Missing PAGES entry in document catalog."); + } + + COSBase object = parseObjectDynamically(pages, false); + + if (!(object instanceof COSDictionary)) { - throw new IOException( "PAGES not a dictionary object, but: " + - object.getClass().getSimpleName() ); + throw new IOException("PAGES not a dictionary object, but: " + object.getClass().getSimpleName()); } - + pagesDictionary = (COSDictionary) object; - + return pagesDictionary; } @@ -605,101 +645,111 @@ public class NonSequentialPDFParser exte /** * {@inheritDoc} */ - @Override - public void parse() throws IOException + @Override + public void parse() throws IOException { - boolean exceptionOccurred = true; // set to false if all is processed - + boolean exceptionOccurred = true; // set to false if all is processed + try { - if ( ! initialParseDone ) + if (!initialParseDone) { initialParse(); } - + final int pageCount = getPageNumber(); - - if ( ! allPagesParsed ) + + if (!allPagesParsed) { - for ( int pNr = 0; pNr < pageCount; pNr++ ) + for (int pNr = 0; pNr < pageCount; pNr++) { - getPage( pNr ); + getPage(pNr); } allPagesParsed = true; document.setDecrypted(); } - + exceptionOccurred = false; } finally { - try - { + try + { closeFileStream(); - } - catch ( IOException ioe ) - {} - - deleteTempFile(); - - if ( exceptionOccurred && ( document != null ) ) - { - try - { - document.close(); - } - catch ( IOException ioe ) - {} - } - } - } - - protected File getPdfFile() { - return this.pdfFile; - } - - /** - * Remove the temporary file. - * A temporary file is created if this class is instantiated with an InputStream - */ - protected void deleteTempFile() { - if (isTmpPDFFile) { - try { - if (!pdfFile.delete()) LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted"); - } catch (SecurityException e) { - LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted", e); - } - } - } + } + catch (IOException ioe) + { + } + + deleteTempFile(); + + if (exceptionOccurred && (document != null)) + { + try + { + document.close(); + } + catch (IOException ioe) + { + } + } + } + } + + protected File getPdfFile() + { + return this.pdfFile; + } + + /** + * Remove the temporary file. A temporary file is created if this class is + * instantiated with an InputStream + */ + protected void deleteTempFile() + { + if (isTmpPDFFile) + { + try + { + if (!pdfFile.delete()) + LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted"); + } + catch (SecurityException e) + { + LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted", e); + } + } + } + // ------------------------------------------------------------------------ - /** + /** * Returns security handler of the document or null if document - * is not encrypted or {@link #parse()} wasn't called before. - * + * is not encrypted or {@link #parse()} wasn't called before. + * * @return the security handler. */ - public SecurityHandler getSecurityHandler() + public SecurityHandler getSecurityHandler() { return securityHandler; } // ------------------------------------------------------------------------ /** - * This will get the PD document that was parsed. When you are done with + * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. - * + * * Overwriting super method was necessary in order to set security handler. - * + * * @return The document at the PD layer. - * + * * @throws IOException If there is an error getting the document. */ @Override public PDDocument getPDDocument() throws IOException { PDDocument pdDocument = super.getPDDocument(); - if ( securityHandler != null ) - pdDocument.setSecurityHandler( securityHandler ); + if (securityHandler != null) + pdDocument.setSecurityHandler(securityHandler); return pdDocument; } @@ -709,16 +759,16 @@ public class NonSequentialPDFParser exte * * @return the number of pages. * - * @throws IOException if PAGES or other needed object is missing + * @throws IOException if PAGES or other needed object is missing */ public int getPageNumber() throws IOException { - int pageCount = getPagesObject().getInt( COSName.COUNT ); - - if ( pageCount < 0 ) + int pageCount = getPagesObject().getInt(COSName.COUNT); + + if (pageCount < 0) { - throw new IOException( "No page number specified." ); - } + throw new IOException("No page number specified."); + } return pageCount; } @@ -730,88 +780,88 @@ public class NonSequentialPDFParser exte * @return the page with the given pagenumber. * @throws IOException If something went wrong. */ - public PDPage getPage( int pageNr ) throws IOException + public PDPage getPage(int pageNr) throws IOException { getPagesObject(); - + // ---- get list of top level pages - COSArray kids = (COSArray) pagesDictionary.getDictionaryObject( COSName.KIDS ); - - if ( kids == null ) + COSArray kids = (COSArray) pagesDictionary.getDictionaryObject(COSName.KIDS); + + if (kids == null) { - throw new IOException( "Missing 'Kids' entry in pages dictionary." ); + throw new IOException("Missing 'Kids' entry in pages dictionary."); } - - // ---- get page we are looking for (possibly going recursively into subpages) - COSObject pageObj = getPageObject( pageNr, kids, 0 ); - - if ( pageObj == null ) + + // ---- get page we are looking for (possibly going recursively into + // subpages) + COSObject pageObj = getPageObject(pageNr, kids, 0); + + if (pageObj == null) { - throw new IOException( "Page " + pageNr + " not found." ); + throw new IOException("Page " + pageNr + " not found."); } - + // ---- parse all objects necessary to load page. COSDictionary pageDict = (COSDictionary) pageObj.getObject(); - - if ( parseMinimalCatalog && ( ! allPagesParsed ) ) + + if (parseMinimalCatalog && (!allPagesParsed)) { // parse page resources since we did not do this on start - COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject( COSName.RESOURCES ); - parseDictObjects( resDict ); + COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject(COSName.RESOURCES); + parseDictObjects(resDict); } - - return new PDPage( pageDict ); + + return new PDPage(pageDict); } /** - * Returns the object for a specific page. - * The page tree is made up of kids. The kids have COSArray with COSObjects - * inside of them. The COSObject can be parsed using the dynamic parsing method - * We want to only parse the minimum COSObjects and still return a complete page. - * ready to be used. + * Returns the object for a specific page. The page tree is made up of kids. + * The kids have COSArray with COSObjects inside of them. The COSObject can + * be parsed using the dynamic parsing method We want to only parse the + * minimum COSObjects and still return a complete page. ready to be used. * - * @param num the requested page number; numbering starts with 0 + * @param num the requested page number; numbering starts with 0 * @param startKids Kids array to start with looking up page number * @param startPageCount * - * @return page object or null if no such page exists + * @return page object or null if no such page exists * * @throws IOException */ - private COSObject getPageObject( int num, COSArray startKids, int startPageCount ) throws IOException + private COSObject getPageObject(int num, COSArray startKids, int startPageCount) throws IOException { - int curPageCount = startPageCount; - Iterator kidsIter = startKids.iterator(); - - while( kidsIter.hasNext() ) - { - COSObject obj = (COSObject) kidsIter.next(); - COSBase base = obj.getObject(); - if( base == null ) - { - base = parseObjectDynamically( obj, false ); - obj.setObject( base ); - } - - COSDictionary dic = (COSDictionary) base; - int count = dic.getInt( COSName.COUNT ); - if ( count >= 0 ) + int curPageCount = startPageCount; + Iterator kidsIter = startKids.iterator(); + + while (kidsIter.hasNext()) + { + COSObject obj = (COSObject) kidsIter.next(); + COSBase base = obj.getObject(); + if (base == null) + { + base = parseObjectDynamically(obj, false); + obj.setObject(base); + } + + COSDictionary dic = (COSDictionary) base; + int count = dic.getInt(COSName.COUNT); + if (count >= 0) { // skip this branch if requested page comes later - if( ( curPageCount + count ) <= num ) + if ((curPageCount + count) <= num) { curPageCount += count; continue; } } - - COSArray kids = (COSArray) dic.getDictionaryObject( COSName.KIDS ); - if( kids != null) + + COSArray kids = (COSArray) dic.getDictionaryObject(COSName.KIDS); + if (kids != null) { // recursively scan subpages - COSObject ans = getPageObject( num, kids, curPageCount ); + COSObject ans = getPageObject(num, kids, curPageCount); // if ans is not null, we got what we were looking for - if( ans != null ) + if (ans != null) { return ans; } @@ -819,290 +869,298 @@ public class NonSequentialPDFParser exte else { // found page? - if( curPageCount == num ) + if (curPageCount == num) { return obj; } - // page has no kids and it is not the page we are looking for + // page has no kids and it is not the page we are looking for curPageCount++; } } return null; } - /** Creates a unique object id using object number and object generation number. - * (requires object number < 2^31)) */ - private final long getObjectId( final COSObject obj ) + /** + * Creates a unique object id using object number and object generation + * number. (requires object number < 2^31)) + */ + private final long getObjectId(final COSObject obj) { - return ( obj.getObjectNumber().longValue() << 32 ) | obj.getGenerationNumber().longValue(); + return (obj.getObjectNumber().longValue() << 32) | obj.getGenerationNumber().longValue(); } - - /** Adds all from newObjects to toBeParsedList if it is not an COSObject - * or we didn't add this COSObject already (checked via addedObjects). */ - private final void addNewToList( final Queue toBeParsedList, - final Collection newObjects, - final Set addedObjects ) + + /** + * Adds all from newObjects to toBeParsedList if it is not an COSObject or + * we didn't add this COSObject already (checked via addedObjects). + */ + private final void addNewToList(final Queue toBeParsedList, final Collection newObjects, + final Set addedObjects) { - for ( COSBase newObject : newObjects ) + for (COSBase newObject : newObjects) { - if ( newObject instanceof COSObject ) + if (newObject instanceof COSObject) { - final long objId = getObjectId( (COSObject) newObject ); - if ( ! addedObjects.add( objId ) ) + final long objId = getObjectId((COSObject) newObject); + if (!addedObjects.add(objId)) { continue; } } - toBeParsedList.add( newObject ); + toBeParsedList.add(newObject); } } - /** Adds newObject to toBeParsedList if it is not an COSObject - * or we didn't add this COSObject already (checked via addedObjects). */ - private final void addNewToList( final Queue toBeParsedList, - final COSBase newObject, - final Set addedObjects ) + /** + * Adds newObject to toBeParsedList if it is not an COSObject or we didn't + * add this COSObject already (checked via addedObjects). + */ + private final void addNewToList(final Queue toBeParsedList, final COSBase newObject, + final Set addedObjects) { - if ( newObject instanceof COSObject ) + if (newObject instanceof COSObject) { - final long objId = getObjectId( (COSObject) newObject ); - if ( ! addedObjects.add( objId ) ) + final long objId = getObjectId((COSObject) newObject); + if (!addedObjects.add(objId)) { return; } } - toBeParsedList.add( newObject ); + toBeParsedList.add(newObject); } /** - * Will parse every object necessary to load a single page from the pdf document. - * We try our best to order objects according to offset in file before reading - * to minimize seek operations. + * Will parse every object necessary to load a single page from the pdf + * document. We try our best to order objects according to offset in file + * before reading to minimize seek operations. * * @param dict the COSObject from the parent pages. - * @param excludeObjects dictionary object reference entries with these names will not be parsed + * @param excludeObjects dictionary object reference entries with these + * names will not be parsed * * @throws IOException */ - private void parseDictObjects( COSDictionary dict, COSName... excludeObjects ) throws IOException + private void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException { // ---- create queue for objects waiting for further parsing - final Queue toBeParsedList = new LinkedList(); + final Queue toBeParsedList = new LinkedList(); // offset ordered object map - final TreeMap> objToBeParsed = new TreeMap>(); + final TreeMap> objToBeParsed = new TreeMap>(); // in case of compressed objects offset points to stmObj - final Set parsedObjects = new HashSet(); - final Set addedObjects = new HashSet(); - + final Set parsedObjects = new HashSet(); + final Set addedObjects = new HashSet(); + // ---- add objects not to be parsed to list of already parsed objects - if ( excludeObjects != null ) + if (excludeObjects != null) { - for ( COSName objName : excludeObjects ) + for (COSName objName : excludeObjects) { - COSBase baseObj = dict.getItem( objName ); - if ( baseObj instanceof COSObject ) + COSBase baseObj = dict.getItem(objName); + if (baseObj instanceof COSObject) { - parsedObjects.add( getObjectId( (COSObject) baseObj ) ); + parsedObjects.add(getObjectId((COSObject) baseObj)); } } } - - addNewToList( toBeParsedList, dict.getValues(), addedObjects ); - + + addNewToList(toBeParsedList, dict.getValues(), addedObjects); + // ---- go through objects to be parsed - while( ! ( toBeParsedList.isEmpty() && objToBeParsed.isEmpty() ) ) + while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) { // -- first get all COSObject from other kind of objects and - // put them in objToBeParsed; afterwards toBeParsedList is empty + // put them in objToBeParsed; afterwards toBeParsedList is empty COSBase baseObj; - while ( ( baseObj = toBeParsedList.poll() ) != null ) + while ((baseObj = toBeParsedList.poll()) != null) { - if ( baseObj instanceof COSStream ) + if (baseObj instanceof COSStream) { - addNewToList( toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects ); + addNewToList(toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects); } - else if ( baseObj instanceof COSDictionary ) + else if (baseObj instanceof COSDictionary) { - addNewToList( toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects ); + addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects); } - else if ( baseObj instanceof COSArray ) + else if (baseObj instanceof COSArray) { - final Iterator arrIter = ( (COSArray) baseObj ).iterator(); - while ( arrIter.hasNext() ) + final Iterator arrIter = ((COSArray) baseObj).iterator(); + while (arrIter.hasNext()) { - addNewToList( toBeParsedList, arrIter.next(), addedObjects ); + addNewToList(toBeParsedList, arrIter.next(), addedObjects); } } - else if ( baseObj instanceof COSObject ) + else if (baseObj instanceof COSObject) { - COSObject obj = (COSObject) baseObj; - long objId = getObjectId( obj ); - COSObjectKey objKey = new COSObjectKey( obj.getObjectNumber().intValue(), - obj.getGenerationNumber().intValue() ); - - if ( ! ( parsedObjects.contains( objId ) /*|| document.hasObjectInPool( objKey ) */ ) ) + COSObject obj = (COSObject) baseObj; + long objId = getObjectId(obj); + COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj.getGenerationNumber() + .intValue()); + + if (!(parsedObjects.contains(objId) /* + * || + * document.hasObjectInPool + * ( objKey ) + */)) { - Long fileOffset = xrefTrailerResolver.getXrefTable().get( objKey ); - // it is allowed that object references point to null, thus we have to test - if ( fileOffset != null ) + Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey); + // it is allowed that object references point to null, + // thus we have to test + if (fileOffset != null) { - if ( fileOffset > 0 ) + if (fileOffset > 0) { - objToBeParsed.put( fileOffset, Collections.singletonList( obj ) ); + objToBeParsed.put(fileOffset, Collections.singletonList(obj)); } - else + else { - // negative offset means we have a compressed object within object stream; + // negative offset means we have a compressed + // object within object stream; // get offset of object stream - fileOffset = xrefTrailerResolver.getXrefTable().get( new COSObjectKey( -fileOffset, 0 ) ); - if ( ( fileOffset == null ) || ( fileOffset <= 0 ) ) + fileOffset = xrefTrailerResolver.getXrefTable().get(new COSObjectKey(-fileOffset, 0)); + if ((fileOffset == null) || (fileOffset <= 0)) { - throw new IOException( "Invalid object stream xref object reference: " + fileOffset ); + throw new IOException("Invalid object stream xref object reference: " + fileOffset); } - - List stmObjects = objToBeParsed.get( fileOffset ); - if ( stmObjects == null ) + + List stmObjects = objToBeParsed.get(fileOffset); + if (stmObjects == null) { - objToBeParsed.put( fileOffset, stmObjects = new ArrayList() ); + objToBeParsed.put(fileOffset, stmObjects = new ArrayList()); } - stmObjects.add( obj ); + stmObjects.add(obj); } } else { // NULL object - COSObject pdfObject = document.getObjectFromPool( objKey ); - pdfObject.setObject( COSNull.NULL ); + COSObject pdfObject = document.getObjectFromPool(objKey); + pdfObject.setObject(COSNull.NULL); } } } } - + // ---- read first COSObject with smallest offset; - // resulting object will be added to toBeParsedList - if ( objToBeParsed.isEmpty() ) + // resulting object will be added to toBeParsedList + if (objToBeParsed.isEmpty()) { break; } - - for ( COSObject obj : objToBeParsed.remove( objToBeParsed.firstKey() ) ) + + for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) { - COSBase parsedObj = parseObjectDynamically( obj, false ); - - obj.setObject( parsedObj ); - addNewToList( toBeParsedList, parsedObj, addedObjects ); - - parsedObjects.add( getObjectId( obj ) ); + COSBase parsedObj = parseObjectDynamically(obj, false); + + obj.setObject(parsedObj); + addNewToList(toBeParsedList, parsedObj, addedObjects); + + parsedObjects.add(getObjectId(obj)); } } } - - /** - * This will parse the next object from the stream and add it to - * the local state. - * This is taken from {@link PDFParser} and reduced to parsing - * an indirect object. - * - * @param obj object to be parsed (we only take object number and generation number for lookup start offset) - * @param requireExistingNotCompressedObj if true object to be parsed must - * not be contained within compressed stream - * @return the parsed object (which is also added to document object) + + /** + * This will parse the next object from the stream and add it to the local + * state. This is taken from {@link PDFParser} and reduced to parsing an + * indirect object. + * + * @param obj object to be parsed (we only take object number and generation + * number for lookup start offset) + * @param requireExistingNotCompressedObj if true object to be + * parsed must not be contained within compressed stream + * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ - protected final COSBase parseObjectDynamically( COSObject obj, boolean requireExistingNotCompressedObj ) - throws IOException + protected final COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj) + throws IOException { - return parseObjectDynamically( obj.getObjectNumber().intValue(), - obj.getGenerationNumber().intValue(), - requireExistingNotCompressedObj ); + return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue(), + requireExistingNotCompressedObj); } /** - * This will parse the next object from the stream and add it to - * the local state. - * This is taken from {@link PDFParser} and reduced to parsing - * an indirect object. - * - * @param objNr object number of object to be parsed - * @param objGenNr object generation number of object to be parsed - * @param requireExistingNotCompressedObj if true the object to be parsed must be defined - * in xref (comment: null objects may be missing from xref) and - * it must not be a compressed object within object stream - * (this is used to circumvent being stuck in a loop in a malicious PDF) + * This will parse the next object from the stream and add it to the local + * state. This is taken from {@link PDFParser} and reduced to parsing an + * indirect object. * - * @return the parsed object (which is also added to document object) + * @param objNr object number of object to be parsed + * @param objGenNr object generation number of object to be parsed + * @param requireExistingNotCompressedObj if true the object to + * be parsed must be defined in xref (comment: null objects may + * be missing from xref) and it must not be a compressed object + * within object stream (this is used to circumvent being stuck + * in a loop in a malicious PDF) + * + * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ - protected COSBase parseObjectDynamically( int objNr, int objGenNr, boolean requireExistingNotCompressedObj ) - throws IOException + protected COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj) + throws IOException { // ---- create object key and get object (container) from pool - final COSObjectKey objKey = new COSObjectKey( objNr, objGenNr ); - final COSObject pdfObject = document.getObjectFromPool( objKey ); - - if ( pdfObject.getObject() == null ) + final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); + final COSObject pdfObject = document.getObjectFromPool(objKey); + + if (pdfObject.getObject() == null) { // not previously parsed // ---- read offset or object stream object number from xref table - Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get( objKey ); + Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents - if ( requireExistingNotCompressedObj && - ( ( offsetOrObjstmObNr == null ) || ( offsetOrObjstmObNr <= 0 ) ) ) - { - throw new IOException( "Object must be defined and must not be compressed object: " + - objKey.getNumber() + ":" + objKey.getGeneration() ); + if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) + { + throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() + + ":" + objKey.getGeneration()); } - - if ( offsetOrObjstmObNr == null ) + + if (offsetOrObjstmObNr == null) { // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9) - pdfObject.setObject( COSNull.NULL ); + pdfObject.setObject(COSNull.NULL); } - else if ( offsetOrObjstmObNr > 0 ) + else if (offsetOrObjstmObNr > 0) { // offset of indirect object in file // ---- go to object start - setPdfSource( offsetOrObjstmObNr ); - + setPdfSource(offsetOrObjstmObNr); + // ---- we must have an indirect object - final int readObjNr = readInt(); + final int readObjNr = readInt(); final int readObjGen = readInt(); - readPattern( OBJ_MARKER ); - + readPattern(OBJ_MARKER); + // ---- consistency check - if ( ( readObjNr != objKey.getNumber() ) || - ( readObjGen != objKey.getGeneration() ) ) + if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) { - throw new IOException( "XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + - " points to wrong object: " + readObjNr + ":" + readObjGen ); + throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + + " points to wrong object: " + readObjNr + ":" + readObjGen); } - + skipSpaces(); - COSBase pb = parseDirObject(); - String endObjectKey = readString(); - - if ( endObjectKey.equals( "stream" ) ) - { - pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") ); - pdfSource.unread( ' ' ); - if( pb instanceof COSDictionary ) + COSBase pb = parseDirObject(); + String endObjectKey = readString(); + + if (endObjectKey.equals("stream")) + { + pdfSource.unread(endObjectKey.getBytes("ISO-8859-1")); + pdfSource.unread(' '); + if (pb instanceof COSDictionary) { - COSStream stream = parseCOSStream( (COSDictionary)pb, - getDocument().getScratchFile() ); - - if ( securityHandler != null ) + COSStream stream = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile()); + + if (securityHandler != null) { - try + try { - securityHandler.decryptStream(stream, objNr, objGenNr ); - } - catch ( CryptographyException ce ) + securityHandler.decryptStream(stream, objNr, objGenNr); + } + catch (CryptographyException ce) { - throw new IOException( "Error decrypting stream object " + objNr + ": " + ce.getMessage() - /*, ce // TODO: remove remark with Java 1.6 */ ); + throw new IOException("Error decrypting stream object " + objNr + ": " + + ce.getMessage() + /* , ce // TODO: remove remark with Java 1.6 */); } } pb = stream; @@ -1110,179 +1168,185 @@ public class NonSequentialPDFParser exte else { // this is not legal - // the combination of a dict and the stream/endstream forms a complete stream object - throw new IOException( "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")." ); + // the combination of a dict and the stream/endstream + // forms a complete stream object + throw new IOException("Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")."); } skipSpaces(); endObjectKey = readLine(); - + // we have case with a second 'endstream' before endobj - if ( ! endObjectKey.startsWith( "endobj" ) ) + if (!endObjectKey.startsWith("endobj")) { - if ( endObjectKey.startsWith( "endstream" ) ) + if (endObjectKey.startsWith("endstream")) { - endObjectKey = endObjectKey.substring( 9 ).trim(); - if ( endObjectKey.length() == 0 ) + endObjectKey = endObjectKey.substring(9).trim(); + if (endObjectKey.length() == 0) { // no other characters in extra endstream line - endObjectKey = readLine(); // read next line + endObjectKey = readLine(); // read next line } } } } - else if ( securityHandler != null ) + else if (securityHandler != null) { // decrypt - if ( pb instanceof COSString ) + if (pb instanceof COSString) { - decrypt( (COSString) pb, objNr, objGenNr ); + decrypt((COSString) pb, objNr, objGenNr); } - else if ( pb instanceof COSDictionary ) + else if (pb instanceof COSDictionary) { - for( Entry entry : ((COSDictionary) pb).entrySet() ) + for (Entry entry : ((COSDictionary) pb).entrySet()) { - // TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary - if ( entry.getValue() instanceof COSString ) + // TODO: specially handle 'Contents' entry of + // signature dictionary like in + // SecurityHandler#decryptDictionary + if (entry.getValue() instanceof COSString) { - decrypt( (COSString) entry.getValue(), objNr, objGenNr ); + decrypt((COSString) entry.getValue(), objNr, objGenNr); } } } - else if ( pb instanceof COSArray ) + else if (pb instanceof COSArray) { final COSArray array = (COSArray) pb; - for( int aIdx = 0, len = array.size(); aIdx < len; aIdx++ ) + for (int aIdx = 0, len = array.size(); aIdx < len; aIdx++) { - if ( array.get( aIdx ) instanceof COSString ) + if (array.get(aIdx) instanceof COSString) { - decrypt( (COSString) array.get( aIdx ), objNr, objGenNr ); + decrypt((COSString) array.get(aIdx), objNr, objGenNr); } } } } - - pdfObject.setObject( pb ); - - if ( ! endObjectKey.startsWith( "endobj" ) ) + + pdfObject.setObject(pb); + + if (!endObjectKey.startsWith("endobj")) { - throw new IOException( "Object (" + readObjNr + ":" + readObjGen + - ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'." ); + throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " + + offsetOrObjstmObNr + " does not end with 'endobj'."); } - + releasePdfSourceInputStream(); - + } else { - // xref value is object nr of object stream containing object to be parsed; - // since our object was not found it means object stream was not parsed so far - final int objstmObjNr = (int) ( - offsetOrObjstmObNr ); - final COSBase objstmBaseObj = parseObjectDynamically( objstmObjNr, 0, true ); - if ( objstmBaseObj instanceof COSStream ) + // xref value is object nr of object stream containing object to + // be parsed; + // since our object was not found it means object stream was not + // parsed so far + final int objstmObjNr = (int) (-offsetOrObjstmObNr); + final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true); + if (objstmBaseObj instanceof COSStream) { // parse object stream - PDFObjectStreamParser parser = - new PDFObjectStreamParser( (COSStream) objstmBaseObj, document, forceParsing ); + PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document, + forceParsing); parser.parse(); - - // get set of object numbers referenced for this object stream - final Set refObjNrs = xrefTrailerResolver.getContainedObjectNumbers( objstmObjNr ); - - // register all objects which are referenced to be contained in object stream - for( COSObject next : parser.getObjects() ) + + // get set of object numbers referenced for this object + // stream + final Set refObjNrs = xrefTrailerResolver.getContainedObjectNumbers(objstmObjNr); + + // register all objects which are referenced to be contained + // in object stream + for (COSObject next : parser.getObjects()) { - COSObjectKey stmObjKey = new COSObjectKey( next ); - if ( refObjNrs.contains( stmObjKey.getNumber() ) ) + COSObjectKey stmObjKey = new COSObjectKey(next); + if (refObjNrs.contains(stmObjKey.getNumber())) { - COSObject stmObj = document.getObjectFromPool( stmObjKey ); - stmObj.setObject( next.getObject() ); + COSObject stmObj = document.getObjectFromPool(stmObjKey); + stmObj.setObject(next.getObject()); } } } } - } + } return pdfObject.getObject(); } - + // ------------------------------------------------------------------------ /** Decrypts given COSString. */ - protected final void decrypt( COSString str, long objNr, long objGenNr ) - throws IOException + protected final void decrypt(COSString str, long objNr, long objGenNr) throws IOException { - try + try { - securityHandler.decryptString( str, objNr, objGenNr ); + securityHandler.decryptString(str, objNr, objGenNr); } - catch ( CryptographyException ce ) + catch (CryptographyException ce) { - throw new IOException( "Error decrypting string: " + ce.getMessage() - /*, ce // TODO: remove remark with Java 1.6 */ ); - } + throw new IOException("Error decrypting string: " + ce.getMessage() + /* , ce // TODO: remove remark with Java 1.6 */); + } } - + // ------------------------------------------------------------------------ private boolean inGetLength = false; - + /** Returns length value referred to or defined in given object. */ - private COSNumber getLength( final COSBase lengthBaseObj ) throws IOException + private COSNumber getLength(final COSBase lengthBaseObj) throws IOException { - if ( lengthBaseObj == null ) + if (lengthBaseObj == null) { return null; } - - if ( inGetLength ) + + if (inGetLength) { - throw new IOException( "Loop while reading length from " + lengthBaseObj ); + throw new IOException("Loop while reading length from " + lengthBaseObj); } - + COSNumber retVal = null; - + try { inGetLength = true; - + // ---- maybe length was given directly - if ( lengthBaseObj instanceof COSNumber ) + if (lengthBaseObj instanceof COSNumber) { retVal = (COSNumber) lengthBaseObj; } // ---- length in referenced object - else if ( lengthBaseObj instanceof COSObject ) + else if (lengthBaseObj instanceof COSObject) { COSObject lengthObj = (COSObject) lengthBaseObj; - - if ( lengthObj.getObject() == null ) + + if (lengthObj.getObject() == null) { // not read so far - + // keep current stream position final long curFileOffset = getPdfSourceOffset(); releasePdfSourceInputStream(); - - parseObjectDynamically( lengthObj, true ); - + + parseObjectDynamically(lengthObj, true); + // reset current stream position - setPdfSource( curFileOffset ); - - if ( lengthObj.getObject() == null ) + setPdfSource(curFileOffset); + + if (lengthObj.getObject() == null) { - throw new IOException( "Length object content was not read." ); + throw new IOException("Length object content was not read."); } } - - if ( ! ( lengthObj.getObject() instanceof COSNumber ) ) + + if (!(lengthObj.getObject() instanceof COSNumber)) { - throw new IOException( "Wrong type of referenced length object " + lengthObj + ": " + - lengthObj.getObject().getClass().getSimpleName() ); + throw new IOException("Wrong type of referenced length object " + lengthObj + ": " + + lengthObj.getObject().getClass().getSimpleName()); } - + retVal = (COSNumber) lengthObj.getObject(); - + } else { - throw new IOException( "Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName() ); + throw new IOException("Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName()); } } finally @@ -1291,112 +1355,236 @@ public class NonSequentialPDFParser exte } return retVal; } - + // ------------------------------------------------------------------------ - private final int streamCopyBufLen = 8192; - private final byte[] streamCopyBuf = new byte[ streamCopyBufLen ]; - + private final int streamCopyBufLen = 8192; + private final byte[] streamCopyBuf = new byte[streamCopyBufLen]; + /** * This will read a COSStream from the input stream using length attribute - * within dictionary. - * If length attribute is a indirect reference it is first resolved to get - * the stream length. This means we copy stream data without testing for - * 'endstream' or 'endobj' and thus it is no problem if these keywords - * occur within stream. - * We require 'endstream' to be found after stream data is read. - * - * @param dic dictionary that goes with this stream. - * @param file file to write the stream to when reading. - * + * within dictionary. If length attribute is a indirect reference it is + * first resolved to get the stream length. This means we copy stream data + * without testing for 'endstream' or 'endobj' and thus it is no problem if + * these keywords occur within stream. We require 'endstream' to be found + * after stream data is read. + * + * @param dic dictionary that goes with this stream. + * @param file file to write the stream to when reading. + * * @return parsed pdf stream. - * - * @throws IOException if an error occurred reading the stream, like problems - * with reading length attribute, stream does not end with 'endstream' - * after data read, stream too short etc. + * + * @throws IOException if an error occurred reading the stream, like + * problems with reading length attribute, stream does not end + * with 'endstream' after data read, stream too short etc. */ @Override - protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException + protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException { - final COSStream stream = new COSStream( dic, file ); + final COSStream stream = new COSStream(dic, file); OutputStream out = null; try { - readString(); // read 'stream'; this was already tested in parseObjectsDynamically() - + readString(); // read 'stream'; this was already tested in + // parseObjectsDynamically() + // ---- skip whitespaces before start of data - // PDF Ref 1.7, chap. 3.2.7: - // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else. + // PDF Ref 1.7, chap. 3.2.7: + // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF + // but nothing else. { int whitespace = pdfSource.read(); - - //see brother_scan_cover.pdf, it adds whitespaces - //after the stream but before the start of the - //data, so just read those first + + // see brother_scan_cover.pdf, it adds whitespaces + // after the stream but before the start of the + // data, so just read those first while (whitespace == 0x20) { whitespace = pdfSource.read(); } - - if( whitespace == 0x0D ) + + if (whitespace == 0x0D) { whitespace = pdfSource.read(); - if( whitespace != 0x0A ) + if (whitespace != 0x0A) { - // the spec says this is invalid but it happens in the real + // the spec says this is invalid but it happens in the + // real // world so we must support it - pdfSource.unread( whitespace ); + pdfSource.unread(whitespace); } } else if (whitespace != 0x0A) { - // no whitespace after 'stream'; PDF ref. says 'should' so that is ok - pdfSource.unread( whitespace ); + // no whitespace after 'stream'; PDF ref. says 'should' so + // that is ok + pdfSource.unread(whitespace); } - } - - /*This needs to be dic.getItem because when we are parsing, the underlying object - * might still be null. + } + + /* + * This needs to be dic.getItem because when we are parsing, the + * underlying object might still be null. */ - COSNumber streamLengthObj = getLength( dic.getItem( COSName.LENGTH ) ); - if ( streamLengthObj == null ) + COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH)); + if (streamLengthObj == null) { - throw new IOException( "Missing length for stream." ); + throw new IOException("Missing length for stream."); } - + // ---- get output stream to copy data to - out = stream.createFilteredStream( streamLengthObj ); - + out = stream.createFilteredStream(streamLengthObj); + long remainBytes = streamLengthObj.longValue(); - - while ( remainBytes > 0 ) - { - final int readBytes = pdfSource.read( streamCopyBuf, 0, - ( remainBytes > streamCopyBufLen ) ? streamCopyBufLen : (int) remainBytes ); - if ( readBytes <= 0 ) - { [... 168 lines stripped ...]