pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From le...@apache.org
Subject svn commit: r1685643 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BaseParser.java COSParser.java
Date Mon, 15 Jun 2015 17:32:35 GMT
Author: lehmi
Date: Mon Jun 15 17:32:35 2015
New Revision: 1685643

URL: http://svn.apache.org/r1685643
Log:
PDFBOX-2576: move method and make it private

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1685643&r1=1685642&r2=1685643&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Mon Jun
15 17:32:35 2015
@@ -18,7 +18,6 @@ package org.apache.pdfbox.pdfparser;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.io.OutputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -64,34 +63,19 @@ public abstract class BaseParser
      */
     private static final Log LOG = LogFactory.getLog(BaseParser.class);
 
-    private static final int E = 'e';
-    private static final int N = 'n';
-    private static final int D = 'd';
-
-    private static final int S = 's';
-    private static final int T = 't';
-    private static final int R = 'r';
-    private static final int A = 'a';
-    private static final int M = 'm';
-
-    private static final int O = 'o';
-    private static final int B = 'b';
-    private static final int J = 'j';
-
-    private static final int STRMBUFLEN = 2048;
-    private final byte[] strmBuf    = new byte[ STRMBUFLEN ];
-
-    /**
-     * This is a byte array that will be used for comparisons.
-     */
-    public static final byte[] ENDSTREAM =
-        new byte[] { E, N, D, S, T, R, E, A, M };
-
-    /**
-     * This is a byte array that will be used for comparisons.
-     */
-    public static final byte[] ENDOBJ =
-        new byte[] { E, N, D, O, B, J };
+    protected static final int E = 'e';
+    protected static final int N = 'n';
+    protected static final int D = 'd';
+
+    protected static final int S = 's';
+    protected static final int T = 't';
+    protected static final int R = 'r';
+    protected static final int A = 'a';
+    protected static final int M = 'm';
+
+    protected static final int O = 'o';
+    protected static final int B = 'b';
+    protected static final int J = 'j';
 
     /**
      * This is a string constant that will be used for comparisons.
@@ -386,118 +370,6 @@ public abstract class BaseParser
     }
 
     /**
-     * This method will read through the current stream object until
-     * we find the keyword "endstream" meaning we're at the end of this
-     * object. Some pdf files, however, forget to write some endstream tags
-     * and just close off objects with an "endobj" tag so we have to handle
-     * this case as well.
-     * 
-     * This method is optimized using buffered IO and reduced number of
-     * byte compare operations.
-     * 
-     * @param out  stream we write out to.
-     * 
-     * @throws IOException if something went wrong
-     */
-    protected void readUntilEndStream( final OutputStream out ) throws IOException
-    {
-        int bufSize;
-        int charMatchCount = 0;
-        byte[] keyw = ENDSTREAM;
-        
-        // last character position of shortest keyword ('endobj')
-        final int quickTestOffset = 5;
-        
-        // read next chunk into buffer; already matched chars are added to beginning of buffer
-        while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, STRMBUFLEN - charMatchCount
) ) > 0 ) 
-        {
-            bufSize += charMatchCount;
-            
-            int bIdx = charMatchCount;
-            int quickTestIdx;
-        
-            // iterate over buffer, trying to find keyword match
-            for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++
) 
-            {
-                // reduce compare operations by first test last character we would have to
-                // match if current one matches; if it is not a character from keywords
-                // we can move behind the test character;
-                // this shortcut is inspired by the Boyer-Moore string search algorithm
-                // and can reduce parsing time by approx. 20%
-                if ( ( charMatchCount == 0 ) &&
-                         ( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx
) ) 
-                {
-                    
-                    final byte ch = strmBuf[quickTestIdx];
-                    if ( ( ch > 't' ) || ( ch < 'a' ) ) 
-                    {
-                        // last character we would have to match if current character would
match
-                        // is not a character from keywords -> jump behind and start over
-                        bIdx = quickTestIdx;
-                        continue;
-                    }
-                }
-                
-                // could be negative - but we only compare to ASCII
-                final byte ch = strmBuf[bIdx];
-            
-                if ( ch == keyw[ charMatchCount ] ) 
-                {
-                    if ( ++charMatchCount == keyw.length ) 
-                    {
-                        // match found
-                        bIdx++;
-                        break;
-                    }
-                } 
-                else 
-                {
-                    if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount
] ) ) 
-                    {
-                        // maybe ENDSTREAM is missing but we could have ENDOBJ
-                        keyw = ENDOBJ;
-                        charMatchCount++;
-                    } 
-                    else 
-                    {
-                        // no match; incrementing match start by 1 would be dumb since we
already know matched chars
-                        // depending on current char read we may already have beginning of
a new match:
-                        // 'e': first char matched;
-                        // 'n': if we are at match position idx 7 we already read 'e' thus
2 chars matched
-                        // for each other char we have to start matching first keyword char
beginning with next 
-                        // read position
-                        charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount
== 7 ) ) ? 2 : 0;
-                        // search again for 'endstream'
-                        keyw = ENDSTREAM;
-                    }
-                } 
-            }  // for
-            
-            int contentBytes = Math.max( 0, bIdx - charMatchCount );
-            
-            // write buffer content until first matched char to output stream
-            if ( contentBytes > 0 )
-            {
-                out.write( strmBuf, 0, contentBytes );
-            }
-            if ( charMatchCount == keyw.length ) 
-            {
-                // keyword matched; unread matched keyword (endstream/endobj) and following
buffered content
-                pdfSource.rewind( bufSize - contentBytes );
-                break;
-            } 
-            else 
-            {
-                // copy matched chars at start of buffer
-                System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
-            }
-            
-        }
-        // this writes a lonely CR or drops trailing CR LF and LF
-        out.flush();
-    }
-    
-    /**
      * This is really a bug in the Document creators code, but it caused a crash
      * in PDFBox, the first bug was in this format:
      * /Title ( (5)

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1685643&r1=1685642&r2=1685643&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Mon Jun 15
17:32:35 2015
@@ -72,10 +72,17 @@ public class COSParser extends BaseParse
     private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
     private static final char[] STARTXREF = new char[] { 's','t','a','r','t','x','r','e','f'
};
 
+    private static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M };
+
+    private static final byte[] ENDOBJ = new byte[] { E, N, D, O, B, J };
+
     private static final long MINIMUM_SEARCH_OFFSET = 6;
     
     private static final int X = 'x';
 
+    private static final int STRMBUFLEN = 2048;
+    private final byte[] strmBuf    = new byte[ STRMBUFLEN ];
+
     /**
      * Only parse the PDF file minimally allowing access to basic information.
      */
@@ -953,6 +960,118 @@ public class COSParser extends BaseParse
         return stream;
     }
 
+    /**
+     * This method will read through the current stream object until
+     * we find the keyword "endstream" meaning we're at the end of this
+     * object. Some pdf files, however, forget to write some endstream tags
+     * and just close off objects with an "endobj" tag so we have to handle
+     * this case as well.
+     * 
+     * This method is optimized using buffered IO and reduced number of
+     * byte compare operations.
+     * 
+     * @param out  stream we write out to.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void readUntilEndStream( final OutputStream out ) throws IOException
+    {
+        int bufSize;
+        int charMatchCount = 0;
+        byte[] keyw = ENDSTREAM;
+        
+        // last character position of shortest keyword ('endobj')
+        final int quickTestOffset = 5;
+        
+        // read next chunk into buffer; already matched chars are added to beginning of buffer
+        while ( ( bufSize = pdfSource.read( strmBuf, charMatchCount, STRMBUFLEN - charMatchCount
) ) > 0 ) 
+        {
+            bufSize += charMatchCount;
+            
+            int bIdx = charMatchCount;
+            int quickTestIdx;
+        
+            // iterate over buffer, trying to find keyword match
+            for ( int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++
) 
+            {
+                // reduce compare operations by first test last character we would have to
+                // match if current one matches; if it is not a character from keywords
+                // we can move behind the test character;
+                // this shortcut is inspired by the Boyer-Moore string search algorithm
+                // and can reduce parsing time by approx. 20%
+                if ( ( charMatchCount == 0 ) &&
+                         ( ( quickTestIdx = bIdx + quickTestOffset ) < maxQuicktestIdx
) ) 
+                {
+                    
+                    final byte ch = strmBuf[quickTestIdx];
+                    if ( ( ch > 't' ) || ( ch < 'a' ) ) 
+                    {
+                        // last character we would have to match if current character would
match
+                        // is not a character from keywords -> jump behind and start over
+                        bIdx = quickTestIdx;
+                        continue;
+                    }
+                }
+                
+                // could be negative - but we only compare to ASCII
+                final byte ch = strmBuf[bIdx];
+            
+                if ( ch == keyw[ charMatchCount ] ) 
+                {
+                    if ( ++charMatchCount == keyw.length ) 
+                    {
+                        // match found
+                        bIdx++;
+                        break;
+                    }
+                } 
+                else 
+                {
+                    if ( ( charMatchCount == 3 ) && ( ch == ENDOBJ[ charMatchCount
] ) ) 
+                    {
+                        // maybe ENDSTREAM is missing but we could have ENDOBJ
+                        keyw = ENDOBJ;
+                        charMatchCount++;
+                    } 
+                    else 
+                    {
+                        // no match; incrementing match start by 1 would be dumb since we
already know matched chars
+                        // depending on current char read we may already have beginning of
a new match:
+                        // 'e': first char matched;
+                        // 'n': if we are at match position idx 7 we already read 'e' thus
2 chars matched
+                        // for each other char we have to start matching first keyword char
beginning with next 
+                        // read position
+                        charMatchCount = ( ch == E ) ? 1 : ( ( ch == N ) && ( charMatchCount
== 7 ) ) ? 2 : 0;
+                        // search again for 'endstream'
+                        keyw = ENDSTREAM;
+                    }
+                } 
+            }  // for
+            
+            int contentBytes = Math.max( 0, bIdx - charMatchCount );
+            
+            // write buffer content until first matched char to output stream
+            if ( contentBytes > 0 )
+            {
+                out.write( strmBuf, 0, contentBytes );
+            }
+            if ( charMatchCount == keyw.length ) 
+            {
+                // keyword matched; unread matched keyword (endstream/endobj) and following
buffered content
+                pdfSource.rewind( bufSize - contentBytes );
+                break;
+            } 
+            else 
+            {
+                // copy matched chars at start of buffer
+                System.arraycopy( keyw, 0, strmBuf, 0, charMatchCount );
+            }
+            
+        }
+        // this writes a lonely CR or drops trailing CR LF and LF
+        out.flush();
+    }
+
     private void readValidStream(OutputStream out, COSNumber streamLengthObj) throws IOException
     {
         long remainBytes = streamLengthObj.longValue();



Mime
View raw message