pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From le...@apache.org
Subject svn commit: r1654017 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BaseParser.java NonSequentialPDFParser.java
Date Thu, 22 Jan 2015 19:18:27 GMT
Author: lehmi
Date: Thu Jan 22 19:18:27 2015
New Revision: 1654017

URL: http://svn.apache.org/r1654017
Log:
PDFBOX-2610: readLine now treats CR+LF as one EOL

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1654017&r1=1654016&r2=1654017&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Jan
22 19:18:27 2015
@@ -1431,8 +1431,8 @@ public abstract class BaseParser impleme
 
     /**
      * This will read bytes until the first end of line marker occurs.
-     * Note: if you later unread the results of this function, you'll
-     * need to add a newline character to the end of the string.
+     * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
+     * which is an important detail if one wants to unread the line.
      *
      * @return The characters between the current position and the end of the line.
      *
@@ -1450,12 +1450,18 @@ public abstract class BaseParser impleme
         int c;
         while ((c = pdfSource.read()) != -1)
         {
+            // CR and LF are valid EOLs
             if (isEOL(c))
             {
                 break;
             }
             buffer.append( (char)c );
         }
+        // CR+LF is also a valid EOL 
+        if (isCR(c) && isLF(pdfSource.peek()))
+        {
+            pdfSource.read();
+        }
         return buffer.toString();
     }
 
@@ -1479,9 +1485,19 @@ public abstract class BaseParser impleme
      */
     protected boolean isEOL(int c)
     {
-        return ASCII_LF == c || ASCII_CR == c;
+        return isLF(c) || isCR(c);
     }
 
+    private boolean isLF(int c)
+    {
+        return ASCII_LF == c;
+    }
+
+    private boolean isCR(int c)
+    {
+        return ASCII_CR == c;
+    }
+    
     /**
      * This will tell if the next byte is whitespace or not.
      *

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1654017&r1=1654016&r2=1654017&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Thu Jan 22 19:18:27 2015
@@ -2161,6 +2161,7 @@ public class NonSequentialPDFParser exte
             return false;
         }
         //read "trailer"
+        long currentOffset = pdfSource.getOffset();
         String nextLine = readLine();
         if( !nextLine.trim().equals( "trailer" ) )
         {
@@ -2170,10 +2171,10 @@ public class NonSequentialPDFParser exte
             // Acrobat reader can also deal with this.
             if (nextLine.startsWith("trailer"))
             {
-                byte[] b = nextLine.getBytes(ISO_8859_1);
+                // we can't just unread a portion of the read data as we don't know if the
EOL consist of 1 or 2 bytes
                 int len = "trailer".length();
-                pdfSource.unread('\n');
-                pdfSource.unread(b, len, b.length-len);
+                // jump back right after "trailer"
+                pdfSource.seek(currentOffset + len);
             }
             else
             {



Mime
View raw message