pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jahew...@apache.org
Subject svn commit: r1603606 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/text/ main/java/org/apache/pdfbox/util/ test/java/org/apache/pdfbox/util/
Date Wed, 18 Jun 2014 19:53:38 GMT
Author: jahewson
Date: Wed Jun 18 19:53:38 2014
New Revision: 1603606

URL: http://svn.apache.org/r1603606
Log:
PDFBOX-2145: Clean up TextPosition

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java Wed Jun 18
19:53:38 2014
@@ -16,7 +16,6 @@
  */
 package org.apache.pdfbox.text;
 
-import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.util.Matrix;
 
@@ -25,89 +24,33 @@ import org.apache.pdfbox.util.Matrix;
  *
  * @author Ben Litchfield
  */
-public class TextPosition
+public final class TextPosition
 {
     // text matrix for the start of the text object, coordinates are in display units
     // and have not been adjusted
-    private Matrix textPos;
+    private final Matrix textPos;
 
     // ending X and Y coordinates in display units
-    private float endX;
-    private float endY;
+    private final float endX;
+    private final float endY;
 
-    private float maxTextHeight; // maximum height of text, in display units
-    private int rot; // 0, 90, 180, 270 degrees of page rotation
-    private float x = Float.NEGATIVE_INFINITY;
-    private float y = Float.NEGATIVE_INFINITY;
-    private float pageHeight;
-    private float pageWidth;
+    private final float maxTextHeight; // maximum height of text, in display units
+    private final int rotation; // 0, 90, 180, 270 degrees of page rotation
+    private final float x = Float.NEGATIVE_INFINITY;
+    private final float y = Float.NEGATIVE_INFINITY;
+    private final float pageHeight;
+    private final float pageWidth;
+
+    private final float widthOfSpace; // width of a space, in display units
+
+    private final int[] unicodeCP;
+    private final PDFont font;
+    private final float fontSize;
+    private final int fontSizePt;
+
+    // mutable
     private float[] widths;
-    private float widthOfSpace; // width of a space, in display units
     private String string;
-    private int[] unicodeCP;
-    private PDFont font;
-    private float fontSize;
-    private int fontSizePt;
-
-    /**
-     * Constructor.
-     *
-     * @deprecated Can this be removed?
-     */
-    @Deprecated
-    protected TextPosition()
-    {
-    }
-
-    /**
-     * Constructor.
-     *
-     * @param page Page that the text is located in
-     * @param textPositionSt TextMatrix for start of text (in display units)
-     * @param textPositionEnd TextMatrix for end of text (in display units)
-     * @param maxFontH Maximum height of text (in display units)
-     * @param individualWidths The width of each individual character. (in ? units)
-     * @param spaceWidth The width of the space character. (in display units)
-     * @param string The character to be displayed.
-     * @param currentFont The current for for this text position.
-     * @param fontSizeValue The new font size.
-     * @param fontSizeInPt The font size in pt units.
-     * @param ws The word spacing parameter (in display units)
-     *
-     * @deprecated Can this be removed?
-     */
-    @Deprecated
-    public TextPosition(PDPage page, Matrix textPositionSt, Matrix textPositionEnd, float
maxFontH,
-                        float[] individualWidths, float spaceWidth, String string,
-                        PDFont currentFont, float fontSizeValue, int fontSizeInPt, float
ws)
-    {
-        this.textPos = textPositionSt;
-
-        this.endX = textPositionEnd.getXPosition();
-        this.endY = textPositionEnd.getYPosition();
-
-        this.rot = page.findRotation();
-        // make sure it is 0 to 270 and no negative numbers
-        if (this.rot < 0)
-        {
-            rot += 360;
-        }
-        else if (rot >= 360)
-        {
-            rot -= 360;
-        }
-
-        this.maxTextHeight = maxFontH;
-        this.pageHeight = page.findMediaBox().getHeight();
-        this.pageWidth = page.findMediaBox().getWidth();
-
-        this.widths = individualWidths;
-        this.widthOfSpace = spaceWidth;
-        this.string = string;
-        this.font = currentFont;
-        this.fontSize = fontSizeValue;
-        this.fontSizePt = fontSizeInPt;
-    }
 
     /**
      * Constructor.
@@ -137,16 +80,17 @@ public class TextPosition
         this.endX = endXValue;
         this.endY = endYValue;
 
-        this.rot = pageRotation;
+        int rotation = pageRotation;
         // make sure it is 0 to 270 and no negative numbers
-        if (this.rot < 0)
+        if (rotation < 0)
         {
-            rot += 360;
+            rotation += 360;
         }
-        else if (rot >= 360)
+        else if (rotation >= 360)
         {
-            rot -= 360;
+            rotation -= 360;
         }
+        this.rotation = rotation;
 
         this.maxTextHeight = maxFontH;
         this.pageHeight = pageHeightValue;
@@ -176,7 +120,7 @@ public class TextPosition
      *
      * @return an array containing all codepoints.
      */
-    public int[] getCodePoints()
+    public int[] getCodePoints()    // todo: NOT Unicode!!
     {
         return unicodeCP;
     }
@@ -267,7 +211,7 @@ public class TextPosition
     {
         if (x == Float.NEGATIVE_INFINITY)
         {
-            x = getXRot(rot);
+            return getXRot(rotation);
         }
         return x;
     }
@@ -322,13 +266,13 @@ public class TextPosition
     {
         if (y == Float.NEGATIVE_INFINITY)
         {
-            if (rot == 0 || rot == 180)
+            if (rotation == 0 || rotation == 180)
             {
-                y = pageHeight - getYLowerLeftRot(rot);
+                return pageHeight - getYLowerLeftRot(rotation);
             }
             else
             {
-                y = pageWidth - getYLowerLeftRot(rot);
+                return pageWidth - getYLowerLeftRot(rotation);
             }
         }
         return y;
@@ -379,7 +323,7 @@ public class TextPosition
      */
     public float getWidth()
     {
-        return getWidthRot(rot);
+        return getWidthRot(rotation);
     }
 
     /**
@@ -608,6 +552,7 @@ public class TextPosition
             currCharXStart += widths[i];
         }
     }
+
     /**
      * Inserts the diacritic TextPosition to the str of this TextPosition and updates the
widths
      * array to include the extra character width.

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Wed Jun
18 19:53:38 2014
@@ -269,17 +269,6 @@ public class PDFStreamEngine
     }
 
     /**
-     * A method provided as an event interface to allow a subclass to perform some specific
-     * functionality when text needs to be processed.
-     * 
-     * @param text The text to be processed.
-     */
-    protected void processTextPosition(TextPosition text)
-    {
-        // subclasses can override to provide specific functionality.
-    }
-
-    /**
      * Process encoded text from the PDF Stream. You should override this method if you want
to
      * perform an action when encoded text is being processed.
      * 
@@ -469,6 +458,17 @@ public class PDFStreamEngine
     }
 
     /**
+     * A method provided as an event interface to allow a subclass to perform some specific
+     * functionality when text needs to be processed.
+     *
+     * @param text The text to be processed.
+     */
+    protected void processTextPosition(TextPosition text)
+    {
+        // subclasses can override to provide specific functionality.
+    }
+
+    /**
      * This is used to handle an operation.
      * 
      * @param operation The operation to perform.

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Wed Jun
18 19:53:38 2014
@@ -574,7 +574,7 @@ public class PDFTextStripper extends PDF
             // Now cycle through to print the text.
             // We queue up a line at a time before we print so that we can convert
             // the line from presentation form to logical form (if needed).
-            List<TextPosition> line = new ArrayList<TextPosition>();
+            List<LineItem> line = new ArrayList<LineItem>();
 
             textIter = textList.iterator();    // start from the beginning again
             // PDF files don't always store spaces. We will need to guess where we should
add
@@ -709,7 +709,7 @@ public class PDFTextStripper extends PDF
                         lastPosition.getTextPosition().getCharacter() != null &&
                         !lastPosition.getTextPosition().getCharacter().endsWith(" "))
                     {
-                        line.add(WordSeparator.getSeparator());
+                        line.add(LineItem.getWordSeparator());
                     }
                 }
                 if (positionY >= maxYForLine)
@@ -727,7 +727,7 @@ public class PDFTextStripper extends PDF
                     {
                         writeParagraphStart();//not sure this is correct for RTL?
                     }
-                    line.add(position);
+                    line.add(new LineItem(position));
                 }
                 maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
                 minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
@@ -1784,7 +1784,7 @@ public class PDFTextStripper extends PDF
      * @param hasRtl determines if lines contains rtl formatted text(parts)
      * @return a list of strings, one string for every word
      */
-    private List<WordWithTextPositions> normalize(List<TextPosition> line, boolean
isRtlDominant,
+    private List<WordWithTextPositions> normalize(List<LineItem> line, boolean
isRtlDominant,
                                                   boolean hasRtl)
     {
         LinkedList<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
@@ -1801,9 +1801,9 @@ public class PDFTextStripper extends PDF
         }
         else
         {
-            for (TextPosition text : line)
+            for (LineItem item : line)
             {
-                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, text);
+                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
             }
         }
         if (lineBuilder.length() > 0) 
@@ -1827,9 +1827,9 @@ public class PDFTextStripper extends PDF
      * @return The StringBuilder that must be used when calling this method.
      */
     private StringBuilder normalizeAdd(LinkedList<WordWithTextPositions> normalized,
-            StringBuilder lineBuilder, List<TextPosition> wordPositions, TextPosition
text)
+            StringBuilder lineBuilder, List<TextPosition> wordPositions, LineItem item)
     {
-        if (text instanceof WordSeparator) 
+        if (item.isWordSeparator())
         {
             normalized.add(createWord(lineBuilder.toString(),
                     new ArrayList<TextPosition>(wordPositions)));
@@ -1838,6 +1838,7 @@ public class PDFTextStripper extends PDF
         }
         else 
         {
+            TextPosition text = item.getTextPosition();
             lineBuilder.append(text.getCharacter());
             wordPositions.add(text);
         }
@@ -1847,17 +1848,35 @@ public class PDFTextStripper extends PDF
     /**
      * internal marker class. Used as a place holder in a line of TextPositions.
      */
-    private static final class WordSeparator extends TextPosition
+    private static final class LineItem
     {
-        private static final WordSeparator separator = new WordSeparator();
-        
-        private WordSeparator()
+        public static LineItem WORD_SEPARATOR = new LineItem();
+
+        public static LineItem getWordSeparator()
+        {
+            return WORD_SEPARATOR;
+        }
+
+        private final TextPosition textPosition;
+
+        private LineItem()
+        {
+            textPosition = null;
+        }
+
+        public LineItem(TextPosition textPosition)
+        {
+            this.textPosition = textPosition;
+        }
+
+        public TextPosition getTextPosition()
         {
+            return textPosition;
         }
 
-        public static WordSeparator getSeparator()
+        public boolean isWordSeparator()
         {
-            return separator;
+            return textPosition == null;
         }
     }
 

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Wed Jun
18 19:53:38 2014
@@ -273,9 +273,8 @@ public class TestTextStripper extends Te
             if (!expectedFile.exists())
             {
                 this.bFail = true;
-                log.error(
-                        "FAILURE: Input verification file: " + expectedFile.getAbsolutePath()
+
-                " did not exist");
+                fail("FAILURE: Input verification file: " + expectedFile.getAbsolutePath()
+
+                        " did not exist");
                 return;
             }
 
@@ -299,12 +298,12 @@ public class TestTextStripper extends Te
                 if (!stringsEqual(expectedLine, actualLine))
                 {
                     this.bFail = true;
-                    log.error("FAILURE: Line mismatch for file " + inFile.getName() +
+                    fail("FAILURE: Line mismatch for file " + inFile.getName() +
                             " ( sort = "+bSort+")" +
                             " at expected line: " + expectedReader.getLineNumber() +
-                            " at actual line: " + actualReader.getLineNumber());
-                    log.error("  expected line was: \"" + expectedLine + "\"");
-                    log.error("  actual line was:   \"" + actualLine + "\"" + "\n");
+                            " at actual line: " + actualReader.getLineNumber() +
+                            "\nexpected line was: \"" + expectedLine + "\"" +
+                            "\nactual line was:   \"" + actualLine + "\"" + "\n");
 
                     //lets report all lines, even though this might produce some verbose
logging
                     //break;



Mime
View raw message