pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r719294 - in /incubator/pdfbox/trunk: src/main/java/org/apache/pdfbox/util/ test/input/
Date Thu, 20 Nov 2008 17:17:31 GMT
Author: jukka
Date: Thu Nov 20 09:17:31 2008
New Revision: 719294

URL: http://svn.apache.org/viewvc?rev=719294&view=rev
Log:
PDFBOX-374: text areas not properly being sorted because of page rotation

Fix contributed by Brian Carrier. Thanks, Brian!

Modified:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
    incubator/pdfbox/trunk/test/input/hexnumberproblem.pdf.txt
    incubator/pdfbox/trunk/test/input/ocalc.pdf.txt
    incubator/pdfbox/trunk/test/input/test_rotate_270.pdf.txt

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Thu Nov
20 09:17:31 2008
@@ -245,34 +245,43 @@
 
     public void showString( byte[] string ) throws IOException
     {
-        float[] individualWidths = new float[2048];
-        float spaceWidth = 0;
-        float spacing = 0;
-        StringBuffer stringResult = new StringBuffer(string.length);
-
-        float characterHorizontalDisplacement = 0;
-        float characterVerticalDisplacement = 0;
-        float spaceDisplacement = 0;
-        float fontSize = graphicsState.getTextState().getFontSize();
-        float horizontalScaling = graphicsState.getTextState().getHorizontalScalingPercent()/100f;
-        float verticalScaling = horizontalScaling;//not sure if this is right but what else
to do???
-        float rise = graphicsState.getTextState().getRise();
-        final float wordSpacing = graphicsState.getTextState().getWordSpacing();
-        final float characterSpacing = graphicsState.getTextState().getCharacterSpacing();
-        float wordSpacingDisplacement = 0;
+    	/* Note on variable names.  There are three different units being used
+    	 * in this code.  Character sizes are given in glyph units, text locations
+    	 * are initially given in text units, and we want to save the data in 
+    	 * display units. The variable names should end with Text or Disp to 
+    	 * represent if the values are in text or disp units (no glyph units are saved).
+    	 */
+        final float fontSizeText = graphicsState.getTextState().getFontSize();
+        final float horizontalScalingText = graphicsState.getTextState().getHorizontalScalingPercent()/100f;
+        //float verticalScalingText = horizontalScaling;//not sure if this is right but what
else to do???
+        final float riseText = graphicsState.getTextState().getRise();
+        final float wordSpacingText = graphicsState.getTextState().getWordSpacing();
+        final float characterSpacingText = graphicsState.getTextState().getCharacterSpacing();
+        
         //We won't know the actual number of characters until
         //we process the byte data(could be two bytes each) but
         //it won't ever be more than string.length*2(there are some cases
         //were a single byte will result in two output characters "fi"
-
-
-        PDFont font = graphicsState.getTextState().getFont();
-
+        
+        final PDFont font = graphicsState.getTextState().getFont();
+        
         //This will typically be 1000 but in the case of a type3 font
         //this might be a different number
-        float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 );
-        float averageWidth = font.getAverageFontWidth();
+        final float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0
);
+        
 
+      	// lets see what the space displacement should be
+        float spaceWidthText = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor);
+        if( spaceWidthText == 0 )
+        {
+            spaceWidthText = (font.getAverageFontWidth()/glyphSpaceToTextSpaceFactor);
+            //The average space width appears to be higher than necessary
+            //so lets make it a little bit smaller.
+            spaceWidthText *= .80f;
+        }
+        
+        
+        /* Convert textMatrix to display units */
         Matrix initialMatrix = new Matrix();
         initialMatrix.setValue(0,0,1);
         initialMatrix.setValue(0,1,0);
@@ -281,53 +290,27 @@
         initialMatrix.setValue(1,1,1);
         initialMatrix.setValue(1,2,0);
         initialMatrix.setValue(2,0,0);
-        initialMatrix.setValue(2,1,rise);
+        initialMatrix.setValue(2,1,riseText);	
         initialMatrix.setValue(2,2,1);
-
-
-        //this
+    
+        final Matrix ctm = graphicsState.getCurrentTransformationMatrix();
+        final Matrix textMatrixStDisp = initialMatrix.multiply( textMatrix ).multiply( ctm
);
+        
+        final float xScaleDisp = textMatrixStDisp.getXScale();
+        final float yScaleDisp = textMatrixStDisp.getYScale(); 
+        
+        final float spaceWidthDisp = spaceWidthText * xScaleDisp * fontSizeText;
+        final float wordSpacingDisp = wordSpacingText * xScaleDisp * fontSizeText; 
+        
+        float maxVerticalDisplacementText = 0;
+        float[] individualWidthsText = new float[2048];
+        StringBuffer stringResult = new StringBuffer(string.length);
+        
         int codeLength = 1;
-        Matrix ctm = graphicsState.getCurrentTransformationMatrix();
-
-        //lets see what the space displacement should be
-        spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor);
-        if( spaceDisplacement == 0 )
-        {
-            spaceDisplacement = (averageWidth/glyphSpaceToTextSpaceFactor);
-            //The average space width appears to be higher than necessary
-            //so lets make it a little bit smaller.
-            spaceDisplacement *= .80f;
-        }
-        int pageRotation = page.findRotation();
-        Matrix trm = initialMatrix.multiply( textMatrix ).multiply( ctm );
-        float x = trm.getValue(2,0);
-        float y = trm.getValue(2,1);
-        if( pageRotation == 0 )
-        {
-            trm.setValue( 2,1, -y + page.findMediaBox().getHeight() );
-        }
-        else if( pageRotation == 90 || pageRotation == -270 )
-        {
-            trm.setValue( 2,0, y );
-            trm.setValue( 2,1, x );
-        }
-        else if( pageRotation == 270 || pageRotation == -90 )
-        {
-            trm.setValue( 2,0, -y  + page.findMediaBox().getHeight() );
-            trm.setValue( 2,1, x );
-        }
-        float xScale = trm.getXScale();
-        float yScale = trm.getYScale();
-        float xPos = trm.getXPosition();
-        float yPos = trm.getYPosition();
-        spaceWidth = spaceDisplacement * xScale * fontSize;
-        wordSpacingDisplacement = wordSpacing*xScale * fontSize;
-        float totalStringWidth = 0;
         for( int i=0; i<string.length; i+=codeLength )
         {
-
+        	// Decode the value to a Unicode character
             codeLength = 1;
-
             String c = font.encode( string, i, codeLength );
             if( c == null && i+1<string.length)
             {
@@ -337,13 +320,13 @@
             }
 
             //todo, handle horizontal displacement
-            characterHorizontalDisplacement = (font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor);
-            characterVerticalDisplacement =
-                Math.max(
-                    characterVerticalDisplacement,
+            // get the width and height of this character in text units 
+            float characterHorizontalDisplacementText = (font.getFontWidth( string, i, codeLength
)/glyphSpaceToTextSpaceFactor); 
+            maxVerticalDisplacementText = 
+                Math.max( 
+                    maxVerticalDisplacementText, 
                     font.getFontHeight( string, i, codeLength)/glyphSpaceToTextSpaceFactor);
 
-
             // PDF Spec - 5.5.2 Word Spacing
             //
             // Word spacing works the same was as character spacing, but applies
@@ -362,100 +345,98 @@
             // applying word spacing to either the non-32 space or to the character
             // code 32 non-space resulted in errors consistent with this interpretation.
             //
+            float spacingText = characterSpacingText;
             if( (string[i] == 0x20) && c != null && c.equals( " " ) )
             {
-                spacing = wordSpacing + characterSpacing;
-            }
-            else
-            {
-                spacing = characterSpacing;
+                spacingText += wordSpacingText;
             }
 
-            // We want to update the textMatrix using the width, in text space units.
-            //
+            // get the X location before we update the text matrix
+            float xPosBeforeText = textMatrix.getXPosition();
+            
+            /* The text matrix gets updated after each glyph is placed.  The updated
+             * version will have the X and Y coordinates for the next glyph.
+             */
+            
             //The adjustment will always be zero.  The adjustment as shown in the
             //TJ operator will be handled separately.
             float adjustment=0;
-            //todo, need to compute the vertical displacement
-            float ty = 0;
-            float tx = ((characterHorizontalDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize
+ spacing)
-                       *horizontalScaling;
-
-
+            /* todo: tx should be set for horizontal text and ty for vertical text, which
+             * seems to be specified in the font (not the direction in the matrix). 
+             */
+            float tx = ((characterHorizontalDisplacementText-adjustment/glyphSpaceToTextSpaceFactor)*fontSizeText
+ spacingText)
+                       *horizontalScalingText;
+            float ty = 0;              
+            
             Matrix td = new Matrix();
             td.setValue( 2, 0, tx );
-            td.setValue( 2, 1, ty );
-
-            float xPosBefore = textMatrix.getXPosition();
-            float yPosBefore = textMatrix.getYPosition();
+            td.setValue( 2, 1, ty );            
+            
             textMatrix = td.multiply( textMatrix );
 
-            float width = 0;
-            if( pageRotation == 0 )
-            {
-                width = (textMatrix.getXPosition() - xPosBefore);
-            }
-            else if( pageRotation == 90 || pageRotation == -270)
-            {
-                width = (textMatrix.getYPosition() - yPosBefore);
-            }
-            else if( pageRotation == 270 || pageRotation == -90 )
-            {
-                width = (yPosBefore - textMatrix.getYPosition());
-            }
+            // determine the width of this character
+            // XXX: Note that if we handled vertical text, we should be using Y here
+            float widthText = (textMatrix.getXPosition() - xPosBeforeText);
+            
             //there are several cases where one character code will
             //output multiple characters.  For example "fi" or a
             //glyphname that has no mapping like "visiblespace"
             if( c != null )
             {
-                float widthOfEachCharacterForCode = width/c.length();
+            	// assume each character is the same size
+                float widthOfEachCharacterForCode = widthText/c.length();
+            	
                 for( int j=0; j<c.length(); j++)
                 {
-                    if( stringResult.length()+j <individualWidths.length )
+                    if( stringResult.length()+j <individualWidthsText.length )
                     {
                         if( c.equals("-"))
                         {
                             //System.out.println( "stringResult.length()+j=" + (widthOfEachCharacterForCode));
                         }
-                        individualWidths[stringResult.length()+j] = widthOfEachCharacterForCode;
+                        individualWidthsText[stringResult.length()+j] = widthOfEachCharacterForCode;
                     }
                 }
-            } else {
-                // PDFBOX-373: Replace a null entry with "?" so it is
-                // not printed as "(null)" 
-                c = "?";
             }
-
-            totalStringWidth += width;
+            else {
+            	// PDFBOX-373: Replace a null entry with "?" so it is
+            	// not printed as "(null)"
+            	c = "?";
+            }
+            
             stringResult.append( c );
         }
-        float totalStringHeight = characterVerticalDisplacement * fontSize * yScale;
+        
+        
         String resultingString = stringResult.toString();
-
-        if( individualWidths.length != resultingString.length() )
+        
+        if( individualWidthsText.length != resultingString.length() )
         {
             float[] tmp = new float[resultingString.length()];
-            System.arraycopy( individualWidths, 0, tmp, 0, Math.min( individualWidths.length,
resultingString.length() ));
-            individualWidths = tmp;
+            System.arraycopy( individualWidthsText, 0, tmp, 0, Math.min( individualWidthsText.length,
resultingString.length() ));
+            individualWidthsText = tmp;
             if( resultingString.equals( "- " ))
             {
                 //System.out.println( "EQUALS " + individualWidths[0] );
             }
         }
+        
+        float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText
* yScaleDisp;
+        // convert textMatrix at the end of the string to display units
+        Matrix textMatrixEndDisp = initialMatrix.multiply( textMatrix ).multiply( ctm );
+        
         showCharacter(
                 new TextPosition(
-                    xPos,
-                    yPos,
-                    xScale,
-                    yScale,
-                    totalStringWidth,
-                    individualWidths,
-                    totalStringHeight,
-                    spaceWidth,
-                    stringResult.toString(),
-                    font,
-                    fontSize,
-                    wordSpacingDisplacement ));
+                		page,
+                		textMatrixStDisp,
+                		textMatrixEndDisp,
+                		totalVerticalDisplacementDisp,
+                		individualWidthsText,
+                		spaceWidthDisp,
+                		stringResult.toString(),
+                		font,
+                		fontSizeText,
+                		wordSpacingDisp ));
     }
 
     /**

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Thu Nov
20 09:17:31 2008
@@ -383,61 +383,83 @@
      * @throws IOException If there is an error writing the text.
      */
     protected void flushText() throws IOException
-    {
-        float currentY = -1;
-        float lastBaselineFontSize = -1;
+    { 
+        float maxYForLine = -1;
+        float minYTopForLine = Float.MAX_VALUE;
+        //float lastBaselineFontSize = -1;
         float endOfLastTextX = -1;
-        float startOfNextWordX = -1;
+        //float endOfLastTextY = -1;
+        float expectedStartOfNextWordX = -1;
         float lastWordSpacing = -1;
         float maxHeightForLine = -1;
-        TextPosition lastProcessedCharacter = null;
-
+        //float lastHeightForLine = -1;
+        TextPosition lastPosition = null;
         for( int i=0; i<charactersByArticle.size(); i++)
         {
             startParagraph();
-            List textList = (List)charactersByArticle.get( i );
+            List<TextPosition> textList = (List<TextPosition>)charactersByArticle.get(
i );
             if( sortByPosition )
             {
-                TextPositionComparator comparator = new TextPositionComparator( getCurrentPage()
);
+                TextPositionComparator comparator = new TextPositionComparator();
                 Collections.sort( textList, comparator );
             }
-            Iterator textIter = textList.iterator();
+            
+            Iterator<TextPosition> textIter = textList.iterator();
             while( textIter.hasNext() )
             {
-                TextPosition position = (TextPosition)textIter.next();
+                TextPosition position = textIter.next();
                 String characterValue = position.getCharacter();
-
-                //wordSpacing = position.getWordSpacing();
+                
+                float positionX;
+                float positionY;
+                float positionWidth;
+                float positionHeight;
+                
+                /* If we are sorting, then we need to use the text direction 
+                 * adjusted coordinates, because they were used in the sorting. */
+                if (sortByPosition) {
+                	positionX = position.getXDirAdj();
+                	positionY = position.getYDirAdj();
+                	positionWidth = position.getWidthDirAdj();
+                	positionHeight = position.getHeightDir();
+                }
+                else {
+                	positionX = position.getX();
+                	positionY = position.getY();
+                	positionWidth = position.getWidth();
+                	positionHeight = position.getHeight();
+                }
+                
+                
                 float wordSpacing = 0;
-
+                /* float wordSpacing = position.getWordSpacing();	BC: When I re-enabled this
for a a test, lots of extra spaces were added
                 if( wordSpacing == 0 )
                 {
+                */
                     //try to get width of a space character
                     wordSpacing = position.getWidthOfSpace();
                     //if still zero fall back to getting the width of the current
                     //character
                     if( wordSpacing == 0 )
                     {
-                        wordSpacing = position.getWidth();
+                      wordSpacing = positionWidth;
                     }
-                }
-
-
+                //}
+                
+                
                 // RDD - We add a conservative approximation for space determination.
                 // basically if there is a blank area between two characters that is
                 //equal to some percentage of the word spacing then that will be the
                 //start of the next word
                 if( lastWordSpacing <= 0 )
                 {
-                    startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
+                    expectedStartOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
                 }
                 else
                 {
-                    startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)*
0.50f);
+                    expectedStartOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)*
0.50f);
                 }
-
-                lastWordSpacing = wordSpacing;
-
+    
                 // RDD - We will suppress text that is very close to the current line
                 // and which overwrites previously rendered text on this line.
                 // This is done specifically to handle a reasonably common situation
@@ -458,62 +480,66 @@
                     }
                     continue;
                 }*/
-
+    
                 // RDD - Here we determine whether this text object is on the current
                 // line.  We use the lastBaselineFontSize to handle the superscript
                 // case, and the size of the current font to handle the subscript case.
                 // Text must overlap with the last rendered baseline text by at least
                 // a small amount in order to be considered as being on the same line.
                 //
-                int verticalScaling = 1;
-                if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
+                
+                //int verticalScaling = 1;
+                //if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
+                //{
+                //    verticalScaling = -1;
+                //}
+                
+                if( lastPosition != null )
                 {
-                    verticalScaling = -1;
-                }
-                if( lastProcessedCharacter != null )
-                {
-                    float currentHeight = position.getHeight();
                     //if (currentY != -1 &&
                     //    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f
* verticalScaling))) ||
                     //     (position.getY() > (currentY + (position.getFontSize() * 0.9f
* verticalScaling)))))
                     //{
-                    if( !overlap( position.getY(), currentHeight, currentY, maxHeightForLine
) )
+                    /* XXX BC: In theory, this check should really check if the next char
is in full range 
+                     * seen in this line. This is what I tried to do with minYTopForLine,
but this caused a lot
+                     * of regression test failures.  So, I'm leaving it be for now. */
+                    if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine
) )) 
+                    		//maxYForLine - minYTopForLine))) 
                     {
                         processLineSeparator( position );
                         endOfLastTextX = -1;
-                        startOfNextWordX = -1;
-                        currentY = -1;
+                        expectedStartOfNextWordX = -1;
+                        maxYForLine = -1;
                         maxHeightForLine = -1;
-                        lastBaselineFontSize = -1;
+                        //lastBaselineFontSize = -1;
+                        minYTopForLine = Float.MAX_VALUE;
+                        //lastHeightForLine = -1;
                     }
+                
+    
+	                if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX <
positionX &&
+	                   //only bother adding a space if the last character was not a space
+	                   lastPosition.getCharacter() != null &&
+	                   !lastPosition.getCharacter().endsWith( " " ) )
+	                {
+	                    processWordSeparator( lastPosition, position );
+	                }
+	                else
+	                {
+	                    //System.out.println( "Not a word separator " + position.getCharacter()
+  " start=" + startOfNextWordX + " x=" + position.getX() );
+	                }
+                }
+    
+                if (positionY >= maxYForLine) {
+                	maxYForLine = positionY;
+                    //lastBaselineFontSize = position.getFontSize();
                 }
-
-                if (startOfNextWordX != -1 && startOfNextWordX < position.getX()
&&
-                   lastProcessedCharacter != null &&
-                   //only bother adding a space if the last character was not a space
-                   lastProcessedCharacter.getCharacter() != null &&
-                   !lastProcessedCharacter.getCharacter().endsWith( " " ) )
-                {
-                    processWordSeparator( lastProcessedCharacter, position );
-                }
-                else
-                {
-                    //System.out.println( "Not a word separtor " + position.getCharacter()
+  " start=" + startOfNextWordX + " x=" + position.getX() );
-                }
-
-                currentY = Math.max(currentY,position.getY());
-
-                if (currentY == position.getY())
-                {
-                    lastBaselineFontSize = position.getFontSize();
-                }
-
+    
                 // RDD - endX is what PDF considers to be the x coordinate of the
                 // end position of the text.  We use it in computing our metrics below.
-                //
-                endOfLastTextX = position.getX() + position.getWidth();
-
-
+                endOfLastTextX = positionX + positionWidth;
+                //endOfLastTextY = positionY;
+    
                 if (characterValue != null)
                 {
                     writeCharacters( position );
@@ -522,12 +548,15 @@
                 {
                     //Position.getString() is null so not writing anything
                 }
-                maxHeightForLine = Math.max( maxHeightForLine, position.getHeight() );
-                lastProcessedCharacter = position;
+                maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
+                minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); 
+                lastPosition = position;
+                //lastHeightForLine = position.getHeight();
+                lastWordSpacing = wordSpacing;
             }
             endParagraph();
         }
-
+        
 
         // RDD - newline at end of flush - required for end of page (so that the top
         // of the next page starts on its own line.
@@ -622,8 +651,8 @@
                 if( charCharacter != null &&
                     //charCharacter.equals( textCharacter ) &&
                     within( charX, textX, tolerance ) &&
-                    within( charY,
-                            textY,
+                    within( charY, 
+                    		textY,
                             tolerance ) )
                 {
                     suppressCharacter = true;

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java Thu Nov
20 09:17:31 2008
@@ -16,28 +16,35 @@
  */
 package org.apache.pdfbox.util;
 
+import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 
 /**
- * This represents a character and a position on the screen of those characters.
+ * This represents a string and a position on the screen of those characters.
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.12 $
  */
 public class TextPosition
 {
-    private float x;
-    private float y;
-    private float xScale;
-    private float yScale;
-    private float totalWidth;
+	/* TextMatrix for the start of the text object.  Coordinates
+	 * are in display units and have not been adjusted. */
+	private Matrix textPos;
+	
+	// ending X and Y coordinates in display units
+	private float endX;
+	private float endY;
+	
+	private float maxTextHeight; // maximum height of text, in display units
+	private int rot; // 0, 90, 180, 270 degrees of page rotation
+	private float pageHeight;
+	private float pageWidth;
     private float[] widths;
-    private float height;
-    private float widthOfSpace;
-    private String c;
+    private float widthOfSpace; // width of a space, in display units
+    private String str; 
     private PDFont font;
     private float fontSize;
-    private float wordSpacing;
+    private float wordSpacing;	// word spacing value, in display units
 
     protected TextPosition()
     {
@@ -47,86 +54,223 @@
     /**
      * Constructor.
      *
-     * @param xPos The x coordinate of the character.
-     * @param yPos The y coordinate of the character.
-     * @param xScl The x scaling of the character.
-     * @param yScl The y scaling of the character.
-     * @param totalWidthValue The width of all the characters.
-     * @param individualWidths The width of each individual character.
-     * @param heightValue The height of the character.
-     * @param spaceWidth The width of the space character.
+     * @param page Page that the text is located in
+     * @param textPositionSt TextMatrix for start of text (in display units)
+     * @param textPositionEnd TextMatrix for end of text (in display units)
+     * @param maxFontH Maximum height of text (in display units)
+     * @param individualWidths The width of each individual character. (in ? units)
+     * @param spaceWidth The width of the space character. (in display units)
      * @param string The character to be displayed.
      * @param currentFont The current for for this text position.
      * @param fontSizeValue The new font size.
-     * @param ws The word spacing parameter
+     * @param ws The word spacing parameter (in display units)
      */
     public TextPosition(
-        float xPos,
-        float yPos,
-        float xScl,
-        float yScl,
-        float totalWidthValue,
-        float[] individualWidths,
-        float heightValue,
-        float spaceWidth,
-        String string,
-        PDFont currentFont,
-        float fontSizeValue,
-        float ws
-        )
-    {
-        this.x = xPos;
-        this.y = yPos;
-        this.xScale = xScl;
-        this.yScale = yScl;
-        this.totalWidth = totalWidthValue;
+    		PDPage page,
+    		Matrix textPositionSt,
+    		Matrix textPositionEnd,
+    		float maxFontH,
+    		float[] individualWidths,
+    		float spaceWidth,
+    		String string,
+    		PDFont currentFont,
+    		float fontSizeValue,
+    		float ws
+    )
+    {
+    	this.textPos = textPositionSt;
+    	
+    	this.endX = textPositionEnd.getXPosition();
+    	this.endY = textPositionEnd.getYPosition();
+    	
+    	this.rot = page.findRotation();
+    	// make sure it is 0 to 270 and no negative numbers
+    	if(this.rot < 0)
+    		rot += 360;
+    	
+    	this.maxTextHeight = maxFontH;
+    	this.pageHeight = page.findMediaBox().getHeight();
+    	this.pageWidth = page.findMediaBox().getWidth();
+    	        
         this.widths = individualWidths;
-        this.height = heightValue;
         this.widthOfSpace = spaceWidth;
-        this.c = string;
+        this.str = string;
         this.font = currentFont;
         this.fontSize = fontSizeValue;
         this.wordSpacing = ws;
     }
 
     /**
-     * This will the character that will be displayed on the screen.
+     * Return the string of characters stored in this object.
      *
-     * @return The character on the screen.
+     * @return The string on the screen.
      */
     public String getCharacter()
     {
-        return c;
+        return str;
     }
 
     /**
-     * This will get the x position of the character.
+     * Return the direction/orientation of the string in this object
+     * based on its text matrix.
+     * @return The direction of the text (0, 90, 180, or 270)
+     */
+    public float getDir() {
+    	float a = textPos.getValue(0,0);
+    	float b = textPos.getValue(0,1);
+    	float c = textPos.getValue(1,0);
+    	float d = textPos.getValue(1,1);
+    	
+    	// 12 0   left to right
+    	// 0 12 
+    	if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) &&
(d > 0))
+    		return 0;
+    	// -12 0   right to left (upside down)
+    	// 0 -12
+    	else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c)
< Math.abs(a)) && (d < 0))
+    		return 180;
+    	// 0  12	up
+    	// -12 0 
+    	else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0)
&& (Math.abs(d) < b))
+    		return 90;
+    	// 0  -12	down
+    	// 12 0 
+    	else if ((Math.abs(a) < c) && (b < 0) && (c > 0) &&
(Math.abs(d) < Math.abs(b)))
+    		return 270;
+ 
+    	return 0;
+    }
+    
+    /**
+     * Return the X starting coordinate of the text, adjusted by 
+     * the given rotation amount.  The rotation adjusts where the 0,0
+     * location is relative to the text. 
+     *  
+     * @param a_rot Rotation to apply (0, 90, 180, or 270).  0 will perform no adjustments.

+     * @return X coordinate
+     */
+    private float getX_rot(float a_rot)
+    {
+    	if (a_rot == 0)
+    		return textPos.getValue(2,0);
+    	else if (a_rot == 90)
+    		return textPos.getValue(2,1);
+    	else if (a_rot == 180)
+    		return pageWidth - textPos.getValue(2,0);
+    	else if (a_rot == 270)
+    		return pageHeight - textPos.getValue(2,1);
+    	else 
+    		return 0;
+    }
+    
+    /**
+     * This will get the page rotation adjusted x position of the character.
+     * This is adjusted based on page rotation so that the upper 
+     * left is 0,0. 
      *
      * @return The x coordinate of the character.
      */
     public float getX()
     {
-        return x;
+    	return getX_rot(rot);
+    }
+    
+    /**
+     * This will get the text direction adjusted x position of the character.
+     * This is adjusted based on text direction so that the first character
+     * in that direction is in the upper left at 0,0.
+     *
+     * @return The x coordinate of the text.
+     */
+    public float getXDirAdj() {
+    	return getX_rot(getDir());	
     }
 
+    /** 
+     * This will get the y position of the character with 0,0 in lower left. 
+     * This will be adjusted by the given rotation. 
+     * @param a_rot Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
+     * 
+     * @return The y coordinate of the text
+     */
+    private float getY_ll_rot(float a_rot)
+    {
+    	if (a_rot == 0)
+    		return textPos.getValue(2,1);
+    	else if (a_rot == 90)
+    		return pageWidth - textPos.getValue(2,0);
+    	else if (a_rot == 180)
+    		return pageHeight - textPos.getValue(2,1);
+    	else if (a_rot == 270)
+    		return textPos.getValue(2,0);
+    	else 
+    		return 0;
+    }
+    
     /**
-     * This will get the y position of the character.
+     * This will get the y position of the text, adjusted so that 0,0 is upper left and 
+     * it is adjusted based on the page rotation. 
      *
-     * @return The y coordinate of the character.
+     * @return The adjusted y coordinate of the character.
      */
     public float getY()
     {
-        return y;
+    	if ((rot == 0) || (rot == 180))
+    		return pageHeight - getY_ll_rot(rot);
+    	else
+    		return pageWidth - getY_ll_rot(rot);
     }
-
+    
     /**
-     * This will get the width of this character.
+     * This will get the y position of the text, adjusted so that 0,0 is upper left and 
+     * it is adjusted based on the text direction. 
      *
-     * @return The width of this character.
+     * @return The adjusted y coordinate of the character.
      */
-    public float getWidth()
+    public float getYDirAdj()
     {
-        return totalWidth;
+    	float dir = getDir();
+    	// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
+    	if ((dir == 0) || (dir == 180))
+    		return pageHeight - getY_ll_rot(dir);
+    	else
+    		return pageWidth - getY_ll_rot(dir);
+    }
+
+
+    
+    /**
+     * Get the length or width of the text, based on a given rotation. 
+     * 
+     * @param a_rot Rotation that was used to determine coordinates (0,90,180,270)
+     * @return Width of text in display units
+     */
+    private float getWidth_rot(float a_rot)
+    {
+    	if ((a_rot == 90) || (a_rot == 270)) {
+    		return Math.abs(endY - textPos.getYPosition());
+    	}
+    	else {
+    		return Math.abs(endX - textPos.getXPosition());
+    	}
+    }
+    
+    /**
+     * This will get the width of the string when page rotation adjusted coordinates are
used.
+     *
+     * @return The width of the text in display units.
+     */
+    public float getWidth() {
+    	return getWidth_rot(rot);
+    }
+    
+    /**
+     * This will get the width of the string when text direction adjusted coordinates are
used.
+     *
+     * @return The width of the text in display units.
+     */
+    public float getWidthDirAdj() {
+    	return getWidth_rot(getDir());
     }
 
     /**
@@ -134,9 +278,18 @@
      *
      * @return The maximum height of all characters in this string.
      */
-    public float getHeight()
-    {
-        return height;
+    public float getHeight() {
+    	return maxTextHeight;
+    }
+    
+    /**
+     * This will get the maximum height of all characters in this string.
+     *
+     * @return The maximum height of all characters in this string.
+     */
+    public float getHeightDir() {
+    	// this is not really a rotation-dependent calculation, but this is defined for symmetry.
+    	return maxTextHeight;
     }
 
     /**
@@ -186,28 +339,15 @@
      */
     public float getXScale()
     {
-        return xScale;
-    }
-    /**
-     * @param scale The xScale to set.
-     */
-    public void setXScale(float scale)
-    {
-        xScale = scale;
+        return textPos.getXScale();
     }
+
     /**
      * @return Returns the yScale.
      */
     public float getYScale()
     {
-        return yScale;
-    }
-    /**
-     * @param scale The yScale to set.
-     */
-    public void setYScale(float scale)
-    {
-        yScale = scale;
+        return textPos.getYScale();
     }
 
     /**

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
(original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
Thu Nov 20 09:17:31 2008
@@ -18,28 +18,17 @@
 
 import java.util.Comparator;
 
-import org.apache.pdfbox.pdmodel.PDPage;
-
 /**
- * This class is a comparator for TextPosition operators.
+ * This class is a comparator for TextPosition operators.  It handles
+ * pages with text in different directions by grouping the text based
+ * on direction and sorting in that direction. This allows continuous text
+ * in a given direction to be more easily grouped together.  
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.7 $
  */
 public class TextPositionComparator implements Comparator
 {
-    private PDPage thePage = null;
-
-    /**
-     * Constuctor, comparison of TextPosition depends on the rotation
-     * of the page.
-     * @param page The page that the text position is on.
-     */
-    public TextPositionComparator( PDPage page )
-    {
-        thePage = page;
-    }
-
     /**
      * {@inheritDoc}
      */
@@ -48,41 +37,22 @@
         int retval = 0;
         TextPosition pos1 = (TextPosition)o1;
         TextPosition pos2 = (TextPosition)o2;
-        int rotation = thePage.findRotation();
-        float x1 = 0;
-        float x2 = 0;
-        float pos1YBottom = 0;
-        float pos2YBottom = 0;
-        if( rotation == 0 )
-        {
-            x1 = pos1.getX();
-            x2 = pos2.getX();
-            pos1YBottom = pos1.getY();
-            pos2YBottom = pos2.getY();
-        }
-        else if( rotation == 90 )
-        {
-            x1 = pos1.getY();
-            x2 = pos2.getX();
-            pos1YBottom = pos1.getX();
-            pos2YBottom = pos2.getY();
-        }
-        else if( rotation == 180 )
-        {
-            x1 = -pos1.getX();
-            x2 = -pos2.getX();
-            pos1YBottom = -pos1.getY();
-            pos2YBottom = -pos2.getY();
-        }
-        else if( rotation == 270 )
-        {
-            x1 = -pos1.getY();
-            x2 = -pos2.getY();
-            pos1YBottom = -pos1.getX();
-            pos2YBottom = -pos2.getX();
-        }
-        float pos1YTop = pos1YBottom - pos1.getHeight();
-        float pos2YTop = pos2YBottom - pos2.getHeight();
+
+        /* Only compare text that is in the same direction. */
+        if (pos1.getDir() < pos2.getDir())
+        	return -1;
+        else if (pos1.getDir() > pos2.getDir())
+        	return 1;
+        	
+        // Get the text direction adjusted coordinates
+        float x1 = pos1.getXDirAdj();
+        float x2 = pos2.getXDirAdj();
+        
+        float pos1YBottom = pos1.getYDirAdj();
+        float pos2YBottom = pos2.getYDirAdj();
+        // note that the coordinates have been adjusted so 0,0 is in upper left
+        float pos1YTop = pos1YBottom - pos1.getHeightDir();
+        float pos2YTop = pos2YBottom - pos2.getHeightDir();
 
         float yDifference = Math.abs( pos1YBottom-pos2YBottom);
         //we will do a simple tolerance comparison.
@@ -111,8 +81,6 @@
         {
             return 1;
         }
-
         return retval;
     }
-
 }

Modified: incubator/pdfbox/trunk/test/input/hexnumberproblem.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/hexnumberproblem.pdf.txt?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/ocalc.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/ocalc.pdf.txt?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/test_rotate_270.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/test_rotate_270.pdf.txt?rev=719294&r1=719293&r2=719294&view=diff
==============================================================================
Binary files - no diff available.



Mime
View raw message