pdfbox-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Tilman Hausherr <THaush...@t-online.de>
Subject Re: associating text with a PDActionURI?
Date Thu, 07 Jul 2016 16:55:01 GMT
here's code that works - for some reason, I can't take the rectangle as 
it is, I have to flip the coordinates. I wonder if this is documented. 
The coordinates in the PDF are PDF coordinates (bottom is y = 0), but 
the coordinates I had to use are top is y = 0)
Tilman

package org.apache.pdfbox.examples.util;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;

/**
  * This is an example on how to extract text from a specific area on 
the PDF document.
  *
  * @author Ben Litchfield
  */
public final class ExtractTextByArea
{
     private ExtractTextByArea()
     {
         //utility class and should not be constructed.
     }


     /**
      * This will print the documents text in a certain area.
      *
      * @param args The command line arguments.
      *
      * @throws IOException If there is an error parsing the document.
      */
     public static void main( String[] args ) throws IOException
     {
         if( args.length != 1 )
         {
             usage();
         }
         else
         {
             PDDocument document = null;
             try
             {
                 document = PDDocument.load( new File(args[0]) );
                 PDFTextStripperByArea stripper = new 
PDFTextStripperByArea();
                 stripper.setSortByPosition( true );
                 float pageHeight = 
document.getPage(0).getCropBox().getHeight();
                 Rectangle2D rect = new Rectangle2D.Float( 69.75f, 
pageHeight - 376.62f, 153.45f - 69.75f, 376.62f - 351.17f); 
/////////////////////////////////////////////////
                 stripper.addRegion( "class1", rect );
                 PDPage firstPage = document.getPage(0);
                 stripper.extractRegions( firstPage );
                 System.out.println( "Text in the area:" + rect );
                 System.out.println( stripper.getTextForRegion( "class1" 
) );
             }
             finally
             {
                 if( document != null )
                 {
                     document.close();
                 }
             }
         }
     }

     /**
      * This will print the usage for this document.
      */
     private static void usage()
     {
         System.err.println( "Usage: java " + 
ExtractTextByArea.class.getName() + " <input-pdf>" );
     }

}


---------------------------------------------------------------------
To unsubscribe, e-mail: users-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: users-help@pdfbox.apache.org


Mime
View raw message