pdfbox-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Zach Hawkins <drunnerz...@gmail.com>
Subject Print Text Locations
Date Fri, 28 Jan 2011 22:32:24 GMT
Hello All,

Since this has been such a common question, I felt as though I should
share the code I used to extend the PDFTextStripper class to also have
PDFTextLocations and PDFTextLocationsByArea classes (in PDFBox 1.2.1
because of extraction issues in the more recent versions where line
breaks were not being created where they needed to be). These two should
make it much easier to determine the text locations and perform PDF
modifications afterwards. I designed them to be used just like the
parent classes to which they belong. Hopefully it helps someone because
it took me a while to re-learn enough java to get through this. Code
below (maybe someone would like to add it to the project):

/*
 *  Copyright 2011 Administrator.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */

package org.apache.pdfbox.util;

/**
 *
 * @author Zach Hawkins
 */
import java.io.IOException;
import java.io.StringWriter;
import org.apache.pdfbox.pdmodel.PDDocument;

public class PDFTextLocations extends PDFTextStripper
{
    protected StringWriter outputText = new StringWriter();

    public PDFTextLocations() throws IOException
    {
       
    }

    /**
     * A method provided as an event interface to allow a subclass to
perform
     * some specific functionality when text needs to be processed.
     *
     * @param text The text to be processed
     */
    protected void processTextPosition( TextPosition text )
    {
        outputText.append("[XY=" + text.getXDirAdj() + "," +
        text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" +
        text.getXScale() + " height=" + text.getHeightDir() + " space=" +
        text.getWidthOfSpace() + " width=" +
        text.getWidthDirAdj() + "]" + text.getCharacter() +"\n");
    }

    public String getText( PDDocument doc ) throws IOException
    {
        writeText( doc, outputText );
        return outputText.toString();
    }
    /**public String getText()
    {
        return outputText.toString();
    }
     * */
}


/*
 *  Copyright 2011 Administrator.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */

package org.apache.pdfbox.util;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

/**
 *
 * @author Zach Hawkins
 */
public class PDFTextLocationsByArea extends PDFTextLocations
{
    private Map regionArea = new HashMap();
    private List regions = new ArrayList();
    private Map regionCharacterList = new HashMap();
    private Map regionText = new HashMap();
   
   
    public PDFTextLocationsByArea() throws IOException
    {
        super();
        super.setSortByPosition( true );
    }

    public void addRegion(String regionName, Rectangle2D rect)
    {
        regions.add( regionName );
        regionArea.put( regionName, rect );
    }

    public List getRegions()
    {
        return regions;
    }


    public void extractLocations(PDDocument doc, int i) throws IOException
    {
        /** TextStripperByArea:
        Iterator regionIter = regions.iterator();
        while( regionIter.hasNext() )
        {
            setStartPage(getCurrentPageNo());
            setEndPage(getCurrentPageNo());
            //reset the stored text for the region so this class
            //can be reused.
            String regionName = (String)regionIter.next();
            Vector regionCharactersByArticle = new Vector();
            regionCharactersByArticle.add( new ArrayList() );
            regionCharacterList.put( regionName,
regionCharactersByArticle );
            regionText.put( regionName, new StringWriter() );
        }

        PDStream contentStream = page.getContents();
        if( contentStream != null )
        {
            COSStream contents = contentStream.getStream();
            processPage( page, contents );
        }
         */
        Iterator regionIter = regions.iterator();
        while( regionIter.hasNext() )
        {
            setStartPage(i);
            setEndPage(i);
            //reset the stored text for the region so this class
            //can be reused.
            String regionName = (String)regionIter.next();
            Vector regionCharactersByArticle = new Vector();
            regionCharactersByArticle.add( new ArrayList() );
            regionCharacterList.put( regionName,
regionCharactersByArticle );
            regionText.put( regionName, new StringWriter() );
        }
        PDPage page = (PDPage)doc.getDocumentCatalog().getAllPages().get(i);
        PDStream contentStream = page.getContents();
        if( contentStream != null )
        {
            COSStream contents = contentStream.getStream();
            processPage( page, contents );
        }
    }

    protected void processTextPosition( TextPosition text )
    {
        Iterator regionIter = regionArea.keySet().iterator();
        while( regionIter.hasNext() )
        {
            String region = (String)regionIter.next();
            Rectangle2D rect = (Rectangle2D)regionArea.get( region );
            if( rect.contains( text.getX(), text.getY() ) )
            {
                charactersByArticle = (Vector)regionCharacterList.get(
region );
                StringWriter textString = (StringWriter) regionText.get(
region );
                textString.append("[XY=" + text.getXDirAdj() + "," +
                text.getYDirAdj() + " fs=" + text.getFontSize() + "
xscale=" +
                text.getXScale() + " height=" + text.getHeightDir() + "
space=" +
                text.getWidthOfSpace() + " width=" +
                text.getWidthDirAdj() + "]" + text.getCharacter() +"\n");
            }
        }
    }

    /**
     * This will print the processed page text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void writePage() throws IOException
    {
        Iterator regionIter = regionArea.keySet().iterator();
        while( regionIter.hasNext() )
        {
            String region = (String)regionIter.next();
            charactersByArticle = (Vector)regionCharacterList.get( region );
            output = (StringWriter)regionText.get( region );
            super.writePage();
        }
    }
   
       /**
     * Get the text for the region, this should be called after
extractRegions().
     *
     * @param regionName The name of the region to get the text from.
     * @return The text that was identified in that region.
     */
   
    public String getTextForRegion( String regionName )
    {
        StringWriter text = (StringWriter)regionText.get( regionName );
        return text.toString();

    }
  
}



Mime
View raw message