pdfbox-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Tilman Hausherr <THaush...@t-online.de>
Subject Re: PdfBox does not find/replace the text in the PDF document
Date Thu, 10 Aug 2017 19:48:56 GMT
Hi,

Please read
https://pdfbox.apache.org/2.0/migration.html
"Why was the ReplaceText example removed?"

I looked at your file with PDFDebugger, the Tj parameters look like this:

(\000\024\000\030\000\026\000\033\000\(\000\027\000'\000\034\000\024\000\024\000\024\000\031\000%\000&\000%\000&\000\026\000\027\000\031\000\031\000\)\000\024\000\025\000\034\000\026\000\032\000&\000%\000\031\000'\000\032\000\027\000\027\000\034\000\025\000$\000\033\000\024\000\024\000\025)

Tj

So your task is next to impossible, sadly.

Tilman

Am 10.08.2017 um 21:20 schrieb Muthu Krishnan:
> Hi,
>
> I am trying to use PDF box to find and replace the text in PDF using 
> the following code.
>
> But this does not work with my PDF. I am attaching the input.pdf and 
> this java code. Can anyone please let me know what is wrong here.
>
> Thank you
> Muthu
>
>
>
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileOutputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.OutputStream;
> import java.util.ArrayList;
> import java.util.Collection;
> import java.util.HashSet;
> import java.util.List;
> import java.util.Set;
> import java.util.regex.Matcher;
> import java.util.regex.Pattern;
>
> import org.apache.pdfbox.contentstream.operator.Operator;
> import org.apache.pdfbox.cos.COSName;
> import org.apache.pdfbox.cos.COSString;
> import org.apache.pdfbox.pdfparser.PDFStreamParser;
> import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.pdmodel.PDPage;
> import org.apache.pdfbox.pdmodel.common.PDStream;
>
> public class PdfBoxTester {
>
> private static final String TEST_PDF = "/shn/input.pdf";
>
> private static final Pattern TOKEN_PATTERN = 
> Pattern.compile("[0-9a-f]{64}+|[0-9A-F]{40}+");
>
> public static void main(String[] args) throws Exception {
> substituteTokens();
>
> }
>
> private static void substituteTokens() throws IOException {
> PDDocument document = null;
> try (InputStream inputStream = new FileInputStream(new File(TEST_PDF))) {
> try {
> document = PDDocument.load(inputStream);
> if (document.isEncrypted()) {
> throw new IOException("Error: Encrypted documents are not supported 
> for this example.");
> }
> for (PDPage page : document.getPages()) {
> PDFStreamParser parser = new PDFStreamParser(page);
> parser.parse();
> List<Object> tokens = parser.getTokens();
> List<Object> newTokens = new ArrayList<Object>();
> for (Object token : tokens) {
> if (token instanceof Operator) {
> Operator op = (Operator) token;
> if (op.getName().equals("TJ") || op.getName().equals("Tj")) {
>
> Object argumentToken = newTokens.get(newTokens.size() - 1);
> if (argumentToken instanceof COSString) {
> COSString stringToken = (COSString) argumentToken;
> Collection<String> tokenStrings = collectTokens(stringToken.getString());
> if (!tokenStrings.isEmpty()) {
> String detokenizedString = substituteTokens(stringToken.getString(),
> tokenStrings, "static replacement");
> if (detokenizedString != null) {
> stringToken.setValue(detokenizedString.getBytes());
> }
> }
> }
> }
> }
> newTokens.add(token);
> }
> PDStream newContents = new PDStream(document);
> OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
> ContentStreamWriter writer = new ContentStreamWriter(out);
> writer.writeTokens(newTokens);
> out.close();
> page.setContents(newContents);
> }
> FileOutputStream fileOutputStream = new FileOutputStream(new 
> File("/shn/output.pdf"));
> document.save(fileOutputStream);
> } finally {
> if (document != null) {
> document.close();
> }
> inputStream.close();
> }
> }
> }
>
> public static Collection<String> collectTokens(String tokenizedText) 
> throws IOException {
> Set<String> tokens = new HashSet<>();
> Matcher matcher = TOKEN_PATTERN.matcher(tokenizedText);
> while (matcher.find()) {
> tokens.add(matcher.group());
> }
> return tokens;
> }
>
> public static String substituteTokens(String text, Collection<String> 
> tokens, String staticReplacementText) {
> String result = text;
> for (String token : tokens) {
> result = result.replace(token, staticReplacementText);
> }
> return result;
> }
> }
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: users-unsubscribe@pdfbox.apache.org
> For additional commands, e-mail: users-help@pdfbox.apache.org



Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message