From commits-return-13503-archive-asf-public=cust-asf.ponee.io@pdfbox.apache.org Wed Nov 7 19:30:23 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 1F54718067A for ; Wed, 7 Nov 2018 19:30:22 +0100 (CET) Received: (qmail 69471 invoked by uid 500); 7 Nov 2018 18:30:22 -0000 Mailing-List: contact commits-help@pdfbox.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@pdfbox.apache.org Delivered-To: mailing list commits@pdfbox.apache.org Received: (qmail 69462 invoked by uid 99); 7 Nov 2018 18:30:22 -0000 Received: from Unknown (HELO svn01-us-west.apache.org) (209.188.14.144) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 07 Nov 2018 18:30:22 +0000 Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id 998243A0044 for ; Wed, 7 Nov 2018 18:30:21 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1846065 - /pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Date: Wed, 07 Nov 2018 18:30:21 -0000 To: commits@pdfbox.apache.org From: tilman@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20181107183021.998243A0044@svn01-us-west.apache.org> Author: tilman Date: Wed Nov 7 18:30:21 2018 New Revision: 1846065 URL: http://svn.apache.org/viewvc?rev=1846065&view=rev Log: PDFBOX-4367: run stripper by page as preparation to catch the exception in a later commit; improve usage text Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1846065&r1=1846064&r2=1846065&view=diff ============================================================================== --- pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original) +++ pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Wed Nov 7 18:30:21 2018 @@ -224,18 +224,30 @@ public final class ExtractText } stripper.setSortByPosition( sort ); stripper.setShouldSeparateByBeads( separateBeads ); - stripper.setStartPage( startPage ); - stripper.setEndPage( endPage ); startTime = startProcessing("Starting text extraction"); if (debug) { System.err.println("Writing to "+outputFile); } - + endPage = Math.min(endPage, document.getNumberOfPages()); + // Extract text for main document: - stripper.writeText( document, output ); - + for (int p = startPage; p <= endPage; ++p) + { + try + { + stripper.setStartPage(p); + stripper.setEndPage(p); + stripper.writeText(document, output); + } + catch (IOException ex) + { + //TODO alternatively, log and continue + throw ex; + } + } + // ... also for any embedded PDFs: PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); @@ -273,7 +285,20 @@ public final class ExtractText } try { - stripper.writeText( subDoc, output ); + for (int p = 1; p <= subDoc.getNumberOfPages(); ++p) + { + try + { + stripper.setStartPage(p); + stripper.setEndPage(p); + stripper.writeText(subDoc, output); + } + catch (IOException ex) + { + //TODO alternatively, log and continue + throw ex; + } + } } finally { @@ -320,17 +345,17 @@ public final class ExtractText { String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] [output-text-file]\n" + "\nOptions:\n" - + " -password : Password to decrypt document\n" - + " -encoding : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n" - + " -console : Send text to console instead of file\n" - + " -html : Output in HTML format instead of raw text\n" - + " -sort : Sort the text before writing\n" - + " -ignoreBeads : Disables the separation by beads\n" - + " -debug : Enables debug output about the time consumption of every stage\n" - + " -startPage : The first page to start extraction(1 based)\n" - + " -endPage : The last page to extract(inclusive)\n" - + " : The PDF document to use\n" - + " [output-text-file] : The file to write the text to"; + + " -password : Password to decrypt document\n" + + " -encoding : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n" + + " -console : Send text to console instead of file\n" + + " -html : Output in HTML format instead of raw text\n" + + " -sort : Sort the text before writing\n" + + " -ignoreBeads : Disables the separation by beads\n" + + " -debug : Enables debug output about the time consumption of every stage\n" + + " -startPage : The first page to start extraction (1 based)\n" + + " -endPage : The last page to extract (1 based and inclusive)\n" + + " : The PDF document to use\n" + + " [output-text-file] : The file to write the text to"; System.err.println(message); System.exit( 1 );