pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jahew...@apache.org
Subject svn commit: r1627877 - /pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
Date Fri, 26 Sep 2014 20:17:02 GMT
Author: jahewson
Date: Fri Sep 26 20:17:02 2014
New Revision: 1627877

URL: http://svn.apache.org/r1627877
Log:
PDFBOX-2384: Use UTF-8 as the default encoding for ExtractText

Modified:
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1627877&r1=1627876&r2=1627877&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Fri Sep 26 20:17:02
2014
@@ -102,7 +102,7 @@ public class ExtractText
         boolean separateBeads = true;
         boolean useNonSeqParser = false; 
         String password = "";
-        String encoding = null;
+        String encoding = "UTF-8";
         String pdfFile = null;
         String outputFile = null;
         // Defaults to text files
@@ -227,29 +227,13 @@ public class ExtractText
                 
                 stopProcessing("Time for loading: ", startTime);
 
-
-                if ((encoding == null) && (toHTML))
-                {
-                    encoding = "UTF-8";
-                }
-
                 if( toConsole )
                 {
-                    output = new OutputStreamWriter( System.out );
+                    output = new OutputStreamWriter( System.out, encoding );
                 }
                 else
                 {
-                    if( encoding != null )
-                    {
-                        output = new OutputStreamWriter(
-                                new FileOutputStream( outputFile ), encoding );
-                    }
-                    else
-                    {
-                        //use default encoding
-                        output = new OutputStreamWriter(
-                                new FileOutputStream( outputFile ) );
-                    }
+                    output = new OutputStreamWriter( new FileOutputStream( outputFile ),
encoding );
                 }
 
                 PDFTextStripper stripper;
@@ -365,7 +349,7 @@ public class ExtractText
     {
         System.err.println( "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [OPTIONS]
<PDF file> [Text File]\n" +
             "  -password  <password>        Password to decrypt document\n" +
-            "  -encoding  <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
+            "  -encoding  <output encoding> UTF-8 (default) or ISO-8859-1, UTF-16BE,
UTF-16LE, etc.\n" +
             "  -console                     Send text to console instead of file\n" +
             "  -html                        Output in HTML format instead of raw text\n"
+
             "  -sort                        Sort the text before writing\n" +



Mime
View raw message