pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From le...@apache.org
Subject svn commit: r1740161 - in /pdfbox/branches/2.0: ./ tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
Date Wed, 20 Apr 2016 15:58:55 GMT
Author: lehmi
Date: Wed Apr 20 15:58:55 2016
New Revision: 1740161

URL: http://svn.apache.org/viewvc?rev=1740161&view=rev
Log:
PDFBOX-3281: ignore encoding parameter when writing html output

Modified:
    pdfbox/branches/2.0/   (props changed)
    pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
    pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java

Propchange: pdfbox/branches/2.0/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Wed Apr 20 15:58:55 2016
@@ -1,3 +1,3 @@
 /pdfbox/branches/no-awt:1618517-1621410
 /pdfbox/no-awt:1618514-1618516
-/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755
+/pdfbox/trunk:1736223,1736227,1736615,1737043,1737130,1737599-1737600,1738755,1740160

Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1740161&r1=1740160&r2=1740161&view=diff
==============================================================================
--- pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original)
+++ pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Wed Apr
20 15:58:55 2016
@@ -49,8 +49,9 @@ public final class ExtractText
     private static final String SORT = "-sort";
     private static final String IGNORE_BEADS = "-ignoreBeads";
     private static final String DEBUG = "-debug";
-    // jjb - added simple HTML output
     private static final String HTML = "-html";
+    
+    private static final String STD_ENCODING = "UTF-8";
 
     /*
      * debug flag
@@ -93,7 +94,7 @@ public final class ExtractText
         boolean sort = false;
         boolean separateBeads = true;
         String password = "";
-        String encoding = "UTF-8";
+        String encoding = STD_ENCODING;
         String pdfFile = null;
         String outputFile = null;
         // Defaults to text files
@@ -204,6 +205,11 @@ public final class ExtractText
                 }
                 else
                 {
+                    if (toHTML && !STD_ENCODING.equals(encoding))
+                    {
+                        encoding = STD_ENCODING;
+                        System.out.println("The encoding parameter is ignored when writing
html output.");
+                    }
                     output = new OutputStreamWriter( new FileOutputStream( outputFile ),
encoding );
                 }
 

Modified: pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java?rev=1740161&r1=1740160&r2=1740161&view=diff
==============================================================================
--- pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java (original)
+++ pdfbox/branches/2.0/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Wed
Apr 20 15:58:55 2016
@@ -39,7 +39,6 @@ public class PDFText2HTML extends PDFTex
 {
     private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
 
-    private boolean onFirstPage = true;
     private final FontState fontState = new FontState();
 
     /**
@@ -64,34 +63,26 @@ public class PDFText2HTML extends PDFTex
      *
      * @throws IOException
      *             If there is a problem writing out the header to the document.
+     * @deprecated deprecated, use {@link #startDocument(PDDocument)}
      */
     protected void writeHeader() throws IOException
     {
+    }
+
+    @Override
+    protected void startDocument(PDDocument document) throws IOException
+    {
         StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_HTML_BYTES);
         buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
+ "\n"
                 + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");
         buf.append("<html><head>");
         buf.append("<title>").append(escape(getTitle())).append("</title>\n");
-        buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-16\">\n");
+        buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-8\">\n");
         buf.append("</head>\n");
         buf.append("<body>\n");
         super.writeString(buf.toString());
     }
-
-    /**
-     * {@inheritDoc}
-     */
-    @Override
-    protected void writePage() throws IOException
-    {
-        if (onFirstPage)
-        {
-            writeHeader();
-            onFirstPage = false;
-        }
-        super.writePage();
-    }
-
+    
     /**
      * {@inheritDoc}
      */



Mime
View raw message