pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jahew...@apache.org
Subject svn commit: r1603250 - in /pdfbox/trunk: pdfbox/src/main/java/org/apache/pdfbox/pdmodel/ pdfbox/src/main/java/org/apache/pdfbox/text/ pdfbox/src/main/java/org/apache/pdfbox/util/ tools/src/main/java/org/apache/pdfbox/tools/
Date Tue, 17 Jun 2014 18:29:57 GMT
Author: jahewson
Date: Tue Jun 17 18:29:57 2014
New Revision: 1603250

URL: http://svn.apache.org/r1603250
Log:
PDFBOX-2145: Clean up PDFTextStripper, etc.

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocumentCatalog.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/ICU4JImpl.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPositionComparator.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocumentCatalog.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocumentCatalog.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocumentCatalog.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocumentCatalog.java Tue
Jun 17 18:29:57 2014
@@ -203,9 +203,9 @@ public class PDDocumentCatalog implement
      *
      * @return A list of PDPage objects.
      */
-    public List getAllPages()
+    public List<COSObjectable> getAllPages()
     {
-        List retval = new ArrayList();
+        List<COSObjectable> retval = new ArrayList<COSObjectable>();
         PDPageNode rootNode = getPages();
         //old (slower):
         //getPageObjects( rootNode, retval );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/ICU4JImpl.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/ICU4JImpl.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/ICU4JImpl.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/ICU4JImpl.java Tue Jun 17 18:29:57
2014
@@ -49,7 +49,7 @@ public class ICU4JImpl 
 
     /**
      * Takes a line of text in presentation order and converts it to logical order.
-     * @see org.apache.pdfbox.text.TextNormalize#makeLineLogicalOrder(String, boolean)
+     *
      *  
      * @param str String to convert
      * @param isRtlDominant RTL (right-to-left) will be the dominant text direction
@@ -69,7 +69,7 @@ public class ICU4JImpl 
 
     /**
      * Normalize presentation forms of characters to the separate parts. 
-     * @see org.apache.pdfbox.text.TextNormalize#normalizePres(String)
+     * @see org.apache.pdfbox.text.TextNormalize#normalizePresentationForm(String)
      * 
      * @param str String to normalize
      * @return Normalized form

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java Tue Jun 17
18:29:57 2014
@@ -19,23 +19,60 @@ package org.apache.pdfbox.text;
 import java.util.HashMap;
 
 /**
- * This class allows a caller to normalize text in various ways. It will load the ICU4J jar
file if it is defined on the
- * classpath.
- * 
- * @author <a href="mailto:carrier@digital-evidence.org">Brian Carrier</a>
+ * This class allows a caller to normalize text in various ways.
+ * It will load the ICU4J jar file if it is defined on the classpath.
  * 
+ * @author Brian Carrier
  */
 public class TextNormalize
 {
-    private ICU4JImpl icu4j = null;
-    private static final HashMap<Integer, String> DIACHASH = new HashMap<Integer,
String>();
-    private String outputEncoding;
+    private static final HashMap<Integer, String> DIACRITICS = createDiacritics();
 
-    static
-    {
-        populateDiacHash();
+    // Adds non-decomposing diacritics to the hash with their related combining character.
+    // These are values that the unicode spec claims are equivalent but are not mapped in
the form
+    // NFKC normalization method. Determined by going through the Combining Diacritical Marks
+    // section of the Unicode spec and identifying which characters are not  mapped to by
the
+    // normalization.
+    private static HashMap<Integer, String> createDiacritics()
+    {
+        HashMap<Integer, String> map = new HashMap<Integer, String>();
+        map.put(0x0060, "\u0300");
+        map.put(0x02CB, "\u0300");
+        map.put(0x0027, "\u0301");
+        map.put(0x02B9, "\u0301");
+        map.put(0x02CA, "\u0301");
+        map.put(0x005e, "\u0302");
+        map.put(0x02C6, "\u0302");
+        map.put(0x007E, "\u0303");
+        map.put(0x02C9, "\u0304");
+        map.put(0x00B0, "\u030A");
+        map.put(0x02BA, "\u030B");
+        map.put(0x02C7, "\u030C");
+        map.put(0x02C8, "\u030D");
+        map.put(0x0022, "\u030E");
+        map.put(0x02BB, "\u0312");
+        map.put(0x02BC, "\u0313");
+        map.put(0x0486, "\u0313");
+        map.put(0x055A, "\u0313");
+        map.put(0x02BD, "\u0314");
+        map.put(0x0485, "\u0314");
+        map.put(0x0559, "\u0314");
+        map.put(0x02D4, "\u031D");
+        map.put(0x02D5, "\u031E");
+        map.put(0x02D6, "\u031F");
+        map.put(0x02D7, "\u0320");
+        map.put(0x02B2, "\u0321");
+        map.put(0x02CC, "\u0329");
+        map.put(0x02B7, "\u032B");
+        map.put(0x02CD, "\u0331");
+        map.put(0x005F, "\u0332");
+        map.put(0x204E, "\u0359");
+        return map;
     }
 
+    private ICU4JImpl icu4j = null;
+    private String outputEncoding;
+
     /**
      * 
      * @param encoding The Encoding that the text will eventually be written as (or null)
@@ -61,52 +98,11 @@ public class TextNormalize
         }
     }
 
-    /*
-     * Adds non-decomposing diacritics to the hash with their related combining character.
These are values that the
-     * unicode spec claims are equivalent but are not mapped in the form NFKC normalization
method. Determined by going
-     * through the Combining Diacritical Marks section of the Unicode spec and identifying
which characters are not
-     * mapped to by the normalization.
-     */
-    private static void populateDiacHash()
-    {
-        DIACHASH.put(new Integer(0x0060), "\u0300");
-        DIACHASH.put(new Integer(0x02CB), "\u0300");
-        DIACHASH.put(new Integer(0x0027), "\u0301");
-        DIACHASH.put(new Integer(0x02B9), "\u0301");
-        DIACHASH.put(new Integer(0x02CA), "\u0301");
-        DIACHASH.put(new Integer(0x005e), "\u0302");
-        DIACHASH.put(new Integer(0x02C6), "\u0302");
-        DIACHASH.put(new Integer(0x007E), "\u0303");
-        DIACHASH.put(new Integer(0x02C9), "\u0304");
-        DIACHASH.put(new Integer(0x00B0), "\u030A");
-        DIACHASH.put(new Integer(0x02BA), "\u030B");
-        DIACHASH.put(new Integer(0x02C7), "\u030C");
-        DIACHASH.put(new Integer(0x02C8), "\u030D");
-        DIACHASH.put(new Integer(0x0022), "\u030E");
-        DIACHASH.put(new Integer(0x02BB), "\u0312");
-        DIACHASH.put(new Integer(0x02BC), "\u0313");
-        DIACHASH.put(new Integer(0x0486), "\u0313");
-        DIACHASH.put(new Integer(0x055A), "\u0313");
-        DIACHASH.put(new Integer(0x02BD), "\u0314");
-        DIACHASH.put(new Integer(0x0485), "\u0314");
-        DIACHASH.put(new Integer(0x0559), "\u0314");
-        DIACHASH.put(new Integer(0x02D4), "\u031D");
-        DIACHASH.put(new Integer(0x02D5), "\u031E");
-        DIACHASH.put(new Integer(0x02D6), "\u031F");
-        DIACHASH.put(new Integer(0x02D7), "\u0320");
-        DIACHASH.put(new Integer(0x02B2), "\u0321");
-        DIACHASH.put(new Integer(0x02CC), "\u0329");
-        DIACHASH.put(new Integer(0x02B7), "\u032B");
-        DIACHASH.put(new Integer(0x02CD), "\u0331");
-        DIACHASH.put(new Integer(0x005F), "\u0332");
-        DIACHASH.put(new Integer(0x204E), "\u0359");
-    }
-
     /**
-     * Takes a line of text in presentation order and converts it to logical order. For most
text other than Arabic and
-     * Hebrew, the presentation and logical orders are the same. However, for Arabic and
Hebrew, they are different and
-     * if the text involves both RTL and LTR text then the Unicode BIDI algorithm must be
used to determine how to map
-     * between them.
+     * Takes a line of text in presentation order and converts it to logical order. For most
text
+     * other than Arabic and Hebrew, the presentation and logical orders are the same. However,
for
+     * Arabic and Hebrew, they are different and if the text involves both RTL and LTR text
then the
+     * Unicode BIDI algorithm must be used to determine how to map  between them.
      * 
      * @param str Presentation form of line to convert (i.e. left most char is first char)
      * @param isRtlDominant true if the PAGE has a dominant right to left ordering
@@ -125,13 +121,13 @@ public class TextNormalize
     }
 
     /**
-     * Normalize the presentation forms of characters in the string. For example, convert
the single "fi" ligature to
-     * "f" and "i".
+     * Normalize the presentation forms of characters in the string. For example, convert
the
+     * single "fi" ligature to "f" and "i".
      * 
      * @param str String to normalize
      * @return Normalized string (or original string if ICU4J library is not on classpath)
      */
-    public String normalizePres(String str)
+    public String normalizePresentationForm(String str)
     {
         if (icu4j != null)
         {
@@ -144,23 +140,22 @@ public class TextNormalize
     }
 
     /**
-     * Normalize the diacritic, for example, convert non-combining diacritic characters to
their combining counterparts.
+     * Normalize the diacritic, for example, convert non-combining diacritic characters to
their
+     * combining counterparts.
      * 
      * @param str String to normalize
      * @return Normalized string (or original string if ICU4J library is not on classpath)
      */
-    public String normalizeDiac(String str)
+    public String normalizeDiacritic(String str)
     {
-        /*
-         * Unicode contains special combining forms of the diacritic characters and we want
to use these.
-         */
+        // Unicode contains special combining forms of the diacritic characters which we
want to use
         if (outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
         {
-            Integer c = new Integer(str.charAt(0));
+            Integer c = (int) str.charAt(0);
             // convert the characters not defined in the Unicode spec
-            if (DIACHASH.containsKey(c))
+            if (DIACRITICS.containsKey(c))
             {
-                return (String) DIACHASH.get(c);
+                return DIACRITICS.get(c);
             }
             else if (icu4j != null)
             {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java Tue Jun 17
18:29:57 2014
@@ -641,7 +641,7 @@ public class TextPosition
             dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING ||
             dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
         {
-            buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
+            buf.append(normalize.normalizeDiacritic(diacritic.getCharacter()));
             widths2[i] = 0;
             buf.append(str.charAt(i));
             widths2[i+1] = widths[i];
@@ -650,7 +650,7 @@ public class TextPosition
         {
             buf.append(str.charAt(i));
             widths2[i] = widths[i];
-            buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
+            buf.append(normalize.normalizeDiacritic(diacritic.getCharacter()));
             widths2[i+1] = 0;
         }
 

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPositionComparator.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPositionComparator.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPositionComparator.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPositionComparator.java Tue
Jun 17 18:29:57 2014
@@ -24,21 +24,14 @@ import java.util.Comparator;
  * on direction and sorting in that direction. This allows continuous text
  * in a given direction to be more easily grouped together.  
  *
- * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * @version $Revision: 1.7 $
+ * @author Ben Litchfield
  */
-public class TextPositionComparator implements Comparator
+public class TextPositionComparator implements Comparator<TextPosition>
 {
-    /**
-     * {@inheritDoc}
-     */
-    public int compare(Object o1, Object o2)
+    @Override
+    public int compare(TextPosition pos1, TextPosition pos2)
     {
-        int retval = 0;
-        TextPosition pos1 = (TextPosition)o1;
-        TextPosition pos2 = (TextPosition)o2;
-
-        /* Only compare text that is in the same direction. */
+        // only compare text that is in the same direction
         if (pos1.getDir() < pos2.getDir())
         {
             return -1;
@@ -48,43 +41,44 @@ public class TextPositionComparator impl
             return 1;
         }
         
-        // Get the text direction adjusted coordinates
+        // get the text direction adjusted coordinates
         float x1 = pos1.getXDirAdj();
         float x2 = pos2.getXDirAdj();
         
         float pos1YBottom = pos1.getYDirAdj();
         float pos2YBottom = pos2.getYDirAdj();
+
         // note that the coordinates have been adjusted so 0,0 is in upper left
         float pos1YTop = pos1YBottom - pos1.getHeightDir();
         float pos2YTop = pos2YBottom - pos2.getHeightDir();
 
-        float yDifference = Math.abs( pos1YBottom-pos2YBottom);
-        //we will do a simple tolerance comparison.
-        if( yDifference < .1 ||
-            (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) ||
-            (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom))
+        float yDifference = Math.abs(pos1YBottom - pos2YBottom);
+
+        // we will do a simple tolerance comparison
+        if (yDifference < .1 ||
+            pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
+            pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
         {
-            if( x1 < x2 )
+            if (x1 < x2)
             {
-                retval = -1;
+                return -1;
             }
-            else if( x1 > x2 )
+            else if (x1 > x2)
             {
-                retval = 1;
+                return 1;
             }
             else
             {
-                retval = 0;
+                return 0;
             }
         }
-        else if( pos1YBottom < pos2YBottom )
+        else if (pos1YBottom < pos2YBottom)
         {
-            retval = -1;
+            return - 1;
         }
         else
         {
             return 1;
         }
-        return retval;
     }
 }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Jun
17 18:29:57 2014
@@ -112,15 +112,14 @@ public class PDFTextStripper extends PDF
     /**
      * The platform's line separator.
      */
-    protected final String systemLineSeparator = System.getProperty("line.separator"); 
+    protected final String LINE_SEPARATOR = System.getProperty("line.separator");
 
-    private String lineSeparator = systemLineSeparator;
-    private String pageSeparator = systemLineSeparator;
+    private String lineSeparator = LINE_SEPARATOR;
     private String wordSeparator = " ";
     private String paragraphStart = "";
     private String paragraphEnd = "";
     private String pageStart = "";
-    private String pageEnd = pageSeparator;
+    private String pageEnd = LINE_SEPARATOR;
     private String articleStart = "";
     private String articleEnd = "";
 
@@ -139,7 +138,7 @@ public class PDFTextStripper extends PDF
     private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
     private float dropThreshold = DEFAULT_DROP_THRESHOLD;
 
-    // We will need to estimate where to add spaces. These are used to help guess.
+    // we will need to estimate where to add spaces, these are used to help guess
     private float spacingTolerance = .5f;
     private float averageCharTolerance = .3f;
 
@@ -211,6 +210,7 @@ public class PDFTextStripper extends PDF
         this.outputEncoding = null;
         normalize = new TextNormalize(this.outputEncoding);
     }
+
     /**
      * Instantiate a new PDFTextStripper object. This object will load
      * properties from PDFTextStripper.properties and will apply
@@ -330,10 +330,9 @@ public class PDFTextStripper extends PDF
             startBookmarkPageNumber = 0;
             endBookmarkPageNumber = 0;
         }
-        Iterator<COSObjectable> pageIter = pages.iterator();
-        while (pageIter.hasNext())
+        for (COSObjectable page : pages)
         {
-            PDPage nextPage = (PDPage)pageIter.next();
+            PDPage nextPage = (PDPage) page;
             PDStream contentStream = nextPage.getContents();
             currentPageNo++;
             if (contentStream != null)
@@ -360,10 +359,10 @@ public class PDFTextStripper extends PDF
      * This method is available for subclasses of this class. It will be called before processing
      * of the document start.
      *
-     * @param pdf The PDF document that is being processed.
+     * @param document The PDF document that is being processed.
      * @throws IOException If an IO error occurs.
      */
-    protected void startDocument(PDDocument pdf) throws IOException
+    protected void startDocument(PDDocument document) throws IOException
     {
         // no default implementation, but available for subclasses
     }
@@ -372,10 +371,10 @@ public class PDFTextStripper extends PDF
      * This method is available for subclasses of this class. It will be called after processing
      * of the document finishes.
      *
-     * @param pdf The PDF document that is being processed.
+     * @param document The PDF document that is being processed.
      * @throws IOException If an IO error occurs.
      */
-    protected void endDocument(PDDocument pdf) throws IOException
+    protected void endDocument(PDDocument document) throws IOException
     {
         // no default implementation, but available for subclasses
     }
@@ -403,7 +402,7 @@ public class PDFTextStripper extends PDF
             }
             int originalSize = charactersByArticle.size();
             charactersByArticle.setSize(numberOfArticleSections);
-            for (int i=0; i<numberOfArticleSections; i++)
+            for (int i = 0; i < numberOfArticleSections; i++)
             {
                 if (numberOfArticleSections < originalSize)
                 {
@@ -441,10 +440,10 @@ public class PDFTextStripper extends PDF
      * Default implementation is to do nothing.  Subclasses
      * may provide additional information.
      *
-     * @param isltr true if primary direction of text is left to right.
+     * @param isLTR true if primary direction of text is left to right.
      * @throws IOException If there is any error writing to the stream.
      */
-    protected void startArticle(boolean isltr) throws IOException
+    protected void startArticle(boolean isLTR) throws IOException
     {
         output.write(getArticleStart());
     }
@@ -649,7 +648,7 @@ public class PDFTextStripper extends PDF
                 // with some margin. This calculation does not make a true average (average
of
                 // averages) but we found that it gave the best results after numerous experiments.
                 // Based on experiments we also found that .3 worked well.
-                float averageCharWidth = -1;
+                float averageCharWidth;
                 if (previousAveCharWidth < 0)
                 {
                     averageCharWidth = positionWidth / wordCharCount;
@@ -762,33 +761,17 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Write the page separator value to the output stream.
-     * @throws IOException
-     *             If there is a problem writing out the pageseparator to the document.
-     */
-    protected void writePageSeperator() throws IOException
-    {
-        // RDD - newline at end of flush - required for end of page (so that the top
-        // of the next page starts on its own line.
-        output.write(getPageSeparator());
-        output.flush();
-    }
-
-    /**
      * Write the line separator value to the output stream.
-     * @throws IOException
-     *             If there is a problem writing out the lineseparator to the document.
+     * @throws IOException If there is a problem writing out the lineseparator to the document.
      */
     protected void writeLineSeparator() throws IOException
     {
         output.write(getLineSeparator());
     }
 
-
     /**
      * Write the word separator value to the output stream.
-     * @throws IOException
-     *             If there is a problem writing out the wordseparator to the document.
+     * @throws IOException If there is a problem writing out the wordseparator to the document.
      */
     protected void writeWordSeparator() throws IOException
     {
@@ -843,9 +826,8 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will process a TextPosition object and add the
-     * text to the list of characters on a page.  It takes care of
-     * overlapping text.
+     * This will process a TextPosition object and add the text to the list of characters
on a page.
+     * It takes care of overlapping text.
      *
      * @param text The text to process.
      */
@@ -913,7 +895,7 @@ public class PDFTextStripper extends PDF
             float y = text.getY();
             if (shouldSeparateByBeads)
             {
-                for (int i=0; i<pageArticles.size() && foundArticleDivisionIndex
== -1; i++)
+                for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex
== -1; i++)
                 {
                     PDThreadBead bead = pageArticles.get(i);
                     if (bead != null)
@@ -921,23 +903,23 @@ public class PDFTextStripper extends PDF
                         PDRectangle rect = bead.getRectangle();
                         if (rect.contains(x, y))
                         {
-                            foundArticleDivisionIndex = i*2+1;
+                            foundArticleDivisionIndex = i * 2 + 1;
                         }
                         else if ((x < rect.getLowerLeftX() ||
                                 y < rect.getUpperRightY()) &&
                                 notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                         {
-                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
+                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                         }
                         else if (x < rect.getLowerLeftX() &&
                                 notFoundButFirstLeftArticleDivisionIndex == -1)
                         {
-                            notFoundButFirstLeftArticleDivisionIndex = i*2;
+                            notFoundButFirstLeftArticleDivisionIndex = i * 2;
                         }
                         else if (y < rect.getUpperRightY() &&
                                 notFoundButFirstAboveArticleDivisionIndex == -1)
                         {
-                            notFoundButFirstAboveArticleDivisionIndex = i*2;
+                            notFoundButFirstAboveArticleDivisionIndex = i * 2;
                         }
                     }
                     else
@@ -950,7 +932,7 @@ public class PDFTextStripper extends PDF
             {
                 foundArticleDivisionIndex = 0;
             }
-            int articleDivisionIndex = -1;
+            int articleDivisionIndex;
             if (foundArticleDivisionIndex != -1)
             {
                 articleDivisionIndex = foundArticleDivisionIndex;
@@ -969,7 +951,7 @@ public class PDFTextStripper extends PDF
             }
             else
             {
-                articleDivisionIndex = charactersByArticle.size()-1;
+                articleDivisionIndex = charactersByArticle.size() - 1;
             }
 
             List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
@@ -989,7 +971,7 @@ public class PDFTextStripper extends PDF
                 // Note that we are making an assumption that we need to only look back
                 // one TextPosition to find what we are overlapping.
                 // This may not always be true. */
-                TextPosition previousTextPosition = textList.get(textList.size()-1);
+                TextPosition previousTextPosition = textList.get(textList.size() - 1);
                 if (text.isDiacritic() && previousTextPosition.contains(text))
                 {
                     previousTextPosition.mergeDiacritic(text, normalize);
@@ -1079,18 +1061,6 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * Set the desired page separator for output text.  The line.separator
-     * system property is used if the page separator preference is not set
-     * explicitly using this method.
-     *
-     * @param separator The desired page separator string.
-     */
-    public void setPageSeparator(String separator)
-    {
-        pageSeparator = separator;
-    }
-
-    /**
      * This will get the word separator.
      *
      * @return The desired word separator string.
@@ -1115,15 +1085,6 @@ public class PDFTextStripper extends PDF
     }
 
     /**
-     * This will get the page separator.
-     *
-     * @return The page separator string.
-     */
-    public String getPageSeparator()
-    {
-        return pageSeparator;
-    }
-    /**
      * @return Returns the suppressDuplicateOverlappingText.
      */
     public boolean getSuppressDuplicateOverlappingText()
@@ -1171,8 +1132,7 @@ public class PDFTextStripper extends PDF
      *
      * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText
to set.
      */
-    public void setSuppressDuplicateOverlappingText(
-            boolean suppressDuplicateOverlappingTextValue)
+    public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue)
     {
         suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
     }
@@ -1709,7 +1669,7 @@ public class PDFTextStripper extends PDF
     /**
      * a list of regular expressions that match commonly used
      * list item formats, i.e. bullets, numbers, letters,
-     * Roman numerals, etc.  Not meant to be
+     * Roman numerals, etc. Not meant to be
      * comprehensive.
      */
     private static final String[] LIST_ITEM_EXPRESSIONS = {
@@ -1786,7 +1746,6 @@ public class PDFTextStripper extends PDF
      */
     protected static Pattern matchPattern(String string, List<Pattern> patterns)
     {
-        Pattern matchedPattern = null;
         for (Pattern p : patterns)
         {
             if (p.matcher(string).matches())
@@ -1794,7 +1753,7 @@ public class PDFTextStripper extends PDF
                 return p;
             }
         }
-        return matchedPattern;
+        return null;
     }
 
     /**
@@ -1807,11 +1766,11 @@ public class PDFTextStripper extends PDF
             throws IOException
     {
         int numberOfStrings = line.size();
-        for (int i=0; i<numberOfStrings; i++)
+        for (int i = 0; i < numberOfStrings; i++)
         {
             WordWithTextPositions word = line.get(i);
             writeString(word.getText(), word.getTextPositions());
-            if (i < numberOfStrings-1)
+            if (i < numberOfStrings - 1)
             {
                 writeWordSeparator();
             }
@@ -1860,7 +1819,7 @@ public class PDFTextStripper extends PDF
      */
     private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions)
     {
-        return new WordWithTextPositions(normalize.normalizePres(word), wordPositions);
+        return new WordWithTextPositions(normalize.normalizePresentationForm(word), wordPositions);
     }
 
     /**
@@ -1896,7 +1855,7 @@ public class PDFTextStripper extends PDF
         {
         }
 
-        public static final WordSeparator getSeparator()
+        public static WordSeparator getSeparator()
         {
             return separator;
         }

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java?rev=1603250&r1=1603249&r2=1603250&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Tue Jun 17
18:29:57 2014
@@ -50,13 +50,13 @@ public class PDFText2HTML extends PDFTex
     public PDFText2HTML(String encoding) throws IOException
     {
         super(encoding);
-        setLineSeparator(systemLineSeparator);
+        setLineSeparator(LINE_SEPARATOR);
         setParagraphStart("<p>");
-        setParagraphEnd("</p>"+systemLineSeparator);
+        setParagraphEnd("</p>"+ LINE_SEPARATOR);
         setPageStart("<div style=\"page-break-before:always; page-break-after:always\">");
-        setPageEnd("</div>"+systemLineSeparator);
-        setArticleStart(systemLineSeparator);
-        setArticleEnd(systemLineSeparator);
+        setPageEnd("</div>"+ LINE_SEPARATOR);
+        setArticleStart(LINE_SEPARATOR);
+        setArticleEnd(LINE_SEPARATOR);
     }
 
     /**
@@ -99,7 +99,7 @@ public class PDFText2HTML extends PDFTex
     /**
      * {@inheritDoc}
      */
-    public void endDocument(PDDocument pdf) throws IOException
+    public void endDocument(PDDocument document) throws IOException
     {
         super.writeString("</body></html>");
     }
@@ -156,13 +156,13 @@ public class PDFText2HTML extends PDFTex
      * Write out the article separator (div tag) with proper text direction
      * information.
      *
-     * @param isltr true if direction of text is left to right
+     * @param isLTR true if direction of text is left to right
      * @throws IOException
      *             If there is an error writing to the stream.
      */
-    protected void startArticle(boolean isltr) throws IOException
+    protected void startArticle(boolean isLTR) throws IOException
     {
-        if (isltr)
+        if (isLTR)
         {
             super.writeString("<div>");
         }



Mime
View raw message