Return-Path: X-Original-To: apmail-pdfbox-commits-archive@www.apache.org Delivered-To: apmail-pdfbox-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 4CAAA18F7E for ; Fri, 23 Oct 2015 15:48:45 +0000 (UTC) Received: (qmail 42496 invoked by uid 500); 23 Oct 2015 15:48:45 -0000 Delivered-To: apmail-pdfbox-commits-archive@pdfbox.apache.org Received: (qmail 42475 invoked by uid 500); 23 Oct 2015 15:48:45 -0000 Mailing-List: contact commits-help@pdfbox.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@pdfbox.apache.org Delivered-To: mailing list commits@pdfbox.apache.org Received: (qmail 42466 invoked by uid 99); 23 Oct 2015 15:48:45 -0000 Received: from Unknown (HELO spamd3-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 23 Oct 2015 15:48:45 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd3-us-west.apache.org (ASF Mail Server at spamd3-us-west.apache.org) with ESMTP id BDB781809AB for ; Fri, 23 Oct 2015 15:48:44 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd3-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 0.99 X-Spam-Level: X-Spam-Status: No, score=0.99 tagged_above=-999 required=6.31 tests=[KAM_LAZY_DOMAIN_SECURITY=1, T_RP_MATCHES_RCVD=-0.01] autolearn=disabled Received: from mx1-us-west.apache.org ([10.40.0.8]) by localhost (spamd3-us-west.apache.org [10.40.0.10]) (amavisd-new, port 10024) with ESMTP id 2NSf9Ch_v8hH for ; Fri, 23 Oct 2015 15:48:43 +0000 (UTC) Received: from mailrelay1-us-west.apache.org (mailrelay1-us-west.apache.org [209.188.14.139]) by mx1-us-west.apache.org (ASF Mail Server at mx1-us-west.apache.org) with ESMTP id 06A272303B for ; Fri, 23 Oct 2015 15:48:43 +0000 (UTC) Received: from svn01-us-west.apache.org (svn.apache.org [10.41.0.6]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id 6E91EE0183 for ; Fri, 23 Oct 2015 15:48:42 +0000 (UTC) Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id EC3393A0907 for ; Fri, 23 Oct 2015 15:48:41 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1710247 - /pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java Date: Fri, 23 Oct 2015 15:48:41 -0000 To: commits@pdfbox.apache.org From: tilman@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20151023154841.EC3393A0907@svn01-us-west.apache.org> Author: tilman Date: Fri Oct 23 15:48:41 2015 New Revision: 1710247 URL: http://svn.apache.org/viewvc?rev=1710247&view=rev Log: PDFBOX-3044: change encoding to utf8, don't fail immediately; output diff output; use diff library; update test files to utf8 Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java?rev=1710247&r1=1710246&r2=1710247&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java (original) +++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java Fri Oct 23 15:48:41 2015 @@ -16,6 +16,13 @@ */ package org.apache.pdfbox.text; +import difflib.ChangeDelta; +import difflib.DeleteDelta; +import difflib.DiffUtils; +import difflib.InsertDelta; +import difflib.Patch; +import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; @@ -25,8 +32,11 @@ import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.io.PrintStream; import java.io.Writer; import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; import junit.framework.Test; import junit.framework.TestCase; @@ -96,7 +106,7 @@ public class TestTextStripper extends Te private boolean bFail = false; private PDFTextStripper stripper = null; - private final String encoding = "UTF-16LE"; + private static final String ENCODING = "UTF-8"; /** * Test class constructor. @@ -238,26 +248,30 @@ public class TestTextStripper extends Te try { File outFile; + File diffFile; File expectedFile; if(bSort) { outFile = new File(outDir, inFile.getName() + "-sorted.txt"); + diffFile = new File(outDir, inFile.getName() + "-sorted-diff.txt"); expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt"); } else { outFile = new File(outDir, inFile.getName() + ".txt"); + diffFile = new File(outDir, inFile.getName() + "-diff.txt"); expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt"); } OutputStream os = new FileOutputStream(outFile); try { - os.write( 0xFF ); - os.write( 0xFE ); + os.write (0xEF); + os.write (0xBB); + os.write (0xBF); - Writer writer = new OutputStreamWriter(os, encoding); + Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING)); try { //Allows for sorted tests @@ -284,15 +298,17 @@ public class TestTextStripper extends Te if (!expectedFile.exists()) { this.bFail = true; - fail("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + + log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist"); return; } + + boolean localFail = false; LineNumberReader expectedReader = - new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), encoding)); + new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING)); LineNumberReader actualReader = - new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), encoding)); + new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING)); while (true) { @@ -309,7 +325,8 @@ public class TestTextStripper extends Te if (!stringsEqual(expectedLine, actualLine)) { this.bFail = true; - fail("FAILURE: Line mismatch for file " + inFile.getName() + + localFail = true; + log.error("FAILURE: Line mismatch for file " + inFile.getName() + " (sort = "+bSort+")" + " at expected line: " + expectedReader.getLineNumber() + " at actual line: " + actualReader.getLineNumber() + @@ -327,6 +344,50 @@ public class TestTextStripper extends Te } expectedReader.close(); actualReader.close(); + if (!localFail) + { + outFile.delete(); + } + else + { + // https://code.google.com/p/java-diff-utils/wiki/SampleUsage + List original = fileToLines(expectedFile); + List revised = fileToLines(outFile); + + // Compute diff. Get the Patch object. Patch is the container for computed deltas. + Patch patch = DiffUtils.diff(original, revised); + + PrintStream diffPS = new PrintStream(diffFile); + for (Object delta : (List) patch.getDeltas()) + { + if (delta instanceof ChangeDelta) + { + ChangeDelta cdelta = (ChangeDelta) delta; + diffPS.println("Org: " + cdelta.getOriginal()); + diffPS.println("New: " + cdelta.getRevised()); + diffPS.println(); + } + else if (delta instanceof DeleteDelta) + { + DeleteDelta ddelta = (DeleteDelta) delta; + diffPS.println("Org: " + ddelta.getOriginal()); + diffPS.println("New: " + ddelta.getRevised()); + diffPS.println(); + } + else if (delta instanceof InsertDelta) + { + InsertDelta idelta = (InsertDelta) delta; + diffPS.println("Org: " + idelta.getOriginal()); + diffPS.println("New: " + idelta.getRevised()); + diffPS.println(); + } + else + { + diffPS.println(delta); + } + } + diffPS.close(); + } } finally { @@ -334,6 +395,27 @@ public class TestTextStripper extends Te } } + // Helper method for get the file content + private static List fileToLines(File file) + { + List lines = new LinkedList(); + String line = ""; + try + { + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING)); + while ((line = in.readLine()) != null) + { + lines.add(line); + } + in.close(); + } + catch (IOException e) + { + e.printStackTrace(); + } + return lines; + } + private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException { PDPageDestination pageDest = (PDPageDestination) oi.getDestination();