Return-Path: X-Original-To: apmail-pdfbox-commits-archive@www.apache.org Delivered-To: apmail-pdfbox-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id DBBD711882 for ; Wed, 11 Jun 2014 12:09:58 +0000 (UTC) Received: (qmail 24387 invoked by uid 500); 11 Jun 2014 12:09:58 -0000 Delivered-To: apmail-pdfbox-commits-archive@pdfbox.apache.org Received: (qmail 24362 invoked by uid 500); 11 Jun 2014 12:09:58 -0000 Mailing-List: contact commits-help@pdfbox.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@pdfbox.apache.org Delivered-To: mailing list commits@pdfbox.apache.org Received: (qmail 24355 invoked by uid 99); 11 Jun 2014 12:09:58 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 11 Jun 2014 12:09:58 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 11 Jun 2014 12:09:56 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id C1FC723888D2; Wed, 11 Jun 2014 12:09:36 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1601867 - /pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Date: Wed, 11 Jun 2014 12:09:36 -0000 To: commits@pdfbox.apache.org From: tilman@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140611120936.C1FC723888D2@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: tilman Date: Wed Jun 11 12:09:36 2014 New Revision: 1601867 URL: http://svn.apache.org/r1601867 Log: PDFBOX-2128: add parameter directJPEG to force 1:1 extraction of JPEGs Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java?rev=1601867&r1=1601866&r2=1601867&view=diff ============================================================================== --- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java (original) +++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Wed Jun 11 12:09:36 2014 @@ -17,11 +17,15 @@ package org.apache.pdfbox; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; @@ -30,6 +34,8 @@ import org.apache.pdfbox.pdmodel.encrypt import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; +import org.apache.pdfbox.io.IOUtils; /** * This will read a read pdf and extract images.

@@ -47,6 +53,7 @@ public class ExtractImages private static final String PREFIX = "-prefix"; private static final String ADDKEY = "-addkey"; private static final String NONSEQ = "-nonSeq"; + private static final String DIRECTJPEG = "-directJPEG"; private ExtractImages() { @@ -78,6 +85,7 @@ public class ExtractImages String prefix = null; boolean addKey = false; boolean useNonSeqParser = false; + boolean directJPEG = false; for( int i=0; i DCT_FILTERS = new ArrayList(); + DCT_FILTERS.add(COSName.DCT_DECODE.getName()); + DCT_FILTERS.add(COSName.DCT_DECODE_ABBREVIATION.getName()); + + FileOutputStream out = null; + + try + { + out = new FileOutputStream(filename + ".jpg"); + InputStream data = image.getPDStream().getPartiallyFilteredStream(DCT_FILTERS); + byte[] buf = new byte[1024]; + int amountRead; + while ((amountRead = data.read(buf)) != -1) + { + out.write(buf, 0, amountRead); + } + IOUtils.closeQuietly(data); + out.flush(); + } + finally + { + if (out != null) + { + out.close(); + } + } + } - private void processResources(PDResources resources, String prefix, boolean addKey) throws IOException + private void processResources(PDResources resources, String prefix, + boolean addKey, boolean directJPEG) throws IOException { if (resources == null) { @@ -199,7 +242,14 @@ public class ExtractImages name = getUniqueFileName( prefix, image.getSuffix() ); } System.out.println( "Writing image:" + name ); - image.write2file( name ); + if (directJPEG && "jpg".equals(image.getSuffix())) + { + writeJpeg2file((PDJpeg) image, name); + } + else + { + image.write2file(name); + } image.clear(); // PDFBOX-2101 get rid of cache ASAP } // maybe there are more images embedded in a form object @@ -207,7 +257,7 @@ public class ExtractImages { PDXObjectForm xObjectForm = (PDXObjectForm)xobject; PDResources formResources = xObjectForm.getResources(); - processResources(formResources, prefix, addKey); + processResources(formResources, prefix, addKey, directJPEG); } } } @@ -237,6 +287,7 @@ public class ExtractImages " -prefix Image prefix(default to pdf name)\n" + " -addkey add the internal image key to the file name\n" + " -nonSeq Enables the new non-sequential parser\n" + + " -directJPEG Forces the direct extraction of JPEG images regardless of colorspace\n" + " The PDF document to use\n" ); System.exit( 1 );