pdfbox-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From til...@apache.org
Subject svn commit: r1601867 - /pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
Date Wed, 11 Jun 2014 12:09:36 GMT
Author: tilman
Date: Wed Jun 11 12:09:36 2014
New Revision: 1601867

URL: http://svn.apache.org/r1601867
Log:
PDFBOX-2128: add parameter directJPEG to force 1:1 extraction of JPEGs

Modified:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java?rev=1601867&r1=1601866&r2=1601867&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Wed Jun
11 12:09:36 2014
@@ -17,11 +17,15 @@
 package org.apache.pdfbox;
 
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
@@ -30,6 +34,8 @@ import org.apache.pdfbox.pdmodel.encrypt
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg;
+import org.apache.pdfbox.io.IOUtils;
 
 /**
  * This will read a read pdf and extract images. <br/><br/>
@@ -47,6 +53,7 @@ public class ExtractImages
     private static final String PREFIX = "-prefix";
     private static final String ADDKEY = "-addkey";
     private static final String NONSEQ = "-nonSeq";
+    private static final String DIRECTJPEG = "-directJPEG";
 
     private ExtractImages()
     {
@@ -78,6 +85,7 @@ public class ExtractImages
             String prefix = null;
             boolean addKey = false;
             boolean useNonSeqParser = false;
+            boolean directJPEG = false;
             for( int i=0; i<args.length; i++ )
             {
                 if( args[i].equals( PASSWORD ) )
@@ -106,6 +114,10 @@ public class ExtractImages
                 {
                     useNonSeqParser = true;
                 }
+                else if( args[i].equals( DIRECTJPEG ) )
+                {
+                    directJPEG = true;
+                }
                 else
                 {
                     if( pdfFile == null )
@@ -157,7 +169,7 @@ public class ExtractImages
                         PDPage page = (PDPage)iter.next();
                         PDResources resources = page.getResources();
                         // extract all XObjectImages which are part of the page resources
-                        processResources(resources, prefix, addKey);
+                        processResources(resources, prefix, addKey, directJPEG);
                     }
                 }
                 finally
@@ -170,8 +182,39 @@ public class ExtractImages
             }
         }
     }
+    
+    public void writeJpeg2file(PDJpeg image, String filename) throws IOException
+    {
+        final List<String> DCT_FILTERS = new ArrayList<String>();
+        DCT_FILTERS.add(COSName.DCT_DECODE.getName());
+        DCT_FILTERS.add(COSName.DCT_DECODE_ABBREVIATION.getName());
+
+        FileOutputStream out = null;
+        
+        try
+        {
+            out = new FileOutputStream(filename + ".jpg");
+            InputStream data = image.getPDStream().getPartiallyFilteredStream(DCT_FILTERS);
+            byte[] buf = new byte[1024];
+            int amountRead;
+            while ((amountRead = data.read(buf)) != -1)
+            {
+                out.write(buf, 0, amountRead);
+            }
+            IOUtils.closeQuietly(data);
+            out.flush();
+        }
+        finally
+        {
+            if (out != null)
+            {
+                out.close();
+            }
+        }
+    }
 
-    private void processResources(PDResources resources, String prefix, boolean addKey) throws
IOException
+    private void processResources(PDResources resources, String prefix, 
+            boolean addKey, boolean directJPEG) throws IOException
     {
         if (resources == null)
         {
@@ -199,7 +242,14 @@ public class ExtractImages
                         name = getUniqueFileName( prefix, image.getSuffix() );
                     }
                     System.out.println( "Writing image:" + name );
-                    image.write2file( name );
+                    if (directJPEG && "jpg".equals(image.getSuffix()))
+                    {
+                        writeJpeg2file((PDJpeg) image, name);
+                    }
+                    else
+                    {
+                        image.write2file(name);
+                    }
                     image.clear(); // PDFBOX-2101 get rid of cache ASAP
                 }
                 // maybe there are more images embedded in a form object
@@ -207,7 +257,7 @@ public class ExtractImages
                 {
                     PDXObjectForm xObjectForm = (PDXObjectForm)xobject;
                     PDResources formResources = xObjectForm.getResources();
-                    processResources(formResources, prefix, addKey);
+                    processResources(formResources, prefix, addKey, directJPEG);
                 }
             }
         }
@@ -237,6 +287,7 @@ public class ExtractImages
             "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n" +
             "  -addkey                      add the internal image key to the file name\n"
+
             "  -nonSeq                      Enables the new non-sequential parser\n" +
+            "  -directJPEG                  Forces the direct extraction of JPEG images regardless
of colorspace\n" +
             "  <PDF file>                   The PDF document to use\n"
             );
         System.exit( 1 );



Mime
View raw message