Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id EA99B200C82 for ; Sat, 13 May 2017 01:38:30 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id E916F160BCE; Fri, 12 May 2017 23:38:30 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 3C4D5160BC8 for ; Sat, 13 May 2017 01:38:30 +0200 (CEST) Received: (qmail 54162 invoked by uid 500); 12 May 2017 23:38:29 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 54132 invoked by uid 99); 12 May 2017 23:38:29 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 12 May 2017 23:38:29 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id EDCCBE080D; Fri, 12 May 2017 23:38:28 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: tflobbe@apache.org To: commits@lucene.apache.org Date: Fri, 12 May 2017 23:38:28 -0000 Message-Id: <17dbd29730b9429a82dc276586af5223@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [01/58] [abbrv] lucene-solr:jira/solr-10233: squash merge jira/solr-10290 into master archived-at: Fri, 12 May 2017 23:38:31 -0000 Repository: lucene-solr Updated Branches: refs/heads/jira/solr-10233 1c0e2f20d -> 096ed90b3 http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/95968c69/solr/solr-ref-guide/tools/ReducePDFSize.java ---------------------------------------------------------------------- diff --git a/solr/solr-ref-guide/tools/ReducePDFSize.java b/solr/solr-ref-guide/tools/ReducePDFSize.java new file mode 100644 index 0000000..4506cae --- /dev/null +++ b/solr/solr-ref-guide/tools/ReducePDFSize.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObject; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDStream; + +/** + * A simple command line utility for reducing the size of the ref-guide PDF. + *

+ * Currently this script focuses on using {@link COSName#FLATE_DECODE} to compress the (decoded) Objects + * in the source PDF, but other improvements may be possible in the future. + *

+ *

+ * This code is originally based on the WriteDecodedDoc example provided with Apache PDFBox. + *

+ *

+ * NOTE: This class should NOT be considered a general purpose tool for reducing the size of + * any PDF. + * Decisions made in this code can and will be focused explicitly on serving the purpose of reducing the size of the + * Solr Reference Guide PDF, as originally produced by asciidoctor, and may not be generally useful for all PDFs + * "in the wild". + *

+ */ +public class ReducePDFSize { + + public static void main(String[] args) throws IOException { + if (2 != args.length) { + throw new RuntimeException("arg0 must be input file, org1 must be output file"); + } + String in = args[0]; + String out = args[1]; + PDDocument doc = null; + + try { + doc = PDDocument.load(new File(in)); + doc.setAllSecurityToBeRemoved(true); + for (COSObject cosObject : doc.getDocument().getObjects()) { + COSBase base = cosObject.getObject(); + // if it's a stream: decode it, then re-write it using FLATE_DECODE + if (base instanceof COSStream) { + COSStream stream = (COSStream) base; + byte[] bytes; + try { + bytes = new PDStream(stream).toByteArray(); + } catch (IOException ex) { + // NOTE: original example code from PDFBox just logged & "continue;"d here, 'skipping' this stream. + // If this type of failure ever happens, we can (perhaps) consider (re)ignoring this type of failure? + // + // IIUC then that will leave the original (non-decoded / non-flated) stream in place? + throw new RuntimeException("can't serialize byte[] from: " + + cosObject.getObjectNumber() + " " + + cosObject.getGenerationNumber() + " obj: " + + ex.getMessage(), ex); + } + stream.removeItem(COSName.FILTER); + OutputStream streamOut = stream.createOutputStream(COSName.FLATE_DECODE); + streamOut.write(bytes); + streamOut.close(); + } + } + doc.getDocumentCatalog(); + doc.save( out ); + } finally { + if ( doc != null ) { + doc.close(); + } + } + } +}