Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id D7864200C11 for ; Sat, 4 Feb 2017 17:19:42 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id D5F1D160B56; Sat, 4 Feb 2017 16:19:42 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id B7603160B63 for ; Sat, 4 Feb 2017 17:19:40 +0100 (CET) Received: (qmail 29021 invoked by uid 500); 4 Feb 2017 16:19:39 -0000 Mailing-List: contact notifications-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list notifications@commons.apache.org Received: (qmail 29008 invoked by uid 99); 4 Feb 2017 16:19:39 -0000 Received: from Unknown (HELO svn01-us-west.apache.org) (209.188.14.144) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 04 Feb 2017 16:19:39 +0000 Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id 13BDD3A108E for ; Sat, 4 Feb 2017 16:19:39 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1006212 [2/16] - in /websites/production/commons/content/proper/commons-compress: ./ apidocs/org/apache/commons/compress/compressors/lz4/ apidocs/org/apache/commons/compress/compressors/snappy/ apidocs/src-html/org/apache/commons/compress/... Date: Sat, 04 Feb 2017 16:19:38 -0000 To: notifications@commons.apache.org From: bodewig@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20170204161939.13BDD3A108E@svn01-us-west.apache.org> archived-at: Sat, 04 Feb 2017 16:19:43 -0000 Modified: websites/production/commons/content/proper/commons-compress/apidocs/src-html/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.html ============================================================================== --- websites/production/commons/content/proper/commons-compress/apidocs/src-html/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.html (original) +++ websites/production/commons/content/proper/commons-compress/apidocs/src-html/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.html Sat Feb 4 16:19:37 2017 @@ -32,439 +32,442 @@ 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; -027import java.util.List; -028 -029import org.apache.commons.compress.compressors.CompressorOutputStream; -030import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; -031import org.apache.commons.compress.compressors.lz77support.Parameters; -032import org.apache.commons.compress.utils.ByteUtils; -033 -034/** -035 * CompressorOutputStream for the LZ4 block format. -036 * -037 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> -038 * @since 1.14 -039 * @NotThreadSafe -040 */ -041public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { -042 -043 private static final int MIN_BACK_REFERENCE_LENGTH = 4; -044 private static final int MIN_LENGTH_OF_LAST_LITERAL = 5; -045 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; -046 -047 /* -048 -049 The LZ4 block format has a few properties that make it less -050 straight-forward than one would hope: -051 -052 * literal blocks and back-references must come in pairs (except -053 for the very last literal block), so consecutive literal -054 blocks created by the compressor must be merged into a single -055 block. -056 -057 * the start of a literal/back-reference pair contains the length -058 of the back-reference (at least some part of it) so we can't -059 start writing the literal before we know how long the next -060 back-reference is going to be. -061 -062 * there are special rules for the final blocks -063 -064 > There are specific parsing rules to respect in order to remain -065 > compatible with assumptions made by the decoder : -066 > -067 > 1. The last 5 bytes are always literals -068 > -069 > 2. The last match must start at least 12 bytes before end of -070 > block. Consequently, a block with less than 13 bytes cannot be -071 > compressed. -072 -073 which means any back-reference may need to get rewritten as a -074 literal block unless we know the next block is at least of -075 length 5 and the sum of this block's length and offset and the -076 next block's length is at least twelve. -077 -078 */ -079 -080 private final LZ77Compressor compressor; -081 private final OutputStream os; -082 -083 // used in one-arg write method -084 private final byte[] oneByte = new byte[1]; -085 -086 private boolean finished = false; -087 -088 private Deque<Pair> pairs = new LinkedList<>(); -089 // keeps track of the last window-size bytes (64k) in order to be -090 // able to expand back-references when needed -091 private Deque<byte[]> expandedBlocks = new LinkedList<>(); -092 -093 /** -094 * Creates a new LZ4 output stream. -095 * -096 * @param os -097 * An OutputStream to read compressed data from -098 * -099 * @throws IOException if reading fails -100 */ -101 public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException { -102 this.os = os; -103 int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; -104 compressor = new LZ77Compressor(new Parameters(BlockLZ4CompressorInputStream.WINDOW_SIZE, -105 MIN_BACK_REFERENCE_LENGTH, maxLen, maxLen, maxLen), -106 new LZ77Compressor.Callback() { -107 public void accept(LZ77Compressor.Block block) throws IOException { -108 //System.err.println(block); -109 if (block instanceof LZ77Compressor.LiteralBlock) { -110 addLiteralBlock((LZ77Compressor.LiteralBlock) block); -111 } else if (block instanceof LZ77Compressor.BackReference) { -112 addBackReference((LZ77Compressor.BackReference) block); -113 } else if (block instanceof LZ77Compressor.EOD) { -114 writeFinalLiteralBlock(); -115 } -116 } -117 }); -118 } -119 -120 @Override -121 public void write(int b) throws IOException { -122 oneByte[0] = (byte) (b & 0xff); -123 write(oneByte); -124 } -125 -126 @Override -127 public void write(byte[] data, int off, int len) throws IOException { -128 compressor.compress(data, off, len); -129 } -130 -131 @Override -132 public void close() throws IOException { -133 finish(); -134 os.close(); -135 } -136 -137 /** -138 * Compresses all remaining data and writes it to the stream, -139 * doesn't close the underlying stream. -140 * @throws IOException if an error occurs -141 */ -142 public void finish() throws IOException { -143 if (!finished) { -144 compressor.finish(); -145 finished = true; -146 } -147 } -148 -149 private void addLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException { -150 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); -151 recordLiteral(last.addLiteral(block)); -152 clearUnusedBlocksAndPairs(); -153 } -154 -155 private void addBackReference(LZ77Compressor.BackReference block) throws IOException { -156 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); -157 last.setBackReference(block); -158 recordBackReference(block); -159 clearUnusedBlocksAndPairs(); -160 } -161 -162 private Pair writeBlocksAndReturnUnfinishedPair(int length) throws IOException { -163 writeWritablePairs(length); -164 Pair last = pairs.peekLast(); -165 if (last == null || last.hasBackReference()) { -166 last = new Pair(); -167 pairs.addLast(last); -168 } -169 return last; -170 } -171 -172 private void recordLiteral(byte[] b) { -173 expandedBlocks.addFirst(b); -174 } -175 -176 private void clearUnusedBlocksAndPairs() { -177 clearUnusedBlocks(); -178 clearUnusedPairs(); -179 } -180 -181 private void clearUnusedBlocks() { -182 int blockLengths = 0; -183 int blocksToKeep = 0; -184 for (byte[] b : expandedBlocks) { -185 blocksToKeep++; -186 blockLengths += b.length; -187 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { -188 break; -189 } -190 } -191 final int size = expandedBlocks.size(); -192 for (int i = blocksToKeep; i < size; i++) { -193 expandedBlocks.removeLast(); -194 } -195 } -196 -197 private void recordBackReference(LZ77Compressor.BackReference block) { -198 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); -199 } -200 -201 private byte[] expand(final int offset, final int length) { -202 byte[] expanded = new byte[length]; -203 if (offset == 1) { // surprisingly common special case -204 byte[] block = expandedBlocks.peekFirst(); -205 byte b = block[block.length - 1]; -206 if (b != 0) { // the fresh array contains 0s anyway -207 Arrays.fill(expanded, b); -208 } -209 } else { -210 expandFromList(expanded, offset, length); -211 } -212 return expanded; -213 } -214 -215 private void expandFromList(final byte[] expanded, int offset, int length) { -216 int offsetRemaining = offset; -217 int lengthRemaining = length; -218 int writeOffset = 0; -219 while (lengthRemaining > 0) { -220 // find block that contains offsetRemaining -221 byte[] block = null; -222 int copyLen, copyOffset; -223 if (offsetRemaining > 0) { -224 int blockOffset = 0; -225 for (byte[] b : expandedBlocks) { -226 if (b.length + blockOffset >= offsetRemaining) { -227 block = b; -228 break; -229 } -230 blockOffset += b.length; -231 } -232 copyOffset = blockOffset + block.length - offsetRemaining; -233 copyLen = Math.min(lengthRemaining, block.length - copyOffset); -234 } else { -235 // offsetRemaining is negative and points into the expanded bytes -236 block = expanded; -237 copyOffset = writeOffset + offsetRemaining; -238 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); -239 } -240 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); -241 offsetRemaining -= copyLen; -242 lengthRemaining -= copyLen; -243 writeOffset += copyLen; -244 } -245 } -246 -247 private void clearUnusedPairs() { -248 int pairLengths = 0; -249 int pairsToKeep = 0; -250 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { -251 Pair p = it.next(); -252 pairsToKeep++; -253 pairLengths += p.length(); -254 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { -255 break; -256 } -257 } -258 final int size = pairs.size(); -259 for (int i = pairsToKeep; i < size; i++) { -260 Pair p = pairs.peekFirst(); -261 if (p.hasBeenWritten()) { -262 pairs.removeFirst(); -263 } else { -264 break; -265 } -266 } -267 } -268 -269 private void writeFinalLiteralBlock() throws IOException { -270 rewriteLastPairs(); -271 for (Pair p : pairs) { -272 if (!p.hasBeenWritten()) { -273 p.writeTo(os); -274 } -275 } -276 pairs.clear(); -277 } -278 -279 private void writeWritablePairs(int lengthOfBlocksAfterLastPair) throws IOException { -280 int unwrittenLength = lengthOfBlocksAfterLastPair; -281 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { -282 Pair p = it.next(); -283 if (p.hasBeenWritten()) { -284 break; -285 } -286 unwrittenLength += p.length(); -287 } -288 for (Pair p : pairs) { -289 if (p.hasBeenWritten()) { -290 continue; -291 } -292 unwrittenLength -= p.length(); -293 if (p.canBeWritten(unwrittenLength)) { -294 p.writeTo(os); -295 } else { -296 break; -297 } -298 } -299 } -300 -301 private void rewriteLastPairs() { -302 LinkedList<Pair> lastPairs = new LinkedList<>(); -303 LinkedList<Integer> pairLength = new LinkedList<>(); -304 int offset = 0; -305 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { -306 Pair p = it.next(); -307 if (p.hasBeenWritten()) { -308 break; -309 } -310 int len = p.length(); -311 pairLength.addFirst(len); -312 lastPairs.addFirst(p); -313 offset += len; -314 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { -315 break; -316 } -317 } -318 for (Pair p : lastPairs) { -319 pairs.remove(p); +027 +028import org.apache.commons.compress.compressors.CompressorOutputStream; +029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; +030import org.apache.commons.compress.compressors.lz77support.Parameters; +031import org.apache.commons.compress.utils.ByteUtils; +032 +033/** +034 * CompressorOutputStream for the LZ4 block format. +035 * +036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> +037 * @since 1.14 +038 * @NotThreadSafe +039 */ +040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { +041 +042 private static final int MIN_BACK_REFERENCE_LENGTH = 4; +043 private static final int MIN_LENGTH_OF_LAST_LITERAL = 5; +044 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; +045 +046 /* +047 +048 The LZ4 block format has a few properties that make it less +049 straight-forward than one would hope: +050 +051 * literal blocks and back-references must come in pairs (except +052 for the very last literal block), so consecutive literal +053 blocks created by the compressor must be merged into a single +054 block. +055 +056 * the start of a literal/back-reference pair contains the length +057 of the back-reference (at least some part of it) so we can't +058 start writing the literal before we know how long the next +059 back-reference is going to be. +060 +061 * there are special rules for the final blocks +062 +063 > There are specific parsing rules to respect in order to remain +064 > compatible with assumptions made by the decoder : +065 > +066 > 1. The last 5 bytes are always literals +067 > +068 > 2. The last match must start at least 12 bytes before end of +069 > block. Consequently, a block with less than 13 bytes cannot be +070 > compressed. +071 +072 which means any back-reference may need to get rewritten as a +073 literal block unless we know the next block is at least of +074 length 5 and the sum of this block's length and offset and the +075 next block's length is at least twelve. +076 +077 */ +078 +079 private final LZ77Compressor compressor; +080 private final OutputStream os; +081 +082 // used in one-arg write method +083 private final byte[] oneByte = new byte[1]; +084 +085 private boolean finished = false; +086 +087 private Deque<Pair> pairs = new LinkedList<>(); +088 // keeps track of the last window-size bytes (64k) in order to be +089 // able to expand back-references when needed +090 private Deque<byte[]> expandedBlocks = new LinkedList<>(); +091 +092 /** +093 * Creates a new LZ4 output stream. +094 * +095 * @param os +096 * An OutputStream to read compressed data from +097 * +098 * @throws IOException if reading fails +099 */ +100 public BlockLZ4CompressorOutputStream(final OutputStream os) throws IOException { +101 this.os = os; +102 int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; +103 compressor = new LZ77Compressor(new Parameters(BlockLZ4CompressorInputStream.WINDOW_SIZE, +104 MIN_BACK_REFERENCE_LENGTH, maxLen, maxLen, maxLen), +105 new LZ77Compressor.Callback() { +106 public void accept(LZ77Compressor.Block block) throws IOException { +107 //System.err.println(block); +108 if (block instanceof LZ77Compressor.LiteralBlock) { +109 addLiteralBlock((LZ77Compressor.LiteralBlock) block); +110 } else if (block instanceof LZ77Compressor.BackReference) { +111 addBackReference((LZ77Compressor.BackReference) block); +112 } else if (block instanceof LZ77Compressor.EOD) { +113 writeFinalLiteralBlock(); +114 } +115 } +116 }); +117 } +118 +119 @Override +120 public void write(int b) throws IOException { +121 oneByte[0] = (byte) (b & 0xff); +122 write(oneByte); +123 } +124 +125 @Override +126 public void write(byte[] data, int off, int len) throws IOException { +127 compressor.compress(data, off, len); +128 } +129 +130 @Override +131 public void close() throws IOException { +132 finish(); +133 os.close(); +134 } +135 +136 /** +137 * Compresses all remaining data and writes it to the stream, +138 * doesn't close the underlying stream. +139 * @throws IOException if an error occurs +140 */ +141 public void finish() throws IOException { +142 if (!finished) { +143 compressor.finish(); +144 finished = true; +145 } +146 } +147 +148 private void addLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException { +149 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); +150 recordLiteral(last.addLiteral(block)); +151 clearUnusedBlocksAndPairs(); +152 } +153 +154 private void addBackReference(LZ77Compressor.BackReference block) throws IOException { +155 Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); +156 last.setBackReference(block); +157 recordBackReference(block); +158 clearUnusedBlocksAndPairs(); +159 } +160 +161 private Pair writeBlocksAndReturnUnfinishedPair(int length) throws IOException { +162 writeWritablePairs(length); +163 Pair last = pairs.peekLast(); +164 if (last == null || last.hasBackReference()) { +165 last = new Pair(); +166 pairs.addLast(last); +167 } +168 return last; +169 } +170 +171 private void recordLiteral(byte[] b) { +172 expandedBlocks.addFirst(b); +173 } +174 +175 private void clearUnusedBlocksAndPairs() { +176 clearUnusedBlocks(); +177 clearUnusedPairs(); +178 } +179 +180 private void clearUnusedBlocks() { +181 int blockLengths = 0; +182 int blocksToKeep = 0; +183 for (byte[] b : expandedBlocks) { +184 blocksToKeep++; +185 blockLengths += b.length; +186 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { +187 break; +188 } +189 } +190 final int size = expandedBlocks.size(); +191 for (int i = blocksToKeep; i < size; i++) { +192 expandedBlocks.removeLast(); +193 } +194 } +195 +196 private void recordBackReference(LZ77Compressor.BackReference block) { +197 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); +198 } +199 +200 private byte[] expand(final int offset, final int length) { +201 byte[] expanded = new byte[length]; +202 if (offset == 1) { // surprisingly common special case +203 byte[] block = expandedBlocks.peekFirst(); +204 byte b = block[block.length - 1]; +205 if (b != 0) { // the fresh array contains 0s anyway +206 Arrays.fill(expanded, b); +207 } +208 } else { +209 expandFromList(expanded, offset, length); +210 } +211 return expanded; +212 } +213 +214 private void expandFromList(final byte[] expanded, int offset, int length) { +215 int offsetRemaining = offset; +216 int lengthRemaining = length; +217 int writeOffset = 0; +218 while (lengthRemaining > 0) { +219 // find block that contains offsetRemaining +220 byte[] block = null; +221 int copyLen, copyOffset; +222 if (offsetRemaining > 0) { +223 int blockOffset = 0; +224 for (byte[] b : expandedBlocks) { +225 if (b.length + blockOffset >= offsetRemaining) { +226 block = b; +227 break; +228 } +229 blockOffset += b.length; +230 } +231 if (block == null) { +232 // should not be possible +233 throw new IllegalStateException("failed to find a block containing offset " + offset); +234 } +235 copyOffset = blockOffset + block.length - offsetRemaining; +236 copyLen = Math.min(lengthRemaining, block.length - copyOffset); +237 } else { +238 // offsetRemaining is negative and points into the expanded bytes +239 block = expanded; +240 copyOffset = writeOffset + offsetRemaining; +241 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); +242 } +243 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); +244 offsetRemaining -= copyLen; +245 lengthRemaining -= copyLen; +246 writeOffset += copyLen; +247 } +248 } +249 +250 private void clearUnusedPairs() { +251 int pairLengths = 0; +252 int pairsToKeep = 0; +253 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { +254 Pair p = it.next(); +255 pairsToKeep++; +256 pairLengths += p.length(); +257 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { +258 break; +259 } +260 } +261 final int size = pairs.size(); +262 for (int i = pairsToKeep; i < size; i++) { +263 Pair p = pairs.peekFirst(); +264 if (p.hasBeenWritten()) { +265 pairs.removeFirst(); +266 } else { +267 break; +268 } +269 } +270 } +271 +272 private void writeFinalLiteralBlock() throws IOException { +273 rewriteLastPairs(); +274 for (Pair p : pairs) { +275 if (!p.hasBeenWritten()) { +276 p.writeTo(os); +277 } +278 } +279 pairs.clear(); +280 } +281 +282 private void writeWritablePairs(int lengthOfBlocksAfterLastPair) throws IOException { +283 int unwrittenLength = lengthOfBlocksAfterLastPair; +284 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { +285 Pair p = it.next(); +286 if (p.hasBeenWritten()) { +287 break; +288 } +289 unwrittenLength += p.length(); +290 } +291 for (Pair p : pairs) { +292 if (p.hasBeenWritten()) { +293 continue; +294 } +295 unwrittenLength -= p.length(); +296 if (p.canBeWritten(unwrittenLength)) { +297 p.writeTo(os); +298 } else { +299 break; +300 } +301 } +302 } +303 +304 private void rewriteLastPairs() { +305 LinkedList<Pair> lastPairs = new LinkedList<>(); +306 LinkedList<Integer> pairLength = new LinkedList<>(); +307 int offset = 0; +308 for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { +309 Pair p = it.next(); +310 if (p.hasBeenWritten()) { +311 break; +312 } +313 int len = p.length(); +314 pairLength.addFirst(len); +315 lastPairs.addFirst(p); +316 offset += len; +317 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { +318 break; +319 } 320 } -321 // lastPairs may contain between one and four Pairs: -322 // * the last pair may be a one byte literal -323 // * all other Pairs contain a back-reference which must be four bytes long at minimum -324 // we could merge them all into a single literal block but -325 // this may harm compression. For example compressing -326 // "bla.tar" from our tests yields a last block containing a -327 // back-reference of length > 2k and we'd end up with a last -328 // literal of that size rather than a 2k back-reference and a -329 // 12 byte literal at the end. -330 -331 // Instead we merge all but the first of lastPairs into a new -332 // literal-only Pair "replacement" and look at the -333 // back-reference in the first of lastPairs and see if we can -334 // split it. We can split it if it is longer than 16 - -335 // replacement.length (i.e. the minimal length of four is kept -336 // while making sure the last literal is at least twelve bytes -337 // long). If we can't split it, we expand the first of the pairs -338 // as well. -339 -340 // this is not optimal, we could get better compression -341 // results with more complex approaches as the last literal -342 // only needs to be five bytes long if the previous -343 // back-reference has an offset big enough -344 -345 final int lastPairsSize = lastPairs.size(); -346 int toExpand = 0; -347 for (int i = 1; i < lastPairsSize; i++) { -348 toExpand += pairLength.get(i); -349 } -350 Pair replacement = new Pair(); -351 if (toExpand > 0) { -352 replacement.prependLiteral(expand(toExpand, toExpand)); -353 } -354 Pair splitCandidate = lastPairs.get(0); -355 int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; -356 if (splitCandidate.hasBackReference() -357 && splitCandidate.backReferenceLength() >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { -358 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); -359 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(splitCandidate.backReferenceLength() -360 - stillNeeded)); -361 } else { -362 if (splitCandidate.hasBackReference()) { -363 int brLen = splitCandidate.backReferenceLength(); -364 replacement.prependLiteral(expand(toExpand + brLen, brLen)); -365 } -366 splitCandidate.prependTo(replacement); -367 } -368 pairs.add(replacement); -369 } -370 -371 final static class Pair { -372 private final Deque<byte[]> literals = new LinkedList<>(); -373 private int brOffset, brLength; -374 private boolean written; -375 -376 private void prependLiteral(byte[] data) { -377 literals.addFirst(data); -378 } -379 byte[] addLiteral(LZ77Compressor.LiteralBlock block) { -380 byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), -381 block.getOffset() + block.getLength()); -382 literals.add(copy); -383 return copy; -384 } -385 void setBackReference(LZ77Compressor.BackReference block) { -386 if (hasBackReference()) { -387 throw new IllegalStateException(); -388 } -389 brOffset = block.getOffset(); -390 brLength = block.getLength(); -391 } -392 boolean hasBackReference() { -393 return brOffset > 0; +321 for (Pair p : lastPairs) { +322 pairs.remove(p); +323 } +324 // lastPairs may contain between one and four Pairs: +325 // * the last pair may be a one byte literal +326 // * all other Pairs contain a back-reference which must be four bytes long at minimum +327 // we could merge them all into a single literal block but +328 // this may harm compression. For example compressing +329 // "bla.tar" from our tests yields a last block containing a +330 // back-reference of length > 2k and we'd end up with a last +331 // literal of that size rather than a 2k back-reference and a +332 // 12 byte literal at the end. +333 +334 // Instead we merge all but the first of lastPairs into a new +335 // literal-only Pair "replacement" and look at the +336 // back-reference in the first of lastPairs and see if we can +337 // split it. We can split it if it is longer than 16 - +338 // replacement.length (i.e. the minimal length of four is kept +339 // while making sure the last literal is at least twelve bytes +340 // long). If we can't split it, we expand the first of the pairs +341 // as well. +342 +343 // this is not optimal, we could get better compression +344 // results with more complex approaches as the last literal +345 // only needs to be five bytes long if the previous +346 // back-reference has an offset big enough +347 +348 final int lastPairsSize = lastPairs.size(); +349 int toExpand = 0; +350 for (int i = 1; i < lastPairsSize; i++) { +351 toExpand += pairLength.get(i); +352 } +353 Pair replacement = new Pair(); +354 if (toExpand > 0) { +355 replacement.prependLiteral(expand(toExpand, toExpand)); +356 } +357 Pair splitCandidate = lastPairs.get(0); +358 int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; +359 if (splitCandidate.hasBackReference() +360 && splitCandidate.backReferenceLength() >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { +361 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); +362 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(splitCandidate.backReferenceLength() +363 - stillNeeded)); +364 } else { +365 if (splitCandidate.hasBackReference()) { +366 int brLen = splitCandidate.backReferenceLength(); +367 replacement.prependLiteral(expand(toExpand + brLen, brLen)); +368 } +369 splitCandidate.prependTo(replacement); +370 } +371 pairs.add(replacement); +372 } +373 +374 final static class Pair { +375 private final Deque<byte[]> literals = new LinkedList<>(); +376 private int brOffset, brLength; +377 private boolean written; +378 +379 private void prependLiteral(byte[] data) { +380 literals.addFirst(data); +381 } +382 byte[] addLiteral(LZ77Compressor.LiteralBlock block) { +383 byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), +384 block.getOffset() + block.getLength()); +385 literals.add(copy); +386 return copy; +387 } +388 void setBackReference(LZ77Compressor.BackReference block) { +389 if (hasBackReference()) { +390 throw new IllegalStateException(); +391 } +392 brOffset = block.getOffset(); +393 brLength = block.getLength(); 394 } -395 boolean canBeWritten(int lengthOfBlocksAfterThisPair) { -396 return hasBackReference() -397 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; -398 } -399 int length() { -400 return literalLength() + brLength; +395 boolean hasBackReference() { +396 return brOffset > 0; +397 } +398 boolean canBeWritten(int lengthOfBlocksAfterThisPair) { +399 return hasBackReference() +400 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 401 } -402 private boolean hasBeenWritten() { -403 return written; +402 int length() { +403 return literalLength() + brLength; 404 } -405 void writeTo(OutputStream out) throws IOException { -406 int litLength = literalLength(); -407 out.write(lengths(litLength, brLength)); -408 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { -409 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); -410 } -411 for (byte[] b : literals) { -412 out.write(b); +405 private boolean hasBeenWritten() { +406 return written; +407 } +408 void writeTo(OutputStream out) throws IOException { +409 int litLength = literalLength(); +410 out.write(lengths(litLength, brLength)); +411 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { +412 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 413 } -414 if (hasBackReference()) { -415 ByteUtils.toLittleEndian(out, brOffset, 2); -416 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { -417 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH -418 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); -419 } -420 } -421 written = true; -422 } -423 private int literalLength() { -424 int length = 0; -425 for (byte[] b : literals) { -426 length += b.length; -427 } -428 return length; -429 } -430 private static int lengths(int litLength, int brLength) { -431 int l = litLength < 15 ? litLength : 15; -432 int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); -433 return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; -434 } -435 private static void writeLength(int length, OutputStream out) throws IOException { -436 while (length >= 255) { -437 out.write(255); -438 length -= 255; -439 } -440 out.write(length); -441 } -442 private int backReferenceLength() { -443 return brLength; +414 for (byte[] b : literals) { +415 out.write(b); +416 } +417 if (hasBackReference()) { +418 ByteUtils.toLittleEndian(out, brOffset, 2); +419 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { +420 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH +421 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); +422 } +423 } +424 written = true; +425 } +426 private int literalLength() { +427 int length = 0; +428 for (byte[] b : literals) { +429 length += b.length; +430 } +431 return length; +432 } +433 private static int lengths(int litLength, int brLength) { +434 int l = litLength < 15 ? litLength : 15; +435 int br = brLength < 4 ? 0 : (brLength < 19 ? brLength - 4 : 15); +436 return (l << BlockLZ4CompressorInputStream.SIZE_BITS) | br; +437 } +438 private static void writeLength(int length, OutputStream out) throws IOException { +439 while (length >= 255) { +440 out.write(255); +441 length -= 255; +442 } +443 out.write(length); 444 } -445 private void prependTo(Pair other) { -446 Iterator<byte[]> litsBackwards = literals.descendingIterator(); -447 while (litsBackwards.hasNext()) { -448 other.prependLiteral(litsBackwards.next()); -449 } -450 } -451 private Pair splitWithNewBackReferenceLengthOf(int newBackReferenceLength) { -452 Pair p = new Pair(); -453 p.literals.addAll(literals); -454 p.brOffset = brOffset; -455 p.brLength = newBackReferenceLength; -456 return p; -457 } -458 } -459} +445 private int backReferenceLength() { +446 return brLength; +447 } +448 private void prependTo(Pair other) { +449 Iterator<byte[]> litsBackwards = literals.descendingIterator(); +450 while (litsBackwards.hasNext()) { +451 other.prependLiteral(litsBackwards.next()); +452 } +453 } +454 private Pair splitWithNewBackReferenceLengthOf(int newBackReferenceLength) { +455 Pair p = new Pair(); +456 p.literals.addAll(literals); +457 p.brOffset = brOffset; +458 p.brLength = newBackReferenceLength; +459 return p; +460 } +461 } +462}