commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bode...@apache.org
Subject commons-compress git commit: COMPRESS-271 implement write support for LZ4 block format
Date Sat, 21 Jan 2017 20:50:43 GMT
Repository: commons-compress
Updated Branches:
  refs/heads/master 1845f0897 -> 1a40c3b89


COMPRESS-271 implement write support for LZ4 block format

needs more tests to cover the different cases of "rewriting the last
blocks"


Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/1a40c3b8
Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/1a40c3b8
Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/1a40c3b8

Branch: refs/heads/master
Commit: 1a40c3b89a31e17583bc6be42e95dc68aa61381d
Parents: 1845f08
Author: Stefan Bodewig <bodewig@apache.org>
Authored: Sat Jan 21 21:49:41 2017 +0100
Committer: Stefan Bodewig <bodewig@apache.org>
Committed: Sat Jan 21 21:49:41 2017 +0100

----------------------------------------------------------------------
 .../compressors/CompressorStreamFactory.java    |  12 +-
 .../lz4/BlockLZ4CompressorOutputStream.java     | 197 ++++++++++++++++++-
 .../lz4/BlockLZ4CompressorRoundtripTest.java    |  72 +++++++
 3 files changed, 272 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1a40c3b8/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java
b/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java
index 85709af..ba98ff9 100644
--- a/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java
+++ b/src/main/java/org/apache/commons/compress/compressors/CompressorStreamFactory.java
@@ -38,6 +38,7 @@ import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputSt
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
 import org.apache.commons.compress.compressors.lz4.BlockLZ4CompressorInputStream;
+import org.apache.commons.compress.compressors.lz4.BlockLZ4CompressorOutputStream;
 import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream;
 import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream;
 import org.apache.commons.compress.compressors.lzma.LZMAUtils;
@@ -511,7 +512,8 @@ public class CompressorStreamFactory implements CompressorStreamProvider
{
      * 
      * @param name
      *            the compressor name, i.e. {@value #GZIP}, {@value #BZIP2},
-     *            {@value #XZ}, {@value #PACK200}, {@value SNAPPY_FRAMED}
+     *            {@value #XZ}, {@value #PACK200}, {@value SNAPPY_FRAMED},
+     *            {@value LZ4_BLOCK}
      *            or {@value #DEFLATE}
      * @param out
      *            the output stream
@@ -558,6 +560,10 @@ public class CompressorStreamFactory implements CompressorStreamProvider
{
                 return new FramedSnappyCompressorOutputStream(out);
             }
 
+            if (LZ4_BLOCK.equalsIgnoreCase(name)) {
+                return new BlockLZ4CompressorOutputStream(out);
+            }
+
         } catch (final IOException e) {
             throw new CompressorException("Could not create CompressorOutputStream", e);
         }
@@ -595,12 +601,12 @@ public class CompressorStreamFactory implements CompressorStreamProvider
{
 
     @Override
     public Set<String> getInputStreamCompressorNames() {
-        return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, SNAPPY_RAW, SNAPPY_FRAMED,
Z, DEFLATE);
+        return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, SNAPPY_RAW, SNAPPY_FRAMED,
Z, DEFLATE, LZ4_BLOCK);
     }
 
     @Override
     public Set<String> getOutputStreamCompressorNames() {
-        return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, DEFLATE, SNAPPY_FRAMED);
+        return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, DEFLATE, SNAPPY_FRAMED, LZ4_BLOCK);
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1a40c3b8/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.java
b/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.java
index 856d9b3..c844b24 100644
--- a/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.java
+++ b/src/main/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorOutputStream.java
@@ -85,6 +85,9 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
     private boolean finished = false;
 
     private Deque<Pair> pairs = new LinkedList<>();
+    // keeps track of the last window-size bytes (64k) in order to be
+    // able to expand back-references when needed
+    private Deque<byte[]> expandedBlocks = new LinkedList<>();
 
     /**
      * Creates a new LZ4 output stream.
@@ -144,12 +147,15 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
 
     private void addLiteralBlock(LZ77Compressor.LiteralBlock block) throws IOException {
         Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
-        last.addLiteral(block);
+        recordLiteral(last.addLiteral(block));
+        clearUnusedBlocksAndPairs();
     }
 
     private void addBackReference(LZ77Compressor.BackReference block) throws IOException
{
         Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
         last.setBackReference(block);
+        recordBackReference(block);
+        clearUnusedBlocksAndPairs();
     }
 
     private Pair writeBlocksAndReturnUnfinishedPair(int length) throws IOException {
@@ -162,6 +168,106 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
         return last;
     }
 
+    private void recordLiteral(byte[] b) {
+        expandedBlocks.addFirst(b);
+    }
+
+    private void clearUnusedBlocksAndPairs() {
+        clearUnusedBlocks();
+        clearUnusedPairs();
+    }
+
+    private void clearUnusedBlocks() {
+        int blockLengths = 0;
+        int blocksToKeep = 0;
+        for (byte[] b : expandedBlocks) {
+            blocksToKeep++;
+            blockLengths += b.length;
+            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
+                break;
+            }
+        }
+        final int size = expandedBlocks.size();
+        for (int i = blocksToKeep; i < size; i++) {
+            expandedBlocks.removeLast();
+        }
+    }
+
+    private void recordBackReference(LZ77Compressor.BackReference block) {
+        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
+    }
+
+    private byte[] expand(final int offset, final int length) {
+        byte[] expanded = new byte[length];
+        if (offset == 1) { // surprisingly common special case
+            byte[] block = expandedBlocks.peekFirst();
+            byte b = block[block.length - 1];
+            for (int i = 0; i < expanded.length; i++) {
+                expanded[i] = b;
+            }
+        } else {
+            expandFromList(expanded, offset, length);
+        }
+        return expanded;
+    }
+
+    private void expandFromList(final byte[] expanded, int offset, int length) {
+        int offsetRemaining = offset;
+        int lengthRemaining = length;
+        int writeOffset = 0;
+        while (lengthRemaining > 0) {
+            // find block that contains offsetRemaining
+            byte[] block = null;
+            int copyLen, copyOffset;
+            if (offsetRemaining > 0) {
+                int blockOffset = 0;
+                for (byte[] b : expandedBlocks) {
+                    if (b.length + blockOffset >= offsetRemaining) {
+                        block = b;
+                        break;
+                    }
+                    blockOffset += b.length;
+                }
+                copyOffset = blockOffset + block.length - offsetRemaining;
+                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
+            } else {
+                // offsetRemaining is negative and points into the expanded bytes
+                block = expanded;
+                copyOffset = writeOffset  + offsetRemaining;
+                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
+            }
+            if (copyLen < 1) {
+                throw new IllegalStateException("zero copy");
+            }
+            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
+            offsetRemaining -= copyLen;
+            lengthRemaining -= copyLen;
+            writeOffset += copyLen;
+        }
+    }
+
+    private void clearUnusedPairs() {
+        int pairLengths = 0;
+        int pairsToKeep = 0;
+        for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
+            Pair p = it.next();
+            pairsToKeep++;
+            pairLengths += p.length();
+            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
+                break;
+            }
+        }
+        final int size = pairs.size();
+        for (int i = pairsToKeep; i < size; i++) {
+            Pair p = pairs.peekFirst();
+            if (p.hasBeenWritten()) {
+                pairs.removeFirst();
+            } else {
+                break;
+            }
+        }
+    }
+
     private void writeFinalLiteralBlock() throws IOException {
         rewriteLastPairs();
         for (Pair p : pairs) {
@@ -195,16 +301,85 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
     }
 
     private void rewriteLastPairs() {
+        LinkedList<Pair> lastPairs = new LinkedList<>();
+        LinkedList<Integer> pairLength = new LinkedList<>();
+        int offset = 0;
+        for (Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
+            Pair p = it.next();
+            if (p.hasBeenWritten()) {
+                break;
+            }
+            int len = p.length();
+            pairLength.addFirst(len);
+            lastPairs.addFirst(p);
+            offset += len;
+            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
+                break;
+            }
+        }
+        for (Pair p : lastPairs) {
+            pairs.remove(p);
+        }
+        // lastPairs may contain between one and four Pairs:
+        // * the last pair may be a one byte literal
+        // * all other Pairs contain a back-reference which must be four bytes long at minimum
+        // we could merge them all into a single literal block but
+        // this may harm compression. For example compressing
+        // "bla.tar" from our tests yields a last block containing a
+        // back-reference of length > 2k and we'd end up with a last
+        // literal of that size rather than a 2k back-reference and a
+        // 12 byte literal at the end.
+
+        // Instead we merge all but the first of lastPairs into a new
+        // literal-only Pair "replacement" and look at the
+        // back-reference in the first of lastPairs and see if we can
+        // split it. We can split it if it is longer than 16 -
+        // replacement.length (i.e. the minimal length of four is kept
+        // while making sure the last literal is at least twelve bytes
+        // long). If we can't split it, we expand the first of the pairs
+        // as well.
+
+        // this is not optimal, we could get better compression
+        // results with more complex approaches as the last literal
+        // only needs to be five bytes long if the previous
+        // back-reference has an offset big enough
+
+        final int allButFirstOfLastPairs = lastPairs.size() - 1;
+        int toExpand = 0;
+        for (int i = 1; i < allButFirstOfLastPairs; i++) {
+            toExpand += pairLength.get(i);
+        }
+        Pair replacement = new Pair();
+        if (toExpand > 0) {
+            replacement.prependLiteral(expand(toExpand, toExpand));
+        }
+        Pair splitCandidate = lastPairs.get(0);
+        int splitLen = splitCandidate.length();
+        int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
+        if (splitCandidate.hasBackReference()
+            && splitCandidate.backReferenceLength() > MIN_BACK_REFERENCE_LENGTH
+ stillNeeded) {
+            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
+            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(splitCandidate.backReferenceLength()
+                - stillNeeded));
+        } else {
+            replacement.prependLiteral(expand(toExpand + splitLen, splitLen));
+        }
+        pairs.add(replacement);
     }
 
     final static class Pair {
-        private final List<byte[]> literals = new LinkedList<>();
+        private final Deque<byte[]> literals = new LinkedList<>();
         private int brOffset, brLength;
         private boolean written;
 
-        void addLiteral(LZ77Compressor.LiteralBlock block) {
-            literals.add(Arrays.copyOfRange(block.getData(), block.getOffset(),
-                block.getOffset() + block.getLength()));
+        private void prependLiteral(byte[] data) {
+            literals.addFirst(data);
+        }
+        byte[] addLiteral(LZ77Compressor.LiteralBlock block) {
+            byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
+                block.getOffset() + block.getLength());
+            literals.add(copy);
+            return copy;
         }
         void setBackReference(LZ77Compressor.BackReference block) {
             if (hasBackReference()) {
@@ -224,7 +399,7 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
         int length() {
             return literalLength() + brLength;
         }
-        boolean hasBeenWritten() {
+        private boolean hasBeenWritten() {
             return written;
         }
         void writeTo(OutputStream out) throws IOException {
@@ -264,5 +439,15 @@ public class BlockLZ4CompressorOutputStream extends CompressorOutputStream
{
             }
             out.write(length);
         }
+        private int backReferenceLength() {
+            return brLength;
+        }
+        Pair splitWithNewBackReferenceLengthOf(int newBackReferenceLength) {
+            Pair p = new Pair();
+            p.literals.addAll(literals);
+            p.brOffset = brOffset;
+            p.brLength = newBackReferenceLength;
+            return p;
+        }
     }
 }

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1a40c3b8/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorRoundtripTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorRoundtripTest.java
b/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorRoundtripTest.java
new file mode 100644
index 0000000..c6fd6ce
--- /dev/null
+++ b/src/test/java/org/apache/commons/compress/compressors/lz4/BlockLZ4CompressorRoundtripTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.commons.compress.compressors.lz4;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Random;
+import org.apache.commons.compress.AbstractTestCase;
+import org.apache.commons.compress.compressors.lz77support.Parameters;
+import org.apache.commons.compress.utils.IOUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public final class BlockLZ4CompressorRoundtripTest extends AbstractTestCase {
+
+    private void roundTripTest(String testFile) throws IOException {
+        File input = getFile(testFile);
+        long start = System.currentTimeMillis();
+        final File outputSz = new File(dir, input.getName() + ".block.lz4");
+        try (FileInputStream is = new FileInputStream(input);
+             FileOutputStream os = new FileOutputStream(outputSz);
+             BlockLZ4CompressorOutputStream los = new BlockLZ4CompressorOutputStream(os))
{
+            IOUtils.copy(is, los);
+        }
+        System.err.println(input.getName() + " written, uncompressed bytes: " + input.length()
+            + ", compressed bytes: " + outputSz.length() + " after " + (System.currentTimeMillis()
- start) + "ms");
+        start = System.currentTimeMillis();
+        try (FileInputStream is = new FileInputStream(input);
+             BlockLZ4CompressorInputStream sis = new BlockLZ4CompressorInputStream(new FileInputStream(outputSz)))
{
+            byte[] expected = IOUtils.toByteArray(is);
+            byte[] actual = IOUtils.toByteArray(sis);
+            Assert.assertArrayEquals(expected, actual);
+        }
+        System.err.println(outputSz.getName() + " read after " + (System.currentTimeMillis()
- start) + "ms");
+    }
+
+    // should yield decent compression
+    @Test
+    public void blaTarRoundtrip() throws IOException {
+        roundTripTest("bla.tar");
+    }
+
+    // yields no compression at all
+    @Test
+    public void gzippedLoremIpsumRoundtrip() throws IOException {
+        roundTripTest("lorem-ipsum.txt.gz");
+    }
+
+    @Test
+    public void biggerFileRoundtrip() throws IOException {
+        roundTripTest("COMPRESS-256.7z");
+    }
+
+}


Mime
View raw message