commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bode...@apache.org
Subject [1/2] commons-compress git commit: implement back-reference detection
Date Sat, 07 Jan 2017 17:10:40 GMT
Repository: commons-compress
Updated Branches:
  refs/heads/master 1c6d40067 -> eba864c6f


implement back-reference detection


Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/aa81e37f
Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/aa81e37f
Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/aa81e37f

Branch: refs/heads/master
Commit: aa81e37f23e03b8aec4bc287bc84778c2ffc86c3
Parents: 1c6d400
Author: Stefan Bodewig <bodewig@apache.org>
Authored: Sat Jan 7 18:05:10 2017 +0100
Committer: Stefan Bodewig <bodewig@apache.org>
Committed: Sat Jan 7 18:05:10 2017 +0100

----------------------------------------------------------------------
 .../compressors/lz77support/LZ77Compressor.java |  68 +++++--
 .../lz77support/LZ77CompressorTest.java         | 195 +++++++++++++++----
 2 files changed, 209 insertions(+), 54 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-compress/blob/aa81e37f/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
index e4afdf1..2a6fe3d 100644
--- a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
+++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
@@ -207,6 +207,8 @@ public class LZ77Compressor {
     private int blockStart = 0;
     // position of the current match
     private int matchStart = NO_MATCH;
+    // number of insertString calls for the up to three last bytes of the last match
+    private int missedInserts = 0;
 
     /**
      * Initializes a compressor with parameters and a callback.
@@ -341,8 +343,9 @@ public class LZ77Compressor {
         final int minMatch = params.getMinMatchSize();
 
         while (lookahead >= minMatch) {
+            catchUpMissedInserts();
             int matchLength = 0;
-            int hashHead = insertString();
+            int hashHead = insertString(currentPosition);
             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset())
{
                 // sets matchStart as a side effect
                 matchLength = longestMatch(hashHead);
@@ -353,14 +356,10 @@ public class LZ77Compressor {
                     flushLiteralBlock();
                     blockStart = NO_MATCH;
                 }
-                lookahead -= matchLength;
-                // inserts strings contained in current match
-                for (int i = 0; i < matchLength - 1; i++) {
-                    currentPosition++;
-                    insertString();
-                }
-                currentPosition++;
                 flushBackReference(matchLength);
+                insertStringsInMatch(matchLength);
+                lookahead -= matchLength;
+                currentPosition += matchLength;
                 blockStart = currentPosition;
             } else {
                 // no match, append to current or start a new literal
@@ -381,23 +380,66 @@ public class LZ77Compressor {
      * <p>Updates <code>insertHash</code> and <code>prev</code>
as a
      * side effect.</p>
      */
-    private int insertString() {
-        insertHash = nextHash(insertHash, window[currentPosition -1 + NUMBER_OF_BYTES_IN_HASH]);
+    private int insertString(int pos) {
+        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
         int hashHead = head[insertHash];
         prev[currentPosition & wMask] = hashHead;
-        head[insertHash] = currentPosition;
+        head[insertHash] = pos;
         return hashHead;
     }
 
+    private void insertStringsInMatch(int matchLength) {
+        // inserts strings contained in current match
+        // insertString inserts the byte 2 bytes after position, which may not yet be available
-> missedInserts
+        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
+        // currentPosition has been inserted already
+        for (int i = 1; i <= stop; i++) {
+            insertString(currentPosition + i);
+        }
+        missedInserts = matchLength - stop - 1;
+    }
+
+    private void catchUpMissedInserts() {
+        while (missedInserts > 0) {
+            insertString(currentPosition - missedInserts--);
+        }
+    }
+
     private void flushBackReference(int matchLength) {
-        callback.accept(new BackReference(matchStart, matchLength));
+        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
     }
 
     private void flushLiteralBlock() {
         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
     }
 
+    /**
+     * Searches the hash chain for real matches and returns the length
+     * of the longest match (0 if none were found) that isn't too far
+     * away (WRT maxOffset).
+     *
+     * <p>Sets matchStart to the index of the start position of the
+     * longest match as a side effect.</p>
+     */
     private int longestMatch(int matchHead) {
-        return 0;
+        final int minLength = params.getMinMatchSize();
+        int longestMatchLength = minLength - 1;
+        final int maxPossibleLength = Math.min(params.getMaxMatchSize(), lookahead);
+        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
+        while (matchHead >= minIndex) {
+            int currentLength = 0;
+            for (int i = 0; i < maxPossibleLength; i++) {
+                if (window[matchHead + i] != window[currentPosition + i]) {
+                    break;
+                }
+                currentLength++;
+            }
+            if (currentLength > longestMatchLength) {
+                longestMatchLength = currentLength;
+                matchStart = matchHead;
+            }
+            matchHead = prev[matchHead & wMask];
+        }
+        return longestMatchLength; // < minLength if no matches have been found, will
be ignored in compress()
     }
 }

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/aa81e37f/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
index 202c5da..049a085 100644
--- a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
+++ b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
@@ -18,6 +18,7 @@
  */
 package org.apache.commons.compress.compressors.lz77support;
 
+import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -29,11 +30,46 @@ import static org.junit.Assert.assertEquals;
 
 public class LZ77CompressorTest {
 
+    private static final byte[] BLA, SAM, ONE_TO_TEN;
+
+    static {
+        try {
+            /*
+             * Example from "An Explanation of the Deflate Algorithm" by "Antaeus Feldspar".
+             * @see "http://zlib.net/feldspar.html"
+             */
+            BLA = "Blah blah blah blah blah!".getBytes("ASCII");
+
+            /*
+             * Example from Wikipedia article about LZSS.
+             * Note the example uses indices instead of offsets.
+             * @see "https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Storer%E2%80%93Szymanski"
+             */
+            SAM = ("I am Sam\n"
+                   + "\n"
+                   + "Sam I am\n"
+                   + "\n"
+                   + "That Sam-I-am!\n"
+                   + "That Sam-I-am!\n"
+                   + "I do not like\n"
+                   + "that Sam-I-am!\n"
+                   + "\n"
+                   + "Do you like green eggs and ham?\n"
+                   + "\n"
+                   + "I do not like them, Sam-I-am.\n"
+                   + "I do not like green eggs and ham.").getBytes("ASCII");
+        } catch (UnsupportedEncodingException ex) {
+            throw new RuntimeException("ASCII not supported");
+        }
+        ONE_TO_TEN = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    }
+
     private List<LZ77Compressor.Block> compress(Parameters params, byte[]... chunks)
{
         final List<LZ77Compressor.Block> blocks = new ArrayList<>();
         LZ77Compressor c = new LZ77Compressor(params, new LZ77Compressor.Callback() {
                 @Override
                 public void accept(LZ77Compressor.Block block) {
+                    //System.err.println(block);
                     if (block instanceof LZ77Compressor.LiteralBlock) {
                         // replace with a real copy of data so tests
                         // can see the results as they've been when
@@ -56,59 +92,136 @@ public class LZ77CompressorTest {
 
     @Test
     public void nonCompressableWithLengthSmallerThanLiteralMax() {
-        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
-        List<LZ77Compressor.Block> blocks = compress(new Parameters(128), data);
-        assertEquals(2, blocks.size());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass());
-        assertArrayEquals(data, ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
-        assertEquals(LZ77Compressor.EOD.class, blocks.get(1).getClass());
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(128), ONE_TO_TEN);
+        assertSize(2, blocks);
+        assertLiteralBlock(ONE_TO_TEN, blocks.get(0));
     }
 
     @Test
     public void nonCompressableWithLengthGreaterThanLiteralMaxButLessThanTwiceWindowSize()
{
-        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
-        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), data);
-        assertEquals(3, blocks.size());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass());
-        assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass());
-        assertArrayEquals(new byte[] { 9, 10 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
-        assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass());
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), ONE_TO_TEN);
+        assertSize(3, blocks);
+        assertLiteralBlock(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, blocks.get(0));
+        assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(1));
     }
 
     @Test
     public void nonCompressableWithLengthThatForcesWindowSlide() {
-        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
-        List<LZ77Compressor.Block> blocks = compress(new Parameters(4), data);
-        assertEquals(4, blocks.size());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass());
-        assertArrayEquals(new byte[] { 1, 2, 3, 4 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass());
-        assertArrayEquals(new byte[] { 5, 6, 7, 8 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(2).getClass());
-        assertArrayEquals(new byte[] { 9, 10 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(2)).getData());
-        assertEquals(LZ77Compressor.EOD.class, blocks.get(3).getClass());
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(4), ONE_TO_TEN);
+        assertSize(4, blocks);
+        assertLiteralBlock(new byte[] { 1, 2, 3, 4, }, blocks.get(0));
+        assertLiteralBlock(new byte[] { 5, 6, 7, 8 }, blocks.get(1));
+        assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(2));
     }
 
     @Test
     public void nonCompressableSentAsSingleBytes() {
-        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), new byte[][]
{
-                { 1 }, { 2 }, { 3 } , { 4 }, { 5 },
-                { 6 }, { 7 }, { 8 } , { 9 }, { 10 },
-            });
-        assertEquals(3, blocks.size());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass());
-        assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
-        assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass());
-        assertArrayEquals(new byte[] { 9, 10 },
-            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
-        assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass());
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), stagger(ONE_TO_TEN));
+        assertSize(3, blocks);
+        assertLiteralBlock(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, blocks.get(0));
+        assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(1));
+    }
+
+    @Test
+    public void blaExampleWithFullArrayAvailableForCompression()
+        throws UnsupportedEncodingException {
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(128), BLA);
+        assertSize(4, blocks);
+        assertLiteralBlock("Blah b", blocks.get(0));
+        assertBackReference(5, 18, blocks.get(1));
+        assertLiteralBlock("!", blocks.get(2));
+    }
+
+    @Test
+    public void blaExampleWithShorterMatchLength() throws UnsupportedEncodingException {
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(128, 3, 5, 0, 0),
BLA);
+        assertSize(7, blocks);
+        assertLiteralBlock("Blah b", blocks.get(0));
+        assertBackReference(5, 5, blocks.get(1));
+        assertBackReference(5, 5, blocks.get(2));
+        assertBackReference(5, 5, blocks.get(3));
+        assertBackReference(5, 3, blocks.get(4));
+        assertLiteralBlock("!", blocks.get(5));
     }
 
+    @Test
+    public void blaExampleSmallerWindowSize() throws UnsupportedEncodingException {
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), BLA);
+        assertSize(5, blocks);
+        assertLiteralBlock("Blah b", blocks.get(0));
+        assertEquals(LZ77Compressor.BackReference.class, blocks.get(1).getClass());
+        assertBackReference(5, 8, blocks.get(1));
+        assertBackReference(5, 8, blocks.get(2));
+        assertLiteralBlock("ah!", blocks.get(3));
+    }
+
+    @Test
+    public void blaExampleWithSingleByteWrites() throws UnsupportedEncodingException {
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(128), stagger(BLA));
+        assertEquals(9, blocks.size());
+        assertLiteralBlock("Blah b", blocks.get(0));
+        assertBackReference(5, 3, blocks.get(1));
+        assertBackReference(5, 3, blocks.get(2));
+        assertBackReference(5, 3, blocks.get(3));
+        assertBackReference(5, 3, blocks.get(4));
+        assertBackReference(5, 3, blocks.get(5));
+        assertBackReference(5, 3, blocks.get(6));
+        assertLiteralBlock("!", blocks.get(7));
+    }
+
+    @Test
+    public void samIAmExampleWithFullArrayAvailableForCompression() throws UnsupportedEncodingException
{
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(1024), SAM);
+        assertEquals(21, blocks.size());
+        assertLiteralBlock("I am Sam\n\n", blocks.get(0));
+        assertBackReference(5, 3, blocks.get(1));
+        assertLiteralBlock(" ", blocks.get(2));
+        assertBackReference(14, 4, blocks.get(3));
+        assertLiteralBlock("\n\nThat", blocks.get(4));
+        assertBackReference(20, 4, blocks.get(5));
+        assertLiteralBlock("-I-am!", blocks.get(6));
+        assertBackReference(15, 16, blocks.get(7));
+        assertLiteralBlock("I do not like\nt", blocks.get(8));
+        assertBackReference(29, 14, blocks.get(9));
+        assertLiteralBlock("\nDo you", blocks.get(10));
+        assertBackReference(28, 5, blocks.get(11));
+        assertLiteralBlock(" green eggs and ham?\n", blocks.get(12));
+        assertBackReference(63, 14, blocks.get(13));
+        assertLiteralBlock(" them,", blocks.get(14));
+        assertBackReference(64, 9, blocks.get(15));
+        assertLiteralBlock(".", blocks.get(16));
+        assertBackReference(30, 15, blocks.get(17));
+        assertBackReference(65, 18, blocks.get(18));
+        assertLiteralBlock(".", blocks.get(19));
+    }
+
+    private static final void assertSize(int expectedSize, List<LZ77Compressor.Block>
blocks) {
+        assertEquals(expectedSize, blocks.size());
+        assertEquals(LZ77Compressor.EOD.class, blocks.get(expectedSize - 1).getClass());
+    }
+
+    private static final void assertLiteralBlock(String expectedContent, LZ77Compressor.Block
block)
+        throws UnsupportedEncodingException {
+        assertLiteralBlock(expectedContent.getBytes("ASCII"), block);
+    }
+
+    private static final void assertLiteralBlock(byte[] expectedContent, LZ77Compressor.Block
block) {
+        assertEquals(LZ77Compressor.LiteralBlock.class, block.getClass());
+        assertArrayEquals(expectedContent, ((LZ77Compressor.LiteralBlock) block).getData());
+    }
+
+    private static final void assertBackReference(int expectedOffset, int expectedLength,
LZ77Compressor.Block block) {
+        assertEquals(LZ77Compressor.BackReference.class, block.getClass());
+        LZ77Compressor.BackReference b = (LZ77Compressor.BackReference) block;
+        assertEquals(expectedOffset, b.getOffset());
+        assertEquals(expectedLength, b.getLength());
+    }
+
+    private static final byte[][] stagger(byte[] data) {
+        byte[][] r = new byte[data.length][1];
+        for (int i = 0; i < data.length; i++) {
+            r[i][0] = data[i];
+        }
+        return r;
+    }
 }


Mime
View raw message