commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bode...@apache.org
Subject svn commit: r1340786 - in /commons/proper/compress/trunk/src: main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
Date Sun, 20 May 2012 18:04:37 GMT
Author: bodewig
Date: Sun May 20 18:04:36 2012
New Revision: 1340786

URL: http://svn.apache.org/viewvc?rev=1340786&view=rev
Log:
Integrate fallback sort into the rest, add some more tests and fix bug in bucket boundary
calculation

Modified:
    commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
    commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java

Modified: commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
URL: http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java?rev=1340786&r1=1340785&r2=1340786&view=diff
==============================================================================
--- commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
(original)
+++ commons/proper/compress/trunk/src/main/java/org/apache/commons/compress/compressors/bzip2/BlockSort.java
Sun May 20 18:04:36 2012
@@ -34,6 +34,7 @@ import java.util.BitSet;
  * Compress" you'd get:</p>
  *
  * <pre>
+ *  CompressCommons
  * Commons Compress
  * CompressCommons 
  * essCommons Compr
@@ -51,9 +52,9 @@ import java.util.BitSet;
  * ssCommons Compre
  * </pre>
  *
- * <p>Which results in a new text "s romooCCmmpnse", in adition the
+ * <p>Which results in a new text "ss romooCCmmpnse", in adition the
  * index of the first line that contained the original text is kept -
- * in this case it is 0.  The idea is that in a long English text all
+ * in this case it is 1.  The idea is that in a long English text all
  * permutations that start with "he" are likely suffixes of a "the" and
  * thus they end in "t" leading to a larger block of "t"s that can
  * better be compressed by the subsequent Move-to-Front, run-length
@@ -101,7 +102,8 @@ class BlockSort {
      * been dropped after 0.9.0 and replaced by a fallback sorting
      * algorithm.
      *
-     * I've added the fallbackSort function of 1.0.6.
+     * I've added the fallbackSort function of 1.0.6 and tried to
+     * integrate it with the existing code without touching too much.
      */
 
     /*
@@ -154,13 +156,16 @@ class BlockSort {
         this.workDone = 0;
         this.blockRandomised = false;
         this.firstAttempt = true;
+
+        if (last + 1 < 10000) {
+            fallbackSort(data, last);
+        } else {
+
         mainSort(data, last);
 
         if (this.firstAttempt && (this.workDone > this.workLimit)) {
-            randomiseBlock(data, last);
-            this.workLimit = this.workDone = 0;
-            this.firstAttempt = false;
-            mainSort(data, last);
+                fallbackSort(data, last);
+        }
         }
 
         final int[] fmap = data.fmap;
@@ -173,7 +178,27 @@ class BlockSort {
         }
 
         // assert (data.origPtr != -1) : data.origPtr;
-        return blockRandomised;
+        return false;
+    }
+
+    /**
+     * Adapt fallbackSort to the expected interface of the rest of the
+     * code, in particular deal with the fact that block starts at
+     * offset 1 (in libbzip2 1.0.6 it starts at 0).
+     */
+    final void fallbackSort(final BZip2CompressorOutputStream.Data data,
+                            final int last) {
+        data.block[0] = data.block[last + 1];
+        fallbackSort(data.fmap, data.block, last + 1);
+        for (int i = 0; i < last + 1; i++) {
+            --data.fmap[i];
+        }
+        for (int i = 0; i < last + 1; i++) {
+            if (data.fmap[i] == -1) {
+                data.fmap[i] = last;
+                break;
+            }
+        }
     }
 
 /*---------------------------------------------*/
@@ -415,12 +440,13 @@ class BlockSort {
     }
 
     /*
-     * The C code uses an array of ints to represents the bucket-start
-     * flags (bhtab).  It also contains optimizations to skip over 32
-     * consecutively set or consecutively unset bits on word
-     * boundaries at once.  For now I've chosen to use the simpler but
-     * potentially slower code using BitSet - also in the hope that
-     * using the BitSet#nextXXX methods may be fast enough.
+     * The C code uses an array of ints (each int holding 32 flags) to
+     * represents the bucket-start flags (bhtab).  It also contains
+     * optimizations to skip over 32 consecutively set or
+     * consecutively unset bits on word boundaries at once.  For now
+     * I've chosen to use the simpler but potentially slower code
+     * using BitSet - also in the hope that using the BitSet#nextXXX
+     * methods may be fast enough.
      */
 
     /**
@@ -432,16 +458,22 @@ class BlockSort {
      * @param off offset of first byte to sort in block
      */
     final void fallbackSort(int[] fmap, byte[] block, int nblock) {
-        int[] ftab = new int[257];
+        final int[] ftab = new int[257];
         int H, i, j, k, l, r, cc, cc1;
         int nNotDone;
         int nBhtab;
+        final int[] eclass = getEclass();
 
+        for (i = 0; i < nblock; i++) {
+            eclass[i] = 0;
+        }
         /*--
           LBZ2: Initial 1-char radix sort to generate
           initial fmap and initial BH bits.
           --*/
-        for (i = 0; i < nblock; i++) ftab[block[i] & 0xff]++;
+        for (i = 0; i < nblock; i++) {
+            ftab[block[i] & 0xff]++;
+        }
         for (i = 1; i < 257;    i++) ftab[i] += ftab[i - 1];
 
         for (i = 0; i < nblock; i++) {
@@ -467,8 +499,6 @@ class BlockSort {
             bhtab.clear(nblock + 2 * i + 1);
         }
 
-        eclass = getEclass();
-
         /*-- LBZ2: the log(N) loop --*/
         H = 1;
         while (true) {
@@ -491,10 +521,10 @@ class BlockSort {
 
                 /*-- LBZ2: find the next non-singleton bucket --*/
                 k = r + 1;
-                k = bhtab.nextSetBit(k);
+                k = bhtab.nextClearBit(k);
                 l = k - 1;
                 if (l >= nblock) break;
-                k = bhtab.nextClearBit(k);
+                k = bhtab.nextSetBit(k + 1);
                 r = k - 1;
                 if (r >= nblock) break;
 
@@ -862,8 +892,8 @@ class BlockSort {
     private static final int SETMASK = (1 << 21);
     private static final int CLEARMASK = (~SETMASK);
 
-    private void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
-                          final int lastShadow) {
+    final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
+                        final int lastShadow) {
         final int[] runningOrder = this.mainSort_runningOrder;
         final int[] copy = this.mainSort_copy;
         final boolean[] bigDone = this.mainSort_bigDone;

Modified: commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
URL: http://svn.apache.org/viewvc/commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java?rev=1340786&r1=1340785&r2=1340786&view=diff
==============================================================================
--- commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
(original)
+++ commons/proper/compress/trunk/src/test/java/org/apache/commons/compress/compressors/bzip2/BlockSortTest.java
Sun May 20 18:04:36 2012
@@ -70,17 +70,56 @@ public class BlockSortTest {
         0, 1, 7, 6, 8, 2, 3, 5, 4
     };
 
+    private static final byte[] FIXTURE2 = { 
+        'C', 'o', 'm', 'm', 'o', 'n', 's', ' ', 'C', 'o', 'm', 'p', 'r', 'e', 's', 's', 
+    };
+
+    private static final byte[] FIXTURE2_BWT = {
+        's', 's', ' ', 'r', 'o', 'm', 'o', 'o', 'C', 'C', 'm', 'm', 'p', 'n', 's', 'e', 
+    };
+
     @Test
     public void testSortFixture() {
-        BZip2CompressorOutputStream.Data data = new BZip2CompressorOutputStream.Data(1);
-        System.arraycopy(FIXTURE, 0, data.block, 1, FIXTURE.length);
-        BlockSort s = new BlockSort(data);
-        assertFalse(s.blockSort(data, FIXTURE.length - 1));
-        assertEquals(FIXTURE[FIXTURE.length - 1], data.block[0]);
-        for (int i = 0; i < FIXTURE.length; i++) {
-            assertEquals(FIXTURE_BWT[i], data.block[data.fmap[i]]);
-        }
-        assertEquals(0, data.origPtr);
+        DS ds = setUpFixture();
+        assertFalse(ds.s.blockSort(ds.data, FIXTURE.length - 1));
+        assertFixtureSorted(ds.data);
+        assertEquals(0, ds.data.origPtr);
+    }
+
+    @Test
+    public void testSortFixtureMainSort() {
+        DS ds = setUpFixture();
+        ds.s.mainSort(ds.data, FIXTURE.length - 1);
+        assertFixtureSorted(ds.data);
+    }
+
+    @Test
+    public void testSortFixtureFallbackSort() {
+        DS ds = setUpFixture();
+        ds.s.fallbackSort(ds.data, FIXTURE.length - 1);
+        assertFixtureSorted(ds.data);
+    }
+
+    @Test
+    public void testSortFixture2() {
+        DS ds = setUpFixture2();
+        assertFalse(ds.s.blockSort(ds.data, FIXTURE2.length - 1));
+        assertFixture2Sorted(ds.data);
+        assertEquals(1, ds.data.origPtr);
+    }
+
+    @Test
+    public void testSortFixture2MainSort() {
+        DS ds = setUpFixture2();
+        ds.s.mainSort(ds.data, FIXTURE2.length - 1);
+        assertFixture2Sorted(ds.data);
+    }
+
+    @Test
+    public void testSortFixture2FallbackSort() {
+        DS ds = setUpFixture2();
+        ds.s.fallbackSort(ds.data, FIXTURE2.length - 1);
+        assertFixture2Sorted(ds.data);
     }
 
     @Test
@@ -91,4 +130,43 @@ public class BlockSortTest {
         s.fallbackSort(fmap, FIXTURE, FIXTURE.length);
         assertArrayEquals(FIXTURE_SORTED, fmap);
     }
+
+    private DS setUpFixture() {
+        return setUpFixture(FIXTURE);
+    }
+
+    private void assertFixtureSorted(BZip2CompressorOutputStream.Data data) {
+        assertFixtureSorted(data, FIXTURE, FIXTURE_BWT);
+    }
+
+    private DS setUpFixture2() {
+        return setUpFixture(FIXTURE2);
+    }
+
+    private void assertFixture2Sorted(BZip2CompressorOutputStream.Data data) {
+        assertFixtureSorted(data, FIXTURE2, FIXTURE2_BWT);
+    }
+
+    private DS setUpFixture(byte[] fixture) {
+        BZip2CompressorOutputStream.Data data = new BZip2CompressorOutputStream.Data(1);
+        System.arraycopy(fixture, 0, data.block, 1, fixture.length);
+        return new DS(data, new BlockSort(data));
+    }
+
+    private void assertFixtureSorted(BZip2CompressorOutputStream.Data data,
+                                     byte[] fixture, byte[] fixtureBwt) {
+        assertEquals(fixture[fixture.length - 1], data.block[0]);
+        for (int i = 0; i < fixture.length; i++) {
+            assertEquals(fixtureBwt[i], data.block[data.fmap[i]]);
+        }
+    }
+
+    private static class DS {
+        private final BZip2CompressorOutputStream.Data data;
+        private final BlockSort s;
+        DS(BZip2CompressorOutputStream.Data data, BlockSort s) {
+            this.data = data;
+            this.s = s;
+        }
+    }
 }
\ No newline at end of file



Mime
View raw message