lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r485463 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/util/BitVector.java src/site/src/documentation/content/xdocs/fileformats.xml src/test/org/apache/lucene/util/TestBitVector.java
Date Mon, 11 Dec 2006 02:03:41 GMT
Author: yonik
Date: Sun Dec 10 18:03:38 2006
New Revision: 485463

URL: http://svn.apache.org/viewvc?view=rev&rev=485463
Log:
read/write .del as d-gaps when the deleted bit vector is sufficiently sparse: LUCENE-738

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
    lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
    lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Dec 10 18:03:38 2006
@@ -143,6 +143,10 @@
     replaced by the correct spelling.
     (Andi Vajda via Daniel Naber)
 
+12. LUCENE-738: Reduce the size of the file that keeps track of which
+    documents are deleted when the number of deleted documents is
+    small.  This changes the index file format and cannot be
+    read by previous versions of Lucene.  (Doron Cohen via Yonik Seeley)
 
 Bug fixes
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java Sun Dec 10 18:03:38 2006
@@ -29,6 +29,7 @@
   <li>a count() method, which efficiently computes the number of one bits;</li>
   <li>optimized read from and write to disk;</li>
   <li>inlinable get() method;</li>
+  <li>store and load, as bit set or d-gaps, depending on sparseness;</li> 
   </ul>
 
   @author Doug Cutting
@@ -111,13 +112,57 @@
   public final void write(Directory d, String name) throws IOException {
     IndexOutput output = d.createOutput(name);
     try {
-      output.writeInt(size());			  // write size
-      output.writeInt(count());			  // write count
-      output.writeBytes(bits, bits.length);	  // write bits
+      if (isSparse()) { 
+        writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
+      } else {
+        writeBits(output);
+      }
     } finally {
       output.close();
     }
   }
+     
+  /** Write as a bit set */
+  private void writeBits(IndexOutput output) throws IOException {
+    output.writeInt(size());        // write size
+    output.writeInt(count());       // write count
+    output.writeBytes(bits, bits.length);
+  }
+  
+  /** Write as a d-gaps list */
+  private void writeDgaps(IndexOutput output) throws IOException {
+    output.writeInt(-1);            // mark using d-gaps                         
+    output.writeInt(size());        // write size
+    output.writeInt(count());       // write count
+    int last=0;
+    int n = count();
+    int m = bits.length;
+    for (int i=0; i<m && n>0; i++) {
+      if (bits[i]!=0) {
+        output.writeVInt(i-last);
+        output.writeByte(bits[i]);
+        last = i;
+        n -= BYTE_COUNTS[bits[i] & 0xFF];
+      }
+    }
+  }
+
+  /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or desnse,
and should be saved as a bit set. */
+  private boolean isSparse() {
+    // note: order of comparisons below set to favor smaller values (no binary range search.)
+    // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
+    // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
+    //       multiplying count by (8+8) or (8+16) or (8+24) etc.:
+    //       - first 8 for writing bits[i] (1 byte vs. 1 bit), and 
+    //       - second part for writing the byte-number d-gap as vint. 
+    // note: factor is for read/write of byte-arrays being faster than vints.  
+    int factor = 10;  
+    if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
+    if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
+    if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
+    if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
+    return                            factor * (4 + (8+40)*count()) < size();
+  }
 
   /** Constructs a bit vector from the file <code>name</code> in Directory
     <code>d</code>, as written by the {@link #write} method.
@@ -125,13 +170,36 @@
   public BitVector(Directory d, String name) throws IOException {
     IndexInput input = d.openInput(name);
     try {
-      size = input.readInt();			  // read size
-      count = input.readInt();			  // read count
-      bits = new byte[(size >> 3) + 1];		  // allocate bits
-      input.readBytes(bits, 0, bits.length);	  // read bits
+      size = input.readInt();       // read size
+      if (size == -1) {
+        readDgaps(input);
+      } else {
+        readBits(input);
+      }
     } finally {
       input.close();
     }
   }
 
+  /** Read as a bit set */
+  private void readBits(IndexInput input) throws IOException {
+    count = input.readInt();        // read count
+    bits = new byte[(size >> 3) + 1];     // allocate bits
+    input.readBytes(bits, 0, bits.length);
+  }
+
+  /** read as a d-gaps list */ 
+  private void readDgaps(IndexInput input) throws IOException {
+    size = input.readInt();       // (re)read size
+    count = input.readInt();        // read count
+    bits = new byte[(size >> 3) + 1];     // allocate bits
+    int last=0;
+    int n = count();
+    while (n>0) {
+      last += input.readVInt();
+      bits[last] = input.readByte();
+      n -= BYTE_COUNTS[bits[last] & 0xFF];
+    }          
+  }
+  
 }

Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Sun Dec 10
18:03:38 2006
@@ -926,7 +926,8 @@
             <section id="Compound Files"><title>Compound Files</title>
 
                 <p>Starting with Lucene 1.4 the compound file format became default.
This
-                    is simply a container for all files described in the next section.</p>
+                    is simply a container for all files described in the next section
+					(except for the .del file).</p>
 
                 <p>Compound (.cfs) --&gt; FileCount, &lt;DataOffset, FileName&gt;
                     <sup>FileCount</sup>
@@ -1511,14 +1512,25 @@
             <section id="Deleted Documents"><title>Deleted Documents</title>
 
                 <p>The .del file is
-                    optional, and only exists when a segment contains deletions:
+                    optional, and only exists when a segment contains deletions.
                 </p>
 
-                <p>Deletions
+                <p>Although per-segment, this file is maintained exterior to compound
segment files.
+                </p>
+				
+                <p>
+                <b>Pre-2.1:</b>
+                Deletions
                     (.del) --&gt; ByteCount,BitCount,Bits
                 </p>
 
-                <p>ByteSize,BitCount --&gt;
+                <p>
+				<b>2.1 and above:</b>
+                Deletions
+                    (.del) --&gt; [Format],ByteCount,BitCount, Bits | DGaps (depending
on Format)
+                </p>
+
+                <p>Format,ByteSize,BitCount --&gt;
                     Uint32
                 </p>
 
@@ -1527,6 +1539,23 @@
                     <sup>ByteCount</sup>
                 </p>
 
+				<p>DGaps --&gt;
+                    &lt;DGap,NonzeroByte&gt;
+                    <sup>NonzeroBytesCount</sup>
+                </p>
+
+                <p>DGap --&gt;
+                    VInt
+                </p>
+
+                <p>NonzeroByte --&gt;
+                    Byte
+                </p>
+				
+                <p>Format
+                    is Optional. -1 indicates DGaps. Non-negative value indicates Bits, and
that Format is excluded.
+                </p>
+
                 <p>ByteCount
                     indicates the number of bytes in Bits. It is typically
                     (SegSize/8)+1.
@@ -1543,6 +1572,20 @@
                     deleted. Bit ordering is from least to most significant. Thus, if
                     Bits contains two bytes, 0x00 and 0x02, then document 9 is marked as
                     deleted.
+                </p>
+
+				<p>DGaps
+                    represents sparse bit-vectors more efficiently than Bits.
+                    It is made of DGaps on indexes of nonzero bytes in Bits,
+                    and the nonzero bytes themselves. The number of nonzero bytes
+                    in Bits (NonzeroBytesCount) is not stored.
+                </p>
+                <p>For example,
+                    if there are 8000 bits and only bits 10,12,32 are set,
+                    DGaps would be used:
+                </p>
+                <p>
+                    (VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1
                 </p>
             </section>
         </section>

Modified: lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java Sun Dec 10 18:03:38
2006
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import junit.framework.TestCase;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
@@ -158,6 +160,47 @@
         }
     }
 
+    /**
+     * Test r/w when size/count cause switching between bit-set and d-gaps file formats.
 
+     * @throws Exception
+     */
+    public void testDgaps() throws IOException {
+      doTestDgaps(1,0,1);
+      doTestDgaps(10,0,1);
+      doTestDgaps(100,0,1);
+      doTestDgaps(1000,4,7);
+      doTestDgaps(10000,40,43);
+      doTestDgaps(100000,415,418);
+      doTestDgaps(1000000,3123,3126);
+    }
+    
+    private void doTestDgaps(int size, int count1, int count2) throws IOException {
+      Directory d = new  RAMDirectory();
+      BitVector bv = new BitVector(size);
+      for (int i=0; i<count1; i++) {
+        bv.set(i);
+        assertEquals(i+1,bv.count());
+      }
+      bv.write(d, "TESTBV");
+      // gradually increase number of set bits
+      for (int i=count1; i<count2; i++) {
+        BitVector bv2 = new BitVector(d, "TESTBV");
+        assertTrue(doCompare(bv,bv2));
+        bv = bv2;
+        bv.set(i);
+        assertEquals(i+1,bv.count());
+        bv.write(d, "TESTBV");
+      }
+      // now start decreasing number of set bits
+      for (int i=count2-1; i>=count1; i--) {
+        BitVector bv2 = new BitVector(d, "TESTBV");
+        assertTrue(doCompare(bv,bv2));
+        bv = bv2;
+        bv.clear(i);
+        assertEquals(i,bv.count());
+        bv.write(d, "TESTBV");
+      }
+    }
     /**
      * Compare two BitVectors.
      * This should really be an equals method on the BitVector itself.



Mime
View raw message