Return-Path:
Delivered-To: apmail-lucene-java-commits-archive@www.apache.org
Received: (qmail 3988 invoked from network); 11 Dec 2006 02:04:28 -0000
Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2)
by minotaur.apache.org with SMTP; 11 Dec 2006 02:04:28 -0000
Received: (qmail 15963 invoked by uid 500); 11 Dec 2006 02:04:36 -0000
Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org
Received: (qmail 15943 invoked by uid 500); 11 Dec 2006 02:04:35 -0000
Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm
Precedence: bulk
List-Help:
List-Unsubscribe:
List-Post:
List-Id:
Reply-To: java-dev@lucene.apache.org
Delivered-To: mailing list java-commits@lucene.apache.org
Received: (qmail 15932 invoked by uid 99); 11 Dec 2006 02:04:35 -0000
Received: from herse.apache.org (HELO herse.apache.org) (140.211.11.133)
by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 10 Dec 2006 18:04:35 -0800
X-ASF-Spam-Status: No, hits=-9.4 required=10.0
tests=ALL_TRUSTED,NO_REAL_NAME
X-Spam-Check-By: apache.org
Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3)
by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 10 Dec 2006 18:04:26 -0800
Received: by eris.apache.org (Postfix, from userid 65534)
id B4FE91A9846; Sun, 10 Dec 2006 18:03:42 -0800 (PST)
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r485463 - in /lucene/java/trunk: CHANGES.txt
src/java/org/apache/lucene/util/BitVector.java
src/site/src/documentation/content/xdocs/fileformats.xml
src/test/org/apache/lucene/util/TestBitVector.java
Date: Mon, 11 Dec 2006 02:03:41 -0000
To: java-commits@lucene.apache.org
From: yonik@apache.org
X-Mailer: svnmailer-1.1.0
Message-Id: <20061211020342.B4FE91A9846@eris.apache.org>
X-Virus-Checked: Checked by ClamAV on apache.org
Author: yonik
Date: Sun Dec 10 18:03:38 2006
New Revision: 485463
URL: http://svn.apache.org/viewvc?view=rev&rev=485463
Log:
read/write .del as d-gaps when the deleted bit vector is sufficiently sparse: LUCENE-738
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Dec 10 18:03:38 2006
@@ -143,6 +143,10 @@
replaced by the correct spelling.
(Andi Vajda via Daniel Naber)
+12. LUCENE-738: Reduce the size of the file that keeps track of which
+ documents are deleted when the number of deleted documents is
+ small. This changes the index file format and cannot be
+ read by previous versions of Lucene. (Doron Cohen via Yonik Seeley)
Bug fixes
Modified: lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java Sun Dec 10 18:03:38 2006
@@ -29,6 +29,7 @@
a count() method, which efficiently computes the number of one bits;
optimized read from and write to disk;
inlinable get() method;
+ store and load, as bit set or d-gaps, depending on sparseness;
@author Doug Cutting
@@ -111,13 +112,57 @@
public final void write(Directory d, String name) throws IOException {
IndexOutput output = d.createOutput(name);
try {
- output.writeInt(size()); // write size
- output.writeInt(count()); // write count
- output.writeBytes(bits, bits.length); // write bits
+ if (isSparse()) {
+ writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
+ } else {
+ writeBits(output);
+ }
} finally {
output.close();
}
}
+
+ /** Write as a bit set */
+ private void writeBits(IndexOutput output) throws IOException {
+ output.writeInt(size()); // write size
+ output.writeInt(count()); // write count
+ output.writeBytes(bits, bits.length);
+ }
+
+ /** Write as a d-gaps list */
+ private void writeDgaps(IndexOutput output) throws IOException {
+ output.writeInt(-1); // mark using d-gaps
+ output.writeInt(size()); // write size
+ output.writeInt(count()); // write count
+ int last=0;
+ int n = count();
+ int m = bits.length;
+ for (int i=0; i0; i++) {
+ if (bits[i]!=0) {
+ output.writeVInt(i-last);
+ output.writeByte(bits[i]);
+ last = i;
+ n -= BYTE_COUNTS[bits[i] & 0xFF];
+ }
+ }
+ }
+
+ /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or desnse, and should be saved as a bit set. */
+ private boolean isSparse() {
+ // note: order of comparisons below set to favor smaller values (no binary range search.)
+ // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
+ // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
+ // multiplying count by (8+8) or (8+16) or (8+24) etc.:
+ // - first 8 for writing bits[i] (1 byte vs. 1 bit), and
+ // - second part for writing the byte-number d-gap as vint.
+ // note: factor is for read/write of byte-arrays being faster than vints.
+ int factor = 10;
+ if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
+ if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size();
+ if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size();
+ if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size();
+ return factor * (4 + (8+40)*count()) < size();
+ }
/** Constructs a bit vector from the file name
in Directory
d
, as written by the {@link #write} method.
@@ -125,13 +170,36 @@
public BitVector(Directory d, String name) throws IOException {
IndexInput input = d.openInput(name);
try {
- size = input.readInt(); // read size
- count = input.readInt(); // read count
- bits = new byte[(size >> 3) + 1]; // allocate bits
- input.readBytes(bits, 0, bits.length); // read bits
+ size = input.readInt(); // read size
+ if (size == -1) {
+ readDgaps(input);
+ } else {
+ readBits(input);
+ }
} finally {
input.close();
}
}
+ /** Read as a bit set */
+ private void readBits(IndexInput input) throws IOException {
+ count = input.readInt(); // read count
+ bits = new byte[(size >> 3) + 1]; // allocate bits
+ input.readBytes(bits, 0, bits.length);
+ }
+
+ /** read as a d-gaps list */
+ private void readDgaps(IndexInput input) throws IOException {
+ size = input.readInt(); // (re)read size
+ count = input.readInt(); // read count
+ bits = new byte[(size >> 3) + 1]; // allocate bits
+ int last=0;
+ int n = count();
+ while (n>0) {
+ last += input.readVInt();
+ bits[last] = input.readByte();
+ n -= BYTE_COUNTS[bits[last] & 0xFF];
+ }
+ }
+
}
Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Sun Dec 10 18:03:38 2006
@@ -926,7 +926,8 @@
Compound Files
Starting with Lucene 1.4 the compound file format became default. This
- is simply a container for all files described in the next section.
+ is simply a container for all files described in the next section
+ (except for the .del file).
Compound (.cfs) --> FileCount, <DataOffset, FileName>
FileCount
@@ -1511,14 +1512,25 @@
Deleted Documents
The .del file is
- optional, and only exists when a segment contains deletions:
+ optional, and only exists when a segment contains deletions.
- Deletions
+
Although per-segment, this file is maintained exterior to compound segment files.
+
+
+
+ Pre-2.1:
+ Deletions
(.del) --> ByteCount,BitCount,Bits
- ByteSize,BitCount -->
+
+ 2.1 and above:
+ Deletions
+ (.del) --> [Format],ByteCount,BitCount, Bits | DGaps (depending on Format)
+
+
+ Format,ByteSize,BitCount -->
Uint32
@@ -1527,6 +1539,23 @@
ByteCount
+ DGaps -->
+ <DGap,NonzeroByte>
+ NonzeroBytesCount
+
+
+ DGap -->
+ VInt
+
+
+ NonzeroByte -->
+ Byte
+
+
+ Format
+ is Optional. -1 indicates DGaps. Non-negative value indicates Bits, and that Format is excluded.
+
+
ByteCount
indicates the number of bytes in Bits. It is typically
(SegSize/8)+1.
@@ -1543,6 +1572,20 @@
deleted. Bit ordering is from least to most significant. Thus, if
Bits contains two bytes, 0x00 and 0x02, then document 9 is marked as
deleted.
+
+
+ DGaps
+ represents sparse bit-vectors more efficiently than Bits.
+ It is made of DGaps on indexes of nonzero bytes in Bits,
+ and the nonzero bytes themselves. The number of nonzero bytes
+ in Bits (NonzeroBytesCount) is not stored.
+
+ For example,
+ if there are 8000 bits and only bits 10,12,32 are set,
+ DGaps would be used:
+
+
+ (VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1
Modified: lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java?view=diff&rev=485463&r1=485462&r2=485463
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/util/TestBitVector.java Sun Dec 10 18:03:38 2006
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import java.io.IOException;
+
import junit.framework.TestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
@@ -158,6 +160,47 @@
}
}
+ /**
+ * Test r/w when size/count cause switching between bit-set and d-gaps file formats.
+ * @throws Exception
+ */
+ public void testDgaps() throws IOException {
+ doTestDgaps(1,0,1);
+ doTestDgaps(10,0,1);
+ doTestDgaps(100,0,1);
+ doTestDgaps(1000,4,7);
+ doTestDgaps(10000,40,43);
+ doTestDgaps(100000,415,418);
+ doTestDgaps(1000000,3123,3126);
+ }
+
+ private void doTestDgaps(int size, int count1, int count2) throws IOException {
+ Directory d = new RAMDirectory();
+ BitVector bv = new BitVector(size);
+ for (int i=0; i=count1; i--) {
+ BitVector bv2 = new BitVector(d, "TESTBV");
+ assertTrue(doCompare(bv,bv2));
+ bv = bv2;
+ bv.clear(i);
+ assertEquals(i,bv.count());
+ bv.write(d, "TESTBV");
+ }
+ }
/**
* Compare two BitVectors.
* This should really be an equals method on the BitVector itself.