lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r651919 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/util/ src/test/org/apache/lucene/index/
Date Sun, 27 Apr 2008 11:14:23 GMT
Author: mikemccand
Date: Sun Apr 27 04:14:10 2008
New Revision: 651919

URL: http://svn.apache.org/viewvc?rev=651919&view=rev
Log:
LUCENE-1267: record per-segment deletion count in segments file; add maxDoc() & numDocs()
in IW; deprecate docCount() in favor of maxDoc()

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/CheckIndex.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestCheckIndex.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Apr 27 04:14:10 2008
@@ -94,6 +94,9 @@
     hitting an exception in readInternal, the buffer is incorrectly
     filled with stale bytes such that subsequent calls to readByte()
     return incorrect results.  (Trejkaz via Mike McCandless)
+
+ 8. LUCENE-1267: Added numDocs() and maxDoc() to IndexWriter;
+    deprecated docCount().  (Mike McCandless)
 	
 New features
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/CheckIndex.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/CheckIndex.java Sun Apr 27 04:14:10
2008
@@ -119,9 +119,11 @@
       // able to create position=-1 when the very first
       // Token has positionIncrement 0
       allowMinusOnePosition = false;
-      if (format == SegmentInfos.FORMAT_CHECKSUM) {
+      if (format == SegmentInfos.FORMAT_CHECKSUM)
         sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
-      } else if (format < SegmentInfos.FORMAT_CHECKSUM) {
+      else if (format == SegmentInfos.FORMAT_DEL_COUNT)
+          sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
+      else if (format < SegmentInfos.CURRENT_FORMAT) {
         sFormat = "int=" + format + " [newer version of Lucene than this tool]";
         skip = true;
       } else {
@@ -178,10 +180,15 @@
         reader = SegmentReader.get(info);
         final int numDocs = reader.numDocs();
         toLoseDocCount = numDocs;
-        if (reader.hasDeletions())
+        if (reader.hasDeletions()) {
+          if (info.docCount - numDocs != info.getDelCount())
+            throw new RuntimeException("delete count mismatch: info=" + info.getDelCount()
+ " vs reader=" + (info.docCount - numDocs));
           out.println("OK [" + (info.docCount - numDocs) + " deleted docs]");
-        else
+        } else {
+          if (info.getDelCount() != 0)
+            throw new RuntimeException("delete count mismatch: info=" + info.getDelCount()
+ " vs reader=" + (info.docCount - numDocs));
           out.println("OK");
+        }
 
         out.print("    test: fields, norms.......");
         Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Sun Apr 27 04:14:10
2008
@@ -1798,17 +1798,48 @@
     return analyzer;
   }
 
-  /** Returns the number of documents currently in this index. */
+  /** Returns the number of documents currently in this
+   *  index, not counting deletions.
+   * @deprecated Please use {@link #maxDoc()} (same as this
+   * method) or {@link #numDocs()} (also takes deletions
+   * into account), instead. */
   public synchronized int docCount() {
     ensureOpen();
+    return maxDoc();
+  }
+
+  /** Returns total number of docs in this index, including
+   *  docs not yet flushed (still in the RAM buffer),
+   *  not counting deletions.
+   *  @see #numDocs */
+  public synchronized int maxDoc() {
     int count;
     if (docWriter != null)
       count = docWriter.getNumDocsInRAM();
     else
       count = 0;
+
+    for (int i = 0; i < segmentInfos.size(); i++)
+      count += segmentInfos.info(i).docCount;
+    return count;
+  }
+
+  /** Returns total number of docs in this index, including
+   *  docs not yet flushed (still in the RAM buffer), and
+   *  including deletions.  <b>NOTE:</b> buffered deletions
+   *  are not counted.  If you really need these to be
+   *  counted you should call {@link #commit()} first.
+   *  @see #numDocs */
+  public synchronized int numDocs() throws IOException {
+    int count;
+    if (docWriter != null)
+      count = docWriter.getNumDocsInRAM();
+    else
+      count = 0;
+
     for (int i = 0; i < segmentInfos.size(); i++) {
-      SegmentInfo si = segmentInfos.info(i);
-      count += si.docCount;
+      final SegmentInfo info = segmentInfos.info(i);
+      count += info.docCount - info.getDelCount();
     }
     return count;
   }
@@ -3354,6 +3385,7 @@
 
     BitVector deletes = null;
     int docUpto = 0;
+    int delCount = 0;
 
     final int numSegmentsToMerge = sourceSegments.size();
     for(int i=0;i<numSegmentsToMerge;i++) {
@@ -3390,8 +3422,10 @@
             if (previousDeletes.get(j))
               assert currentDeletes.get(j);
             else {
-              if (currentDeletes.get(j))
+              if (currentDeletes.get(j)) {
                 deletes.set(docUpto);
+                delCount++;
+              }
               docUpto++;
             }
           }
@@ -3406,8 +3440,10 @@
         BitVector currentDeletes = new BitVector(directory, currentInfo.getDelFileName());
 
         for(int j=0;j<docCount;j++) {
-          if (currentDeletes.get(j))
+          if (currentDeletes.get(j)) {
             deletes.set(docUpto);
+            delCount++;
+          }
           docUpto++;
         }
             
@@ -3420,6 +3456,8 @@
       merge.info.advanceDelGen();
       message("commit merge deletes to " + merge.info.getDelFileName());
       deletes.write(directory, merge.info.getDelFileName());
+      merge.info.setDelCount(delCount);
+      assert delCount == deletes.count();
     }
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java Sun Apr 27 04:14:10
2008
@@ -20,6 +20,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BitVector;
 import java.io.IOException;
 import java.util.List;
 import java.util.ArrayList;
@@ -73,6 +74,9 @@
                                                   // other segments
   private boolean docStoreIsCompoundFile;         // whether doc store files are stored in
compound file (*.cfx)
 
+  private int delCount;                           // How many deleted docs in this segment,
or -1 if not yet known
+                                                  // (if it's an older index)
+
   public SegmentInfo(String name, int docCount, Directory dir) {
     this.name = name;
     this.docCount = docCount;
@@ -84,6 +88,7 @@
     docStoreOffset = -1;
     docStoreSegment = name;
     docStoreIsCompoundFile = false;
+    delCount = 0;
   }
 
   public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean
hasSingleNormFile) { 
@@ -99,6 +104,7 @@
     this.docStoreOffset = docStoreOffset;
     this.docStoreSegment = docStoreSegment;
     this.docStoreIsCompoundFile = docStoreIsCompoundFile;
+    delCount = 0;
     assert docStoreOffset == -1 || docStoreSegment != null;
   }
 
@@ -122,6 +128,7 @@
     }
     isCompoundFile = src.isCompoundFile;
     hasSingleNormFile = src.hasSingleNormFile;
+    delCount = src.delCount;
   }
 
   /**
@@ -168,6 +175,11 @@
       }
       isCompoundFile = input.readByte();
       preLockless = (isCompoundFile == CHECK_DIR);
+      if (format <= SegmentInfos.FORMAT_DEL_COUNT) {
+        delCount = input.readInt();
+        assert delCount <= docCount;
+      } else
+        delCount = -1;
     } else {
       delGen = CHECK_DIR;
       normGen = null;
@@ -177,6 +189,7 @@
       docStoreOffset = -1;
       docStoreIsCompoundFile = false;
       docStoreSegment = null;
+      delCount = -1;
     }
   }
   
@@ -263,6 +276,7 @@
     SegmentInfo si = new SegmentInfo(name, docCount, dir);
     si.isCompoundFile = isCompoundFile;
     si.delGen = delGen;
+    si.delCount = delCount;
     si.preLockless = preLockless;
     si.hasSingleNormFile = hasSingleNormFile;
     if (normGen != null) {
@@ -429,6 +443,23 @@
     }
   }
 
+  int getDelCount() throws IOException {
+    if (delCount == -1) {
+      if (hasDeletions()) {
+        final String delFileName = getDelFileName();
+        delCount = new BitVector(dir, delFileName).count();
+      } else
+        delCount = 0;
+    }
+    assert delCount <= docCount;
+    return delCount;
+  }
+
+  void setDelCount(int delCount) {
+    this.delCount = delCount;
+    assert delCount <= docCount;
+  }
+
   int getDocStoreOffset() {
     return docStoreOffset;
   }
@@ -475,6 +506,7 @@
       }
     }
     output.writeByte(isCompoundFile);
+    output.writeInt(delCount);
   }
 
   private void addIfExists(List files, String fileName) throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java Sun Apr 27 04:14:10
2008
@@ -61,8 +61,12 @@
    *  ensure all bytes were successfully written. */
   public static final int FORMAT_CHECKSUM = -5;
 
+  /** This format adds the deletion count for each segment.
+   *  This way IndexWriter can efficiently report numDocs(). */
+  public static final int FORMAT_DEL_COUNT = -6;
+
   /* This must always point to the most recent file format. */
-  private static final int CURRENT_FORMAT = FORMAT_CHECKSUM;
+  static final int CURRENT_FORMAT = FORMAT_DEL_COUNT;
   
   public int counter = 0;    // used to name new segments
   /**

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Sun Apr 27 04:14:10
2008
@@ -55,10 +55,12 @@
   private boolean deletedDocsDirty = false;
   private boolean normsDirty = false;
   private boolean undeleteAll = false;
+  private int pendingDeleteCount;
 
   private boolean rollbackDeletedDocsDirty = false;
   private boolean rollbackNormsDirty = false;
   private boolean rollbackUndeleteAll = false;
+  private int rollbackPendingDeleteCount;
 
   IndexInput freqStream;
   IndexInput proxStream;
@@ -351,11 +353,16 @@
     if (hasDeletions(si)) {
       deletedDocs = new BitVector(directory(), si.getDelFileName());
      
-      // Verify # deletes does not exceed maxDoc for this segment:
-      if (deletedDocs.count() > maxDoc()) {
-        throw new CorruptIndexException("number of deletes (" + deletedDocs.count() + ")
exceeds max doc (" + maxDoc() + ") for segment " + si.name);
-      }
-    }
+      assert si.getDelCount() == deletedDocs.count() : 
+        "delete count mismatch: info=" + si.getDelCount() + " vs BitVector=" + deletedDocs.count();
+
+      // Verify # deletes does not exceed maxDoc for this
+      // segment:
+      assert si.getDelCount() <= maxDoc() : 
+        "delete count mismatch: " + deletedDocs.count() + ") exceeds max doc (" + maxDoc()
+ ") for segment " + si.name;
+
+    } else
+      assert si.getDelCount() == 0;
   }
   
   protected synchronized DirectoryIndexReader doReopen(SegmentInfos infos) throws CorruptIndexException,
IOException {
@@ -525,9 +532,12 @@
       // .tmp & renaming it) because the file is not live
       // until segments file is written:
       deletedDocs.write(directory(), si.getDelFileName());
+      
+      si.setDelCount(si.getDelCount()+pendingDeleteCount);
     }
     if (undeleteAll && si.hasDeletions()) {
       si.clearDelGen();
+      si.setDelCount(0);
     }
     if (normsDirty) {               // re-write norms
       si.setNumFields(fieldInfos.size());
@@ -620,7 +630,8 @@
       deletedDocs = new BitVector(maxDoc());
     deletedDocsDirty = true;
     undeleteAll = false;
-    deletedDocs.set(docNum);
+    if (!deletedDocs.getAndSet(docNum))
+      pendingDeleteCount++;
   }
 
   protected void doUndeleteAll() {
@@ -1009,6 +1020,7 @@
     rollbackDeletedDocsDirty = deletedDocsDirty;
     rollbackNormsDirty = normsDirty;
     rollbackUndeleteAll = undeleteAll;
+    rollbackPendingDeleteCount = pendingDeleteCount;
     Iterator it = norms.values().iterator();
     while (it.hasNext()) {
       Norm norm = (Norm) it.next();
@@ -1021,6 +1033,7 @@
     deletedDocsDirty = rollbackDeletedDocsDirty;
     normsDirty = rollbackNormsDirty;
     undeleteAll = rollbackUndeleteAll;
+    pendingDeleteCount = rollbackPendingDeleteCount;
     Iterator it = norms.values().iterator();
     while (it.hasNext()) {
       Norm norm = (Norm) it.next();

Modified: lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/BitVector.java Sun Apr 27 04:14:10 2008
@@ -56,6 +56,25 @@
     count = -1;
   }
 
+  /** Sets the value of <code>bit</code> to true, and
+   *  returns true if bit was already set */
+  public final boolean getAndSet(int bit) {
+    if (bit >= size) {
+      throw new ArrayIndexOutOfBoundsException(bit);
+    }
+    final int pos = bit >> 3;
+    final int v = bits[pos];
+    final int flag = 1 << (bit & 7);
+    if ((flag & v) != 0)
+      return true;
+    else {
+      bits[pos] = (byte) (v | flag);
+      if (count != -1)
+        count++;
+      return false;
+    }
+  }
+
   /** Sets the value of <code>bit</code> to zero. */
   public final void clear(int bit) {
     if (bit >= size) {

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestCheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestCheckIndex.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestCheckIndex.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestCheckIndex.java Sun Apr 27 04:14:10
2008
@@ -49,7 +49,11 @@
     ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
 
     CheckIndex.out = new PrintStream(bos);
-    assertTrue(CheckIndex.check(dir, false, null));
+    if (!CheckIndex.check(dir, false, null)) {
+      System.out.println("CheckIndex failed");
+      System.out.println(bos.toString());
+      fail();
+    }
     final List onlySegments = new ArrayList();
     onlySegments.add("_0");
     assertTrue(CheckIndex.check(dir, false, onlySegments));

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=651919&r1=651918&r2=651919&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Sun Apr 27 04:14:10
2008
@@ -104,8 +104,11 @@
 
         // optimize the index and check that the new doc count is correct
         writer = new IndexWriter(dir, true, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
+        assertEquals(100, writer.maxDoc());
+        assertEquals(60, writer.numDocs());
         writer.optimize();
-        assertEquals(60, writer.docCount());
+        assertEquals(60, writer.maxDoc());
+        assertEquals(60, writer.numDocs());
         writer.close();
 
         // check that the index reader gives the same numbers.
@@ -117,7 +120,8 @@
         // make sure opening a new index for create over
         // this existing one works correctly:
         writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
-        assertEquals(0, writer.docCount());
+        assertEquals(0, writer.maxDoc());
+        assertEquals(0, writer.numDocs());
         writer.close();
     }
 
@@ -3030,7 +3034,10 @@
     writer = new IndexWriter(dir,
                              false, new StandardAnalyzer(),
                              IndexWriter.MaxFieldLength.LIMITED);
+    assertEquals(8, writer.numDocs());
+    assertEquals(10, writer.maxDoc());
     writer.expungeDeletes();
+    assertEquals(8, writer.numDocs());
     writer.close();
     ir = IndexReader.open(dir);
     assertEquals(8, ir.maxDoc());
@@ -3075,6 +3082,7 @@
                              false, new StandardAnalyzer(),
                              IndexWriter.MaxFieldLength.LIMITED);
     writer.setMergeFactor(3);
+    assertEquals(49, writer.numDocs());
     writer.expungeDeletes();
     writer.close();
     ir = IndexReader.open(dir);



Mime
View raw message