lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r620604 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Date Mon, 11 Feb 2008 20:34:54 GMT
Author: mikemccand
Date: Mon Feb 11 12:34:49 2008
New Revision: 620604

URL: http://svn.apache.org/viewvc?rev=620604&view=rev
Log:
LUCENE-325: add expungeDeletes methods to IndexWriter

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MergeScheduler.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Feb 11 12:34:49 2008
@@ -49,6 +49,12 @@
     see the changes.  Deprecate IndexWriter.flush() in favor of
     IndexWriter.commit().  (Mike McCandless)
 
+ 5. LUCENE-325: Added IndexWriter.expungeDeletes methods, which
+    consult the MergePolicy to find merges necessary to merge away all
+    deletes from the index.  This should be a somewhat lower cost
+    operation than optimize.  (John Wang via Mike McCandless)
+
+
 Bug fixes
     
  1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimze a single 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Mon Feb 11 12:34:49
2008
@@ -26,7 +26,6 @@
 import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.util.BitVector;
-import org.apache.lucene.util.Parameter;
 import org.apache.lucene.util.Constants;
 
 import java.io.File;
@@ -2163,6 +2162,7 @@
           try {
             wait();
           } catch (InterruptedException ie) {
+            Thread.currentThread().interrupt();
           }
 
           if (mergeExceptions.size() > 0) {
@@ -2203,6 +2203,87 @@
         return true;
 
     return false;
+  }
+
+  /** Just like {@link #expungeDeletes()}, except you can
+   *  specify whether the call should block until the
+   *  operation completes.  This is only meaningful with a
+   *  {@link MergeScheduler} that is able to run merges in
+   *  background threads. */
+  public void expungeDeletes(boolean doWait)
+    throws CorruptIndexException, IOException {
+    ensureOpen();
+
+    if (infoStream != null)
+      message("expungeDeletes: index now " + segString());
+
+    MergePolicy.MergeSpecification spec;
+
+    synchronized(this) {
+      spec = mergePolicy.findMergesToExpungeDeletes(segmentInfos, this);
+      if (spec != null) {
+        final int numMerges = spec.merges.size();
+        for(int i=0;i<numMerges;i++)
+          registerMerge((MergePolicy.OneMerge) spec.merges.get(i));
+      }
+    }
+
+    mergeScheduler.merge(this);
+
+    if (spec != null && doWait) {
+      final int numMerges = spec.merges.size();
+      synchronized(this) {
+        boolean running = true;
+        while(running) {
+
+          running = false;
+          for(int i=0;i<numMerges;i++) {
+            final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) spec.merges.get(i);
+            if (pendingMerges.contains(merge) || runningMerges.contains(merge))
+              running = true;
+            Throwable t = merge.getException();
+            if (t != null) {
+              IOException ioe = new IOException("background merge hit exception: " + merge.segString(directory));
+              ioe.initCause(t);
+              throw ioe;
+            }
+          }
+
+          if (running) {
+            try {
+              wait();
+            } catch (InterruptedException ie) {
+              Thread.currentThread().interrupt();
+            }
+          }
+        }
+      }
+    }
+
+    // NOTE: in the ConcurrentMergeScheduler case, when
+    // doWait is false, we can return immediately while
+    // background threads accomplish the optimization
+  }
+
+
+  /** Expunges all deletes from the index.  When and index
+   *  has many document deletions (or updates to existing
+   *  documents), it's best to either call optimize or
+   *  expungeDeletes to remove all unusged data in the index
+   *  associated with the deleted documents.  To see how
+   *  many deletions you have pending in your index, call
+   *  {@link IndexReader#maxDoc - IndexReader#numDocs}.
+   *  This saves disk space and memory usage while
+   *  searching.  expungeDeletes should be somewhat faster
+   *  than optimize since it does not insist on reducing the
+   *  index to a single segment (though, this depends on the
+   *  {@link MergePolicy}; see {@link
+   *  MergePolicy#findMergesToExpungeDeletes}.). Note that
+   *  this call does not first commit any buffered
+   *  documents, so you must do so yourself if necessary.
+   *  See also {@link #expungeDeletes(boolean)}*/
+  public void expungeDeletes() throws CorruptIndexException, IOException {
+    expungeDeletes(true);
   }
 
   /**

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java Mon Feb 11 12:34:49
2008
@@ -245,6 +245,54 @@
     return spec;
   }
 
+  /**
+   * Finds merges necessary to expunge all deletes from the
+   * index.  We simply merge adjacent segments that have
+   * deletes, up to mergeFactor at a time.
+   */ 
+  public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos,
+                                                       IndexWriter writer)
+    throws CorruptIndexException, IOException
+  {
+    this.writer = writer;
+
+    final int numSegments = segmentInfos.size();
+
+    message("findMergesToExpungeDeletes: " + numSegments + " segments");
+
+    MergeSpecification spec = new MergeSpecification();
+    int firstSegmentWithDeletions = -1;
+    for(int i=0;i<numSegments;i++) {
+      final SegmentInfo info = segmentInfos.info(i);
+      if (info.hasDeletions()) {
+        message("  segment " + info.name + " has deletions");
+        if (firstSegmentWithDeletions == -1)
+          firstSegmentWithDeletions = i;
+        else if (i - firstSegmentWithDeletions == mergeFactor) {
+          // We've seen mergeFactor segments in a row with
+          // deletions, so force a merge now:
+          message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
+          spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+          firstSegmentWithDeletions = i;
+        }
+      } else if (firstSegmentWithDeletions != -1) {
+        // End of a sequence of segments with deletions, so,
+        // merge those past segments even if it's fewer than
+        // mergeFactor segments
+        message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
+        spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+        firstSegmentWithDeletions = -1;
+      }
+    }
+
+    if (firstSegmentWithDeletions != -1) {
+      message("  add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
+      spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments), useCompoundFile));
+    }
+
+    return spec;
+  }
+
   /** Checks if any merges are now necessary and returns a
    *  {@link MergePolicy.MergeSpecification} if so.  A merge
    *  is necessary when there are more than {@link

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java Mon Feb 11 12:34:49
2008
@@ -50,6 +50,8 @@
  * 
  * <p>The default MergePolicy is {@link
  * LogByteSizeMergePolicy}.</p>
+ * <p><b>NOTE:</b> This API is new and still experimental
+ * (subject to change suddenly in the next release)</p>
  */
 
 public abstract class MergePolicy {
@@ -209,7 +211,7 @@
     throws CorruptIndexException, IOException;
 
   /**
-   * Determine what set of merge operations are necessary in
+   * Determine what set of merge operations is necessary in
    * order to optimize the index.  The IndexWriter calls
    * this when its optimize() method is called.  This call
    * is always synchronized on the IndexWriter instance so
@@ -228,6 +230,19 @@
                                                     int maxSegmentCount,
                                                     Set segmentsToOptimize)
     throws CorruptIndexException, IOException;
+
+  /**
+   * Determine what set of merge operations is necessary in
+   * order to expunge all deletes from the index.
+   * @param segmentInfos the total set of segments in the index
+   * @param writer IndexWriter instance
+   */
+  MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos,
+                                                 IndexWriter writer)
+    throws CorruptIndexException, IOException
+  {
+    throw new RuntimeException("not implemented");
+  }
 
   /**
    * Release all resources for the policy.

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MergeScheduler.java?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MergeScheduler.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MergeScheduler.java Mon Feb 11 12:34:49
2008
@@ -19,10 +19,13 @@
 
 import java.io.IOException;
 
-/** Expert: {@link IndexWriter} uses an instance
+/** <p>Expert: {@link IndexWriter} uses an instance
  *  implementing this interface to execute the merges
  *  selected by a {@link MergePolicy}.  The default
- *  MergeScheduler is {@link ConcurrentMergeScheduler}. */
+ *  MergeScheduler is {@link ConcurrentMergeScheduler}.</p>
+ * <p><b>NOTE:</b> This API is new and still experimental
+ * (subject to change suddenly in the next release)</p>
+*/
 
 public abstract class MergeScheduler {
 

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=620604&r1=620603&r2=620604&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Mon Feb 11 12:34:49
2008
@@ -2977,4 +2977,140 @@
     reader.close();
     dir.close();
   }
+
+  // LUCENE-325: test expungeDeletes, when 2 singular merges
+  // are required
+  public void testExpungeDeletes() throws IOException {
+    Directory dir = new MockRAMDirectory();
+    IndexWriter writer = new IndexWriter(dir,
+                                         false, new StandardAnalyzer(),
+                                         IndexWriter.MaxFieldLength.LIMITED);
+    writer.setMaxBufferedDocs(2);
+    writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
+
+    Document document = new Document();
+
+    document = new Document();
+    Field storedField = new Field("stored", "stored", Field.Store.YES,
+                                  Field.Index.NO);
+    document.add(storedField);
+    Field termVectorField = new Field("termVector", "termVector",
+                                      Field.Store.NO, Field.Index.UN_TOKENIZED,
+                                      Field.TermVector.WITH_POSITIONS_OFFSETS);
+    document.add(termVectorField);
+    for(int i=0;i<10;i++)
+      writer.addDocument(document);
+    writer.close();
+
+    IndexReader ir = IndexReader.open(dir);
+    assertEquals(10, ir.maxDoc());
+    assertEquals(10, ir.numDocs());
+    ir.deleteDocument(0);
+    ir.deleteDocument(7);
+    assertEquals(8, ir.numDocs());
+    ir.close();
+
+    writer = new IndexWriter(dir,
+                             false, new StandardAnalyzer(),
+                             IndexWriter.MaxFieldLength.LIMITED);
+    writer.expungeDeletes();
+    writer.close();
+    ir = IndexReader.open(dir);
+    assertEquals(8, ir.maxDoc());
+    assertEquals(8, ir.numDocs());
+    ir.close();
+    dir.close();
+  }
+
+  // LUCENE-325: test expungeDeletes, when many adjacent merges are required
+  public void testExpungeDeletes2() throws IOException {
+    Directory dir = new MockRAMDirectory();
+    IndexWriter writer = new IndexWriter(dir,
+                                         false, new StandardAnalyzer(),
+                                         IndexWriter.MaxFieldLength.LIMITED);
+    writer.setMaxBufferedDocs(2);
+    writer.setMergeFactor(50);
+    writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
+
+    Document document = new Document();
+
+    document = new Document();
+    Field storedField = new Field("stored", "stored", Field.Store.YES,
+                                  Field.Index.NO);
+    document.add(storedField);
+    Field termVectorField = new Field("termVector", "termVector",
+                                      Field.Store.NO, Field.Index.UN_TOKENIZED,
+                                      Field.TermVector.WITH_POSITIONS_OFFSETS);
+    document.add(termVectorField);
+    for(int i=0;i<98;i++)
+      writer.addDocument(document);
+    writer.close();
+
+    IndexReader ir = IndexReader.open(dir);
+    assertEquals(98, ir.maxDoc());
+    assertEquals(98, ir.numDocs());
+    for(int i=0;i<98;i+=2)
+      ir.deleteDocument(i);
+    assertEquals(49, ir.numDocs());
+    ir.close();
+
+    writer = new IndexWriter(dir,
+                             false, new StandardAnalyzer(),
+                             IndexWriter.MaxFieldLength.LIMITED);
+    writer.setMergeFactor(3);
+    writer.expungeDeletes();
+    writer.close();
+    ir = IndexReader.open(dir);
+    assertEquals(49, ir.maxDoc());
+    assertEquals(49, ir.numDocs());
+    ir.close();
+    dir.close();
+  }
+
+  // LUCENE-325: test expungeDeletes without waiting, when
+  // many adjacent merges are required
+  public void testExpungeDeletes3() throws IOException {
+    Directory dir = new MockRAMDirectory();
+    IndexWriter writer = new IndexWriter(dir,
+                                         false, new StandardAnalyzer(),
+                                         IndexWriter.MaxFieldLength.LIMITED);
+    writer.setMaxBufferedDocs(2);
+    writer.setMergeFactor(50);
+    writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
+
+    Document document = new Document();
+
+    document = new Document();
+    Field storedField = new Field("stored", "stored", Field.Store.YES,
+                                  Field.Index.NO);
+    document.add(storedField);
+    Field termVectorField = new Field("termVector", "termVector",
+                                      Field.Store.NO, Field.Index.UN_TOKENIZED,
+                                      Field.TermVector.WITH_POSITIONS_OFFSETS);
+    document.add(termVectorField);
+    for(int i=0;i<98;i++)
+      writer.addDocument(document);
+    writer.close();
+
+    IndexReader ir = IndexReader.open(dir);
+    assertEquals(98, ir.maxDoc());
+    assertEquals(98, ir.numDocs());
+    for(int i=0;i<98;i+=2)
+      ir.deleteDocument(i);
+    assertEquals(49, ir.numDocs());
+    ir.close();
+
+    writer = new IndexWriter(dir,
+                             false, new StandardAnalyzer(),
+                             IndexWriter.MaxFieldLength.LIMITED);
+    // Force many merges to happen
+    writer.setMergeFactor(3);
+    writer.expungeDeletes(false);
+    writer.close();
+    ir = IndexReader.open(dir);
+    assertEquals(49, ir.maxDoc());
+    assertEquals(49, ir.numDocs());
+    ir.close();
+    dir.close();
+  }
 }



Mime
View raw message