lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r502191 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/index/IndexWriter.java src/java/org/apache/lucene/index/NewIndexModifier.java src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java
Date Thu, 01 Feb 2007 10:55:13 GMT
Author: mikemccand
Date: Thu Feb  1 02:55:12 2007
New Revision: 502191

URL: http://svn.apache.org/viewvc?view=rev&rev=502191
Log:
LUCENE-565: added NewIndexModifier (IndexWriter subclass) to more efficiently handle updating
documents (delete then add)

Added:
    lucene/java/trunk/src/java/org/apache/lucene/index/NewIndexModifier.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=502191&r1=502190&r2=502191
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Feb  1 02:55:12 2007
@@ -102,6 +102,13 @@
     their passing unit tests.
     (Otis Gospodnetic)
 
+13. LUCENE-565: Added NewIndexModifier (subclass of IndexWriter) to
+    more efficiently handle updating documents (the "delete then add"
+    use case).  This is intended to be an eventual replacement for the
+    existing IndexModifier.  Added IndexWriter.flush() (renamed from
+    flushRamSegments()) to flush all pending updates (held in RAM), to
+    the Directory.  (Ning Li via Mike McCandless)
+
 API Changes
 
  1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?view=diff&rev=502191&r1=502190&r2=502191
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Thu Feb  1 02:55:12
2007
@@ -108,8 +108,8 @@
   private HashSet protectedSegments; // segment names that should not be deleted until commit
   private SegmentInfos rollbackSegmentInfos;      // segmentInfos we will fallback to if
the commit fails
 
-  private SegmentInfos segmentInfos = new SegmentInfos();       // the segments
-  private SegmentInfos ramSegmentInfos = new SegmentInfos();    // the segments in ramDirectory
+  protected SegmentInfos segmentInfos = new SegmentInfos();       // the segments
+  protected SegmentInfos ramSegmentInfos = new SegmentInfos();    // the segments in ramDirectory
   private final RAMDirectory ramDirectory = new RAMDirectory(); // for temp segs
   private IndexFileDeleter deleter;
 
@@ -125,6 +125,10 @@
 
   private boolean closeDir;
 
+  protected IndexFileDeleter getDeleter() {
+    return deleter;
+  }
+
   /** Get the current setting of whether to use the compound file format.
    *  Note that this just returns the value you set with setUseCompoundFile(boolean)
    *  or the default. You cannot use this to query the status of an existing index.
@@ -642,23 +646,28 @@
    * flushing/merging temporary free space requirements.</p>
    */
   public void addDocument(Document doc, Analyzer analyzer) throws IOException {
-    DocumentWriter dw =
-      new DocumentWriter(ramDirectory, analyzer, this);
-    dw.setInfoStream(infoStream);
-    String segmentName = newRAMSegmentName();
-    dw.addDocument(segmentName, doc);
+    SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
     synchronized (this) {
-      ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false));
+      ramSegmentInfos.addElement(newSegmentInfo);
       maybeFlushRamSegments();
     }
   }
 
+  final SegmentInfo buildSingleDocSegment(Document doc, Analyzer analyzer)
+      throws IOException {
+    DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, this);
+    dw.setInfoStream(infoStream);
+    String segmentName = newRAMSegmentName();
+    dw.addDocument(segmentName, doc);
+    return new SegmentInfo(segmentName, 1, ramDirectory, false, false);
+  }
+
   // for test purpose
   final synchronized int getRAMSegmentCount() {
     return ramSegmentInfos.size();
   }
 
-  private final synchronized String newRAMSegmentName() {
+  final synchronized String newRAMSegmentName() {
     return "_ram_" + Integer.toString(ramSegmentInfos.counter++, Character.MAX_RADIX);
   }
 
@@ -676,7 +685,7 @@
     }
   }
 
-  private final synchronized String newSegmentName() {
+  final synchronized String newSegmentName() {
     return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
   }
 
@@ -1219,20 +1228,46 @@
   //         counts x and y, then f(x) >= f(y).
   //      2: The number of committed segments on the same level (f(n)) <= M.
 
-  private final void maybeFlushRamSegments() throws IOException {
-    if (ramSegmentInfos.size() >= minMergeDocs) {
+  protected boolean timeToFlushRam() {
+    return ramSegmentInfos.size() >= minMergeDocs;
+  }
+
+  protected boolean anythingToFlushRam() {
+    return ramSegmentInfos.size() > 0;
+  }
+
+  // true if only buffered inserts, no buffered deletes
+  protected boolean onlyRamDocsToFlush() {
+    return true;
+  }
+
+  // whether the latest segment is the flushed merge of ram segments
+  protected void doAfterFlushRamSegments(boolean flushedRamSegments)
+      throws IOException {
+  }
+
+  protected final void maybeFlushRamSegments() throws IOException {
+    if (timeToFlushRam()) {
       flushRamSegments();
     }
   }
 
   /** Expert:  Flushes all RAM-resident segments (buffered documents), then may merge segments.
*/
-  public final synchronized void flushRamSegments() throws IOException {
-    if (ramSegmentInfos.size() > 0) {
+  private final synchronized void flushRamSegments() throws IOException {
+    if (anythingToFlushRam()) {
       mergeSegments(ramSegmentInfos, 0, ramSegmentInfos.size());
       maybeMergeSegments(minMergeDocs);
     }
   }
 
+  /**
+   * Flush all in-memory buffered updates to the Directory.
+   * @throws IOException
+   */
+  public final synchronized void flush() throws IOException {
+    flushRamSegments();
+  }
+
   /** Expert:  Return the total size of all index files currently cached in memory.
    * Useful for size management with flushRamDocs()
    */
@@ -1315,10 +1350,10 @@
   private final int mergeSegments(SegmentInfos sourceSegments, int minSegment, int end)
     throws IOException {
 
+    boolean mergeFlag = end > 0;
     final String mergedName = newSegmentName();
-    if (infoStream != null) infoStream.print("merging segments");
-    SegmentMerger merger = new SegmentMerger(this, mergedName);
-    
+    SegmentMerger merger = null;
+
     final Vector segmentsToDelete = new Vector();
 
     String segmentsInfosFileName = segmentInfos.getCurrentSegmentFileName();
@@ -1326,21 +1361,26 @@
 
     SegmentInfo newSegment = null;
 
-    int mergedDocCount;
+    int mergedDocCount = 0;
 
     // This is try/finally to make sure merger's readers are closed:
     try {
 
+     if (mergeFlag) {
+      if (infoStream != null) infoStream.print("merging segments");
+      merger = new SegmentMerger(this, mergedName);
+
       for (int i = minSegment; i < end; i++) {
         SegmentInfo si = sourceSegments.info(i);
         if (infoStream != null)
           infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
-        IndexReader reader = SegmentReader.get(si);
+        IndexReader reader = SegmentReader.get(si); // no need to set deleter (yet)
         merger.add(reader);
         if ((reader.directory() == this.directory) || // if we own the directory
             (reader.directory() == this.ramDirectory))
           segmentsToDelete.addElement(reader);   // queue segment for deletion
       }
+     }
 
       SegmentInfos rollback = null;
       boolean success = false;
@@ -1349,6 +1389,7 @@
       // if we hit exception when doing the merge:
       try {
 
+       if (mergeFlag) {
         mergedDocCount = merger.merge();
 
         if (infoStream != null) {
@@ -1357,23 +1398,32 @@
 
         newSegment = new SegmentInfo(mergedName, mergedDocCount,
                                      directory, false, true);
+       }
 
+        if (!inTransaction
+            && (sourceSegments != ramSegmentInfos || !onlyRamDocsToFlush())) {
+          // Now save the SegmentInfo instances that
+          // we are replacing:
+          rollback = (SegmentInfos) segmentInfos.clone();
+        }
 
+       if (mergeFlag) {
         if (sourceSegments == ramSegmentInfos) {
           segmentInfos.addElement(newSegment);
         } else {
-
-          if (!inTransaction) {
-            // Now save the SegmentInfo instances that
-            // we are replacing:
-            rollback = (SegmentInfos) segmentInfos.clone();
-          }
-
           for (int i = end-1; i > minSegment; i--)     // remove old infos & add new
             sourceSegments.remove(i);
 
           segmentInfos.set(minSegment, newSegment);
         }
+       }
+
+        if (sourceSegments == ramSegmentInfos) {
+          // Should not be necessary: no prior commit should
+          // have left pending files, so just defensive:
+          deleter.clearPendingFiles();
+          doAfterFlushRamSegments(mergeFlag);
+        }
 
         if (!inTransaction) {
           segmentInfos.write(directory);     // commit before deleting
@@ -1396,7 +1446,7 @@
 
           // Must rollback so our state matches index:
 
-          if (sourceSegments == ramSegmentInfos) {
+          if (sourceSegments == ramSegmentInfos && onlyRamDocsToFlush()) {
             // Simple case: newSegment may or may not have
             // been added to the end of our segment infos,
             // so just check & remove if so:
@@ -1414,6 +1464,10 @@
             segmentInfos.addAll(rollback);
           }
 
+          // Erase any pending files that we were going to delete:
+          // i.e. old del files added by SegmentReader.doCommit() 
+          deleter.clearPendingFiles();
+
           // Delete any partially created files:
           deleter.deleteFile(nextSegmentsFileName);
           deleter.findDeletableFiles();
@@ -1422,18 +1476,21 @@
       }
     } finally {
       // close readers before we attempt to delete now-obsolete segments
-      merger.closeReaders();
+      if (mergeFlag) merger.closeReaders();
     }
 
     if (!inTransaction) {
+      // Attempt to delete all files we just obsoleted:
       deleter.deleteFile(segmentsInfosFileName);    // delete old segments_N file
       deleter.deleteSegments(segmentsToDelete);     // delete now-unused segments
+      // including the old del files
+      deleter.commitPendingFiles();
     } else {
       deleter.addPendingFile(segmentsInfosFileName);    // delete old segments_N file
       deleter.deleteSegments(segmentsToDelete, protectedSegments);     // delete now-unused
segments
     }
 
-    if (useCompoundFile) {
+    if (useCompoundFile && mergeFlag) {
 
       segmentsInfosFileName = nextSegmentsFileName;
       nextSegmentsFileName = segmentInfos.getNextSegmentFileName();

Added: lucene/java/trunk/src/java/org/apache/lucene/index/NewIndexModifier.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NewIndexModifier.java?view=auto&rev=502191
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NewIndexModifier.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NewIndexModifier.java Thu Feb  1 02:55:12
2007
@@ -0,0 +1,294 @@
+package org.apache.lucene.index;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+/**
+ * NewIndexModifier extends {@link IndexWriter} so that you can not only insert
+ * documents but also delete documents through a single interface. Internally,
+ * inserts and deletes are buffered before they are flushed to disk.
+ * <p>
+ * Design Overview
+ * <p>
+ * deleteDocuments() method works by buffering terms to be deleted. Deletes are
+ * deferred until ram is flushed to disk, either because enough new documents or
+ * delete terms are buffered, or because close() or flush() is called. Using
+ * Java synchronization, care is taken to ensure that an interleaved sequence of
+ * inserts and deletes for the same document are properly serialized.
+ */
+
+public class NewIndexModifier extends IndexWriter {
+  // number of ram segments a delete term applies to
+  private class Num {
+    private int num;
+
+    Num(int num) {
+      this.num = num;
+    }
+
+    int getNum() {
+      return num;
+    }
+
+    void setNum(int num) {
+      this.num = num;
+    }
+  }
+
+  /**
+   * Default value is 10. Change using {@link #setMaxBufferedDeleteTerms(int)}.
+   */
+  public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = 10;
+  // the max number of delete terms that can be buffered before
+  // they must be flushed to disk
+  private int maxBufferedDeleteTerms = DEFAULT_MAX_BUFFERED_DELETE_TERMS;
+
+  // to buffer delete terms in ram before they are applied
+  // key is delete term, value is number of ram segments the term applies to
+  private HashMap bufferedDeleteTerms = new HashMap();
+  private int numBufferedDeleteTerms = 0;
+
+  /**
+   * @see IndexWriter#IndexWriter(String, Analyzer, boolean)
+   */
+  public NewIndexModifier(String path, Analyzer a, boolean create)
+      throws IOException {
+    super(path, a, create);
+  }
+
+  /**
+   * @see IndexWriter#IndexWriter(File, Analyzer, boolean)
+   */
+  public NewIndexModifier(File path, Analyzer a, boolean create)
+      throws IOException {
+    super(path, a, create);
+  }
+
+  /**
+   * @see IndexWriter#IndexWriter(Directory, Analyzer, boolean)
+   */
+  public NewIndexModifier(Directory d, Analyzer a, boolean create)
+      throws IOException {
+    super(d, a, create);
+  }
+
+  /**
+   * @see IndexWriter#IndexWriter(String, Analyzer)
+   */
+  public NewIndexModifier(String path, Analyzer a) throws IOException {
+    super(path, a);
+  }
+
+  /**
+   * @see IndexWriter#IndexWriter(File, Analyzer)
+   */
+  public NewIndexModifier(File path, Analyzer a) throws IOException {
+    super(path, a);
+  }
+
+  /**
+   * @see IndexWriter#IndexWriter(Directory, Analyzer)
+   */
+  public NewIndexModifier(Directory d, Analyzer a) throws IOException {
+    super(d, a);
+  }
+
+  /**
+   * Determines the minimal number of delete terms required before the buffered
+   * in-memory delete terms are applied and flushed. If there are documents
+   * buffered in memory at the time, they are merged and a new Segment is
+   * created. The delete terms are applied appropriately.
+   * <p>
+   * The default value is 10.
+   * @throws IllegalArgumentException if maxBufferedDeleteTerms is smaller than
+   *         1
+   */
+  public void setMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) {
+    if (maxBufferedDeleteTerms < 1)
+      throw new IllegalArgumentException("maxBufferedDeleteTerms must at least be 1");
+    this.maxBufferedDeleteTerms = maxBufferedDeleteTerms;
+  }
+
+  /**
+   * @see #setMaxBufferedDeleteTerms
+   */
+  public int getMaxBufferedDeleteTerms() {
+    return maxBufferedDeleteTerms;
+  }
+
+  // for test purpose
+  final synchronized int getBufferedDeleteTermsSize() {
+    return bufferedDeleteTerms.size();
+  }
+
+  // for test purpose
+  final synchronized int getNumBufferedDeleteTerms() {
+    return numBufferedDeleteTerms;
+  }
+
+  /**
+   * Updates a document by first deleting all documents containing
+   * <code>term</code> and then adding the new document.
+   */
+  public void updateDocument(Term term, Document doc) throws IOException {
+    updateDocument(term, doc, getAnalyzer());
+  }
+
+  /**
+   * Updates a document by first deleting all documents containing
+   * <code>term</code> and then adding the new document.
+   */
+  public void updateDocument(Term term, Document doc, Analyzer analyzer)
+      throws IOException {
+    SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
+    synchronized (this) {
+      bufferDeleteTerm(term);
+      ramSegmentInfos.addElement(newSegmentInfo);
+      maybeFlushRamSegments();
+    }
+  }
+
+  /**
+   * Deletes all documents containing <code>term</code>.
+   */
+  public synchronized void deleteDocuments(Term term) throws IOException {
+    bufferDeleteTerm(term);
+    maybeFlushRamSegments();
+  }
+
+  /**
+   * Deletes all documents containing any of the terms. All deletes are flushed
+   * at the same time.
+   */
+  public synchronized void deleteDocuments(Term[] terms) throws IOException {
+    for (int i = 0; i < terms.length; i++) {
+      bufferDeleteTerm(terms[i]);
+    }
+    maybeFlushRamSegments();
+  }
+
+  // buffer a term in bufferedDeleteTerms. bufferedDeleteTerms also records
+  // the current number of documents buffered in ram so that the delete term
+  // will be applied to those ram segments as well as the disk segments
+  private void bufferDeleteTerm(Term term) {
+    Num num = (Num)bufferedDeleteTerms.get(term);
+    if (num == null) {
+      bufferedDeleteTerms.put(term, new Num(getRAMSegmentCount()));
+    } else {
+      num.setNum(getRAMSegmentCount());
+    }
+    numBufferedDeleteTerms++;
+  }
+
+  // a flush is triggered if enough new documents are buffered or
+  // if enough delete terms are buffered
+  protected boolean timeToFlushRam() {
+    return super.timeToFlushRam()
+        || numBufferedDeleteTerms >= maxBufferedDeleteTerms;
+  }
+
+  protected boolean anythingToFlushRam() {
+    return super.anythingToFlushRam() || bufferedDeleteTerms.size() > 0;
+  }
+
+  protected boolean onlyRamDocsToFlush() {
+    return super.onlyRamDocsToFlush() && bufferedDeleteTerms.size() == 0;
+  }
+
+  protected void doAfterFlushRamSegments(boolean flushedRamSegments)
+      throws IOException {
+    if (bufferedDeleteTerms.size() > 0) {
+      if (getInfoStream() != null)
+        getInfoStream().println(
+            "flush " + numBufferedDeleteTerms + " buffered terms on "
+                + segmentInfos.size() + " segments.");
+
+      if (flushedRamSegments) {
+        IndexReader reader = null;
+        try {
+          reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1));
+          reader.setDeleter(getDeleter());
+
+          // apply delete terms to the segment just flushed from ram
+          // apply appropriately so that a delete term is only applied to
+          // the documents buffered before it, not those buffered after it
+          applyDeletesSelectively(bufferedDeleteTerms, reader);
+        } finally {
+          if (reader != null)
+            reader.close();
+        }
+      }
+
+      int infosEnd = segmentInfos.size();
+      if (flushedRamSegments) {
+        infosEnd--;
+      }
+
+      for (int i = 0; i < infosEnd; i++) {
+        IndexReader reader = null;
+        try {
+          reader = SegmentReader.get(segmentInfos.info(i));
+          reader.setDeleter(getDeleter());
+
+          // apply delete terms to disk segments
+          // except the one just flushed from ram
+          applyDeletes(bufferedDeleteTerms, reader);
+        } finally {
+          if (reader != null)
+            reader.close();
+        }
+      }
+
+      // clean up bufferedDeleteTerms
+      bufferedDeleteTerms.clear();
+      numBufferedDeleteTerms = 0;
+    }
+  }
+
+  // apply buffered delete terms to the segment just flushed from ram
+  // apply appropriately so that a delete term is only applied to
+  // the documents buffered before it, not those buffered after it
+  private final void applyDeletesSelectively(HashMap deleteTerms,
+      IndexReader reader) throws IOException {
+    Iterator iter = deleteTerms.entrySet().iterator();
+    while (iter.hasNext()) {
+      Entry entry = (Entry)iter.next();
+      Term term = (Term)entry.getKey();
+
+      TermDocs docs = reader.termDocs(term);
+      if (docs != null) {
+        int num = ((Num)entry.getValue()).getNum();
+        try {
+          while (docs.next()) {
+            int doc = docs.doc();
+            if (doc >= num) {
+              break;
+            }
+            reader.deleteDocument(doc);
+          }
+        } finally {
+          docs.close();
+        }
+      }
+    }
+  }
+
+  // apply buffered delete terms to disk segments
+  // except the one just flushed from ram
+  private final void applyDeletes(HashMap deleteTerms, IndexReader reader)
+      throws IOException {
+    Iterator iter = deleteTerms.entrySet().iterator();
+    while (iter.hasNext()) {
+      Entry entry = (Entry)iter.next();
+      Term term = (Term)entry.getKey();
+      reader.deleteDocuments(term);
+    }
+  }
+}

Added: lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java?view=auto&rev=502191
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestNewIndexModifierDelete.java Thu
Feb  1 02:55:12 2007
@@ -0,0 +1,446 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestNewIndexModifierDelete extends TestCase {
+
+  // test the simple case
+  public void testSimpleCase() throws IOException {
+    String[] keywords = { "1", "2" };
+    String[] unindexed = { "Netherlands", "Italy" };
+    String[] unstored = { "Amsterdam has lots of bridges",
+        "Venice has lots of canals" };
+    String[] text = { "Amsterdam", "Venice" };
+
+    Directory dir = new RAMDirectory();
+    NewIndexModifier modifier = new NewIndexModifier(dir,
+        new WhitespaceAnalyzer(), true);
+    modifier.setUseCompoundFile(true);
+    modifier.setMaxBufferedDeleteTerms(1);
+
+    for (int i = 0; i < keywords.length; i++) {
+      Document doc = new Document();
+      doc.add(new Field("id", keywords[i], Field.Store.YES,
+          Field.Index.UN_TOKENIZED));
+      doc.add(new Field("country", unindexed[i], Field.Store.YES,
+          Field.Index.NO));
+      doc.add(new Field("contents", unstored[i], Field.Store.NO,
+          Field.Index.TOKENIZED));
+      doc
+          .add(new Field("city", text[i], Field.Store.YES,
+              Field.Index.TOKENIZED));
+      modifier.addDocument(doc);
+    }
+    modifier.optimize();
+
+    Term term = new Term("city", "Amsterdam");
+    int hitCount = getHitCount(dir, term);
+    assertEquals(1, hitCount);
+    modifier.deleteDocuments(term);
+    hitCount = getHitCount(dir, term);
+    assertEquals(0, hitCount);
+
+    modifier.close();
+  }
+
+  // test when delete terms only apply to disk segments
+  public void testNonRAMDelete() throws IOException {
+    Directory dir = new RAMDirectory();
+    NewIndexModifier modifier = new NewIndexModifier(dir,
+        new WhitespaceAnalyzer(), true);
+    modifier.setMaxBufferedDocs(2);
+    modifier.setMaxBufferedDeleteTerms(2);
+
+    int id = 0;
+    int value = 100;
+
+    for (int i = 0; i < 7; i++) {
+      addDoc(modifier, ++id, value);
+    }
+    modifier.flush();
+
+    assertEquals(0, modifier.getRAMSegmentCount());
+    assertTrue(0 < modifier.getSegmentCount());
+
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals(7, reader.numDocs());
+    reader.close();
+
+    modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+    modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+
+    reader = IndexReader.open(dir);
+    assertEquals(0, reader.numDocs());
+    reader.close();
+
+    modifier.close();
+  }
+
+  // test when delete terms only apply to ram segments
+  public void testRAMDeletes() throws IOException {
+    Directory dir = new RAMDirectory();
+    NewIndexModifier modifier = new NewIndexModifier(dir,
+        new WhitespaceAnalyzer(), true);
+    modifier.setMaxBufferedDocs(4);
+    modifier.setMaxBufferedDeleteTerms(4);
+
+    int id = 0;
+    int value = 100;
+
+    addDoc(modifier, ++id, value);
+    modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+    addDoc(modifier, ++id, value);
+    modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+
+    assertEquals(2, modifier.getNumBufferedDeleteTerms());
+    assertEquals(1, modifier.getBufferedDeleteTermsSize());
+
+    addDoc(modifier, ++id, value);
+    assertEquals(0, modifier.getSegmentCount());
+    modifier.flush();
+
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals(1, reader.numDocs());
+
+    int hitCount = getHitCount(dir, new Term("id", String.valueOf(id)));
+    assertEquals(1, hitCount);
+    reader.close();
+
+    modifier.close();
+  }
+
+  // test when delete terms apply to both disk and ram segments
+  public void testBothDeletes() throws IOException {
+    Directory dir = new RAMDirectory();
+    NewIndexModifier modifier = new NewIndexModifier(dir,
+        new WhitespaceAnalyzer(), true);
+    modifier.setMaxBufferedDocs(100);
+    modifier.setMaxBufferedDeleteTerms(100);
+
+    int id = 0;
+    int value = 100;
+
+    for (int i = 0; i < 5; i++) {
+      addDoc(modifier, ++id, value);
+    }
+
+    value = 200;
+    for (int i = 0; i < 5; i++) {
+      addDoc(modifier, ++id, value);
+    }
+    modifier.flush();
+
+    for (int i = 0; i < 5; i++) {
+      addDoc(modifier, ++id, value);
+    }
+    modifier.deleteDocuments(new Term("value", String.valueOf(value)));
+    modifier.flush();
+
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals(5, reader.numDocs());
+
+    modifier.close();
+  }
+
+  // test that batched delete terms are flushed together
+  public void testBatchDeletes() throws IOException {
+    Directory dir = new RAMDirectory();
+    NewIndexModifier modifier = new NewIndexModifier(dir,
+        new WhitespaceAnalyzer(), true);
+    modifier.setMaxBufferedDocs(2);
+    modifier.setMaxBufferedDeleteTerms(2);
+
+    int id = 0;
+    int value = 100;
+
+    for (int i = 0; i < 7; i++) {
+      addDoc(modifier, ++id, value);
+    }
+    modifier.flush();
+
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals(7, reader.numDocs());
+    reader.close();
+
+    id = 0;
+    modifier.deleteDocuments(new Term("id", String.valueOf(++id)));
+    modifier.deleteDocuments(new Term("id", String.valueOf(++id)));
+
+    reader = IndexReader.open(dir);
+    assertEquals(5, reader.numDocs());
+    reader.close();
+
+    Term[] terms = new Term[3];
+    for (int i = 0; i < terms.length; i++) {
+      terms[i] = new Term("id", String.valueOf(++id));
+    }
+    modifier.deleteDocuments(terms);
+
+    reader = IndexReader.open(dir);
+    assertEquals(2, reader.numDocs());
+    reader.close();
+
+    modifier.close();
+  }
+
+  private void addDoc(NewIndexModifier modifier, int id, int value)
+      throws IOException {
+    Document doc = new Document();
+    doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
+    doc.add(new Field("id", String.valueOf(id), Field.Store.YES,
+        Field.Index.UN_TOKENIZED));
+    doc.add(new Field("value", String.valueOf(value), Field.Store.NO,
+        Field.Index.UN_TOKENIZED));
+    modifier.addDocument(doc);
+  }
+
+  private int getHitCount(Directory dir, Term term) throws IOException {
+    IndexSearcher searcher = new IndexSearcher(dir);
+    int hitCount = searcher.search(new TermQuery(term)).length();
+    searcher.close();
+    return hitCount;
+  }
+
+  public void testDeletesOnDiskFull() throws IOException {
+    testOperationsOnDiskFull(false);
+  }
+
+  public void testUpdatesOnDiskFull() throws IOException {
+    testOperationsOnDiskFull(true);
+  }
+
+  /**
+   * Make sure if modifier tries to commit but hits disk full that modifier
+   * remains consistent and usable. Similar to TestIndexReader.testDiskFull().
+   */
+  private void testOperationsOnDiskFull(boolean updates) throws IOException {
+
+    boolean debug = false;
+    Term searchTerm = new Term("content", "aaa");
+    int START_COUNT = 157;
+    int END_COUNT = 144;
+
+    // First build up a starting index:
+    RAMDirectory startDir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(startDir, new WhitespaceAnalyzer(),
+        true);
+    for (int i = 0; i < 157; i++) {
+      Document d = new Document();
+      d.add(new Field("id", Integer.toString(i), Field.Store.YES,
+          Field.Index.UN_TOKENIZED));
+      d.add(new Field("content", "aaa " + i, Field.Store.NO,
+          Field.Index.TOKENIZED));
+      writer.addDocument(d);
+    }
+    writer.close();
+
+    long diskUsage = startDir.sizeInBytes();
+    long diskFree = diskUsage + 10;
+
+    IOException err = null;
+
+    boolean done = false;
+
+    // Iterate w/ ever increasing free disk space:
+    while (!done) {
+      MockRAMDirectory dir = new MockRAMDirectory(startDir);
+      NewIndexModifier modifier = new NewIndexModifier(dir,
+          new WhitespaceAnalyzer(), false);
+
+      modifier.setMaxBufferedDocs(1000); // use flush or close
+      modifier.setMaxBufferedDeleteTerms(1000); // use flush or close
+
+      // For each disk size, first try to commit against
+      // dir that will hit random IOExceptions & disk
+      // full; after, give it infinite disk space & turn
+      // off random IOExceptions & retry w/ same reader:
+      boolean success = false;
+
+      for (int x = 0; x < 2; x++) {
+
+        double rate = 0.1;
+        double diskRatio = ((double)diskFree) / diskUsage;
+        long thisDiskFree;
+        String testName;
+
+        if (0 == x) {
+          thisDiskFree = diskFree;
+          if (diskRatio >= 2.0) {
+            rate /= 2;
+          }
+          if (diskRatio >= 4.0) {
+            rate /= 2;
+          }
+          if (diskRatio >= 6.0) {
+            rate = 0.0;
+          }
+          if (debug) {
+            System.out.println("\ncycle: " + diskFree + " bytes");
+          }
+          testName = "disk full during reader.close() @ " + thisDiskFree
+              + " bytes";
+        } else {
+          thisDiskFree = 0;
+          rate = 0.0;
+          if (debug) {
+            System.out.println("\ncycle: same writer: unlimited disk space");
+          }
+          testName = "reader re-use after disk full";
+        }
+
+        dir.setMaxSizeInBytes(thisDiskFree);
+        dir.setRandomIOExceptionRate(rate, diskFree);
+
+        try {
+          if (0 == x) {
+            int docId = 12;
+            for (int i = 0; i < 13; i++) {
+              if (updates) {
+                Document d = new Document();
+                d.add(new Field("id", Integer.toString(i), Field.Store.YES,
+                    Field.Index.UN_TOKENIZED));
+                d.add(new Field("content", "bbb " + i, Field.Store.NO,
+                    Field.Index.TOKENIZED));
+                modifier.updateDocument(
+                    new Term("id", Integer.toString(docId)), d);
+              } else { // deletes
+                modifier
+                    .deleteDocuments(new Term("id", Integer.toString(docId)));
+                // modifier.setNorm(docId, "contents", (float)2.0);
+              }
+              docId += 12;
+            }
+          }
+          modifier.close();
+          success = true;
+          if (0 == x) {
+            done = true;
+          }
+        }
+        catch (IOException e) {
+          if (debug) {
+            System.out.println("  hit IOException: " + e);
+          }
+          err = e;
+          if (1 == x) {
+            e.printStackTrace();
+            fail(testName + " hit IOException after disk space was freed up");
+          }
+        }
+
+        // Whether we succeeded or failed, check that all
+        // un-referenced files were in fact deleted (ie,
+        // we did not create garbage). Just create a
+        // new IndexFileDeleter, have it delete
+        // unreferenced files, then verify that in fact
+        // no files were deleted:
+        String[] startFiles = dir.list();
+        SegmentInfos infos = new SegmentInfos();
+        infos.read(dir);
+        IndexFileDeleter d = new IndexFileDeleter(infos, dir);
+        d.findDeletableFiles();
+        d.deleteFiles();
+        String[] endFiles = dir.list();
+
+        Arrays.sort(startFiles);
+        Arrays.sort(endFiles);
+
+        // for(int i=0;i<startFiles.length;i++) {
+        // System.out.println(" startFiles: " + i + ": " + startFiles[i]);
+        // }
+
+        if (!Arrays.equals(startFiles, endFiles)) {
+          String successStr;
+          if (success) {
+            successStr = "success";
+          } else {
+            successStr = "IOException";
+            err.printStackTrace();
+          }
+          fail("reader.close() failed to delete unreferenced files after "
+              + successStr + " (" + diskFree + " bytes): before delete:\n    "
+              + arrayToString(startFiles) + "\n  after delete:\n    "
+              + arrayToString(endFiles));
+        }
+
+        // Finally, verify index is not corrupt, and, if
+        // we succeeded, we see all docs changed, and if
+        // we failed, we see either all docs or no docs
+        // changed (transactional semantics):
+        IndexReader newReader = null;
+        try {
+          newReader = IndexReader.open(dir);
+        }
+        catch (IOException e) {
+          e.printStackTrace();
+          fail(testName
+              + ":exception when creating IndexReader after disk full during close: "
+              + e);
+        }
+
+        IndexSearcher searcher = new IndexSearcher(newReader);
+        Hits hits = null;
+        try {
+          hits = searcher.search(new TermQuery(searchTerm));
+        }
+        catch (IOException e) {
+          e.printStackTrace();
+          fail(testName + ": exception when searching: " + e);
+        }
+        int result2 = hits.length();
+        if (success) {
+          if (result2 != END_COUNT) {
+            fail(testName
+                + ": method did not throw exception but hits.length for search on term 'aaa'
is "
+                + result2 + " instead of expected " + END_COUNT);
+          }
+        } else {
+          // On hitting exception we still may have added
+          // all docs:
+          if (result2 != START_COUNT && result2 != END_COUNT) {
+            err.printStackTrace();
+            fail(testName
+                + ": method did throw exception but hits.length for search on term 'aaa'
is "
+                + result2 + " instead of expected " + START_COUNT);
+          }
+        }
+
+        searcher.close();
+        newReader.close();
+
+        if (result2 == END_COUNT) {
+          break;
+        }
+      }
+
+      dir.close();
+
+      // Try again with 10 more bytes of free space:
+      diskFree += 10;
+    }
+  }
+
+  private String arrayToString(String[] l) {
+    String s = "";
+    for (int i = 0; i < l.length; i++) {
+      if (i > 0) {
+        s += "\n    ";
+      }
+      s += l[i];
+    }
+    return s;
+  }
+}



Mime
View raw message