lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1099830 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/ lucene/backwards/src/test/org/apache/lucene/index/ lucene/src/java/org/apache/lucene/index/ lucene/src/test/org/apache/lucene/index/ solr/
Date Thu, 05 May 2011 14:52:23 GMT
Author: mikemccand
Date: Thu May  5 14:52:22 2011
New Revision: 1099830

URL: http://svn.apache.org/viewvc?rev=1099830&view=rev
Log:
LUCENE-2897: apply delete-by-term on flushed segment while we flush (still buffer delete-by-terms
for past segments) (merged from trunk)

Added:
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
      - copied unchanged from r1065855, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FrozenBufferedDeletes.java
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
    lucene/dev/branches/branch_3x/solr/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Thu May  5 14:52:22 2011
@@ -20,6 +20,9 @@ Optimizations
 * LUCENE-2990: ArrayUtil/CollectionUtil.*Sort() methods now exit early
   on empty or one-element lists/arrays.  (Uwe Schindler)
 
+* LUCENE-2897: Apply deleted terms while flushing a segment.  We still
+  buffer deleted terms to later apply to past segments.  (Mike McCandless)
+
 Bug fixes
 
 * LUCENE-2996: addIndexes(IndexReader) did not flush before adding the new 

Modified: lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriter.java
Thu May  5 14:52:22 2011
@@ -18,7 +18,6 @@ package org.apache.lucene.index;
  */
 
 import java.io.ByteArrayOutputStream;
-import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.io.Reader;
@@ -2715,7 +2714,7 @@ public class TestIndexWriter extends Luc
           count++;
         }
       }
-      assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting")
+ " count=" + count, count > 2500);
+      assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting")
+ " count=" + count, count > 1500);
     }
     w.close();
     dir.close();

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
Thu May  5 14:52:22 2011
@@ -18,21 +18,23 @@ package org.apache.lucene.index;
  */
 
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
 
-/** Holds buffered deletes, by docID, term or query for a
- *  single segment. This is used to hold buffered pending
- *  deletes against the to-be-flushed segment as well as
- *  per-segment deletes for each segment in the index. */
+/* Holds buffered deletes, by docID, term or query for a
+ * single segment. This is used to hold buffered pending
+ * deletes against the to-be-flushed segment.  Once the
+ * deletes are pushed (on flush in DocumentsWriter), these
+ * deletes are converted to a FrozenDeletes instance. */
 
 // NOTE: we are sync'd by BufferedDeletes, ie, all access to
 // instances of this class is via sync'd methods on
@@ -62,13 +64,8 @@ class BufferedDeletes {
      undercount (say 24 bytes).  Integer is OBJ_HEADER + INT. */
   final static int BYTES_PER_DEL_QUERY = 5*RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2*RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ 2*RamUsageEstimator.NUM_BYTES_INT + 24;
 
-  // TODO: many of the deletes stored here will map to
-  // Integer.MAX_VALUE; we could be more efficient for this
-  // case ie use a SortedSet not a SortedMap.  But: Java's
-  // SortedSet impls are simply backed by a Map so we won't
-  // save anything unless we do something custom...
   final AtomicInteger numTermDeletes = new AtomicInteger();
-  final SortedMap<Term,Integer> terms = new TreeMap<Term,Integer>();
+  final Map<Term,Integer> terms;
   final Map<Query,Integer> queries = new HashMap<Query,Integer>();
   final List<Integer> docIDs = new ArrayList<Integer>();
 
@@ -80,6 +77,14 @@ class BufferedDeletes {
 
   long gen;
 
+  public BufferedDeletes(boolean sortTerms) {
+    if (sortTerms) {
+      terms = new TreeMap<Term,Integer>();
+    } else {
+      terms = new HashMap<Term,Integer>();
+    }
+  }
+
   @Override
   public String toString() {
     if (VERBOSE_DELETES) {
@@ -129,6 +134,26 @@ class BufferedDeletes {
     // should already be cleared
   }
 
+  void update(FrozenBufferedDeletes in) {
+    numTermDeletes.addAndGet(in.numTermDeletes);
+    for(Term term : in.terms) {
+      if (!terms.containsKey(term)) {
+        // only incr bytesUsed if this term wasn't already buffered:
+        bytesUsed.addAndGet(BYTES_PER_DEL_TERM);
+      }
+      terms.put(term, MAX_INT);
+    }
+
+    for(int queryIdx=0;queryIdx<in.queries.length;queryIdx++) {
+      final Query query = in.queries[queryIdx];
+      if (!queries.containsKey(query)) {
+        // only incr bytesUsed if this query wasn't already buffered:
+        bytesUsed.addAndGet(BYTES_PER_DEL_QUERY);
+      }
+      queries.put(query, MAX_INT);
+    }
+  }
+
   public void addQuery(Query query, int docIDUpto) {
     Integer current = queries.put(query, docIDUpto);
     // increment bytes used only if the query wasn't added so far.
@@ -161,6 +186,43 @@ class BufferedDeletes {
       bytesUsed.addAndGet(BYTES_PER_DEL_TERM + term.text.length() * RamUsageEstimator.NUM_BYTES_CHAR);
     }
   }
+
+  public Iterable<Term> termsIterable() {
+    return new Iterable<Term>() {
+      // @Override -- not until Java 1.6
+      public Iterator<Term> iterator() {
+        return terms.keySet().iterator();
+      }
+    };
+  }
+
+  public Iterable<QueryAndLimit> queriesIterable() {
+    return new Iterable<QueryAndLimit>() {
+      
+      // @Override -- not until Java 1.6
+      public Iterator<QueryAndLimit> iterator() {
+        return new Iterator<QueryAndLimit>() {
+          private final Iterator<Map.Entry<Query,Integer>> iter = queries.entrySet().iterator();
+
+          // @Override -- not until Java 1.6
+          public boolean hasNext() {
+            return iter.hasNext();
+          }
+
+          // @Override -- not until Java 1.6
+          public QueryAndLimit next() {
+            final Map.Entry<Query,Integer> ent = iter.next();
+            return new QueryAndLimit(ent.getKey(), ent.getValue());
+          }
+
+          // @Override -- not until Java 1.6
+          public void remove() {
+            throw new UnsupportedOperationException();
+          }
+        };
+      }
+    };
+  }    
     
   void clear() {
     terms.clear();

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java
Thu May  5 14:52:22 2011
@@ -22,7 +22,6 @@ import java.io.PrintStream;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Date;
-import java.util.Map.Entry;
 import java.util.Comparator;
 import java.util.Collections;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -51,7 +50,7 @@ import org.apache.lucene.search.Weight;
 class BufferedDeletesStream {
 
   // TODO: maybe linked list?
-  private final List<BufferedDeletes> deletes = new ArrayList<BufferedDeletes>();
+  private final List<FrozenBufferedDeletes> deletes = new ArrayList<FrozenBufferedDeletes>();
 
   // Starts at 1 so that SegmentInfos that have never had
   // deletes applied (whose bufferedDelGen defaults to 0)
@@ -82,13 +81,13 @@ class BufferedDeletesStream {
 
   // Appends a new packet of buffered deletes to the stream,
   // setting its generation:
-  public synchronized void push(BufferedDeletes packet) {
+  public synchronized void push(FrozenBufferedDeletes packet) {
     assert packet.any();
     assert checkDeleteStats();    
-    packet.gen = nextGen++;
+    assert packet.gen < nextGen;
     deletes.add(packet);
-    numTerms.addAndGet(packet.numTermDeletes.get());
-    bytesUsed.addAndGet(packet.bytesUsed.get());
+    numTerms.addAndGet(packet.numTermDeletes);
+    bytesUsed.addAndGet(packet.bytesUsed);
     if (infoStream != null) {
       message("push deletes " + packet + " delGen=" + packet.gen + " packetCount=" + deletes.size());
     }
@@ -181,14 +180,14 @@ class BufferedDeletesStream {
     while (infosIDX >= 0) {
       //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);
 
-      final BufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
+      final FrozenBufferedDeletes packet = delIDX >= 0 ? deletes.get(delIDX) : null;
       final SegmentInfo info = infos2.get(infosIDX);
       final long segGen = info.getBufferedDeletesGen();
 
       if (packet != null && segGen < packet.gen) {
         //System.out.println("  coalesce");
         if (coalescedDeletes == null) {
-          coalescedDeletes = new BufferedDeletes();
+          coalescedDeletes = new BufferedDeletes(true);
         }
         coalescedDeletes.update(packet);
         delIDX--;
@@ -201,25 +200,25 @@ class BufferedDeletesStream {
         int delCount = 0;
         try {
           if (coalescedDeletes != null) {
-            delCount += applyDeletes(coalescedDeletes, reader);
+            //System.out.println("    del coalesced");
+            delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+            delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
           }
-          delCount += applyDeletes(packet, reader);
+          //System.out.println("    del exact");
+          // Don't delete by Term here; DocumentsWriter
+          // already did that on flush:
+          delCount += applyQueryDeletes(packet.queriesIterable(), reader);
         } finally {
           readerPool.release(reader);
         }
         anyNewDeletes |= delCount > 0;
 
-        // We've applied doc ids, and they're only applied
-        // on the current segment
-        bytesUsed.addAndGet(-packet.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
-        packet.clearDocIDs();
-
         if (infoStream != null) {
           message("seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced
deletes=[" + (coalescedDeletes == null ? "null" : coalescedDeletes) + "] delCount=" + delCount);
         }
 
         if (coalescedDeletes == null) {
-          coalescedDeletes = new BufferedDeletes();
+          coalescedDeletes = new BufferedDeletes(true);
         }
         coalescedDeletes.update(packet);
         delIDX--;
@@ -235,7 +234,8 @@ class BufferedDeletesStream {
           SegmentReader reader = readerPool.get(info, false);
           int delCount = 0;
           try {
-            delCount += applyDeletes(coalescedDeletes, reader);
+            delCount += applyTermDeletes(coalescedDeletes.termsIterable(), reader);
+            delCount += applyQueryDeletes(coalescedDeletes.queriesIterable(), reader);
           } finally {
             readerPool.release(reader);
           }
@@ -300,91 +300,91 @@ class BufferedDeletesStream {
         message("pruneDeletes: prune " + count + " packets; " + (deletes.size() - count)
+ " packets remain");
       }
       for(int delIDX=0;delIDX<count;delIDX++) {
-        final BufferedDeletes packet = deletes.get(delIDX);
-        numTerms.addAndGet(-packet.numTermDeletes.get());
+        final FrozenBufferedDeletes packet = deletes.get(delIDX);
+        numTerms.addAndGet(-packet.numTermDeletes);
         assert numTerms.get() >= 0;
-        bytesUsed.addAndGet(-packet.bytesUsed.get());
+        bytesUsed.addAndGet(-packet.bytesUsed);
         assert bytesUsed.get() >= 0;
       }
       deletes.subList(0, count).clear();
     }
   }
 
-  private synchronized long applyDeletes(BufferedDeletes deletes, SegmentReader reader) throws
IOException {
-
+  // Delete by Term
+  private synchronized long applyTermDeletes(Iterable<Term> termsIter, SegmentReader
reader) throws IOException {
     long delCount = 0;
-
+        
     assert checkDeleteTerm(null);
     
-    if (deletes.terms.size() > 0) {
-      final TermDocs docs = reader.termDocs();
+    final TermDocs docs = reader.termDocs();
+
+    for (Term term : termsIter) {
         
-      for (Entry<Term,Integer> entry: deletes.terms.entrySet()) {
-        Term term = entry.getKey();
-        // Since we visit terms sorted, we gain performance
-        // by re-using the same TermsEnum and seeking only
-        // forwards
-        assert checkDeleteTerm(term);
-        docs.seek(term);
+      // Since we visit terms sorted, we gain performance
+      // by re-using the same TermsEnum and seeking only
+      // forwards
+      assert checkDeleteTerm(term);
+      docs.seek(term);
           
-        final int limit = entry.getValue();
-        while (docs.next()) {
-          final int docID = docs.doc();
-          if (docID >= limit) {
-            break;
-          }
-          // TODO: we could/should change
-          // reader.deleteDocument to return boolean
-          // true if it did in fact delete, because here
-          // we could be deleting an already-deleted doc
-          // which makes this an upper bound:
-          delCount++;
+      while (docs.next()) {
+        final int docID = docs.doc();
+        reader.deleteDocument(docID);
+        // TODO: we could/should change
+        // reader.deleteDocument to return boolean
+        // true if it did in fact delete, because here
+        // we could be deleting an already-deleted doc
+        // which makes this an upper bound:
+        delCount++;
 
-          reader.deleteDocument(docID);
-        }
+        reader.deleteDocument(docID);
       }
     }
 
-    // Delete by docID
-    for (Integer docIdInt : deletes.docIDs) {
-      int docID = docIdInt.intValue();
-      reader.deleteDocument(docID);
-      delCount++;
-    }
-
-    // Delete by query
-    if (deletes.queries.size() > 0) {
-      IndexSearcher searcher = new IndexSearcher(reader);
-      try {
-        for (Entry<Query, Integer> entry : deletes.queries.entrySet()) {
-          Query query = entry.getKey();
-          int limit = entry.getValue().intValue();
-          Weight weight = query.weight(searcher);
-          Scorer scorer = weight.scorer(reader, true, false);
-          if (scorer != null) {
-            while(true)  {
-              int doc = scorer.nextDoc();
-              if (doc >= limit)
-                break;
-
-              reader.deleteDocument(doc);
-              // TODO: we could/should change
-              // reader.deleteDocument to return boolean
-              // true if it did in fact delete, because here
-              // we could be deleting an already-deleted doc
-              // which makes this an upper bound:
-              delCount++;
-            }
+    return delCount;
+  }
+
+  public static class QueryAndLimit {
+    public final Query query;
+    public final int limit;
+    public QueryAndLimit(Query query, int limit) {
+      this.query = query;       
+      this.limit = limit;
+    }
+  }
+
+  // Delete by query
+  private synchronized long applyQueryDeletes(Iterable<QueryAndLimit> queriesIter,
SegmentReader reader) throws IOException {
+    long delCount = 0;
+    IndexSearcher searcher = new IndexSearcher(reader);
+    try {
+      for (QueryAndLimit ent : queriesIter) {
+        Query query = ent.query;
+        int limit = ent.limit;
+        Weight weight = query.weight(searcher);
+        Scorer scorer = weight.scorer(reader, true, false);
+        if (scorer != null) {
+          while(true)  {
+            int doc = scorer.nextDoc();
+            if (doc >= limit)
+              break;
+
+            reader.deleteDocument(doc);
+            // TODO: we could/should change
+            // reader.deleteDocument to return boolean
+            // true if it did in fact delete, because here
+            // we could be deleting an already-deleted doc
+            // which makes this an upper bound:
+            delCount++;
           }
         }
-      } finally {
-        searcher.close();
       }
+    } finally {
+      searcher.close();
     }
 
     return delCount;
   }
-  
+
   // used only by assert
   private boolean checkDeleteTerm(Term term) {
     if (term != null) {
@@ -398,9 +398,9 @@ class BufferedDeletesStream {
   private boolean checkDeleteStats() {
     int numTerms2 = 0;
     long bytesUsed2 = 0;
-    for(BufferedDeletes packet : deletes) {
-      numTerms2 += packet.numTermDeletes.get();
-      bytesUsed2 += packet.bytesUsed.get();
+    for(FrozenBufferedDeletes packet : deletes) {
+      numTerms2 += packet.numTermDeletes;
+      bytesUsed2 += packet.bytesUsed;
     }
     assert numTerms2 == numTerms.get(): "numTerms2=" + numTerms2 + " vs " + numTerms.get();
     assert bytesUsed2 == bytesUsed.get(): "bytesUsed2=" + bytesUsed2 + " vs " + bytesUsed;

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
Thu May  5 14:52:22 2011
@@ -35,8 +35,10 @@ import org.apache.lucene.store.AlreadyCl
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMFile;
 import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.BitVector;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.ThreadInterruptedException;
+
 
 /**
  * This class accepts multiple added documents and directly
@@ -132,7 +134,7 @@ final class DocumentsWriter {
   private final int maxThreadStates;
 
   // Deletes for our still-in-RAM (to be flushed next) segment
-  private BufferedDeletes pendingDeletes = new BufferedDeletes();
+  private BufferedDeletes pendingDeletes = new BufferedDeletes(false);
   
   static class DocState {
     DocumentsWriter docWriter;
@@ -323,6 +325,9 @@ final class DocumentsWriter {
     return doFlush;
   }
 
+  // TODO: we could check w/ FreqProxTermsWriter: if the
+  // term doesn't exist, don't bother buffering into the
+  // per-DWPT map (but still must go into the global map)
   boolean deleteTerm(Term term, boolean skipWait) {
     final boolean doFlush = flushControl.waitUpdate(0, 1, skipWait);
     synchronized(this) {
@@ -469,17 +474,19 @@ final class DocumentsWriter {
 
   private void pushDeletes(SegmentInfo newSegment, SegmentInfos segmentInfos) {
     // Lock order: DW -> BD
+    final long delGen = bufferedDeletesStream.getNextGen();
     if (pendingDeletes.any()) {
       if (segmentInfos.size() > 0 || newSegment != null) {
+        final FrozenBufferedDeletes packet = new FrozenBufferedDeletes(pendingDeletes, delGen);
         if (infoStream != null) {
           message("flush: push buffered deletes");
         }
-        bufferedDeletesStream.push(pendingDeletes);
+        bufferedDeletesStream.push(packet);
         if (infoStream != null) {
-          message("flush: delGen=" + pendingDeletes.gen);
+          message("flush: delGen=" + packet.gen);
         }
         if (newSegment != null) {
-          newSegment.setBufferedDeletesGen(pendingDeletes.gen);
+          newSegment.setBufferedDeletesGen(packet.gen);
         }
       } else {
         if (infoStream != null) {
@@ -489,9 +496,9 @@ final class DocumentsWriter {
         // there are no segments, the deletions cannot
         // affect anything.
       }
-      pendingDeletes = new BufferedDeletes();
+      pendingDeletes.clear();
     } else if (newSegment != null) {
-      newSegment.setBufferedDeletesGen(bufferedDeletesStream.getNextGen());
+      newSegment.setBufferedDeletesGen(delGen);
     }
   }
 
@@ -541,7 +548,19 @@ final class DocumentsWriter {
       }
 
       final SegmentWriteState flushState = new SegmentWriteState(infoStream, directory, segment,
fieldInfos,
-                                                                 numDocs, writer.getConfig().getTermIndexInterval());
+                                                                 numDocs, writer.getConfig().getTermIndexInterval(),
+                                                                 pendingDeletes);
+      // Apply delete-by-docID now (delete-byDocID only
+      // happens when an exception is hit processing that
+      // doc, eg if analyzer has some problem w/ the text):
+      if (pendingDeletes.docIDs.size() > 0) {
+        flushState.deletedDocs = new BitVector(numDocs);
+        for(int delDocID : pendingDeletes.docIDs) {
+          flushState.deletedDocs.set(delDocID);
+        }
+        pendingDeletes.bytesUsed.addAndGet(-pendingDeletes.docIDs.size() * BufferedDeletes.BYTES_PER_DEL_DOCID);
+        pendingDeletes.docIDs.clear();
+      }
 
       newSegment = new SegmentInfo(segment, numDocs, directory, false, true, fieldInfos.hasProx(),
false);
 
@@ -553,10 +572,14 @@ final class DocumentsWriter {
       double startMBUsed = bytesUsed()/1024./1024.;
 
       consumer.flush(threads, flushState);
+
       newSegment.setHasVectors(flushState.hasVectors);
 
       if (infoStream != null) {
         message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors"));
+        if (flushState.deletedDocs != null) {
+          message("new segment has " + flushState.deletedDocs.count() + " deleted docs");
+        }
         message("flushedFiles=" + newSegment.files());
       }
 
@@ -576,6 +599,30 @@ final class DocumentsWriter {
         newSegment.setUseCompoundFile(true);
       }
 
+      // Must write deleted docs after the CFS so we don't
+      // slurp the del file into CFS:
+      if (flushState.deletedDocs != null) {
+        final int delCount = flushState.deletedDocs.count();
+        assert delCount > 0;
+        newSegment.setDelCount(delCount);
+        newSegment.advanceDelGen();
+        final String delFileName = newSegment.getDelFileName();
+        boolean success2 = false;
+        try {
+          flushState.deletedDocs.write(directory, delFileName);
+          success2 = true;
+        } finally {
+          if (!success2) {
+            try {
+              directory.deleteFile(delFileName);
+            } catch (Throwable t) {
+              // suppress this so we keep throwing the
+              // original exception
+            }
+          }
+        }
+      }
+
       if (infoStream != null) {
         message("flush: segment=" + newSegment);
         final double newSegmentSizeNoStore = newSegment.sizeInBytes(false)/1024./1024.;

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
Thu May  5 14:52:22 2011
@@ -78,6 +78,14 @@ final class FreqProxFieldMergeState {
     return true;
   }
 
+  public String termText() {
+    int upto = textOffset;
+    while(text[upto] != 0xffff) {
+      upto++;
+    }
+    return new String(text, textOffset, upto-textOffset);
+  }
+
   public boolean nextDoc() throws IOException {
     if (freq.eof()) {
       if (postings.lastDocCodes[currentTermID] != -1) {

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
Thu May  5 14:52:22 2011
@@ -24,6 +24,7 @@ import java.util.Collection;
 import java.util.Map;
 import java.util.ArrayList;
 import java.util.List;
+import org.apache.lucene.util.BitVector;
 import org.apache.lucene.util.CollectionUtil;
 
 final class FreqProxTermsWriter extends TermsHashConsumer {
@@ -116,7 +117,7 @@ final class FreqProxTermsWriter extends 
 
       // If this field has postings then add them to the
       // segment
-      appendPostings(fields, consumer);
+      appendPostings(fieldName, state, fields, consumer);
 
       for(int i=0;i<fields.length;i++) {
         TermsHashPerField perField = fields[i].termsHashPerField;
@@ -142,7 +143,8 @@ final class FreqProxTermsWriter extends 
   /* Walk through all unique text tokens (Posting
    * instances) found in this field and serialize them
    * into a single RAM segment. */
-  void appendPostings(FreqProxTermsWriterPerField[] fields,
+  void appendPostings(String fieldName, SegmentWriteState state,
+                      FreqProxTermsWriterPerField[] fields,
                       FormatPostingsFieldsConsumer consumer)
     throws CorruptIndexException, IOException {
 
@@ -161,17 +163,31 @@ final class FreqProxTermsWriter extends 
     }
 
     final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
+    final Term protoTerm = new Term(fieldName);
 
     FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
 
     final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
 
+    final Map<Term,Integer> segDeletes;
+    if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
+      segDeletes = state.segDeletes.terms;
+    } else {
+      segDeletes = null;
+    }
+
+    // TODO: really TermsHashPerField should take over most
+    // of this loop, including merge sort of terms from
+    // multiple threads and interacting with the
+    // TermsConsumer, only calling out to us (passing us the
+    // DocsConsumer) to handle delivery of docs/positions
     while(numFields > 0) {
 
       // Get the next term to merge
       termStates[0] = mergeStates[0];
       int numToMerge = 1;
 
+      // TODO: pqueue
       for(int i=1;i<numFields;i++) {
         final char[] text = mergeStates[i].text;
         final int textOffset = mergeStates[i].textOffset;
@@ -186,6 +202,18 @@ final class FreqProxTermsWriter extends 
 
       final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text,
termStates[0].textOffset);
 
+      final int delDocLimit;
+      if (segDeletes != null) {
+        final Integer docIDUpto = segDeletes.get(protoTerm.createTerm(termStates[0].termText()));
+        if (docIDUpto != null) {
+          delDocLimit = docIDUpto;
+        } else {
+          delDocLimit = 0;
+        }
+      } else {
+        delDocLimit = 0;
+      }
+
       // Now termStates has numToMerge FieldMergeStates
       // which all share the same term.  Now we must
       // interleave the docID streams.
@@ -200,6 +228,28 @@ final class FreqProxTermsWriter extends 
 
         final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID,
termDocFreq);
 
+        // NOTE: we could check here if the docID was
+        // deleted, and skip it.  However, this is somewhat
+        // dangerous because it can yield non-deterministic
+        // behavior since we may see the docID before we see
+        // the term that caused it to be deleted.  This
+        // would mean some (but not all) of its postings may
+        // make it into the index, which'd alter the docFreq
+        // for those terms.  We could fix this by doing two
+        // passes, ie first sweep marks all del docs, and
+        // 2nd sweep does the real flush, but I suspect
+        // that'd add too much time to flush.
+
+        if (minState.docID < delDocLimit) {
+          // Mark it deleted.  TODO: we could also skip
+          // writing its postings; this would be
+          // deterministic (just for this Term's docs).
+          if (state.deletedDocs == null) {
+            state.deletedDocs = new BitVector(state.numDocs);
+          }
+          state.deletedDocs.set(minState.docID);
+        }
+
         final ByteSliceReader prox = minState.prox;
 
         // Carefully copy over the prox + payload info,

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/IndexWriter.java
Thu May  5 14:52:22 2011
@@ -434,7 +434,7 @@ public class IndexWriter implements Clos
 
   IndexReader getReader(int termInfosIndexDivisor, boolean applyAllDeletes) throws IOException
{
     ensureOpen();
-
+    
     final long tStart = System.currentTimeMillis();
 
     if (infoStream != null) {

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
Thu May  5 14:52:22 2011
@@ -643,7 +643,7 @@ public abstract class LogMergePolicy ext
     sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
     sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
     sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
-    sb.append("useCompoundFile=").append(useCompoundFile);
+    sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
     sb.append("requireContiguousMerge=").append(requireContiguousMerge);
     sb.append("]");
     return sb.toString();

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
Thu May  5 14:52:22 2011
@@ -256,8 +256,7 @@ final class SegmentMerger {
       // details.
       throw new RuntimeException("mergeFields produced an invalid result: docCount is " +
docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?="
+ directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
 
-    segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount,
termIndexInterval);
-
+    segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount,
termIndexInterval, null);
     return docCount;
   }
 

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
Thu May  5 14:52:22 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
 import java.io.PrintStream;
 
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitVector;
 
 /**
  * @lucene.experimental
@@ -32,6 +33,16 @@ public class SegmentWriteState {
   public final int numDocs;
   public boolean hasVectors;
 
+  // Deletes to apply while we are flushing the segment.  A
+  // Term is enrolled in here if it was deleted at one
+  // point, and it's mapped to the docIDUpto, meaning any
+  // docID < docIDUpto containing this term should be
+  // deleted.
+  public final BufferedDeletes segDeletes;
+
+  // Lazily created:
+  public BitVector deletedDocs;
+
   /** Expert: The fraction of terms in the "dictionary" which should be stored
    * in RAM.  Smaller values use more memory, but make searching slightly
    * faster, while larger values use less memory and make searching slightly
@@ -52,8 +63,9 @@ public class SegmentWriteState {
   public final int maxSkipLevels = 10;
 
   public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName,
FieldInfos fieldInfos,
-                           int numDocs, int termIndexInterval) {
+                           int numDocs, int termIndexInterval, BufferedDeletes segDeletes)
{
     this.infoStream = infoStream;
+    this.segDeletes = segDeletes;
     this.directory = directory;
     this.segmentName = segmentName;
     this.fieldInfos = fieldInfos;

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
Thu May  5 14:52:22 2011
@@ -24,7 +24,6 @@ import java.util.List;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Index;
@@ -152,6 +151,7 @@ public class TestAddIndexes extends Luce
     setUpDirs(dir, aux);
     IndexWriter writer = newWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
     writer.setInfoStream(VERBOSE ? System.out : null);
+    writer.setInfoStream(VERBOSE ? System.out : null);
     writer.addIndexes(aux);
 
     // Adds 10 docs, then replaces them with another 10

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
Thu May  5 14:52:22 2011
@@ -2777,7 +2777,7 @@ public class TestIndexWriter extends Luc
           count++;
         }
       }
-      assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting")
+ " count=" + count, count > 2500);
+      assertTrue("flush happened too quickly during " + (doIndexing ? "indexing" : "deleting")
+ " count=" + count, count > 1500);
     }
     w.close();
     dir.close();

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java?rev=1099830&r1=1099829&r2=1099830&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
Thu May  5 14:52:22 2011
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.search.IndexSearcher;
@@ -158,8 +157,6 @@ public class TestIndexWriterDelete exten
       assertEquals(0, modifier.getSegmentCount());
       modifier.commit();
 
-      modifier.commit();
-
       IndexReader reader = IndexReader.open(dir, true);
       assertEquals(1, reader.numDocs());
 



Mime
View raw message