lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r792532 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/index/BufferedDeletes.java src/java/org/apache/lucene/index/DocumentsWriter.java src/java/org/apache/lucene/index/IndexWriter.java
Date Thu, 09 Jul 2009 12:44:58 GMT
Author: mikemccand
Date: Thu Jul  9 12:44:57 2009
New Revision: 792532

URL: http://svn.apache.org/viewvc?rev=792532&view=rev
Log:
LUCENE-1717: properly account for RAM used by buffered deletes

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Jul  9 12:44:57 2009
@@ -127,6 +127,9 @@
     is failing to close reader/writers.  (Brian Groose via Mike
     McCandless)
 
+ 9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of
+    buffered deletions.  (Mike McCandless)
+
 API Changes
 
 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/BufferedDeletes.java Thu Jul  9 12:44:57
2009
@@ -35,6 +35,7 @@
   HashMap terms = new HashMap();
   HashMap queries = new HashMap();
   List docIDs = new ArrayList();
+  long bytesUsed;
 
   // Number of documents a delete term applies to.
   final static class Num {
@@ -60,17 +61,21 @@
     }
   }
 
-
+  int size() {
+    // We use numTerms not terms.size() intentionally, so
+    // that deletes by the same term multiple times "count",
+    // ie if you ask to flush every 1000 deletes then even
+    // dup'd terms are counted towards that 1000
+    return numTerms + queries.size() + docIDs.size();
+  }
 
   void update(BufferedDeletes in) {
     numTerms += in.numTerms;
+    bytesUsed += in.bytesUsed;
     terms.putAll(in.terms);
     queries.putAll(in.queries);
     docIDs.addAll(in.docIDs);
-    in.terms.clear();
-    in.numTerms = 0;
-    in.queries.clear();
-    in.docIDs.clear();
+    in.clear();
   }
     
   void clear() {
@@ -78,6 +83,11 @@
     queries.clear();
     docIDs.clear();
     numTerms = 0;
+    bytesUsed = 0;
+  }
+
+  void addBytesUsed(long b) {
+    bytesUsed += b;
   }
 
   boolean any() {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Jul  9 12:44:57
2009
@@ -38,6 +38,7 @@
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;
 
 /**
  * This class accepts multiple added documents and directly
@@ -887,8 +888,25 @@
   }
 
   synchronized boolean deletesFull() {
-    return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH
-      && ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size())
>= maxBufferedDeleteTerms);
+    return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize)
||
+      (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+       ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
+  }
+
+  synchronized boolean doApplyDeletes() {
+    // Very similar to deletesFull(), except we don't count
+    // numBytesAlloc, because we are checking whether
+    // deletes (alone) are consuming too many resources now
+    // and thus should be applied.  We apply deletes if RAM
+    // usage is > 1/2 of our allowed RAM buffer, to prevent
+    // too-frequent flushing of a long tail of tiny segments
+    // when merges (which always apply deletes) are
+    // infrequent.
+    return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) ||
+      (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+       ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
   }
 
   synchronized private boolean timeToFlushDeletes() {
@@ -1015,20 +1033,24 @@
     else
       num.setNum(docIDUpto);
     deletesInRAM.numTerms++;
+
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE);
   }
 
   // Buffer a specific docID for deletion.  Currently only
   // used when we hit a exception when adding a document
   synchronized private void addDeleteDocID(int docID) {
     deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID));
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID);
   }
 
   synchronized private void addDeleteQuery(Query query, int docID) {
     deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID));
+    deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY);
   }
 
   synchronized boolean doBalanceRAM() {
-    return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull &&
(numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
+    return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull &&
(numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc
>= freeTrigger);
   }
 
   /** Does the synchronized work to finish/flush the
@@ -1044,7 +1066,6 @@
 
       assert docWriter == null || docWriter.docID == perThread.docState.docID;
 
-
       if (aborting) {
 
         // We are currently aborting, and another thread is
@@ -1109,7 +1130,7 @@
   final SkipDocWriter skipDocWriter = new SkipDocWriter();
 
   long getRAMUsed() {
-    return numBytesUsed;
+    return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
   }
 
   long numBytesAlloc;
@@ -1137,10 +1158,34 @@
 
   // Coarse estimates used to measure RAM usage of buffered deletes
   final static int OBJECT_HEADER_BYTES = 8;
-  final static int POINTER_NUM_BYTE = 4;
+  final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4;
   final static int INT_NUM_BYTE = 4;
   final static int CHAR_NUM_BYTE = 2;
 
+  /* Rough logic: HashMap has an array[Entry] w/ varying
+     load factor (say 2 * POINTER).  Entry is object w/ Term
+     key, BufferedDeletes.Num val, int hash, Entry next
+     (OBJ_HEADER + 3*POINTER + INT).  Term is object w/
+     String field and String text (OBJ_HEADER + 2*POINTER).
+     We don't count Term's field since it's interned.
+     Term's text is String (OBJ_HEADER + 4*INT + POINTER +
+     OBJ_HEADER + string.length*CHAR).  BufferedDeletes.num is
+     OBJ_HEADER + INT. */
+ 
+  final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE;
+
+  /* Rough logic: del docIDs are List<Integer>.  Say list
+     allocates ~2X size (2*POINTER).  Integer is OBJ_HEADER
+     + int */
+  final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE;
+
+  /* Rough logic: HashMap has an array[Entry] w/ varying
+     load factor (say 2 * POINTER).  Entry is object w/
+     Query key, Integer val, int hash, Entry next
+     (OBJ_HEADER + 3*POINTER + INT).  Query we often
+     undercount (say 24 bytes).  Integer is OBJ_HEADER + INT. */
+  final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE
+ 24;
+
   /* Initial chunks size of the shared byte[] blocks used to
      store postings data */
   final static int BYTE_BLOCK_SHIFT = 15;
@@ -1285,17 +1330,20 @@
     // We flush when we've used our target usage
     final long flushTrigger = ramBufferSize;
 
-    if (numBytesAlloc > freeTrigger) {
+    final long deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
+
+    if (numBytesAlloc+deletesRAMUsed > freeTrigger) {
 
       if (infoStream != null)
         message("  RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
                 " vs trigger=" + toMB(flushTrigger) +
                 " allocMB=" + toMB(numBytesAlloc) +
+                " deletesMB=" + toMB(deletesRAMUsed) +
                 " vs trigger=" + toMB(freeTrigger) +
                 " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE)
+
                 " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE));
 
-      final long startBytesAlloc = numBytesAlloc;
+      final long startBytesAlloc = numBytesAlloc + deletesRAMUsed;
 
       int iter = 0;
 
@@ -1305,12 +1353,12 @@
 
       boolean any = true;
 
-      while(numBytesAlloc > freeLevel) {
+      while(numBytesAlloc+deletesRAMUsed > freeLevel) {
       
         synchronized(this) {
           if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size()
&& 0 == freeIntBlocks.size() && !any) {
             // Nothing else to free -- must flush now.
-            bufferIsFull = numBytesUsed > flushTrigger;
+            bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger;
             if (infoStream != null) {
               if (numBytesUsed > flushTrigger)
                 message("    nothing to free; now set bufferIsFull");
@@ -1345,7 +1393,7 @@
       }
 
       if (infoStream != null)
-        message("    after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.)
+ " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
+        message("    after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.)
+ " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.));
       
     } else {
       // If we have not crossed the 100% mark, but have
@@ -1355,10 +1403,11 @@
       // flush.
       synchronized(this) {
 
-        if (numBytesUsed > flushTrigger) {
+        if (numBytesUsed+deletesRAMUsed > flushTrigger) {
           if (infoStream != null)
             message("  RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) +
                     " allocMB=" + nf.format(numBytesAlloc/1024./1024.) +
+                    " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) +
                     " triggerMB=" + nf.format(flushTrigger/1024./1024.));
 
           bufferIsFull = true;

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=792532&r1=792531&r2=792532&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Thu Jul  9 12:44:57
2009
@@ -1729,17 +1729,28 @@
   }
 
   /** Determines the amount of RAM that may be used for
-   * buffering added documents before they are flushed as a
-   * new Segment.  Generally for faster indexing performance
-   * it's best to flush by RAM usage instead of document
-   * count and use as large a RAM buffer as you can.
+   * buffering added documents and deletions before they are
+   * flushed to the Directory.  Generally for faster
+   * indexing performance it's best to flush by RAM usage
+   * instead of document count and use as large a RAM buffer
+   * as you can.
    *
    * <p>When this is set, the writer will flush whenever
-   * buffered documents use this much RAM.  Pass in {@link
-   * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
-   * to RAM usage.  Note that if flushing by document count
-   * is also enabled, then the flush will be triggered by
-   * whichever comes first.</p>
+   * buffered documents and deletions use this much RAM.
+   * Pass in {@link #DISABLE_AUTO_FLUSH} to prevent
+   * triggering a flush due to RAM usage.  Note that if
+   * flushing by document count is also enabled, then the
+   * flush will be triggered by whichever comes first.</p>
+   *
+   * <p> <b>NOTE</b>: the account of RAM usage for pending
+   * deletions is only approximate.  Specifically, if you
+   * delete by Query, Lucene currently has no way to measure
+   * the RAM usage if individual Queries so the accounting
+   * will under-estimate and you should compensate by either
+   * calling commit() periodically yourself, or by using
+   * {@link #setMaxBufferedDeleteTerms} to flush by count
+   * instead of RAM usage (each buffered delete Query counts
+   * as one).
    *
    * <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
    * 
@@ -4089,7 +4100,10 @@
 
     flushCount++;
 
-    flushDeletes |= docWriter.deletesFull();
+    // If we are flushing because too many deletes
+    // accumulated, then we should apply the deletes to free
+    // RAM:
+    flushDeletes |= docWriter.doApplyDeletes();
 
     // When autoCommit=true we must always flush deletes
     // when flushing a segment; otherwise deletes may become



Mime
View raw message