lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1038784 - in /lucene/java/branches/lucene_2_9: ./ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Date Wed, 24 Nov 2010 19:47:50 GMT
Author: mikemccand
Date: Wed Nov 24 19:47:49 2010
New Revision: 1038784

URL: http://svn.apache.org/viewvc?rev=1038784&view=rev
Log:
LUCENE-2773: don't build compound files for large merged segments (by default)

Modified:
    lucene/java/branches/lucene_2_9/CHANGES.txt
    lucene/java/branches/lucene_2_9/common-build.xml
    lucene/java/branches/lucene_2_9/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
    lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/LogMergePolicy.java
    lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java

Modified: lucene/java/branches/lucene_2_9/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/CHANGES.txt?rev=1038784&r1=1038783&r2=1038784&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/CHANGES.txt Wed Nov 24 19:47:49 2010
@@ -15,6 +15,13 @@ Changes in runtime behavior
   worst-case free disk space required during optimize is now 3X the
   index size, when compound file is enabled (else 2X).  (Mike
   McCandless)
+
+* LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default =
+  0.1), which means any time a merged segment is greater than 10% of
+  the index size, it will be left in non-compound format even if
+  compound format is on.  This change was made to reduce peak
+  transient disk usage during optimize which increased due to
+  LUCENE-2762.  (Mike McCandless)
   
 Bug fixes
 
@@ -110,6 +117,15 @@ Bug fixes
 * LUCENE-2216: OpenBitSet.hashCode returned different hash codes for
   sets that only differed by trailing zeros. (Dawid Weiss, yonik)
 
+API Changes
+
+* LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default =
+  0.1), which means any time a merged segment is greater than 10% of
+  the index size, it will be left in non-compound format even if
+  compound format is on.  This change was made to reduce peak
+  transient disk usage during optimize which increased due to
+  LUCENE-2762.  (Mike McCandless)
+
 Optimizations
 
 * LUCENE-2556: Improve memory usage after cloning TermAttribute.

Modified: lucene/java/branches/lucene_2_9/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/common-build.xml?rev=1038784&r1=1038783&r2=1038784&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/common-build.xml (original)
+++ lucene/java/branches/lucene_2_9/common-build.xml Wed Nov 24 19:47:49 2010
@@ -42,7 +42,7 @@
   <property name="Name" value="Lucene"/>
   <property name="dev.version" value="2.9.4-dev"/>
   <property name="version" value="${dev.version}"/>
-  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20101123"/>
+  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20101124"/>
   <property name="spec.version" value="${version}"/>	
   <property name="year" value="2000-${current.year}"/>
   <property name="final.name" value="lucene-${name}-${version}"/>

Modified: lucene/java/branches/lucene_2_9/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1038784&r1=1038783&r2=1038784&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
(original)
+++ lucene/java/branches/lucene_2_9/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Wed Nov 24 19:47:49 2010
@@ -17,33 +17,34 @@
 
 package org.apache.lucene.benchmark.byTask;
 
-import java.io.IOException;
-import java.io.StringReader;
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
-import java.io.BufferedReader;
-import java.util.List;
+import java.io.IOException;
+import java.io.StringReader;
 import java.util.Iterator;
+import java.util.List;
+
+import junit.framework.TestCase;
 
 import org.apache.lucene.benchmark.byTask.feeds.DocData;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
-import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
-import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.SerialMergeScheduler;
 import org.apache.lucene.index.LogDocMergePolicy;
+import org.apache.lucene.index.SegmentInfos;
+import org.apache.lucene.index.SerialMergeScheduler;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.store.Directory;
 import org.apache.lucene.search.FieldCache.StringIndex;
 import org.apache.lucene.search.FieldCache;
-
-import junit.framework.TestCase;
+import org.apache.lucene.store.Directory;
 
 /**
  * Test very simply that perf tasks - simple algorithms - are doing what they should.
@@ -775,12 +776,9 @@ public class TestPerfTasksLogic extends 
     ir.close();
 
     // Make sure we have 3 segments:
-    final String[] files = benchmark.getRunData().getDirectory().listAll();
-    int cfsCount = 0;
-    for(int i=0;i<files.length;i++)
-      if (files[i].endsWith(".cfs"))
-        cfsCount++;
-    assertEquals(3, cfsCount);
+    SegmentInfos infos = new SegmentInfos();
+    infos.read(benchmark.getRunData().getDirectory());
+    assertEquals(3, infos.size());
   }
   
   /**

Modified: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1038784&r1=1038783&r2=1038784&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/LogMergePolicy.java Wed
Nov 24 19:47:49 2010
@@ -54,12 +54,19 @@ public abstract class LogMergePolicy ext
    *  or larger will never be merged.  @see setMaxMergeDocs */
   public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
 
+  /** Default noCFSRatio.  If a merge's size is >= 10% of
+   *  the index, then we disable compound file for it.
+   *  @see setNoCFSRatio */
+  public static final double DEFAULT_NO_CFS_RATIO = 0.1;
+
   private int mergeFactor = DEFAULT_MERGE_FACTOR;
 
   long minMergeSize;
   long maxMergeSize;
   int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
 
+  protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
+
   /* TODO 3.0: change this default to true */
   protected boolean calibrateSizeByDeletes = false;
   
@@ -73,6 +80,23 @@ public abstract class LogMergePolicy ext
   protected boolean verbose() {
     return writer != null && writer.verbose();
   }
+
+  /** @see setNoCFSRatio */
+  public double getNoCFSRatio() {
+    return noCFSRatio;
+  }
+
+  /** If a merged segment will be more than this percentage
+   *  of the total size of the index, leave the segment as
+   *  non-compound file even if compound file is enabled.
+   *  Set to 1.0 to always use CFS regardless of merge
+   *  size. */
+  public void setNoCFSRatio(double noCFSRatio) {
+    if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
+      throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got "
+ noCFSRatio);
+    }
+    this.noCFSRatio = noCFSRatio;
+  }
   
   private void message(String message) {
     if (verbose())
@@ -200,7 +224,7 @@ public abstract class LogMergePolicy ext
     return !hasDeletions &&
       !info.hasSeparateNorms() &&
       info.dir == writer.getDirectory() &&
-      info.getUseCompoundFile() == useCompoundFile;
+      (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
   }
 
   /** Returns the merges necessary to optimize the index.
@@ -238,7 +262,7 @@ public abstract class LogMergePolicy ext
         // First, enroll all "full" merges (size
         // mergeFactor) to potentially be run concurrently:
         while (last - maxNumSegments + 1 >= mergeFactor) {
-          spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile));
+          spec.add(makeOneMerge(infos, infos.range(last-mergeFactor, last)));
           last -= mergeFactor;
         }
 
@@ -250,7 +274,7 @@ public abstract class LogMergePolicy ext
             // Since we must optimize down to 1 segment, the
             // choice is simple:
             if (last > 1 || !isOptimized(infos.info(0)))
-              spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
+              spec.add(makeOneMerge(infos, infos.range(0, last)));
           } else if (last > maxNumSegments) {
 
             // Take care to pick a partial merge that is
@@ -278,7 +302,7 @@ public abstract class LogMergePolicy ext
               }
             }
 
-            spec.add(new OneMerge(infos.range(bestStart, bestStart+finalMergeSize), useCompoundFile));
+            spec.add(makeOneMerge(infos, infos.range(bestStart, bestStart+finalMergeSize)));
           }
         }
         
@@ -317,7 +341,7 @@ public abstract class LogMergePolicy ext
           // deletions, so force a merge now:
           if (verbose())
             message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
-          spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+          spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions,
i)));
           firstSegmentWithDeletions = i;
         }
       } else if (firstSegmentWithDeletions != -1) {
@@ -326,7 +350,7 @@ public abstract class LogMergePolicy ext
         // mergeFactor segments
         if (verbose())
           message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
-        spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+        spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions,
i)));
         firstSegmentWithDeletions = -1;
       }
     }
@@ -334,7 +358,7 @@ public abstract class LogMergePolicy ext
     if (firstSegmentWithDeletions != -1) {
       if (verbose())
         message("  add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + "
inclusive");
-      spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments), useCompoundFile));
+      spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, numSegments)));
     }
 
     return spec;
@@ -433,7 +457,7 @@ public abstract class LogMergePolicy ext
             spec = new MergeSpecification();
           if (verbose())
             message("    " + start + " to " + end + ": add this merge");
-          spec.add(new OneMerge(infos.range(start, end), useCompoundFile));
+          spec.add(makeOneMerge(infos, infos.range(start, end)));
         } else if (verbose())
           message("    " + start + " to " + end + ": contains segment over maxMergeSize or
maxMergeDocs; skipping");
 
@@ -447,6 +471,29 @@ public abstract class LogMergePolicy ext
     return spec;
   }
 
+  protected OneMerge makeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge) throws IOException
{
+    final boolean doCFS;
+    if (!useCompoundFile) {
+      doCFS = false;
+    } else if (noCFSRatio == 1.0) {
+      doCFS = true;
+    } else {
+      
+      long totSize = 0;
+      for(int i=0;i<infos.size();i++) {
+        totSize += size(infos.info(i));
+      }
+      long mergeSize = 0;
+      for(int i=0;i<infosToMerge.size();i++) {
+        mergeSize += size(infosToMerge.info(i));
+      }
+
+      doCFS = mergeSize <= noCFSRatio * totSize;
+    }
+
+    return new OneMerge(infosToMerge, doCFS);
+  }
+
   /** <p>Determines the largest segment (measured by
    * document count) that may be merged with other segments.
    * Small values (e.g., less than 10,000) are best for

Modified: lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java?rev=1038784&r1=1038783&r2=1038784&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
(original)
+++ lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
Wed Nov 24 19:47:49 2010
@@ -238,25 +238,5 @@ public class TestIndexWriterMergePolicy 
     if (upperBound * mergeFactor <= maxMergeDocs) {
       assertTrue(numSegments < mergeFactor);
     }
-
-    String[] files = writer.getDirectory().listAll();
-    int segmentCfsCount = 0;
-    for (int i = 0; i < files.length; i++) {
-      if (files[i].endsWith(".cfs")) {
-        segmentCfsCount++;
-      }
-    }
-    assertEquals("index=" + writer.segString(), segmentCount, segmentCfsCount);
-  }
-
-  /*
-  private void printSegmentDocCounts(IndexWriter writer) {
-    int segmentCount = writer.getSegmentCount();
-    System.out.println("" + segmentCount + " segments total");
-    for (int i = 0; i < segmentCount; i++) {
-      System.out.println("  segment " + i + " has " + writer.getDocCount(i)
-          + " docs");
-    }
   }
-  */
 }



Mime
View raw message