lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r820627 - in /lucene/java/trunk/contrib: CHANGES.txt misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
Date Thu, 01 Oct 2009 12:23:04 GMT
Author: mikemccand
Date: Thu Oct  1 12:23:03 2009
New Revision: 820627

URL: http://svn.apache.org/viewvc?rev=820627&view=rev
Log:
LUCENE-1924: add BalancedSegmentMergePolicy

Added:
    lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
  (with props)
Modified:
    lucene/java/trunk/contrib/CHANGES.txt

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=820627&r1=820626&r2=820627&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Oct  1 12:23:03 2009
@@ -14,6 +14,11 @@
 
 New features
 
+ * LUCENE-1924: Added BalancedSegmentMergePolicy to contrib/misc,
+   which is a merge policy that tries to avoid doing very large
+   segment merges to give better search performance in a mixed
+   indexing/searching environment.  (John Wang via Mike McCandless)
+
 Optimizations
 
 Documentation

Added: lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java?rev=820627&view=auto
==============================================================================
--- lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
(added)
+++ lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
Thu Oct  1 12:23:03 2009
@@ -0,0 +1,443 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.Set;
+
+/**
+ * Merge policy that tries to balance not doing large
+ * segment merges with not accumulating too many segments in
+ * the index, to provide for better performance in near
+ * real-time setting.
+ *
+ * <p>This is based on code from zoie, described in more detail
+ * at http://code.google.com/p/zoie/wiki/ZoieMergePolicy.</p>
+ */
+public class BalancedSegmentMergePolicy extends LogByteSizeMergePolicy
+{
+  public static final int DEFAULT_NUM_LARGE_SEGMENTS = 10;
+  
+  private boolean _partialExpunge = false;
+  private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
+  private int _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR;
+  private int _maxSegments = _numLargeSegments + _maxSmallSegments;
+  
+  public BalancedSegmentMergePolicy(IndexWriter writer) {
+    super(writer);
+  }
+  
+  public void setMergePolicyParams(MergePolicyParams params) {
+    if (params!=null) {
+      setPartialExpunge(params._doPartialExpunge);
+      setNumLargeSegments(params._numLargeSegments);
+      setMaxSmallSegments(params._maxSmallSegments);
+      setPartialExpunge(params._doPartialExpunge);
+      setMergeFactor(params._mergeFactor);
+      setUseCompoundFile(params._useCompoundFile);
+      setMaxMergeDocs(params._maxMergeDocs);
+    }
+  }
+  
+  protected long size(SegmentInfo info) throws IOException {
+    long byteSize = info.sizeInBytes();
+    float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount));
+    return (info.docCount <= 0 ?  byteSize : (long)((float)byteSize * (1.0f - delRatio)));
+  }
+  
+  public void setPartialExpunge(boolean doPartialExpunge) {
+    _partialExpunge = doPartialExpunge;
+  }
+  
+  public boolean getPartialExpunge() {
+    return _partialExpunge;
+  }
+  
+  public void setNumLargeSegments(int numLargeSegments) {
+    if (numLargeSegments < 2) {
+      throw new IllegalArgumentException("numLargeSegments cannot be less than 2");
+    }
+    
+    _numLargeSegments = numLargeSegments;
+    _maxSegments = _numLargeSegments + 2 * getMergeFactor();
+  }
+  
+  public int getNumLargeSegments() {
+    return _numLargeSegments;
+  }
+  
+  public void setMaxSmallSegments(int maxSmallSegments) {
+    if (maxSmallSegments < getMergeFactor()) {
+      throw new IllegalArgumentException("maxSmallSegments cannot be less than mergeFactor");
+    }    
+    _maxSmallSegments = maxSmallSegments;
+    _maxSegments = _numLargeSegments + _maxSmallSegments;
+  }
+  
+  public int getMaxSmallSegments() {
+    return _maxSmallSegments;
+  }
+  
+  @Override
+  public void setMergeFactor(int mergeFactor) {
+    super.setMergeFactor(mergeFactor);
+    if (_maxSmallSegments < getMergeFactor()) {
+      _maxSmallSegments = getMergeFactor();
+      _maxSegments = _numLargeSegments + _maxSmallSegments;
+    }
+  }
+  
+  private boolean isOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments,
Set segmentsToOptimize) throws IOException {
+    final int numSegments = infos.size();
+    int numToOptimize = 0;
+    SegmentInfo optimizeInfo = null;
+    for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
+      final SegmentInfo info = infos.info(i);
+      if (segmentsToOptimize.contains(info)) {
+        numToOptimize++;
+        optimizeInfo = info;
+      }
+    }
+
+    return numToOptimize <= maxNumSegments &&
+      (numToOptimize != 1 || isOptimized(writer, optimizeInfo));
+  }
+  
+  private boolean isOptimized(IndexWriter writer, SegmentInfo info)
+    throws IOException {
+    return !info.hasDeletions() &&
+      !info.hasSeparateNorms() &&
+      info.dir == writer.getDirectory() &&
+      info.getUseCompoundFile() == getUseCompoundFile();
+  }
+
+  @Override
+  public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxNumSegments,
Set segmentsToOptimize) throws IOException {
+    
+    assert maxNumSegments > 0;
+
+    MergeSpecification spec = null;
+
+    if (!isOptimized(infos, writer, maxNumSegments, segmentsToOptimize)) {
+
+      // Find the newest (rightmost) segment that needs to
+      // be optimized (other segments may have been flushed
+      // since optimize started):
+      int last = infos.size();
+      while(last > 0) {
+
+        final SegmentInfo info = infos.info(--last);
+        if (segmentsToOptimize.contains(info)) {
+
+          last++;
+          break;
+        }
+      }
+
+      if (last > 0) {
+
+        if (maxNumSegments == 1) {
+
+          // Since we must optimize down to 1 segment, the
+          // choice is simple:
+          boolean useCompoundFile = getUseCompoundFile();
+          if (last > 1 || !isOptimized(writer, infos.info(0))) {
+
+            spec = new MergeSpecification();
+            spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
+          }
+        } else if (last > maxNumSegments) {
+
+          // find most balanced merges
+          spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge);
+        }
+      }
+    }
+    return spec;
+  }
+  
+  private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments,
boolean partialExpunge)
+    throws IOException {
+    if (infoLen <= maxNumSegments) return null;
+    
+    MergeSpecification spec = new MergeSpecification();
+    boolean useCompoundFile = getUseCompoundFile();
+
+    // use Viterbi algorithm to find the best segmentation.
+    // we will try to minimize the size variance of resulting segments.
+    
+    double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments);
+    
+    final int maxMergeSegments = infoLen - maxNumSegments + 1;
+    double[] sumVariance = new double[maxMergeSegments];
+    int[][] backLink = new int[maxNumSegments][maxMergeSegments];
+    
+    for(int i = (maxMergeSegments - 1); i >= 0; i--) {
+      sumVariance[i] = variance[0][i];
+      backLink[0][i] = 0;
+    }
+    for(int i = 1; i < maxNumSegments; i++) {
+      for(int j = (maxMergeSegments - 1); j >= 0; j--) {
+        double minV = Double.MAX_VALUE;
+        int minK = 0;
+        for(int k = j; k >= 0; k--) {
+          double v = sumVariance[k] + variance[i + k][j - k];
+          if(v < minV) {
+            minV = v;
+            minK = k;
+          }
+        }
+        sumVariance[j] = minV;
+        backLink[i][j] = minK;
+      }
+    }
+    
+    // now, trace back the back links to find all merges,
+    // also find a candidate for partial expunge if requested
+    int mergeEnd = infoLen;
+    int prev = maxMergeSegments - 1;
+    int expungeCandidate = -1;
+    int maxDelCount = 0;
+    for(int i = maxNumSegments - 1; i >= 0; i--) {
+      prev = backLink[i][prev];
+      int mergeStart = i + prev;
+      if((mergeEnd - mergeStart) > 1) {
+        spec.add(new OneMerge(infos.range(mergeStart, mergeEnd), useCompoundFile));
+      } else {
+        if(partialExpunge) {
+          SegmentInfo info = infos.info(mergeStart);
+          int delCount = info.getDelCount();
+          if(delCount > maxDelCount) {
+            expungeCandidate = mergeStart;
+            maxDelCount = delCount;
+          }
+        }
+      }
+      mergeEnd = mergeStart;
+    }
+    
+    if(partialExpunge && maxDelCount > 0) {
+      // expunge deletes
+      spec.add(new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), useCompoundFile));
+    }
+    
+    return spec;
+  }
+  
+  private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments)
throws IOException {
+    int maxMergeSegments = last - maxNumSegments + 1;
+    double[][] variance = new double[last][maxMergeSegments];
+    
+    // compute the optimal segment size
+    long optSize = 0;
+    long[] sizeArr = new long[last];
+    for(int i = 0; i < sizeArr.length; i++) {
+      sizeArr[i] = size(infos.info(i));
+      optSize += sizeArr[i];
+    }
+    optSize = (optSize / maxNumSegments);
+    
+    for(int i = 0; i < last; i++) {
+      long size = 0;
+      for(int j = 0; j < maxMergeSegments; j++) {
+        if((i + j) < last) {
+          size += sizeArr[i + j];
+          double residual = ((double)size/(double)optSize) - 1.0d;
+          variance[i][j] = residual * residual;
+        } else {
+          variance[i][j] = Double.NaN;
+        }
+      }
+    }
+    return variance;
+  }
+  
+  @Override
+  public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos)
+    throws CorruptIndexException, IOException {
+    final int numSegs = infos.size();
+    final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments);
+    MergeSpecification spec = null;
+    
+    if(numLargeSegs < numSegs) {
+      SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs);
+      spec = super.findMergesToExpungeDeletes(smallSegments);
+    }
+    
+    if(spec == null) spec = new MergeSpecification();
+    for(int i = 0; i < numLargeSegs; i++) {
+      SegmentInfo info = infos.info(i);
+      if(info.hasDeletions()) {
+        spec.add(new OneMerge(infos.range(i, i + 1), getUseCompoundFile()));        
+      }
+    }
+    return spec;
+  }
+  
+  @Override
+  public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
+    final int numSegs = infos.size();
+    final int numLargeSegs = _numLargeSegments;
+    
+    if (numSegs <= numLargeSegs) {
+      return null;
+    }
+    
+    long totalLargeSegSize = 0;
+    long totalSmallSegSize = 0;
+    SegmentInfo info;
+    
+    // compute the total size of large segments
+    for(int i = 0; i < numLargeSegs; i++) {
+      info = infos.info(i);
+      totalLargeSegSize += size(info);
+    }
+    // compute the total size of small segments
+    for(int i = numLargeSegs; i < numSegs; i++) {
+      info = infos.info(i);
+      totalSmallSegSize += size(info);
+    }
+    
+    long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1));
+    if(targetSegSize <= totalSmallSegSize) {
+      // the total size of small segments is big enough,
+      // promote the small segments to a large segment and do balanced merge,
+      
+      if(totalSmallSegSize < targetSegSize * 2) {
+        MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs -
1), _partialExpunge);
+        if(spec == null) spec = new MergeSpecification(); // should not happen
+        spec.add(new OneMerge(infos.range(numLargeSegs, numSegs), getUseCompoundFile()));
+        return spec;
+      } else {
+        return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge);
+      }      
+    } else if (_maxSegments < numSegs) {
+      // we have more than _maxSegments, merge small segments smaller than targetSegSize/4
+      MergeSpecification spec = new MergeSpecification();
+      int startSeg = numLargeSegs;
+      long sizeThreshold = (targetSegSize / 4);
+      while(startSeg < numSegs) {
+        info = infos.info(startSeg);
+        if(size(info) < sizeThreshold) break;
+        startSeg++;
+      }
+      spec.add(new OneMerge(infos.range(startSeg, numSegs), getUseCompoundFile()));
+      return spec;
+    } else {
+      // apply the log merge policy to small segments.
+      SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs);
+      MergeSpecification spec = super.findMerges(smallSegments);
+      
+      if(_partialExpunge) {
+        OneMerge expunge  = findOneSegmentToExpunge(infos, numLargeSegs);
+        if(expunge != null) {
+          if(spec == null) spec = new MergeSpecification();
+          spec.add(expunge);
+        }
+      }
+      return spec;
+    }      
+  }
+  
+  private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws
IOException {
+    int expungeCandidate = -1;
+    int maxDelCount = 0;
+    
+    for(int i = maxNumSegments - 1; i >= 0; i--) {
+      SegmentInfo info = infos.info(i);
+      int delCount = info.getDelCount();
+      if (delCount > maxDelCount) {
+        expungeCandidate = i;
+        maxDelCount = delCount;
+      }
+    }
+    if (maxDelCount > 0) {
+      return new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), getUseCompoundFile());
+    }
+    return null;
+  }
+  
+
+  public static class MergePolicyParams {
+    private int _numLargeSegments;
+    private int _maxSmallSegments;
+    private boolean _doPartialExpunge;
+    private int _mergeFactor;
+    private boolean _useCompoundFile;
+    private int _maxMergeDocs;
+	  
+    public MergePolicyParams() {
+      _useCompoundFile = true;
+      _doPartialExpunge = false;
+      _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
+      _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR;
+      _maxSmallSegments = _numLargeSegments + _maxSmallSegments;
+      _mergeFactor = LogMergePolicy.DEFAULT_MERGE_FACTOR;
+      _maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS;
+    }
+	  
+    public void setNumLargeSegments(int numLargeSegments) {
+      _numLargeSegments = numLargeSegments;
+    }
+      
+    public int getNumLargeSegments() {
+      return _numLargeSegments;
+    }
+      
+    public void setMaxSmallSegments(int maxSmallSegments) {
+      _maxSmallSegments = maxSmallSegments;
+    }
+      
+    public int getMaxSmallSegments() {
+      return _maxSmallSegments;
+    }
+      
+    public void setPartialExpunge(boolean doPartialExpunge) {
+      _doPartialExpunge = doPartialExpunge;
+    }
+      
+    public boolean getPartialExpunge() {
+      return _doPartialExpunge;
+    }
+      
+    public void setMergeFactor(int mergeFactor) {
+      _mergeFactor = mergeFactor;
+    }
+	  
+    public int getMergeFactor() {
+      return _mergeFactor;
+    }
+		
+    public void setMaxMergeDocs(int maxMergeDocs) {
+      _maxMergeDocs = maxMergeDocs;
+    }
+		
+    public int getMaxMergeDocs() {
+      return _maxMergeDocs;
+    }
+	  
+    public void setUseCompoundFile(boolean useCompoundFile) {
+      _useCompoundFile = useCompoundFile;
+    }
+	  
+    public boolean isUseCompoundFile() {
+      return _useCompoundFile;
+    }
+  }
+}

Propchange: lucene/java/trunk/contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message