lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Mark Miller <markrmil...@gmail.com>
Subject Re: Controlling the maximum size of a segment during indexing
Date Fri, 09 Apr 2010 12:57:14 GMT
Setting maxMergeMB does not limit the size of segments you will see - it 
simply limits what segments will be merged - segments over maxMergeMB 
will not be merged with other segments - you can still buffer up a ton 
of docs in RAM and flush a segment larger than maxMergeMB, or merge n 
segments smaller than maxMergeMB that create a segment larger than 
maxMergeMB.


-- 
- Mark

http://www.lucidimagination.com



On 04/09/2010 01:01 AM, Lance Norskog wrote:
> Here is a Java unit test that uses the LogByteSizeMergePolicy to
> control the maximum size of segment files during indexing. That is, it
> tries. It does not succeed. Will someone who truly understands the
> merge policy code please examine it. There is probably one tiny
> parameter missing.
>
> It adds 20 documents that each are 100k in size.
>
> It creates an index in a RAMDirectory which should have one segment
> that's a tad over 1mb, and then a set of segments that are a tad over
> 500k. Instead, the data does not flush until it commits, writing one
> 5m segment.
>
>
> -------------------------------------------------------------
> org.apache.lucene.index.TestIndexWriterMergeMB
> -------------------------------------------------------------------------------
>
> package org.apache.lucene.index;
>
> /**
>   * Licensed to the Apache Software Foundation (ASF) under one or more
>   * contributor license agreements.  See the NOTICE file distributed with
>   * this work for additional information regarding copyright ownership.
>   * The ASF licenses this file to You under the Apache License, Version 2.0
>   * (the "License"); you may not use this file except in compliance with
>   * the License.  You may obtain a copy of the License at
>   *
>   *     http://www.apache.org/licenses/LICENSE-2.0
>   *
>   * Unless required by applicable law or agreed to in writing, software
>   * distributed under the License is distributed on an "AS IS" BASIS,
>   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>   * See the License for the specific language governing permissions and
>   * limitations under the License.
>   */
>
> import java.io.IOException;
>
> import org.apache.lucene.analysis.WhitespaceAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.FieldSelectorResult;
> import org.apache.lucene.document.Field.Index;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.LuceneTestCase;
>
> /*
>   * Verify that segment sizes are limited to # of bytes.
>   *
>   * Sizing:
>   *  Max MB is 0.5m. Verify against thiAs plus 100k slop. (1.2x)
>   *  Min MB is 10k.
>   *  Each document is 100k.
>   *  mergeSegments=2
>   *  MaxRAMBuffer=1m. Verify against this plus 200k slop. (1.2x)
>   *
>   *  This test should cause the ram buffer to flush after 10 documents,
> and create a CFS a little over 1meg.
>   *  The later documents should be flushed to disk every 5-6 documents,
> and create CFS files a little over 0.5meg.
>   */
>
>
> public class TestIndexWriterMergeMB extends LuceneTestCase {
>    private static final int MERGE_FACTOR = 2;
>    private static final double RAMBUFFER_MB = 1.0;
>    static final double MIN_MB = 0.01d;
>    static final double MAX_MB = 0.5d;
>    static final double SLOP_FACTOR = 1.2d;
>    static final double MB = 1000*1000;
>    static String VALUE_100k = null;
>
>    // Test controlling the mergePolicy for max # of docs
>    public void testMaxMergeMB() throws IOException {
>      Directory dir = new RAMDirectory();
>      IndexWriterConfig config = new IndexWriterConfig(
>          TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
>
>      LogByteSizeMergePolicy mergeMB = new LogByteSizeMergePolicy();
>      config.setMergePolicy(mergeMB);
>      mergeMB.setMinMergeMB(MIN_MB);
>      mergeMB.setMaxMergeMB(MAX_MB);
>      mergeMB.setUseCompoundFile(true);
>      mergeMB.setMergeFactor(MERGE_FACTOR);
>      config.setMaxBufferedDocs(100);                    // irrelevant
> but the next line fails without this.
>      config.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
>      MergeScheduler scheduler = new SerialMergeScheduler();
>      config.setMergeScheduler(scheduler);
>      IndexWriter writer = new IndexWriter(dir, config);
>
>      System.out.println("Start indexing");
>      for (int i = 0; i<  50; i++) {
>        addDoc(writer, i);
>        printSegmentSizes(dir);
>      }
>      checkSegmentSizes(dir);
>      System.out.println("Commit");
>      writer.commit();
>      printSegmentSizes(dir);
>      checkSegmentSizes(dir);
>      writer.close();
>    }
>
>    // document that takes of 100k of RAM
>    private void addDoc(IndexWriter writer, int i) throws IOException {
>      if (VALUE_100k == null) {
>        StringBuilder value = new StringBuilder(100000);
>        for(int fill = 0; fill<  100000; fill ++) {
>          value.append('a');
>        }
>        VALUE_100k = value.toString();
>      }
>      Document doc = new Document();
>      doc.add(new Field("id", i + "", Field.Store.YES, Field.Index.NOT_ANALYZED));
>      doc.add(new Field("content", VALUE_100k, Field.Store.YES,
> Field.Index.NOT_ANALYZED));
>      writer.addDocument(doc);
>    }
>
>
>    private void checkSegmentSizes(Directory dir) {
>      try {
>        String[] files = dir.listAll();
>        for (String file : files) {
>          if (file.equals("_0.cfs")) {
>            long length = dir.fileLength(file);
>            assertTrue("First segment: " + file + " size = " + length + "<  "
>                + (int) ((SLOP_FACTOR * RAMBUFFER_MB) * MB), length<
> (SLOP_FACTOR * RAMBUFFER_MB) * MB);
>          } else if (file.endsWith(".cfs")) {
>            long length = dir.fileLength(file);
>            assertTrue("Later segment: " + file + " size = " + length +
> " should be<  "
>                + (int) ((SLOP_FACTOR * MAX_MB) * MB), length<
> (SLOP_FACTOR * MAX_MB) * MB);
>          }
>        }
>      } catch (IOException e) {
>        System.err.println("Impossible: " + e.getMessage());
>      }
>    }
>
>    private void printSegmentSizes(Directory dir) {
>      try {
>        String[] files = dir.listAll();
>        System.out.println("Print index");
>        for(String file: files) {
>          if (file.endsWith(".cfs")) {
>            long length = dir.fileLength(file);
>            System.out.println("  file: " + file + " has " + length + " bytes");
>          }
>        }
>      } catch (IOException e) {
>        System.err.println("Impossible: " + e.getMessage());
>      }
>    }
> }
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>    


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message