lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Shai Erera <ser...@gmail.com>
Subject Re: Controlling the maximum size of a segment during indexing
Date Fri, 09 Apr 2010 05:20:35 GMT
I'm not sure .. but did you set the RAMBufferSizeMB on IWC? Doesn't look
like it, and the default is 16 MB, which can explain why it doesn't flush
before that.

Shai

On Fri, Apr 9, 2010 at 8:01 AM, Lance Norskog <goksron@gmail.com> wrote:

> Here is a Java unit test that uses the LogByteSizeMergePolicy to
> control the maximum size of segment files during indexing. That is, it
> tries. It does not succeed. Will someone who truly understands the
> merge policy code please examine it. There is probably one tiny
> parameter missing.
>
> It adds 20 documents that each are 100k in size.
>
> It creates an index in a RAMDirectory which should have one segment
> that's a tad over 1mb, and then a set of segments that are a tad over
> 500k. Instead, the data does not flush until it commits, writing one
> 5m segment.
>
>
> -------------------------------------------------------------
> org.apache.lucene.index.TestIndexWriterMergeMB
>
> -------------------------------------------------------------------------------
>
> package org.apache.lucene.index;
>
> /**
>  * Licensed to the Apache Software Foundation (ASF) under one or more
>  * contributor license agreements.  See the NOTICE file distributed with
>  * this work for additional information regarding copyright ownership.
>  * The ASF licenses this file to You under the Apache License, Version 2.0
>  * (the "License"); you may not use this file except in compliance with
>  * the License.  You may obtain a copy of the License at
>  *
>  *     http://www.apache.org/licenses/LICENSE-2.0
>  *
>  * Unless required by applicable law or agreed to in writing, software
>  * distributed under the License is distributed on an "AS IS" BASIS,
>  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>  * See the License for the specific language governing permissions and
>  * limitations under the License.
>  */
>
> import java.io.IOException;
>
> import org.apache.lucene.analysis.WhitespaceAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.FieldSelectorResult;
> import org.apache.lucene.document.Field.Index;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.LuceneTestCase;
>
> /*
>  * Verify that segment sizes are limited to # of bytes.
>  *
>  * Sizing:
>  *  Max MB is 0.5m. Verify against thiAs plus 100k slop. (1.2x)
>  *  Min MB is 10k.
>  *  Each document is 100k.
>  *  mergeSegments=2
>  *  MaxRAMBuffer=1m. Verify against this plus 200k slop. (1.2x)
>  *
>  *  This test should cause the ram buffer to flush after 10 documents,
> and create a CFS a little over 1meg.
>  *  The later documents should be flushed to disk every 5-6 documents,
> and create CFS files a little over 0.5meg.
>  */
>
>
> public class TestIndexWriterMergeMB extends LuceneTestCase {
>  private static final int MERGE_FACTOR = 2;
>  private static final double RAMBUFFER_MB = 1.0;
>  static final double MIN_MB = 0.01d;
>  static final double MAX_MB = 0.5d;
>  static final double SLOP_FACTOR = 1.2d;
>  static final double MB = 1000*1000;
>  static String VALUE_100k = null;
>
>  // Test controlling the mergePolicy for max # of docs
>  public void testMaxMergeMB() throws IOException {
>    Directory dir = new RAMDirectory();
>    IndexWriterConfig config = new IndexWriterConfig(
>        TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
>
>    LogByteSizeMergePolicy mergeMB = new LogByteSizeMergePolicy();
>    config.setMergePolicy(mergeMB);
>    mergeMB.setMinMergeMB(MIN_MB);
>    mergeMB.setMaxMergeMB(MAX_MB);
>    mergeMB.setUseCompoundFile(true);
>    mergeMB.setMergeFactor(MERGE_FACTOR);
>    config.setMaxBufferedDocs(100);                    // irrelevant
> but the next line fails without this.
>    config.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
>    MergeScheduler scheduler = new SerialMergeScheduler();
>    config.setMergeScheduler(scheduler);
>    IndexWriter writer = new IndexWriter(dir, config);
>
>    System.out.println("Start indexing");
>    for (int i = 0; i < 50; i++) {
>      addDoc(writer, i);
>      printSegmentSizes(dir);
>    }
>    checkSegmentSizes(dir);
>    System.out.println("Commit");
>    writer.commit();
>    printSegmentSizes(dir);
>    checkSegmentSizes(dir);
>    writer.close();
>  }
>
>  // document that takes of 100k of RAM
>  private void addDoc(IndexWriter writer, int i) throws IOException {
>    if (VALUE_100k == null) {
>      StringBuilder value = new StringBuilder(100000);
>      for(int fill = 0; fill < 100000; fill ++) {
>        value.append('a');
>      }
>      VALUE_100k = value.toString();
>    }
>    Document doc = new Document();
>    doc.add(new Field("id", i + "", Field.Store.YES,
> Field.Index.NOT_ANALYZED));
>    doc.add(new Field("content", VALUE_100k, Field.Store.YES,
> Field.Index.NOT_ANALYZED));
>    writer.addDocument(doc);
>  }
>
>
>  private void checkSegmentSizes(Directory dir) {
>    try {
>      String[] files = dir.listAll();
>      for (String file : files) {
>        if (file.equals("_0.cfs")) {
>          long length = dir.fileLength(file);
>          assertTrue("First segment: " + file + " size = " + length + " < "
>              + (int) ((SLOP_FACTOR * RAMBUFFER_MB) * MB), length <
> (SLOP_FACTOR * RAMBUFFER_MB) * MB);
>        } else if (file.endsWith(".cfs")) {
>          long length = dir.fileLength(file);
>          assertTrue("Later segment: " + file + " size = " + length +
> " should be < "
>              + (int) ((SLOP_FACTOR * MAX_MB) * MB), length <
> (SLOP_FACTOR * MAX_MB) * MB);
>        }
>      }
>    } catch (IOException e) {
>      System.err.println("Impossible: " + e.getMessage());
>    }
>  }
>
>  private void printSegmentSizes(Directory dir) {
>    try {
>      String[] files = dir.listAll();
>      System.out.println("Print index");
>      for(String file: files) {
>        if (file.endsWith(".cfs")) {
>          long length = dir.fileLength(file);
>          System.out.println("  file: " + file + " has " + length + "
> bytes");
>        }
>      }
>    } catch (IOException e) {
>      System.err.println("Impossible: " + e.getMessage());
>    }
>  }
> }
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>

Mime
View raw message