Return-Path: Delivered-To: apmail-jakarta-lucene-dev-archive@www.apache.org Received: (qmail 81257 invoked from network); 8 Oct 2004 15:59:10 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur-2.apache.org with SMTP; 8 Oct 2004 15:59:10 -0000 Received: (qmail 74773 invoked by uid 500); 8 Oct 2004 15:58:59 -0000 Delivered-To: apmail-jakarta-lucene-dev-archive@jakarta.apache.org Received: (qmail 74681 invoked by uid 500); 8 Oct 2004 15:58:57 -0000 Mailing-List: contact lucene-dev-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Lucene Developers List" Reply-To: "Lucene Developers List" Delivered-To: mailing list lucene-dev@jakarta.apache.org Received: (qmail 74626 invoked by uid 500); 8 Oct 2004 15:58:56 -0000 Received: (qmail 74595 invoked by uid 99); 8 Oct 2004 15:58:56 -0000 X-ASF-Spam-Status: No, hits=-10.0 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [209.237.227.194] (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.28) with SMTP; Fri, 08 Oct 2004 08:58:54 -0700 Received: (qmail 81105 invoked by uid 1209); 8 Oct 2004 15:58:49 -0000 Date: 8 Oct 2004 15:58:49 -0000 Message-ID: <20041008155849.81104.qmail@minotaur.apache.org> From: cutting@apache.org To: jakarta-lucene-cvs@apache.org Subject: cvs commit: jakarta-lucene/src/java/org/apache/lucene/index TermBuffer.java SegmentTermEnum.java TermInfosReader.java X-Virus-Checked: Checked X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N cutting 2004/10/08 08:58:49 Modified: . CHANGES.txt src/java/org/apache/lucene/index SegmentTermEnum.java TermInfosReader.java Added: src/java/org/apache/lucene/index TermBuffer.java Log: Optimize term dictionary lookup to allocate fewer terms. Revision Changes Path 1.118 +6 -1 jakarta-lucene/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v retrieving revision 1.117 retrieving revision 1.118 diff -u -r1.117 -r1.118 --- CHANGES.txt 6 Oct 2004 12:15:05 -0000 1.117 +++ CHANGES.txt 8 Oct 2004 15:58:49 -0000 1.118 @@ -97,6 +97,11 @@ 21. Add a serializable Parameter Class to standardize parameter enum classes in BooleanClause and Field. (Christoph) +22. Optimize term-dictionary lookup to allocate far fewer terms when + scanning for the matching term. This speeds searches involving + low-frequency terms, where the cost of dictionary lookup can be + significant. (cutting) + 1.4.1 1.9 +25 -28 jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java Index: SegmentTermEnum.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- SegmentTermEnum.java 16 Sep 2004 21:13:37 -0000 1.8 +++ SegmentTermEnum.java 8 Oct 2004 15:58:49 -0000 1.9 @@ -25,7 +25,10 @@ long size; long position = -1; - private Term term = new Term("", ""); + private TermBuffer termBuffer = new TermBuffer(); + private TermBuffer prevBuffer = new TermBuffer(); + private TermBuffer scratch; // used for scanning + private TermInfo termInfo = new TermInfo(); private int format; @@ -34,9 +37,6 @@ int indexInterval; int skipInterval; private int formatM1SkipInterval; - Term prev; - - private char[] buffer = {}; SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) throws IOException { @@ -89,7 +89,10 @@ clone.input = (IndexInput) input.clone(); clone.termInfo = new TermInfo(termInfo); - if (term != null) clone.growBuffer(term.text.length()); + + clone.termBuffer = (TermBuffer)termBuffer.clone(); + clone.prevBuffer = (TermBuffer)prevBuffer.clone(); + clone.scratch = null; return clone; } @@ -98,21 +101,20 @@ throws IOException { input.seek(pointer); position = p; - term = t; - prev = null; + termBuffer.set(t); + prevBuffer.reset(); termInfo.set(ti); - growBuffer(term.text.length()); // copy term text into buffer } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { if (position++ >= size - 1) { - term = null; + termBuffer.reset(); return false; } - prev = term; - term = readTerm(); + prevBuffer.set(termBuffer); + termBuffer.read(input, fieldInfos); termInfo.docFreq = input.readVInt(); // read doc freq termInfo.freqPointer += input.readVLong(); // read freq pointer @@ -138,28 +140,23 @@ return true; } - private final Term readTerm() throws IOException { - int start = input.readVInt(); - int length = input.readVInt(); - int totalLength = start + length; - if (buffer.length < totalLength) - growBuffer(totalLength); - - input.readChars(buffer, start, length); - return new Term(fieldInfos.fieldName(input.readVInt()), - new String(buffer, 0, totalLength), false); - } - - private final void growBuffer(int length) { - buffer = new char[length]; - for (int i = 0; i < term.text.length(); i++) // copy contents - buffer[i] = term.text.charAt(i); + /** Optimized scan, without allocating new terms. */ + final void scanTo(Term term) throws IOException { + if (scratch == null) + scratch = new TermBuffer(); + scratch.set(term); + while (scratch.compareTo(termBuffer) > 0 && next()) {} } /** Returns the current Term in the enumeration. Initially invalid, valid after next() called for the first time.*/ public final Term term() { - return term; + return termBuffer.toTerm(); + } + + /** Returns the previous Term enumerated. Initially null.*/ + final Term prev() { + return prevBuffer.toTerm(); } /** Returns the current TermInfo in the enumeration. 1.11 +2 -2 jakarta-lucene/src/java/org/apache/lucene/index/TermInfosReader.java Index: TermInfosReader.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/TermInfosReader.java,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- TermInfosReader.java 16 Sep 2004 21:13:37 -0000 1.10 +++ TermInfosReader.java 8 Oct 2004 15:58:49 -0000 1.11 @@ -129,7 +129,7 @@ // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum enumerator = getEnum(); if (enumerator.term() != null // term is at or past current - && ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0) + && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1; if (indexTerms.length == enumOffset // but before end of block @@ -145,7 +145,7 @@ /** Scans within block for matching term. */ private final TermInfo scanEnum(Term term) throws IOException { SegmentTermEnum enumerator = getEnum(); - while (term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) return enumerator.termInfo(); else 1.1 jakarta-lucene/src/java/org/apache/lucene/index/TermBuffer.java Index: TermBuffer.java =================================================================== package org.apache.lucene.index; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexInput; final class TermBuffer implements Cloneable { private static final char[] NO_CHARS = new char[0]; private String field; private char[] text = NO_CHARS; private int textLength; private Term term; // cached public final int compareTo(TermBuffer other) { if (field == other.field) // fields are interned return compareChars(text, textLength, other.text, other.textLength); else return field.compareTo(other.field); } private static final int compareChars(char[] v1, int len1, char[] v2, int len2) { int end = Math.min(len1, len2); for (int k = 0; k < end; k++) { char c1 = v1[k]; char c2 = v2[k]; if (c1 != c2) { return c1 - c2; } } return len1 - len2; } private final void setTextLength(int newLength) { if (text.length < newLength) { char[] newText = new char[newLength]; System.arraycopy(text, 0, newText, 0, textLength); text = newText; } textLength = newLength; } public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; setTextLength(totalLength); input.readChars(this.text, start, length); this.field = fieldInfos.fieldName(input.readVInt()); } public final void set(Term term) { if (term == null) { reset(); return; } // copy text into the buffer setTextLength(term.text().length()); term.text().getChars(0, term.text().length(), text, 0); this.field = term.field(); this.term = term; } public final void set(TermBuffer other) { setTextLength(other.textLength); System.arraycopy(other.text, 0, text, 0, textLength); this.field = other.field; this.term = other.term; } public void reset() { this.field = null; this.textLength = 0; this.term = null; } public Term toTerm() { if (field == null) // unset return null; if (term == null) term = new Term(field, new String(text, 0, textLength), false); return term; } protected Object clone() { TermBuffer clone = null; try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} clone.text = new char[text.length]; System.arraycopy(text, 0, clone.text, 0, textLength); return clone; } } --------------------------------------------------------------------- To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org For additional commands, e-mail: lucene-dev-help@jakarta.apache.org