From lucene-dev-return-4823-apmail-jakarta-lucene-dev-archive=jakarta.apache.org@jakarta.apache.org Tue Dec 09 19:42:51 2003 Return-Path: Delivered-To: apmail-jakarta-lucene-dev-archive@www.apache.org Received: (qmail 46921 invoked from network); 9 Dec 2003 19:42:51 -0000 Received: from daedalus.apache.org (HELO mail.apache.org) (208.185.179.12) by minotaur-2.apache.org with SMTP; 9 Dec 2003 19:42:51 -0000 Received: (qmail 52675 invoked by uid 500); 9 Dec 2003 19:42:39 -0000 Delivered-To: apmail-jakarta-lucene-dev-archive@jakarta.apache.org Received: (qmail 52656 invoked by uid 500); 9 Dec 2003 19:42:38 -0000 Mailing-List: contact lucene-dev-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Lucene Developers List" Reply-To: "Lucene Developers List" Delivered-To: mailing list lucene-dev@jakarta.apache.org Received: (qmail 52520 invoked from network); 9 Dec 2003 19:42:25 -0000 Received: from unknown (HELO salix.caltha.pl) (212.87.7.182) by daedalus.apache.org with SMTP; 9 Dec 2003 19:42:25 -0000 Received: from [62.121.110.244] (244-mo3-7.acn.waw.pl [::ffff:62.121.110.244]) (AUTH: LOGIN zwierzem, ) by salix.caltha.pl with esmtp; Tue, 09 Dec 2003 20:42:20 +0100 Subject: Re: Revival of Dmitry's Term Vector patches From: Damian Gajda To: Lucene Developers List In-Reply-To: <20031117213217.61249.qmail@web12704.mail.yahoo.com> References: <20031117213217.61249.qmail@web12704.mail.yahoo.com> Organization: Caltha Sp. j. Message-Id: <1070999039.7063.1.camel@localhost.localdomain> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=_salix-19937-1070998941-0001-2" X-Mailer: Ximian Evolution 1.4.5 (1.4.5-7) Date: Tue, 09 Dec 2003 20:44:00 +0100 X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N --=_salix-19937-1070998941-0001-2 Content-Type: text/plain; charset=iso-8859-1 Content-Transfer-Encoding: 7bit Hello Otis, Here is a patch with documentation from Dmitry. I used cvs diff -uN Hope it is OK now. -- Damian --=_salix-19937-1070998941-0001-2 Content-Type: text/x-patch; name="patch.diff"; charset=iso-8859-2 Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename=patch.diff Index: src/java/org/apache/lucene/document/Field.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v retrieving revision 1.11 diff -u -r1.11 Field.java --- src/java/org/apache/lucene/document/Field.java 20 Mar 2003 18:28:13 -0000 1.11 +++ src/java/org/apache/lucene/document/Field.java 9 Dec 2003 19:39:05 -0000 @@ -162,6 +162,8 @@ is used. Exactly one of stringValue() and readerValue() must be set. */ public Reader readerValue() { return readerValue; } + /** Create a field by specifying all parameters. + */ public Field(String name, String string, boolean store, boolean index, boolean token) { if (name == null) Index: src/java/org/apache/lucene/index/FieldInfos.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v retrieving revision 1.4 diff -u -r1.4 FieldInfos.java --- src/java/org/apache/lucene/index/FieldInfos.java 21 Oct 2003 17:59:16 -0000 1.4 +++ src/java/org/apache/lucene/index/FieldInfos.java 9 Dec 2003 19:39:05 -0000 @@ -68,6 +68,12 @@ import org.apache.lucene.store.OutputStream; import org.apache.lucene.store.InputStream; +/** Access to the Field Info file that describes document fields and whether or + * not they are indexed. Each segment has a separate Field Info file. Objects + * of this class is thread-safe for multiple readers, but only one thread can + * be adding documents at a time, with no other reader or writer threads + * accessing this object. + */ final class FieldInfos { private Vector byNumber = new Vector(); private Hashtable byName = new Hashtable(); @@ -94,6 +100,10 @@ } } + /** Adds in information for a set of FieldInfos. + * Returns an array mapping each field number in the names + * collection to the field numbers in this one. + */ final void add(Collection names, boolean isIndexed) { Iterator i = names.iterator(); while (i.hasNext()) { @@ -101,6 +111,10 @@ } } + /** If the field is not yet known, adds it. If it is known, checks + * to make sure that the isIndexed flag is the same as was given + * previously for this field. If not - throws IllegalStateException. + */ final void add(String name, boolean isIndexed) { FieldInfo fi = fieldInfo(name); if (fi == null) Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java,v retrieving revision 1.2 diff -u -r1.2 SegmentMergeInfo.java --- src/java/org/apache/lucene/index/SegmentMergeInfo.java 21 Oct 2003 17:59:16 -0000 1.2 +++ src/java/org/apache/lucene/index/SegmentMergeInfo.java 9 Dec 2003 19:39:06 -0000 @@ -57,14 +57,38 @@ import java.io.IOException; import org.apache.lucene.util.BitVector; +/** Data container to work with SegmentMergeQueue. Represents a single segment + * to be merged. Maintains the segment reader, TermEnum, and TermPositions + * for this segment. + */ final class SegmentMergeInfo { + /** The current term of this segment, or null if none. */ Term term; + + /** Index of the 0th document from this segment in the merged document numbering. */ int base; + + /** This segment's term enum. Do not use directly. */ TermEnum termEnum; + + /** This segment's reader. Do not use directly. */ IndexReader reader; + + /** Postings for the current term. */ TermPositions postings; + + + /** Maps around deleted docs. Contains a slot for each document in the + * reader. Slots corresponding to deleted docs have the value of -1. The + * rest have their new document numbers that start at 0. This value + * added to base is the document number in the merged numbering. + */ int[] docMap = null; // maps around deleted docs + /** Create a new merge info. Base b is a starting + * number for documents from this segment in the merged document + * numbering. + */ SegmentMergeInfo(int b, TermEnum te, IndexReader r) throws IOException { base = b; @@ -87,6 +111,12 @@ } } + + /** Shift to the next term on this segment's TermEnum. The new + * term becomes the current term for this segment, effecting the + * ordering of the SegmentMergeQueue. If no more terms remain + * in this segment, returns false and resets the current term to null. + */ final boolean next() throws IOException { if (termEnum.next()) { term = termEnum.term(); Index: src/java/org/apache/lucene/index/SegmentMergeQueue.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 SegmentMergeQueue.java --- src/java/org/apache/lucene/index/SegmentMergeQueue.java 18 Sep 2001 16:29:53 -0000 1.1.1.1 +++ src/java/org/apache/lucene/index/SegmentMergeQueue.java 9 Dec 2003 19:39:06 -0000 @@ -57,6 +57,10 @@ import java.io.IOException; import org.apache.lucene.util.PriorityQueue; +/** Priority queue of SegmentMergeInfo objects. The queue sorts the + * info objects by their current term, and if the terms are equal, + * by their base offset. + */ final class SegmentMergeQueue extends PriorityQueue { SegmentMergeQueue(int size) { initialize(size); Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v retrieving revision 1.6 diff -u -r1.6 SegmentMerger.java --- src/java/org/apache/lucene/index/SegmentMerger.java 31 Oct 2003 09:28:44 -0000 1.6 +++ src/java/org/apache/lucene/index/SegmentMerger.java 9 Dec 2003 19:39:07 -0000 @@ -77,20 +77,33 @@ "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis" }; + /** Create a segment merger that will merge a number of segments (specified + * as SegmentReaders added to this object with calls to add) into a + * single segment with the specified name. + */ SegmentMerger(Directory dir, String name, boolean compoundFile) { directory = dir; segment = name; useCompoundFile = compoundFile; } + /** Add segment reader to be merged. + * + */ final void add(IndexReader reader) { readers.addElement(reader); } + /** Return one of the segment readers being merged. + * + */ final IndexReader segmentReader(int i) { return (IndexReader)readers.elementAt(i); } + /** Start the merge. All segment readers to be merged must have been added + * prior to this call. + */ final int merge() throws IOException { int value; try { @@ -148,6 +161,9 @@ } + /** Merge the field information from the segment readers. + * Called from merge. + */ private final int mergeFields() throws IOException { fieldInfos = new FieldInfos(); // merge field names int docCount = 0; @@ -181,6 +197,9 @@ private TermInfosWriter termInfosWriter = null; private SegmentMergeQueue queue = null; + /** Merge the term index, frequency and proximity information + * from specified segment readers. Called from merge. + */ private final void mergeTerms() throws IOException { try { freqOutput = directory.createFile(segment + ".frq"); @@ -198,7 +217,11 @@ } } + /** Merge the term index information. Called from mergeTerms. + */ private final void mergeTermInfos() throws IOException { + // Create and populate a priority queue of segments to be merged. + // Segments are sorted by their top term and the base doc number in the merged segment. queue = new SegmentMergeQueue(readers.size()); int base = 0; for (int i = 0; i < readers.size(); i++) { @@ -220,13 +243,19 @@ Term term = match[0].term; SegmentMergeInfo top = (SegmentMergeInfo)queue.top(); + // pop off the queue and put into match[] all segments + // that have the same term at the top while (top != null && term.compareTo(top.term) == 0) { match[matchSize++] = (SegmentMergeInfo)queue.pop(); top = (SegmentMergeInfo)queue.top(); } + // perform the merge for all segments that are positioned on + // the same term mergeTermInfo(match, matchSize); // add new TermInfo + // advance the matched segments to the next term and, if one exists, put + // the segment back onto the queue (priority queue takes care of sorting them) while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.next()) @@ -239,6 +268,14 @@ private final TermInfo termInfo = new TermInfo(); // minimize consing + + /** Merge one term found in one or more segments. The array smis + * contains segments that are positioned at the same term. N + * is the number of cells in the array actually occupied. + * + * @param smis array of segments + * @param n number of cells in the array actually occupied + */ private final void mergeTermInfo(SegmentMergeInfo[] smis, int n) throws IOException { long freqPointer = freqOutput.getFilePointer(); @@ -253,6 +290,14 @@ } } + /** Process postings from multiple segments all positioned on the + * same term. Writes out merged entries into freqOutput and + * the proxOutput streams. + * + * @param smis array of segments + * @param n number of cells in the array actually occupied + * @return number of documents across all segments where this term was found + */ private final int appendPostings(SegmentMergeInfo[] smis, int n) throws IOException { int lastDoc = 0; @@ -295,6 +340,10 @@ } return df; } + + /** Merge field normalization factors for the specified segment readers. + * Called from merge. + */ private final void mergeNorms() throws IOException { for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); --=_salix-19937-1070998941-0001-2 Content-Type: text/plain; charset=us-ascii --------------------------------------------------------------------- To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org For additional commands, e-mail: lucene-dev-help@jakarta.apache.org --=_salix-19937-1070998941-0001-2--