lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r882962 - in /lucene/java/branches/flex_1458: contrib/misc/src/java/org/apache/lucene/index/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/index/codecs/ src/java/org/apache/lucene/index/codecs/standard/ src/java/org/apache/lu...
Date Sat, 21 Nov 2009 18:36:22 GMT
Author: mikemccand
Date: Sat Nov 21 18:36:17 2009
New Revision: 882962

URL: http://svn.apache.org/viewvc?rev=882962&view=rev
Log:
LUCENE-1458 (on flex branch): allow codecs to override merging

Added:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java
  (with props)
Removed:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentFieldMergeQueue.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMergeQueue.java
Modified:
    lucene/java/branches/flex_1458/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/DocsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/PositionsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/OpenBitSet.java

Modified: lucene/java/branches/flex_1458/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java
(original)
+++ lucene/java/branches/flex_1458/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java
Sat Nov 21 18:36:17 2009
@@ -26,6 +26,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.Bits;
 
 /**
  * This tool splits input index into multiple equal parts. The method employed
@@ -212,6 +213,11 @@
     }
 
     @Override
+    public Bits getDeletedDocs() {
+      return dels;
+    }
+
+    @Override
     public boolean isDeleted(int n) {
       return dels.get(n);
     }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java Sat
Nov 21 18:36:17 2009
@@ -989,6 +989,8 @@
         // by re-using the same TermsEnum and seeking only
         // forwards
         if (term.field() != currentField) {
+          // nocommit -- once we sync up branch again, add
+          // assert that this field is always > last one
           currentField = term.field();
           Terms terms = fields.terms(currentField);
           if (terms != null) {

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java Sat
Nov 21 18:36:17 2009
@@ -28,13 +28,12 @@
 import org.apache.lucene.index.MergePolicy.MergeAbortedException;
 import org.apache.lucene.index.codecs.Codecs;
 import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.MergeState;
 import org.apache.lucene.index.codecs.FieldsConsumer;
-import org.apache.lucene.index.codecs.TermsConsumer;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.index.codecs.DocsConsumer;
-import org.apache.lucene.index.codecs.PositionsConsumer;
+import org.apache.lucene.util.Bits;
 
 /**
  * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link
#add},
@@ -575,9 +574,6 @@
     }
   }
 
-  private SegmentFieldMergeQueue fieldsQueue;
-  private SegmentMergeQueue termsQueue;
-  
   Codec getCodec() {
     return codec;
   }
@@ -587,204 +583,68 @@
     SegmentWriteState state = new SegmentWriteState(null, directory, segment, fieldInfos,
null, mergedDocs, 0, termIndexInterval, codecs);
 
     // Let Codecs decide which codec will be used to write
-    // this segment:
+    // the new segment:
     codec = codecs.getWriter(state);
     
+    mergeState = new MergeState();
+    mergeState.readers = readers;
+    mergeState.fieldInfos = fieldInfos;
+    mergeState.readerCount = readers.size();
+    mergeState.mergedDocCount = mergedDocs;
+    
+    // Remap docIDs
+    mergeState.delCounts = new int[mergeState.readerCount];
+    mergeState.docMaps = new int[mergeState.readerCount][];
+    mergeState.docBase = new int[mergeState.readerCount];
+
+    int docBase = 0;
+    for(int i=0;i<mergeState.readerCount;i++) {
+      final IndexReader reader = readers.get(i);
+      mergeState.delCounts[i] = reader.numDeletedDocs();
+      mergeState.docBase[i] = docBase;
+      docBase += reader.numDocs();
+      if (mergeState.delCounts[i] != 0) {
+        int delCount = 0;
+        Bits deletedDocs = reader.getDeletedDocs();
+        final int maxDoc = reader.maxDoc();
+        final int[] docMap = mergeState.docMaps[i] = new int[maxDoc];
+        int newDocID = 0;
+        for(int j=0;j<maxDoc;j++) {
+          if (deletedDocs.get(j)) {
+            docMap[j] = -1;
+            delCount++;  // only for assert
+          } else {
+            docMap[j] = newDocID++;
+          }
+        }
+        assert delCount == mergeState.delCounts[i]: "reader delCount=" + mergeState.delCounts[i]
+ " vs recomputed delCount=" + delCount;
+      }
+    }
+
+    Fields[] fields = new Fields[mergeState.readerCount];
+    for(int i=0;i<mergeState.readerCount;i++) {
+      fields[i] = readers.get(i).fields();
+    }
+
     final FieldsConsumer consumer = codec.fieldsConsumer(state);
 
     try {
-      fieldsQueue = new SegmentFieldMergeQueue(readers.size());
-      termsQueue = new SegmentMergeQueue(readers.size());
-      mergeTermInfos(consumer);
+      consumer.merge(mergeState, fields);
     } finally {
       consumer.close();
     }
   }
 
-  boolean omitTermFreqAndPositions;
-
-  private final void mergeTermInfos(final FieldsConsumer consumer) throws CorruptIndexException,
IOException {
-    int base = 0;
-    final int readerCount = readers.size();
-    for (int i = 0; i < readerCount; i++) {
-      IndexReader reader = readers.get(i);
-      SegmentMergeInfo smi = new SegmentMergeInfo(base, reader);
-      int[] docMap  = smi.getDocMap();
-      if (docMap != null) {
-        if (docMaps == null) {
-          docMaps = new int[readerCount][];
-          delCounts = new int[readerCount];
-        }
-        docMaps[i] = docMap;
-        delCounts[i] = smi.reader.maxDoc() - smi.reader.numDocs();
-      }
-      
-      base += reader.numDocs();
-
-      assert reader.numDocs() == reader.maxDoc() - smi.delCount;
-
-      if (smi.nextField()) {
-        fieldsQueue.add(smi);				  // initialize queue
-      } else {
-        // segment is done: it has no fields
-      }
-    }
-
-    SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
-
-    while (fieldsQueue.size() > 0) {
-
-      while(true) {
-        SegmentMergeInfo smi = fieldsQueue.pop();
-        if (smi.nextTerm()) {
-          termsQueue.add(smi);
-        } else if (smi.nextField()) {
-          // field had no terms
-          fieldsQueue.add(smi);
-        } else {
-          // done with a segment
-        }
-        SegmentMergeInfo top = fieldsQueue.top();
-        if (top == null || (termsQueue.size() > 0 && ((SegmentMergeInfo) termsQueue.top()).field
!= top.field)) {
-          break;
-        }
-      }
-        
-      if (termsQueue.size() > 0) {          
-        // merge one field
-
-        final String field  = termsQueue.top().field;
-        if (Codec.DEBUG) {
-          System.out.println("merge field=" + field + " segCount=" + termsQueue.size());
-        }
-        final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-        final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
-        omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
-
-        while(termsQueue.size() > 0) {
-          // pop matching terms
-          int matchSize = 0;
-          while(true) {
-            match[matchSize++] = termsQueue.pop();
-            SegmentMergeInfo top = termsQueue.top();
-            if (top == null || !top.term.termEquals(match[0].term)) {
-              break;
-            }
-          }
-
-          if (Codec.DEBUG) {
-            System.out.println("merge field=" + field + " term=" + match[0].term + " numReaders="
+ matchSize);
-          }
-
-          int df = appendPostings(termsConsumer, match, matchSize);
-
-          checkAbort.work(df/3.0);
-
-          // put SegmentMergeInfos back into repsective queues
-          while (matchSize > 0) {
-            SegmentMergeInfo smi = match[--matchSize];
-            if (smi.nextTerm()) {
-              termsQueue.add(smi);
-            } else if (smi.nextField()) {
-              fieldsQueue.add(smi);
-            } else {
-              // done with a segment
-            }
-          }
-        }
-        termsConsumer.finish();
-      }
-    }
-  }
+  private MergeState mergeState;
 
-  private byte[] payloadBuffer;
-  private int[][] docMaps;
   int[][] getDocMaps() {
-    return docMaps;
+    return mergeState.docMaps;
   }
-  private int[] delCounts;
+
   int[] getDelCounts() {
-    return delCounts;
+    return mergeState.delCounts;
   }
   
-  /** Process postings from multiple segments all positioned on the
-   *  same term. Writes out merged entries into freqOutput and
-   *  the proxOutput streams.
-   *
-   * @param smis array of segments
-   * @param n number of cells in the array actually occupied
-   * @return number of documents across all segments where this term was found
-   * @throws CorruptIndexException if the index is corrupt
-   * @throws IOException if there is a low-level IO error
-   */
-  private final int appendPostings(final TermsConsumer termsConsumer, SegmentMergeInfo[]
smis, int n)
-        throws CorruptIndexException, IOException {
-
-    final TermRef text = smis[0].term;
-
-    final DocsConsumer docConsumer = termsConsumer.startTerm(text);
-
-    int df = 0;
-    for (int i = 0; i < n; i++) {
-      if (Codec.DEBUG) {
-        System.out.println("    merge reader " + (i+1) + " of " + n + ": term=" + text);
-      }
-
-      SegmentMergeInfo smi = smis[i];
-      DocsEnum docs = smi.terms.docs(smi.reader.getDeletedDocs());
-      int base = smi.base;
-      int[] docMap = smi.getDocMap();
-
-      while (true) {
-        int startDoc = docs.next();
-        if (startDoc == DocsEnum.NO_MORE_DOCS) {
-          break;
-        }
-        if (Codec.DEBUG) {
-          System.out.println("      merge read doc=" + startDoc);
-        }
-
-        df++;
-        int doc;
-        if (docMap != null) {
-          // map around deletions
-          doc = docMap[startDoc];
-          assert doc != -1: "postings enum returned deleted docID " + startDoc + " freq="
+ docs.freq() + " df=" + df;
-        } else {
-          doc = startDoc;
-        }
-
-        doc += base;                              // convert to merged space
-        assert doc < mergedDocs: "doc=" + doc + " maxDoc=" + mergedDocs;
-
-        final int freq = docs.freq();
-        final PositionsConsumer posConsumer = docConsumer.addDoc(doc, freq);
-        final PositionsEnum positions = docs.positions();
-
-        // nocommit -- omitTF should be "private", and this
-        // code (and FreqProxTermsWriter) should instead
-        // check if posConsumer is null?
-        
-        if (!omitTermFreqAndPositions) {
-          for (int j = 0; j < freq; j++) {
-            final int position = positions.next();
-            final int payloadLength = positions.getPayloadLength();
-            if (payloadLength > 0) {
-              if (payloadBuffer == null || payloadBuffer.length < payloadLength)
-                payloadBuffer = new byte[payloadLength];
-              positions.getPayload(payloadBuffer, 0);
-            }
-            posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
-          }
-          posConsumer.finishDoc();
-        }
-      }
-    }
-    termsConsumer.finishTerm(text, df);
-
-    return df;
-  }
-
   private void mergeNorms() throws IOException {
     byte[] normBuffer = null;
     IndexOutput output = null;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/DocsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/DocsConsumer.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/DocsConsumer.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/DocsConsumer.java
Sat Nov 21 18:36:17 2009
@@ -19,15 +19,12 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.DocsEnum;
 
 /**
  * NOTE: this API is experimental and will likely change
  */
 
-// nocommit -- name this "StandardDocsConsumer"?  eg the
-// RAMCodec doesn't need most of these methods...
 public abstract class DocsConsumer {
 
   // nocommit
@@ -43,4 +40,55 @@
    *  consumer doesn't need to see the positions for this
    *  doc. */
   public abstract PositionsConsumer addDoc(int docID, int termDocFreq) throws IOException;
+
+  public static class DocsMergeState {
+    DocsEnum docsEnum;
+    int[] docMap;
+    int docBase;
+  }
+
+  /** Default merge impl: append documents, mapping around
+   *  deletes */
+  public int merge(MergeState mergeState, DocsMergeState[] toMerge, int count) throws IOException
{
+
+    int df = 0;
+    // Append docs in order:
+    for(int i=0;i<count;i++) {
+      final DocsEnum docs = toMerge[i].docsEnum;
+      final int[] docMap = toMerge[i].docMap;
+      final int base = toMerge[i].docBase;
+
+      while(true) {
+        final int startDoc = docs.next();
+        if (startDoc == DocsEnum.NO_MORE_DOCS) {
+          break;
+        }
+        df++;
+
+        int doc;
+        if (docMap != null) {
+          // map around deletions
+          doc = docMap[startDoc];
+          assert doc != -1: "postings enum returned deleted docID " + startDoc + " freq="
+ docs.freq() + " df=" + df;
+        } else {
+          doc = startDoc;
+        }
+
+        doc += base;                              // convert to merged space
+        assert doc < mergeState.mergedDocCount: "doc=" + doc + " maxDoc=" + mergeState.mergedDocCount;
+
+        final int freq = docs.freq();
+        final PositionsConsumer posConsumer = addDoc(doc, freq);
+
+        // nocommit -- omitTF should be "private", and this
+        // code (and FreqProxTermsWriter) should instead
+        // check if posConsumer is null?
+        if (!mergeState.omitTermFreqAndPositions) {
+          posConsumer.merge(mergeState, docs.positions(), freq);
+        }
+      }
+    }
+
+    return df;
+  }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java
Sat Nov 21 18:36:17 2009
@@ -18,6 +18,10 @@
  */
 
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.PriorityQueue;
 
 import java.io.IOException;
 
@@ -35,4 +39,99 @@
 
   /** Called when we are done adding everything. */
   public abstract void close() throws IOException;
+
+  private final static class FieldMergeState {
+    String current;
+    FieldsEnum fieldsEnum;
+    int readerIndex;
+  }
+
+  // Used for merge-sorting by field
+  private final static class MergeQueue extends PriorityQueue<FieldMergeState> {
+    public MergeQueue(int size) {
+      initialize(size);
+    }
+
+    @Override
+    protected final boolean lessThan(FieldMergeState a, FieldMergeState b) {
+      final int cmp = a.current.compareTo(b.current);
+      if (cmp != 0) {
+        return cmp < 0;
+      } else {
+        // nocommit -- technically not required to break
+        // ties, since the terms merging will do so?
+        return a.readerIndex < b.readerIndex;
+      }
+    }
+  }
+
+  public void merge(MergeState mergeState, Fields[] fields) throws IOException {
+
+    MergeQueue queue = new MergeQueue(fields.length);
+
+    for(int i=0;i<fields.length;i++) {
+      FieldsEnum fieldsEnum = fields[i].iterator();
+      String field = fieldsEnum.next();
+      if (field != null) {
+        FieldMergeState state = new FieldMergeState();
+        state.current = field;
+        state.fieldsEnum = fieldsEnum;
+        state.readerIndex = i;
+        queue.add(state);
+      } else {
+        // no fields at all -- nothing to do
+      }
+    }
+
+    final FieldMergeState[] pending = new FieldMergeState[mergeState.readerCount];
+    final TermsConsumer.TermMergeState[] match = new TermsConsumer.TermMergeState[mergeState.readerCount];
+    for(int i=0;i<mergeState.readerCount;i++) {
+      match[i] = new TermsConsumer.TermMergeState();
+    }
+
+    // Merge sort by field name, calling terms.merge on all
+    // fields sharing same field name:
+    while(queue.size() != 0) {
+
+      int matchCount = 0;
+      int pendingCount = 0;
+
+      while(true) {
+        FieldMergeState state = pending[pendingCount++] = queue.pop();
+        TermsEnum termsEnum = state.fieldsEnum.terms();
+        if (termsEnum != null) {
+          match[matchCount].termsEnum = termsEnum;
+          match[matchCount].readerIndex = state.readerIndex;
+          matchCount++;
+        }
+        FieldMergeState top = queue.top();
+        if (top == null || top.current != pending[0].current) {
+          break;
+        }
+      }
+
+      if (matchCount > 0) {
+        // Merge one field
+        final String field = pending[0].current;
+        mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field);
+        mergeState.omitTermFreqAndPositions = mergeState.fieldInfo.omitTermFreqAndPositions;
+        final TermsConsumer termsConsumer = addField(mergeState.fieldInfo);
+        termsConsumer.merge(mergeState, match, matchCount);
+      }
+
+      // Put fields back into queue
+      for(int i=0;i<pendingCount;i++) {
+        FieldMergeState state = pending[i];
+        
+        state.current = state.fieldsEnum.next();
+        if (state.current != null) {
+          // More fields to merge
+          queue.add(state);
+        } else {
+          // Done
+        }
+      }
+    }
+
+  }
 }

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java?rev=882962&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java
(added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java
Sat Nov 21 18:36:17 2009
@@ -0,0 +1,42 @@
+package org.apache.lucene.index.codecs;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+import java.util.List;
+
+/** Holds common state used during segment merging
+ *
+ * <p>This API is experimental and subject to suddenly
+ * change.</p> */
+public class MergeState {
+  public FieldInfos fieldInfos;
+  public List<IndexReader> readers;               // Readers being merged
+  public int readerCount;                         // Number of readers being merged
+  public int[][] docMaps;                         // Maps docIDs around deletions
+  public int[] delCounts;                         // Deletion count per reader
+  public int[] docBase;                           // New docID base per reader
+  public int mergedDocCount;                      // Total # merged docs
+
+  // Updated per field;
+  public FieldInfo fieldInfo;
+  public boolean omitTermFreqAndPositions;
+}
+

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/MergeState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/PositionsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/PositionsConsumer.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/PositionsConsumer.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/PositionsConsumer.java
Sat Nov 21 18:36:17 2009
@@ -19,7 +19,7 @@
 
 import java.io.IOException;
 
-import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.index.PositionsEnum;
 
 public abstract class PositionsConsumer {
 
@@ -32,4 +32,23 @@
   /** Called when we are done adding positions & payloads
    * for each doc */
   public abstract void finishDoc() throws IOException;
+
+  private byte[] payloadBuffer;
+
+  /** Default merge impl, just copies positions & payloads
+   *  from the input. */
+  public void merge(MergeState mergeState, PositionsEnum positions, int freq) throws IOException
{
+    for(int i=0;i<freq;i++) {
+      final int position = positions.next();
+      final int payloadLength = positions.getPayloadLength();
+      if (payloadLength > 0) {
+        if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
+          payloadBuffer = new byte[payloadLength];
+        }
+        positions.getPayload(payloadBuffer, 0);
+      }
+      addPosition(position, payloadBuffer, 0, payloadLength);
+    }
+    finishDoc();
+  }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
Sat Nov 21 18:36:17 2009
@@ -20,6 +20,9 @@
 import java.io.IOException;
 
 import org.apache.lucene.index.TermRef;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.util.PriorityQueue;
 
 /**
  * NOTE: this API is experimental and will likely change
@@ -35,4 +38,96 @@
 
   /** Called when we are done adding terms to this field */
   public abstract void finish() throws IOException;
+
+  // For default merge impl
+  public static class TermMergeState {
+    TermRef current;
+    TermsEnum termsEnum;
+    int readerIndex;
+  }
+
+  private final static class MergeQueue extends PriorityQueue<TermMergeState> {
+    public MergeQueue(int size) {
+      initialize(size);
+    }
+
+    @Override
+    protected final boolean lessThan(TermMergeState a, TermMergeState b) {
+      final int cmp = a.current.compareTerm(b.current);
+      if (cmp != 0) {
+        return cmp < 0;
+      } else {
+        return a.readerIndex < b.readerIndex;
+      }
+    }
+  }
+
+  private MergeQueue queue;
+  private DocsConsumer.DocsMergeState[] match;
+  private TermMergeState[] pending;
+
+  /** Default merge impl */
+  public void merge(MergeState mergeState, TermMergeState[] termsStates, int count) throws
IOException {
+    if (queue == null) {
+      queue = new MergeQueue(mergeState.readerCount);
+      match = new DocsConsumer.DocsMergeState[mergeState.readerCount];
+      for(int i=0;i<mergeState.readerCount;i++) {
+        match[i] = new DocsConsumer.DocsMergeState();
+      }
+      pending = new TermMergeState[mergeState.readerCount];
+    }
+
+    // Init queue
+    for(int i=0;i<count;i++) {
+      TermMergeState state = termsStates[i];
+      state.current = state.termsEnum.next();
+      if (state.current != null) {
+        queue.add(state);
+      } else {
+        // no terms at all in this field
+      }
+    }
+
+    while(queue.size() != 0) {
+
+      int matchCount = 0;
+      int pendingCount = 0;
+
+      while(true) {
+        TermMergeState state = pending[pendingCount++] = queue.pop();
+        DocsEnum docsEnum = state.termsEnum.docs(mergeState.readers.get(state.readerIndex).getDeletedDocs());
+        if (docsEnum != null) {
+          match[matchCount].docsEnum = docsEnum;
+          match[matchCount].docMap = mergeState.docMaps[state.readerIndex];
+          match[matchCount].docBase = mergeState.docBase[state.readerIndex];
+          matchCount++;
+        }
+        TermMergeState top = queue.top();
+        if (top == null || !top.current.termEquals(pending[0].current)) {
+          break;
+        }
+      }
+
+      if (matchCount > 0) {
+        // Merge one term
+        final TermRef term = pending[0].current;
+        final DocsConsumer docsConsumer = startTerm(term);
+        final int numDocs = docsConsumer.merge(mergeState, match, matchCount);
+        finishTerm(term, numDocs);
+      }
+
+      // Put terms back into queue
+      for(int i=0;i<pendingCount;i++) {
+        TermMergeState state = pending[i];
+        
+        state.current = state.termsEnum.next();
+        if (state.current != null) {
+          // More terms to merge
+          queue.add(state);
+        } else {
+          // Done
+        }
+      }
+    }
+  }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
Sat Nov 21 18:36:17 2009
@@ -367,9 +367,9 @@
 
         //int indexCount = 0;
         //int lastIndexCount = 0;
-        int scanCnt = 0;
+        //int scanCnt = 0;
         while(next() != null) {
-          scanCnt++;
+          //scanCnt++;
           final int cmp = bytesReader.term.compareTerm(term);
           if (cmp == 0) {
             // mxx
@@ -518,13 +518,13 @@
   // nocommit -- scrutinize API
   public static class CacheEntry {
     int termUpTo;
-    TermRef term;
+    TermRef term; // nocommit -- really needed?
     long filePointer;
-    // nocommit
+
+    // nocommit -- belongs in Pulsing's CacheEntry class:
     public int freq;
     public Document docs[];
     public boolean pendingIndexTerm;
-
   }
 
   private static final int MAX_CACHE_SIZE = 1024;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/OpenBitSet.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/OpenBitSet.java?rev=882962&r1=882961&r2=882962&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/OpenBitSet.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/OpenBitSet.java Sat Nov
21 18:36:17 2009
@@ -77,7 +77,7 @@
  * @version $Id$
  */
 
-public class OpenBitSet extends DocIdSet implements Cloneable, Serializable {
+public class OpenBitSet extends DocIdSet implements Bits, Cloneable, Serializable {
   protected long[] bits;
   protected int wlen;   // number of words (elements) used in the array
 



Mime
View raw message