lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1061480 [2/5] - in /lucene/dev/branches/bulkpostings: ./ dev-tools/idea/.idea/copyright/ lucene/ lucene/contrib/ lucene/contrib/demo/src/java/org/apache/lucene/demo/ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/ l...
Date Thu, 20 Jan 2011 19:52:08 GMT
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java Thu Jan 20 19:52:03 2011
@@ -23,6 +23,7 @@ import java.nio.CharBuffer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.AttributeReflector;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.UnicodeUtil;
 
@@ -244,6 +245,14 @@ public class CharTermAttributeImpl exten
   }
   
   @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(CharTermAttribute.class, "term", toString());
+    final BytesRef bytes = new BytesRef();
+    toBytesRef(bytes);
+    reflector.reflect(TermToBytesRefAttribute.class, "bytes", bytes);
+  }
+  
+  @Override
   public void copyTo(AttributeImpl target) {
     CharTermAttribute t = (CharTermAttribute) target;
     t.copyBuffer(termBuffer, 0, termLength);

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BufferedDeletes.java Thu Jan 20 19:52:03 2011
@@ -229,7 +229,9 @@ class BufferedDeletes {
           if (mergedDeletes == null) {
             mergedDeletes = getDeletes(segmentInfos.info(firstIdx-1));
             numTerms.addAndGet(-mergedDeletes.numTermDeletes.get());
+            assert numTerms.get() >= 0;
             bytesUsed.addAndGet(-mergedDeletes.bytesUsed.get());
+            assert bytesUsed.get() >= 0;
           }
 
           mergedDeletes.update(deletes, true);

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/CheckIndex.java Thu Jan 20 19:52:03 2011
@@ -610,6 +610,8 @@ public class CheckIndex {
 
         Comparator<BytesRef> termComp = terms.getComparator();
 
+        long sumTotalTermFreq = 0;
+
         while(true) {
 
           final BytesRef term = terms.next();
@@ -660,6 +662,8 @@ public class CheckIndex {
           }
 
           int lastDoc = -1;
+          int docCount = 0;
+          long totalTermFreq = 0;
           while(true) {
             final int doc = docs2.nextDoc();
             if (doc == DocIdSetIterator.NO_MORE_DOCS) {
@@ -667,6 +671,8 @@ public class CheckIndex {
             }
             final int freq = docs2.freq();
             status.totPos += freq;
+            totalTermFreq += freq;
+            docCount++;
 
             if (doc <= lastDoc) {
               throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
@@ -697,22 +703,39 @@ public class CheckIndex {
               }
             }
           }
+          
+          final long totalTermFreq2 = terms.totalTermFreq();
+          final boolean hasTotalTermFreq = postings != null && totalTermFreq2 != -1;
 
-          // Now count how many deleted docs occurred in
-          // this term:
-
+          // Re-count if there are deleted docs:
           if (reader.hasDeletions()) {
             final DocsEnum docsNoDel = terms.docs(null, docs);
-            int count = 0;
+            docCount = 0;
+            totalTermFreq = 0;
             while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-              count++;
+              docCount++;
+              totalTermFreq += docsNoDel.freq();
             }
-            if (count != docFreq) {
-              throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + count);
+          }
+
+          if (docCount != docFreq) {
+            throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != tot docs w/o deletions " + docCount);
+          }
+          if (hasTotalTermFreq) {
+            sumTotalTermFreq += totalTermFreq;
+            if (totalTermFreq != totalTermFreq2) {
+              throw new RuntimeException("term " + term + " totalTermFreq=" + totalTermFreq2 + " != recomputed totalTermFreq=" + totalTermFreq);
             }
           }
         }
 
+        if (sumTotalTermFreq != 0) {
+          final long v = fields.terms(field).getSumTotalTermFreq();
+          if (v != -1 && sumTotalTermFreq != v) {
+            throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq);
+          }
+        }
+
         // Test seek to last term:
         if (lastTerm != null) {
           if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java Thu Jan 20 19:52:03 2011
@@ -142,8 +142,12 @@ public class ConcurrentMergeScheduler ex
     }
   };
 
-  /** Called whenever the running merges have changed, to
-   *  pause & unpause threads. */
+  /**
+   * Called whenever the running merges have changed, to pause & unpause
+   * threads. This method sorts the merge threads by their merge size in
+   * descending order and then pauses/unpauses threads from first to lsat --
+   * that way, smaller merges are guaranteed to run before larger ones.
+   */
   protected synchronized void updateMergeThreads() {
 
     // Only look at threads that are alive & not in the
@@ -164,6 +168,7 @@ public class ConcurrentMergeScheduler ex
       threadIdx++;
     }
 
+    // Sort the merge threads in descending order.
     CollectionUtil.mergeSort(activeMerges, compareByMergeDocCount);
     
     int pri = mergeThreadPriority;
@@ -175,12 +180,8 @@ public class ConcurrentMergeScheduler ex
         continue;
       }
 
-      final boolean doPause;
-      if (threadIdx < activeMergeCount-maxThreadCount) {
-        doPause = true;
-      } else {
-        doPause = false;
-      }
+      // pause the thread if maxThreadCount is smaller than the number of merge threads.
+      final boolean doPause = threadIdx < activeMergeCount - maxThreadCount;
 
       if (verbose()) {
         if (doPause != merge.getPause()) {
@@ -205,13 +206,26 @@ public class ConcurrentMergeScheduler ex
     }
   }
 
-  private boolean verbose() {
+  /**
+   * Returns true if verbosing is enabled. This method is usually used in
+   * conjunction with {@link #message(String)}, like that:
+   * 
+   * <pre>
+   * if (verbose()) {
+   *   message(&quot;your message&quot;);
+   * }
+   * </pre>
+   */
+  protected boolean verbose() {
     return writer != null && writer.verbose();
   }
   
-  private void message(String message) {
-    if (verbose())
-      writer.message("CMS: " + message);
+  /**
+   * Outputs the given message - this method assumes {@link #verbose()} was
+   * called and returned true.
+   */
+  protected void message(String message) {
+    writer.message("CMS: " + message);
   }
 
   private synchronized void initMergeThreadPriority() {
@@ -231,10 +245,10 @@ public class ConcurrentMergeScheduler ex
 
   /** Wait for any running merge threads to finish */
   public void sync() {
-    while(true) {
+    while (true) {
       MergeThread toSync = null;
-      synchronized(this) {
-        for(MergeThread t : mergeThreads) {
+      synchronized (this) {
+        for (MergeThread t : mergeThreads) {
           if (t.isAlive()) {
             toSync = t;
             break;
@@ -253,12 +267,14 @@ public class ConcurrentMergeScheduler ex
     }
   }
 
-  private synchronized int mergeThreadCount() {
+  /**
+   * Returns the number of merge threads that are alive. Note that this number
+   * is &le; {@link #mergeThreads} size.
+   */
+  protected synchronized int mergeThreadCount() {
     int count = 0;
-    final int numThreads = mergeThreads.size();
-    for(int i=0;i<numThreads;i++) {
-      final MergeThread t = mergeThreads.get(i);
-      if (t.isAlive() && t.getCurrentMerge() != null) {
+    for (MergeThread mt : mergeThreads) {
+      if (mt.isAlive() && mt.getCurrentMerge() != null) {
         count++;
       }
     }
@@ -266,8 +282,7 @@ public class ConcurrentMergeScheduler ex
   }
 
   @Override
-  public void merge(IndexWriter writer)
-    throws CorruptIndexException, IOException {
+  public void merge(IndexWriter writer) throws IOException {
 
     assert !Thread.holdsLock(writer);
 
@@ -291,7 +306,7 @@ public class ConcurrentMergeScheduler ex
     
     // Iterate, pulling from the IndexWriter's queue of
     // pending merges, until it's empty:
-    while(true) {
+    while (true) {
 
       // TODO: we could be careful about which merges to do in
       // the BG (eg maybe the "biggest" ones) vs FG, which
@@ -360,8 +375,7 @@ public class ConcurrentMergeScheduler ex
   }
 
   /** Does the actual merge, by calling {@link IndexWriter#merge} */
-  protected void doMerge(MergePolicy.OneMerge merge)
-    throws IOException {
+  protected void doMerge(MergePolicy.OneMerge merge) throws IOException {
     writer.merge(merge);
   }
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java Thu Jan 20 19:52:03 2011
@@ -63,8 +63,6 @@ final class DocInverterPerField extends 
 
     fieldState.reset(docState.doc.getBoost());
 
-    final int maxFieldLength = docState.maxFieldLength;
-
     final boolean doInvert = consumer.start(fields, count);
 
     for(int i=0;i<count;i++) {
@@ -171,12 +169,8 @@ final class DocInverterPerField extends 
                 if (!success)
                   docState.docWriter.setAborting();
               }
+              fieldState.length++;
               fieldState.position++;
-              if (++fieldState.length >= maxFieldLength) {
-                if (docState.infoStream != null)
-                  docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
-                break;
-              }
 
               hasMoreTokens = stream.incrementToken();
             }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Jan 20 19:52:03 2011
@@ -127,7 +127,6 @@ final class DocumentsWriter {
   private boolean aborting;               // True if an abort is pending
 
   PrintStream infoStream;
-  int maxFieldLength = IndexWriterConfig.UNLIMITED_FIELD_LENGTH;
   Similarity similarity;
 
   // max # simultaneous threads; if there are more than
@@ -140,7 +139,6 @@ final class DocumentsWriter {
   static class DocState {
     DocumentsWriter docWriter;
     Analyzer analyzer;
-    int maxFieldLength;
     PrintStream infoStream;
     Similarity similarity;
     int docID;
@@ -191,6 +189,7 @@ final class DocumentsWriter {
     /**
      * Allocate bytes used from shared pool.
      */
+    @Override
     protected byte[] newBuffer(int size) {
       assert size == PER_DOC_BLOCK_SIZE;
       return perDocAllocator.getByteBlock();
@@ -358,13 +357,6 @@ final class DocumentsWriter {
     }
   }
 
-  synchronized void setMaxFieldLength(int maxFieldLength) {
-    this.maxFieldLength = maxFieldLength;
-    for(int i=0;i<threadStates.length;i++) {
-      threadStates[i].docState.maxFieldLength = maxFieldLength;
-    }
-  }
-
   synchronized void setSimilarity(Similarity similarity) {
     this.similarity = similarity;
     for(int i=0;i<threadStates.length;i++) {
@@ -546,6 +538,8 @@ final class DocumentsWriter {
   // Lock order: IW -> DW
   synchronized SegmentInfo flush(IndexWriter writer, IndexFileDeleter deleter, MergePolicy mergePolicy, SegmentInfos segmentInfos) throws IOException {
 
+    final long startTime = System.currentTimeMillis();
+
     // We change writer's segmentInfos:
     assert Thread.holdsLock(writer);
 
@@ -646,6 +640,10 @@ final class DocumentsWriter {
     // Lock order: IW -> DW -> BD
     pushDeletes(newSegment, segmentInfos);
 
+    if (infoStream != null) {
+      message("flush time " + (System.currentTimeMillis()-startTime) + " msec");
+    }
+
     return newSegment;
   }
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java Thu Jan 20 19:52:03 2011
@@ -35,7 +35,6 @@ final class DocumentsWriterThreadState {
   public DocumentsWriterThreadState(DocumentsWriter docWriter) throws IOException {
     this.docWriter = docWriter;
     docState = new DocumentsWriter.DocState();
-    docState.maxFieldLength = docWriter.maxFieldLength;
     docState.infoStream = docWriter.infoStream;
     docState.similarity = docWriter.similarity;
     docState.docWriter = docWriter;

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java Thu Jan 20 19:52:03 2011
@@ -99,6 +99,11 @@ public class FilterIndexReader extends I
     public long getUniqueTermCount() throws IOException {
       return in.getUniqueTermCount();
     }
+
+    @Override
+    public long getSumTotalTermFreq() throws IOException {
+      return in.getSumTotalTermFreq();
+    }
   }
 
   /** Base class for filtering {@link TermsEnum} implementations. */
@@ -156,6 +161,11 @@ public class FilterIndexReader extends I
     }
 
     @Override
+    public long totalTermFreq() {
+      return in.totalTermFreq();
+    }
+
+    @Override
     public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
       return in.docs(skipDocs, reuse);
     }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Thu Jan 20 19:52:03 2011
@@ -20,13 +20,14 @@ package org.apache.lucene.index;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
-import java.util.Comparator;
 
-import org.apache.lucene.index.codecs.PostingsConsumer;
 import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.PostingsConsumer;
 import org.apache.lucene.index.codecs.TermsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CollectionUtil;
 
@@ -165,6 +166,7 @@ final class FreqProxTermsWriter extends 
     // multiple threads and interacting with the
     // TermsConsumer, only calling out to us (passing us the
     // DocsConsumer) to handle delivery of docs/positions
+    long sumTotalTermFreq = 0;
     while(numFields > 0) {
 
       // Get the next term to merge
@@ -197,6 +199,7 @@ final class FreqProxTermsWriter extends 
       // which all share the same term.  Now we must
       // interleave the docID streams.
       int numDocs = 0;
+      long totTF = 0;
       while(numToMerge > 0) {
         
         FreqProxFieldMergeState minState = termStates[0];
@@ -222,6 +225,7 @@ final class FreqProxTermsWriter extends 
           // omitTermFreqAndPositions == false so we do write positions &
           // payload          
           int position = 0;
+          totTF += termDocFreq;
           for(int j=0;j<termDocFreq;j++) {
             final int code = prox.readVInt();
             position += code >> 1;
@@ -286,9 +290,10 @@ final class FreqProxTermsWriter extends 
       }
 
       assert numDocs > 0;
-      termsConsumer.finishTerm(text, numDocs);
+      termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
+      sumTotalTermFreq += totTF;
     }
 
-    termsConsumer.finish();
+    termsConsumer.finish(sumTotalTermFreq);
   }
 }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexReader.java Thu Jan 20 19:52:03 2011
@@ -997,6 +997,23 @@ public abstract class IndexReader implem
     return terms.docFreq(term);
   }
 
+  /** Returns the number of documents containing the term
+   * <code>t</code>.  This method returns 0 if the term or
+   * field does not exists.  This method does not take into
+   * account deleted documents that have not yet been merged
+   * away. */
+  public long totalTermFreq(String field, BytesRef term) throws IOException {
+    final Fields fields = fields();
+    if (fields == null) {
+      return 0;
+    }
+    final Terms terms = fields.terms(field);
+    if (terms == null) {
+      return 0;
+    }
+    return terms.totalTermFreq(term);
+  }
+
   /** This may return null if the field does not exist.*/
   public Terms terms(String field) throws IOException {
     final Fields fields = fields();

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriter.java Thu Jan 20 19:52:03 2011
@@ -662,9 +662,6 @@ public class IndexWriter implements Clos
    * IndexWriter. Additionally, calling {@link #getConfig()} and changing the
    * parameters does not affect that IndexWriter instance.
    * <p>
-   * <b>NOTE:</b> by default, {@link IndexWriterConfig#getMaxFieldLength()}
-   * returns {@link IndexWriterConfig#UNLIMITED_FIELD_LENGTH}. Pay attention to
-   * whether this setting fits your application.
    * 
    * @param d
    *          the index directory. The index is either created or appended
@@ -689,7 +686,6 @@ public class IndexWriter implements Clos
     directory = d;
     analyzer = conf.getAnalyzer();
     infoStream = defaultInfoStream;
-    maxFieldLength = conf.getMaxFieldLength();
     termIndexInterval = conf.getTermIndexInterval();
     mergePolicy = conf.getMergePolicy();
     mergePolicy.setIndexWriter(this);
@@ -768,7 +764,6 @@ public class IndexWriter implements Clos
 
       docWriter = new DocumentsWriter(directory, this, conf.getIndexingChain(), conf.getMaxThreadStates(), getCurrentFieldInfos(), bufferedDeletes);
       docWriter.setInfoStream(infoStream);
-      docWriter.setMaxFieldLength(maxFieldLength);
 
       // Default deleter (for backwards compatibility) is
       // KeepOnlyLastCommitDeleter:
@@ -1177,25 +1172,7 @@ public class IndexWriter implements Clos
   }
 
   /**
-   * The maximum number of terms that will be indexed for a single field in a
-   * document.  This limits the amount of memory required for indexing, so that
-   * collections with very large files will not crash the indexing process by
-   * running out of memory.<p/>
-   * Note that this effectively truncates large documents, excluding from the
-   * index terms that occur further in the document.  If you know your source
-   * documents are large, be sure to set this value high enough to accommodate
-   * the expected size.  If you set it to Integer.MAX_VALUE, then the only limit
-   * is your memory, but you should anticipate an OutOfMemoryError.<p/>
-   * By default, no more than 10,000 terms will be indexed for a field.
-   *
-   * @see MaxFieldLength
-   */
-  private int maxFieldLength;
-
-  /**
-   * Adds a document to this index.  If the document contains more than
-   * {@link IndexWriterConfig#setMaxFieldLength(int)} terms for a given field, 
-   * the remainder are discarded.
+   * Adds a document to this index.
    *
    * <p> Note that if an Exception is hit (for example disk full)
    * then the index will be consistent, but this document
@@ -1242,9 +1219,7 @@ public class IndexWriter implements Clos
 
   /**
    * Adds a document to this index, using the provided analyzer instead of the
-   * value of {@link #getAnalyzer()}.  If the document contains more than
-   * {@link IndexWriterConfig#setMaxFieldLength(int)} terms for a given field, the remainder are
-   * discarded.
+   * value of {@link #getAnalyzer()}.
    *
    * <p>See {@link #addDocument(Document)} for details on
    * index and IndexWriter state after an Exception, and
@@ -3280,7 +3255,7 @@ public class IndexWriter implements Clos
     // NOTE: the callers of this method should in theory
     // be able to do simply wait(), but, as a defense
     // against thread timing hazards where notifyAll()
-    // falls to be called, we wait for at most 1 second
+    // fails to be called, we wait for at most 1 second
     // and then return so caller can check if wait
     // conditions are satisfied:
     try {

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java Thu Jan 20 19:52:03 2011
@@ -41,8 +41,6 @@ import org.apache.lucene.util.Version;
  */
 public final class IndexWriterConfig implements Cloneable {
 
-  public static final int UNLIMITED_FIELD_LENGTH = Integer.MAX_VALUE;
-
   /**
    * Specifies the open mode for {@link IndexWriter}:
    * <ul>
@@ -55,7 +53,7 @@ public final class IndexWriterConfig imp
   public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
   
   /** Default value is 32. Change using {@link #setTermIndexInterval(int)}. */
-  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;                   // TODO: this should be private to the codec, not settable here
+  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32; // TODO: this should be private to the codec, not settable here
 
   /** Denotes a flush trigger is disabled. */
   public final static int DISABLE_AUTO_FLUSH = -1;
@@ -113,7 +111,6 @@ public final class IndexWriterConfig imp
   private IndexDeletionPolicy delPolicy;
   private IndexCommit commit;
   private OpenMode openMode;
-  private int maxFieldLength;
   private Similarity similarity;
   private int termIndexInterval; // TODO: this should be private to the codec, not settable here
   private MergeScheduler mergeScheduler;
@@ -145,7 +142,6 @@ public final class IndexWriterConfig imp
     delPolicy = new KeepOnlyLastCommitDeletionPolicy();
     commit = null;
     openMode = OpenMode.CREATE_OR_APPEND;
-    maxFieldLength = UNLIMITED_FIELD_LENGTH;
     similarity = Similarity.getDefault();
     termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; // TODO: this should be private to the codec, not settable here
     mergeScheduler = new ConcurrentMergeScheduler();
@@ -220,37 +216,6 @@ public final class IndexWriterConfig imp
   }
 
   /**
-   * The maximum number of terms that will be indexed for a single field in a
-   * document. This limits the amount of memory required for indexing, so that
-   * collections with very large files will not crash the indexing process by
-   * running out of memory. This setting refers to the number of running terms,
-   * not to the number of different terms.
-   * <p>
-   * <b>NOTE:</b> this silently truncates large documents, excluding from the
-   * index all terms that occur further in the document. If you know your source
-   * documents are large, be sure to set this value high enough to accomodate
-   * the expected size. If you set it to {@link #UNLIMITED_FIELD_LENGTH}, then
-   * the only limit is your memory, but you should anticipate an
-   * OutOfMemoryError.
-   * <p>
-   * By default it is set to {@link #UNLIMITED_FIELD_LENGTH}.
-   */
-  public IndexWriterConfig setMaxFieldLength(int maxFieldLength) {
-    this.maxFieldLength = maxFieldLength;
-    return this;
-  }
-
-  /**
-   * Returns the maximum number of terms that will be indexed for a single field
-   * in a document.
-   * 
-   * @see #setMaxFieldLength(int)
-   */
-  public int getMaxFieldLength() {
-    return maxFieldLength;
-  }
-
-  /**
    * Expert: allows to open a certain commit point. The default is null which
    * opens the latest commit point.
    */
@@ -611,7 +576,6 @@ public final class IndexWriterConfig imp
     sb.append("delPolicy=").append(delPolicy.getClass().getName()).append("\n");
     sb.append("commit=").append(commit == null ? "null" : commit).append("\n");
     sb.append("openMode=").append(openMode).append("\n");
-    sb.append("maxFieldLength=").append(maxFieldLength).append("\n");
     sb.append("similarity=").append(similarity.getClass().getName()).append("\n");
     sb.append("termIndexInterval=").append(termIndexInterval).append("\n"); // TODO: this should be private to the codec, not settable here
     sb.append("mergeScheduler=").append(mergeScheduler.getClass().getName()).append("\n");

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -30,9 +30,14 @@ public class LogByteSizeMergePolicy exte
    *  or larger will never be merged.  @see setMaxMergeMB */
   public static final double DEFAULT_MAX_MERGE_MB = 2048;
 
+  /** Default maximum segment size.  A segment of this size
+   *  or larger will never be merged during optimize.  @see setMaxMergeMBForOptimize */
+  public static final double DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE = Long.MAX_VALUE;
+
   public LogByteSizeMergePolicy() {
     minMergeSize = (long) (DEFAULT_MIN_MERGE_MB*1024*1024);
     maxMergeSize = (long) (DEFAULT_MAX_MERGE_MB*1024*1024);
+    maxMergeSizeForOptimize = (long) (DEFAULT_MAX_MERGE_MB_FOR_OPTIMIZE*1024*1024);
   }
   
   @Override
@@ -63,6 +68,23 @@ public class LogByteSizeMergePolicy exte
     return ((double) maxMergeSize)/1024/1024;
   }
 
+  /** <p>Determines the largest segment (measured by total
+   *  byte size of the segment's files, in MB) that may be
+   *  merged with other segments during optimize. Setting
+   *  it low will leave the index with more than 1 segment,
+   *  even if {@link IndexWriter#optimize()} is called.*/
+  public void setMaxMergeMBForOptimize(double mb) {
+    maxMergeSizeForOptimize = (long) (mb*1024*1024);
+  }
+
+  /** Returns the largest segment (measured by total byte
+   *  size of the segment's files, in MB) that may be merged
+   *  with other segments during optimize.
+   *  @see #setMaxMergeMBForOptimize */
+  public double getMaxMergeMBForOptimize() {
+    return ((double) maxMergeSizeForOptimize)/1024/1024;
+  }
+
   /** Sets the minimum size for the lowest level segments.
    * Any segments below this size are considered to be on
    * the same level (even if they vary drastically in size)

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogDocMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -31,9 +31,10 @@ public class LogDocMergePolicy extends L
   public LogDocMergePolicy() {
     minMergeSize = DEFAULT_MIN_MERGE_DOCS;
     
-    // maxMergeSize is never used by LogDocMergePolicy; set
+    // maxMergeSize(ForOptimize) are never used by LogDocMergePolicy; set
     // it to Long.MAX_VALUE to disable it
     maxMergeSize = Long.MAX_VALUE;
+    maxMergeSizeForOptimize = Long.MAX_VALUE;
   }
 
   @Override

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java Thu Jan 20 19:52:03 2011
@@ -63,6 +63,9 @@ public abstract class LogMergePolicy ext
 
   protected long minMergeSize;
   protected long maxMergeSize;
+  // Although the core MPs set it explicitly, we must default in case someone
+  // out there wrote his own LMP ...
+  protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
   protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
 
   protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
@@ -240,9 +243,9 @@ public abstract class LogMergePolicy ext
     int start = last - 1;
     while (start >= 0) {
       SegmentInfo info = infos.info(start);
-      if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+      if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
         if (verbose()) {
-          message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
+          message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
         }
         // need to skip that segment + add a merge for the 'right' segments,
         // unless there is only 1 which is optimized.
@@ -326,9 +329,12 @@ public abstract class LogMergePolicy ext
   }
   
   /** Returns the merges necessary to optimize the index.
-   *  This merge policy defines "optimized" to mean only one
-   *  segment in the index, where that segment has no
-   *  deletions pending nor separate norms, and it is in
+   *  This merge policy defines "optimized" to mean only the
+   *  requested number of segments is left in the index, and
+   *  respects the {@link #maxMergeSizeForOptimize} setting.
+   *  By default, and assuming {@code maxNumSegments=1}, only
+   *  one segment will be left in the index, where that segment
+   *  has no deletions pending nor separate norms, and it is in
    *  compound file format if the current useCompoundFile
    *  setting is true.  This method returns multiple merges
    *  (mergeFactor at a time) so the {@link MergeScheduler}
@@ -382,7 +388,7 @@ public abstract class LogMergePolicy ext
     boolean anyTooLarge = false;
     for (int i = 0; i < last; i++) {
       SegmentInfo info = infos.info(i);
-      if (size(info) > maxMergeSize || sizeDocs(info) > maxMergeDocs) {
+      if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
         anyTooLarge = true;
         break;
       }
@@ -588,6 +594,7 @@ public abstract class LogMergePolicy ext
     sb.append("minMergeSize=").append(minMergeSize).append(", ");
     sb.append("mergeFactor=").append(mergeFactor).append(", ");
     sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
+    sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
     sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
     sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
     sb.append("useCompoundFile=").append(useCompoundFile);

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTerms.java Thu Jan 20 19:52:03 2011
@@ -77,6 +77,19 @@ public final class MultiTerms extends Te
   }
 
   @Override
+  public long getSumTotalTermFreq() throws IOException {
+    long sum = 0;
+    for(Terms terms : subs) {
+      final long v = terms.getSumTotalTermFreq();
+      if (v == -1) {
+        return -1;
+      }
+      sum += v;
+    }
+    return sum;
+  }
+
+  @Override
   public Comparator<BytesRef> getComparator() {
     return termComp;
   }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Jan 20 19:52:03 2011
@@ -270,6 +270,19 @@ public final class MultiTermsEnum extend
   }
 
   @Override
+  public long totalTermFreq() {
+    long sum = 0;
+    for(int i=0;i<numTop;i++) {
+      final long v = top[i].terms.totalTermFreq();
+      if (v == -1) {
+        return v;
+      }
+      sum += v;
+    }
+    return sum;
+  }
+
+  @Override
   public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
     final MultiDocsEnum docsEnum;
     if (reuse != null) {

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/SegmentReader.java Thu Jan 20 19:52:03 2011
@@ -335,29 +335,6 @@ public class SegmentReader extends Index
       }
     }
 
-    // Load bytes but do not cache them if they were not
-    // already cached
-    public synchronized void bytes(byte[] bytesOut, int offset, int len) throws IOException {
-      assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
-      if (bytes != null) {
-        // Already cached -- copy from cache:
-        assert len <= maxDoc();
-        System.arraycopy(bytes, 0, bytesOut, offset, len);
-      } else {
-        // Not cached
-        if (origNorm != null) {
-          // Ask origNorm to load
-          origNorm.bytes(bytesOut, offset, len);
-        } else {
-          // We are orig -- read ourselves from disk:
-          synchronized(in) {
-            in.seek(normSeek);
-            in.readBytes(bytesOut, offset, len, false);
-          }
-        }
-      }
-    }
-
     // Load & cache full bytes array.  Returns bytes.
     public synchronized byte[] bytes() throws IOException {
       assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/Terms.java Thu Jan 20 19:52:03 2011
@@ -57,6 +57,18 @@ public abstract class Terms {
     }
   }
 
+  /** Returns the number of documents containing the
+   *  specified term text.  Returns 0 if the term does not
+   *  exist. */
+  public long totalTermFreq(BytesRef text) throws IOException {
+    final TermsEnum termsEnum = getThreadTermsEnum();
+    if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
+      return termsEnum.totalTermFreq();
+    } else {
+      return 0;
+    }
+  }
+
   /** Get {@link DocsEnum} for the specified term.  This
    *  method may return null if the term does not exist. */
   public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@@ -133,6 +145,14 @@ public abstract class Terms {
     throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
   }
 
+  /** Returns the sum of {@link TermsEnum#totalTermFreq} for
+   *  all terms in this field, or -1 if this measure isn't
+   *  stored by the codec (or if this fields omits term freq
+   *  and positions).  Note that, just like other term
+   *  measures, this measure does not take deleted documents
+   *  into account. */
+  public abstract long getSumTotalTermFreq() throws IOException;
+
   /**
    * Returns a thread-private {@link TermsEnum} instance. Obtaining
    * {@link TermsEnum} from this method might be more efficient than using

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jan 20 19:52:03 2011
@@ -125,7 +125,15 @@ public abstract class TermsEnum {
    *  first time, after next() returns null or seek returns
    *  {@link SeekStatus#END}.*/
   public abstract int docFreq();
-  
+
+  /** Returns the total number of occurrences of this term
+   *  across all documents (the sum of the freq() for each
+   *  doc that has this term).  This will be -1 if the
+   *  codec doesn't support this measure.  Note that, like
+   *  other term measures, this measure does not take
+   *  deleted documents into account. */
+  public abstract long totalTermFreq();
+
   /** Get {@link DocsEnum} for the current term.  Do not
    *  call this before calling {@link #next} or {@link
    *  #seek} for the first time.  This method will not
@@ -202,6 +210,11 @@ public abstract class TermsEnum {
     public int docFreq() {
       throw new IllegalStateException("this method should never be called");
     }
+
+    @Override
+    public long totalTermFreq() {
+      throw new IllegalStateException("this method should never be called");
+    }
       
     @Override
     public long ord() {

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Thu Jan 20 19:52:03 2011
@@ -128,7 +128,7 @@ public class FixedGapTermsIndexWriter ex
     }
 
     @Override
-    public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+    public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
       // First term is first indexed term:
       if (0 == (numTerms++ % termIndexInterval)) {
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java Thu Jan 20 19:52:03 2011
@@ -55,9 +55,10 @@ public abstract class PostingsConsumer {
 
   /** Default merge impl: append documents, mapping around
    *  deletes */
-  public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
+  public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
 
     int df = 0;
+    long totTF = 0;
 
     if (mergeState.fieldInfo.omitTermFreqAndPositions) {
       while(true) {
@@ -68,6 +69,7 @@ public abstract class PostingsConsumer {
         this.startDoc(doc, postings.freq());
         this.finishDoc();
         df++;
+        totTF++;
       }
     } else {
       final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@@ -78,6 +80,7 @@ public abstract class PostingsConsumer {
         }
         final int freq = postingsEnum.freq();
         this.startDoc(doc, freq);
+        totTF += freq;
         for(int i=0;i<freq;i++) {
           final int position = postingsEnum.nextPosition();
           final BytesRef payload;
@@ -92,6 +95,6 @@ public abstract class PostingsConsumer {
         df++;
       }
     }
-    return df;
+    return new TermStats(df, totTF);
   }
 }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Thu Jan 20 19:52:03 2011
@@ -34,7 +34,7 @@ public abstract class PostingsWriterBase
   public abstract void startTerm() throws IOException;
 
   /** Finishes the current term */
-  public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
+  public abstract void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException;
 
   public abstract void setField(FieldInfo fieldInfo);
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermState.java Thu Jan 20 19:52:03 2011
@@ -16,6 +16,7 @@ package org.apache.lucene.index.codecs;
  * limitations under the License.
  */
 
+import org.apache.lucene.index.DocsEnum; // javadocs
 import org.apache.lucene.index.OrdTermState;
 import org.apache.lucene.index.TermState;
 
@@ -27,7 +28,8 @@ import org.apache.lucene.index.TermState
 public class PrefixCodedTermState extends OrdTermState {
   public int docFreq; // how many docs have this term
   public long filePointer; // fp into the terms dict primary file (_X.tis)
-
+  public long totalTermFreq;                           // total number of occurrences of this term
+  
   @Override
   public void copyFrom(TermState _other) {
     assert _other instanceof PrefixCodedTermState : "can not copy from " + _other.getClass().getName();
@@ -35,11 +37,12 @@ public class PrefixCodedTermState extend
     super.copyFrom(_other);
     filePointer = other.filePointer;
     docFreq = other.docFreq;
+    totalTermFreq = other.totalTermFreq;
   }
 
   @Override
   public String toString() {
-    return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
+    return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + ", docFreq=" + docFreq + ", totalTermFreq=" + totalTermFreq + "]";
   }
   
 }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Thu Jan 20 19:52:03 2011
@@ -130,18 +130,17 @@ public class PrefixCodedTermsReader exte
       // Read per-field details
       seekDir(in, dirOffset);
 
-      final int numFields = in.readInt();
+      final int numFields = in.readVInt();
 
       for(int i=0;i<numFields;i++) {
-        final int field = in.readInt();
-        final long numTerms = in.readLong();
+        final int field = in.readVInt();
+        final long numTerms = in.readVLong();
         assert numTerms >= 0;
-        final long termsStartPointer = in.readLong();
+        final long termsStartPointer = in.readVLong();
         final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-        if (numTerms > 0) {
-          assert !fields.containsKey(fieldInfo.name);
-          fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
-        }
+        final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong();
+        assert !fields.containsKey(fieldInfo.name);
+        fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq));
       }
       success = true;
     } finally {
@@ -246,12 +245,14 @@ public class PrefixCodedTermsReader exte
     final long numTerms;
     final FieldInfo fieldInfo;
     final long termsStartPointer;
+    final long sumTotalTermFreq;
 
-    FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
+    FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) {
       assert numTerms > 0;
       this.fieldInfo = fieldInfo;
       this.numTerms = numTerms;
       this.termsStartPointer = termsStartPointer;
+      this.sumTotalTermFreq = sumTotalTermFreq;
     }
 
     @Override
@@ -274,6 +275,11 @@ public class PrefixCodedTermsReader exte
       return numTerms;
     }
 
+    @Override
+    public long getSumTotalTermFreq() {
+      return sumTotalTermFreq;
+    }
+
     // Iterates through terms in this field, not supporting ord()
     private final class SegmentTermsEnum extends TermsEnum {
       private final IndexInput in;
@@ -296,6 +302,7 @@ public class PrefixCodedTermsReader exte
         bytesReader = new DeltaBytesReader(in);
         fieldTerm.field = fieldInfo.name;
         state = postingsReader.newTermState();
+        state.totalTermFreq = -1;
         state.ord = -1;
       }
 
@@ -496,6 +503,10 @@ public class PrefixCodedTermsReader exte
           state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
         }
 
+        if (!fieldInfo.omitTermFreqAndPositions) {
+          state.totalTermFreq = state.docFreq + in.readVLong();
+        }
+
         postingsReader.readTerm(in,
                                 fieldInfo, state,
                                 isIndexTerm);
@@ -514,6 +525,11 @@ public class PrefixCodedTermsReader exte
       }
 
       @Override
+      public long totalTermFreq() {
+        return state.totalTermFreq;
+      }
+
+      @Override
       public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
         final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
         assert docsEnum != null;

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Thu Jan 20 19:52:03 2011
@@ -60,7 +60,7 @@ public class PrefixCodedTermsWriter exte
   final FieldInfos fieldInfos;
   FieldInfo currentField;
   private final TermsIndexWriterBase termsIndexWriter;
-  private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
+  private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
   private final Comparator<BytesRef> termComp;
 
   public PrefixCodedTermsWriter(
@@ -96,7 +96,7 @@ public class PrefixCodedTermsWriter exte
     assert currentField == null || currentField.name.compareTo(field.name) < 0;
     currentField = field;
     TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
-    TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
+    final TermsWriter terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
     fields.add(terms);
     return terms;
   }
@@ -105,16 +105,26 @@ public class PrefixCodedTermsWriter exte
   public void close() throws IOException {
 
     try {
-      final int fieldCount = fields.size();
+      
+      int nonZeroCount = 0;
+      for(TermsWriter field : fields) {
+        if (field.numTerms > 0) {
+          nonZeroCount++;
+        }
+      }
 
       final long dirStart = out.getFilePointer();
 
-      out.writeInt(fieldCount);
-      for(int i=0;i<fieldCount;i++) {
-        TermsWriter field = (TermsWriter) fields.get(i);
-        out.writeInt(field.fieldInfo.number);
-        out.writeLong(field.numTerms);
-        out.writeLong(field.termsStartPointer);
+      out.writeVInt(nonZeroCount);
+      for(TermsWriter field : fields) {
+        if (field.numTerms > 0) {
+          out.writeVInt(field.fieldInfo.number);
+          out.writeVLong(field.numTerms);
+          out.writeVLong(field.termsStartPointer);
+          if (!field.fieldInfo.omitTermFreqAndPositions) {
+            out.writeVLong(field.sumTotalTermFreq);
+          }
+        }
       }
       writeTrailer(dirStart);
     } finally {
@@ -142,6 +152,7 @@ public class PrefixCodedTermsWriter exte
     private final long termsStartPointer;
     private long numTerms;
     private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
+    long sumTotalTermFreq;
 
     TermsWriter(
         TermsIndexWriterBase.FieldWriter fieldIndexWriter,
@@ -169,12 +180,12 @@ public class PrefixCodedTermsWriter exte
     }
 
     @Override
-    public void finishTerm(BytesRef text, int numDocs) throws IOException {
+    public void finishTerm(BytesRef text, TermStats stats) throws IOException {
 
-      assert numDocs > 0;
+      assert stats.docFreq > 0;
       //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp="  + out.getFilePointer());
 
-      final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
+      final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
 
       termWriter.write(text);
       final int highBit = isIndexTerm ? 0x80 : 0;
@@ -182,23 +193,28 @@ public class PrefixCodedTermsWriter exte
 
       // This is a vInt, except, we steal top bit to record
       // whether this was an indexed term:
-      if ((numDocs & ~0x3F) == 0) {
+      if ((stats.docFreq & ~0x3F) == 0) {
         // Fast case -- docFreq fits in 6 bits
-        out.writeByte((byte) (highBit | numDocs));
+        out.writeByte((byte) (highBit | stats.docFreq));
       } else {
         // Write bottom 6 bits of docFreq, then write the
         // remainder as vInt:
-        out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
-        out.writeVInt(numDocs >>> 6);
+        out.writeByte((byte) (highBit | 0x40 | (stats.docFreq & 0x3F)));
+        out.writeVInt(stats.docFreq >>> 6);
+      }
+      if (!fieldInfo.omitTermFreqAndPositions) {
+        assert stats.totalTermFreq >= stats.docFreq;
+        out.writeVLong(stats.totalTermFreq - stats.docFreq);
       }
-      postingsWriter.finishTerm(numDocs, isIndexTerm);
+      postingsWriter.finishTerm(stats, isIndexTerm);
       numTerms++;
     }
 
     // Finishes all terms in this field
     @Override
-    public void finish() throws IOException {
+    public void finish(long sumTotalTermFreq) throws IOException {
       // EOF marker:
+      this.sumTotalTermFreq = sumTotalTermFreq;
       out.writeVInt(DeltaBytesWriter.TERM_EOF);
       fieldIndexWriter.finish();
     }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Thu Jan 20 19:52:03 2011
@@ -38,10 +38,10 @@ public abstract class TermsConsumer {
   public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
 
   /** Finishes the current term; numDocs must be > 0. */
-  public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
+  public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
 
   /** Called when we are done adding terms to this field */
-  public abstract void finish() throws IOException;
+  public abstract void finish(long sumTotalTermFreq) throws IOException;
 
   /** Return the BytesRef Comparator used to sort terms
    *  before feeding to this API. */
@@ -55,6 +55,7 @@ public abstract class TermsConsumer {
 
     BytesRef term;
     assert termsEnum != null;
+    long sumTotalTermFreq = 0;
 
     if (mergeState.fieldInfo.omitTermFreqAndPositions) {
       if (docsEnum == null) {
@@ -69,9 +70,9 @@ public abstract class TermsConsumer {
         if (docsEnumIn != null) {
           docsEnum.reset(docsEnumIn);
           final PostingsConsumer postingsConsumer = startTerm(term);
-          final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
-          if (numDocs > 0) {
-            finishTerm(term, numDocs);
+          final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
+          if (stats.docFreq > 0) {
+            finishTerm(term, stats);
           }
         }
       }
@@ -94,14 +95,15 @@ public abstract class TermsConsumer {
             }
           }
           final PostingsConsumer postingsConsumer = startTerm(term);
-          final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
-          if (numDocs > 0) {
-            finishTerm(term, numDocs);
+          final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
+          if (stats.docFreq > 0) {
+            finishTerm(term, stats);
+            sumTotalTermFreq += stats.totalTermFreq;
           }
         }
       }
     }
 
-    finish();
+    finish(sumTotalTermFreq);
   }
 }

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Thu Jan 20 19:52:03 2011
@@ -28,7 +28,7 @@ public abstract class TermsIndexWriterBa
   public abstract void setTermsOutput(IndexOutput out);
 
   public abstract class FieldWriter {
-    public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
+    public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
     public abstract void finish() throws IOException;
   }
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Thu Jan 20 19:52:03 2011
@@ -59,7 +59,7 @@ public class VariableGapTermsIndexWriter
   public static abstract class IndexTermSelector {
     // Called sequentially on every term being written,
     // returning true if this term should be indexed
-    public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+    public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
   }
 
   /** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -74,9 +74,9 @@ public class VariableGapTermsIndexWriter
     }
 
     @Override
-    public boolean isIndexTerm(BytesRef term, int docFreq) {
+    public boolean isIndexTerm(BytesRef term, TermStats stats) {
       if (count >= interval) {
-        count = 0;
+        count = 1;
         return true;
       } else {
         count++;
@@ -99,9 +99,9 @@ public class VariableGapTermsIndexWriter
     }
 
     @Override
-    public boolean isIndexTerm(BytesRef term, int docFreq) {
-      if (docFreq >= docFreqThresh || count >= interval) {
-        count = 0;
+    public boolean isIndexTerm(BytesRef term, TermStats stats) {
+      if (stats.docFreq >= docFreqThresh || count >= interval) {
+        count = 1;
         return true;
       } else {
         count++;
@@ -214,8 +214,8 @@ public class VariableGapTermsIndexWriter
     }
 
     @Override
-    public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
-      if (policy.isIndexTerm(text, docFreq) || first) {
+    public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+      if (policy.isIndexTerm(text, stats) || first) {
         first = false;
         //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
         final int lengthSave = text.length;

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Jan 20 19:52:03 2011
@@ -34,7 +34,6 @@ import org.apache.lucene.index.FieldsEnu
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.CompoundFileReader;
@@ -265,6 +264,11 @@ public class PreFlexFields extends Field
         return BytesRef.getUTF8SortedAsUTF16Comparator();
       }
     }
+
+    @Override
+    public long getSumTotalTermFreq() {
+      return -1;
+    }
   }
 
   private class PreTermsEnum extends TermsEnum {
@@ -941,6 +945,11 @@ public class PreFlexFields extends Field
     }
 
     @Override
+    public long totalTermFreq() {
+      return -1;
+    }
+
+    @Override
     public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
       PreDocsEnum docsEnum;
       if (reuse == null || !(reuse instanceof PreDocsEnum)) {

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Thu Jan 20 19:52:03 2011
@@ -45,6 +45,7 @@ public class PulsingPostingsReaderImpl e
 
   // Fallback reader for non-pulsed terms:
   final PostingsReaderBase wrappedPostingsReader;
+  int maxPositions;
 
   public PulsingPostingsReaderImpl(PostingsReaderBase wrappedPostingsReader) throws IOException {
     this.wrappedPostingsReader = wrappedPostingsReader;
@@ -54,6 +55,7 @@ public class PulsingPostingsReaderImpl e
   public void init(IndexInput termsIn) throws IOException {
     CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
       PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
+    maxPositions = termsIn.readVInt();
     wrappedPostingsReader.init(termsIn);
   }
 
@@ -115,8 +117,10 @@ public class PulsingPostingsReaderImpl e
 
     termState.pendingIndexTerm |= isIndexTerm;
 
-    // TODO: wasteful to use whole byte for this (need just a 1 bit);
-    if (termsIn.readByte() == 1) {
+    // total TF, but in the omitTFAP case its computed based on docFreq.
+    long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
+    
+    if (count <= maxPositions) {
 
       // Inlined into terms dict -- just read the byte[] blob in,
       // but don't decode it now (we only decode when a DocsEnum

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Thu Jan 20 19:52:03 2011
@@ -21,15 +21,16 @@ import java.io.IOException;
 
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
 
-// TODO: we now pulse entirely according to docFreq of the
-// term; it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
-// positions would not be inlined.  Though this is
+// TODO: we pulse based on total TF of the term,
+// it might be better to eg pulse by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined.  Though this is
 // presumably rare in practice...
 
 /** @lucene.experimental */
@@ -85,6 +86,7 @@ public final class PulsingPostingsWriter
   public void start(IndexOutput termsOut) throws IOException {
     this.termsOut = termsOut;
     CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+    termsOut.writeVInt(pending.length); // encode maxPositions in header
     wrappedPostingsWriter.start(termsOut);
   }
 
@@ -177,7 +179,7 @@ public final class PulsingPostingsWriter
 
   /** Called when we are done adding docs to this term */
   @Override
-  public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+  public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
     //System.out.println("PW   finishTerm docCount=" + docCount);
 
     assert pendingCount > 0 || pendingCount == -1;
@@ -185,8 +187,7 @@ public final class PulsingPostingsWriter
     pendingIsIndexTerm |= isIndexTerm;
 
     if (pendingCount == -1) {
-      termsOut.writeByte((byte) 0);
-      wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
+      wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
       pendingIsIndexTerm = false;
     } else {
 
@@ -194,8 +195,6 @@ public final class PulsingPostingsWriter
       // term, so we fully inline our postings data into
       // terms dict, now:
 
-      termsOut.writeByte((byte) 1);
-
       // TODO: it'd be better to share this encoding logic
       // in some inner codec that knows how to write a
       // single doc / single position, etc.  This way if a

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Thu Jan 20 19:52:03 2011
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
@@ -244,11 +245,11 @@ public final class SepPostingsWriterImpl
 
   /** Called when we are done adding docs to this term */
   @Override
-  public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
+  public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
 
     // TODO: -- wasteful we are counting this in two places?
-    assert docCount > 0;
-    assert docCount == df;
+    assert stats.docFreq > 0;
+    assert stats.docFreq == df;
 
     docIndex.write(termsOut, isIndexTerm);
 

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java Thu Jan 20 19:52:03 2011
@@ -21,7 +21,6 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.index.codecs.FieldsProducer;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.FieldsEnum;
-import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.DocsAndPositionsEnum;
@@ -120,28 +119,31 @@ class SimpleTextFieldsReader extends Fie
     private final IndexInput in;
     private final boolean omitTF;
     private int docFreq;
+    private long totalTermFreq;
     private long docsStart;
     private boolean ended;
-    private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
+    private final BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstEnum;
 
-    public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
+    public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst, boolean omitTF) throws IOException {
       this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
       this.omitTF = omitTF;
-      fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
+      fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(fst);
     }
 
     public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
 
       //System.out.println("seek to text=" + text.utf8ToString());
-      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.seekCeil(text);
+      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.seekCeil(text);
       if (result == null) {
         //System.out.println("  end");
         return SeekStatus.END;
       } else {
         //System.out.println("  got text=" + term.utf8ToString());
-        PairOutputs.Pair<Long,Long> pair = result.output;
-        docsStart = pair.output1;
-        docFreq = pair.output2.intValue();
+        PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+        PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+        docsStart = pair1.output1;
+        docFreq = pair2.output1.intValue();
+        totalTermFreq = pair2.output2;
 
         if (result.input.equals(text)) {
           //System.out.println("  match docsStart=" + docsStart);
@@ -156,11 +158,13 @@ class SimpleTextFieldsReader extends Fie
     @Override
     public BytesRef next() throws IOException {
       assert !ended;
-      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
+      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> result = fstEnum.next();
       if (result != null) {
-        final PairOutputs.Pair<Long,Long> pair = result.output;
-        docsStart = pair.output1;
-        docFreq = pair.output2.intValue();
+        PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>> pair1 = result.output;
+        PairOutputs.Pair<Long,Long> pair2 = pair1.output2;
+        docsStart = pair1.output1;
+        docFreq = pair2.output1.intValue();
+        totalTermFreq = pair2.output2;
         return result.input;
       } else {
         return null;
@@ -188,6 +192,11 @@ class SimpleTextFieldsReader extends Fie
     }
 
     @Override
+    public long totalTermFreq() {
+      return totalTermFreq;
+    }
+ 
+    @Override
     public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
       SimpleTextDocsEnum docsEnum;
       if (reuse != null && reuse instanceof SimpleTextDocsEnum && ((SimpleTextDocsEnum) reuse).canReuse(SimpleTextFieldsReader.this.in)) {
@@ -636,8 +645,9 @@ class SimpleTextFieldsReader extends Fie
   private class SimpleTextTerms extends Terms {
     private final long termsStart;
     private final boolean omitTF;
-    private FST<PairOutputs.Pair<Long,Long>> fst;
-
+    private long sumTotalTermFreq;
+    private FST<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fst;
+    private int termCount;
     private final BytesRef scratch = new BytesRef(10);
 
     public SimpleTextTerms(String field, long termsStart) throws IOException {
@@ -648,24 +658,38 @@ class SimpleTextFieldsReader extends Fie
 
     private void loadTerms() throws IOException {
       PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
-      Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
+      final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
+      b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
+                                                                          0,
+                                                                          0,
+                                                                          true,
+                                                                          new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
+                                                                                                                            new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
       IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
       in.seek(termsStart);
       final BytesRef lastTerm = new BytesRef(10);
       long lastDocsStart = -1;
       int docFreq = 0;
+      long totalTermFreq = 0;
       while(true) {
         readLine(in, scratch);
         if (scratch.equals(END) || scratch.startsWith(FIELD)) {
           if (lastDocsStart != -1) {
-            b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+            b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+                                                                                   new PairOutputs.Pair<Long,Long>((long) docFreq,
+                                                                                                                   posIntOutputs.get(totalTermFreq))));
+            sumTotalTermFreq += totalTermFreq;
           }
           break;
         } else if (scratch.startsWith(DOC)) {
           docFreq++;
+        } else if (scratch.startsWith(POS)) {
+          totalTermFreq++;
         } else if (scratch.startsWith(TERM)) {
           if (lastDocsStart != -1) {
-            b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
+            b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
+                                                                                   new PairOutputs.Pair<Long,Long>((long) docFreq,
+                                                                                                                   posIntOutputs.get(totalTermFreq))));
           }
           lastDocsStart = in.getFilePointer();
           final int len = scratch.length - TERM.length;
@@ -675,6 +699,9 @@ class SimpleTextFieldsReader extends Fie
           System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
           lastTerm.length = len;
           docFreq = 0;
+          sumTotalTermFreq += totalTermFreq;
+          totalTermFreq = 0;
+          termCount++;
         }
       }
       fst = b.finish();
@@ -700,6 +727,16 @@ class SimpleTextFieldsReader extends Fie
     public Comparator<BytesRef> getComparator() {
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
+
+    @Override
+    public long getUniqueTermCount() {
+      return (long) termCount;
+    }
+
+    @Override
+    public long getSumTotalTermFreq() {
+      return sumTotalTermFreq;
+    }
   }
 
   @Override

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java Thu Jan 20 19:52:03 2011
@@ -22,6 +22,7 @@ import org.apache.lucene.util.UnicodeUti
 import org.apache.lucene.index.codecs.FieldsConsumer;
 import org.apache.lucene.index.codecs.TermsConsumer;
 import org.apache.lucene.index.codecs.PostingsConsumer;
+import org.apache.lucene.index.codecs.TermStats;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.store.IndexOutput;
@@ -84,11 +85,11 @@ class SimpleTextFieldsWriter extends Fie
     }
 
     @Override
-    public void finishTerm(BytesRef term, int numDocs) throws IOException {
+    public void finishTerm(BytesRef term, TermStats stats) throws IOException {
     }
 
     @Override
-    public void finish() throws IOException {
+    public void finish(long sumTotalTermFreq) throws IOException {
     }
 
     @Override

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Thu Jan 20 19:52:03 2011
@@ -28,6 +28,7 @@ import org.apache.lucene.index.SegmentWr
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
 
@@ -184,12 +185,12 @@ public final class StandardPostingsWrite
 
   /** Called when we are done adding docs to this term */
   @Override
-  public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
-    assert docCount > 0;
+  public void finishTerm(TermStats stats, boolean isIndexTerm) throws IOException {
+    assert stats.docFreq > 0;
 
     // TODO: wasteful we are counting this (counting # docs
     // for this term) in two places?
-    assert docCount == df;
+    assert stats.docFreq == df;
 
     if (isIndexTerm) {
       // Write absolute at seek points

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanQuery.java Thu Jan 20 19:52:03 2011
@@ -169,7 +169,7 @@ public class BooleanQuery extends Query 
 
     public BooleanWeight(IndexSearcher searcher, boolean disableCoord)
       throws IOException {
-      this.similarity = getSimilarity(searcher);
+      this.similarity = searcher.getSimilarity();
       this.disableCoord = disableCoord;
       weights = new ArrayList<Weight>(clauses.size());
       for (int i = 0 ; i < clauses.size(); i++) {
@@ -201,6 +201,9 @@ public class BooleanQuery extends Query 
       return sum ;
     }
 
+    public float coord(int overlap, int maxOverlap) {
+      return similarity.coord(overlap, maxOverlap);
+    }
 
     @Override
     public void normalize(float norm) {
@@ -273,7 +276,7 @@ public class BooleanQuery extends Query 
       sumExpl.setMatch(0 < coord ? Boolean.TRUE : Boolean.FALSE);
       sumExpl.setValue(sum);
       
-      final float coordFactor = disableCoord ? 1.0f : similarity.coord(coord, maxCoord);
+      final float coordFactor = disableCoord ? 1.0f : coord(coord, maxCoord);
       if (coordFactor == 1.0f) {
         return sumExpl;                             // eliminate wrapper
       } else {
@@ -312,7 +315,7 @@ public class BooleanQuery extends Query 
       
       // Check if we can return a BooleanScorer
       if (!scorerContext.scoreDocsInOrder && scorerContext.topScorer && required.size() == 0 && prohibited.size() < 32) {
-        return new BooleanScorer(this, disableCoord, similarity, minNrShouldMatch, optional, prohibited, maxCoord);
+        return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord);
       }
       
       if (required.size() == 0 && optional.size() == 0) {
@@ -326,7 +329,7 @@ public class BooleanQuery extends Query 
       }
       
       // Return a BooleanScorer2
-      return new BooleanScorer2(this, disableCoord, similarity, minNrShouldMatch, required, prohibited, optional, maxCoord);
+      return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord);
     }
     
     @Override

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java?rev=1061480&r1=1061479&r2=1061480&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/search/BooleanScorer.java Thu Jan 20 19:52:03 2011
@@ -22,6 +22,7 @@ import java.util.List;
 
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery.BooleanWeight;
 
 /* Description from Doug Cutting (excerpted from
  * LUCENE-1483):
@@ -197,7 +198,7 @@ final class BooleanScorer extends Scorer
   private Bucket current;
   private int doc = -1;
   
-  BooleanScorer(Weight weight, boolean disableCoord, Similarity similarity, int minNrShouldMatch,
+  BooleanScorer(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch,
       List<Scorer> optionalScorers, List<Scorer> prohibitedScorers, int maxCoord) throws IOException {
     super(null, weight);   // Similarity not used
     this.minNrShouldMatch = minNrShouldMatch;
@@ -223,7 +224,7 @@ final class BooleanScorer extends Scorer
 
     coordFactors = new float[optionalScorers.size() + 1];
     for (int i = 0; i < coordFactors.length; i++) {
-      coordFactors[i] = disableCoord ? 1.0f : similarity.coord(i, maxCoord); 
+      coordFactors[i] = disableCoord ? 1.0f : weight.coord(i, maxCoord); 
     }
   }
 



Mime
View raw message