lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r636458 [3/3] - /lucene/java/trunk/src/java/org/apache/lucene/index/
Date Wed, 12 Mar 2008 19:09:16 GMT
Added: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,773 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.store.IndexOutput;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+
+/** Used by DocumentsWriter to hold data associated with a
+ *  single field in a single ThreadState, including the
+ *  Postings hash.  A document may have many occurrences for
+ *  a given field name; we gather all such occurrences here
+ *  (in docFields) so that we can process the entire field
+ *  at once. */
+
+final class DocumentsWriterFieldData implements Comparable {
+
+  final DocumentsWriterThreadState threadState;
+  FieldInfo fieldInfo;
+
+  int fieldCount;
+  Fieldable[] docFields = new Fieldable[1];
+
+  int lastGen = -1;
+  DocumentsWriterFieldData next;
+
+  boolean doNorms;
+  boolean doVectors;
+  boolean doVectorPositions;
+  boolean doVectorOffsets;
+  boolean postingsCompacted;
+
+  int numPostings;
+      
+  Posting[] postingsHash;
+  int postingsHashSize;
+  int postingsHashHalfSize;
+  int postingsHashMask;
+
+  int position;
+  int length;
+  int offset;
+  float boost;
+  int postingsVectorsUpto;
+
+  public DocumentsWriterFieldData(DocumentsWriterThreadState threadState, FieldInfo fieldInfo) {
+    this.fieldInfo = fieldInfo;
+    this.threadState = threadState;
+  }
+
+  void resetPostingArrays() {
+    if (!postingsCompacted)
+      compactPostings();
+    threadState.docWriter.recyclePostings(this.postingsHash, numPostings);
+    Arrays.fill(postingsHash, 0, postingsHash.length, null);
+    postingsCompacted = false;
+    numPostings = 0;
+  }
+
+  void initPostingArrays() {
+    // Target hash fill factor of <= 50%
+    // NOTE: must be a power of two for hash collision
+    // strategy to work correctly
+    postingsHashSize = 4;
+    postingsHashHalfSize = 2;
+    postingsHashMask = postingsHashSize-1;
+    postingsHash = new Posting[postingsHashSize];
+  }
+
+  public int compareTo(Object o) {
+    return fieldInfo.name.compareTo(((DocumentsWriterFieldData) o).fieldInfo.name);
+  }
+
+  private void compactPostings() {
+    int upto = 0;
+    for(int i=0;i<postingsHashSize;i++)
+      if (postingsHash[i] != null)
+        postingsHash[upto++] = postingsHash[i];
+
+    assert upto == numPostings;
+    postingsCompacted = true;
+  }
+
+  /** Collapse the hash table & sort in-place. */
+  public Posting[] sortPostings() {
+    compactPostings();
+    threadState.doPostingSort(postingsHash, numPostings);
+    return postingsHash;
+  }
+
+  /** Process all occurrences of one field in the document. */
+  public void processField(Analyzer analyzer) throws IOException, AbortException {
+    length = 0;
+    position = 0;
+    offset = 0;
+    boost = threadState.docBoost;
+
+    final int maxFieldLength = threadState.docWriter.writer.getMaxFieldLength();
+
+    final int limit = fieldCount;
+    final Fieldable[] docFieldsFinal = docFields;
+
+    boolean doWriteVectors = true;
+
+    // Walk through all occurrences in this doc for this
+    // field:
+    try {
+      for(int j=0;j<limit;j++) {
+        Fieldable field = docFieldsFinal[j];
+
+        if (field.isIndexed())
+          invertField(field, analyzer, maxFieldLength);
+
+        if (field.isStored()) {
+          threadState.numStoredFields++;
+          boolean success = false;
+          try {
+            threadState.localFieldsWriter.writeField(fieldInfo, field);
+            success = true;
+          } finally {
+            // If we hit an exception inside
+            // localFieldsWriter.writeField, the
+            // contents of fdtLocal can be corrupt, so
+            // we must discard all stored fields for
+            // this document:
+            if (!success)
+              threadState.fdtLocal.reset();
+          }
+        }
+
+        docFieldsFinal[j] = null;
+      }
+    } catch (AbortException ae) {
+      doWriteVectors = false;
+      throw ae;
+    } finally {
+      if (postingsVectorsUpto > 0) {
+        try {
+          if (doWriteVectors) {
+            // Add term vectors for this field
+            boolean success = false;
+            try {
+              writeVectors(fieldInfo);
+              success = true;
+            } finally {
+              if (!success) {
+                // If we hit an exception inside
+                // writeVectors, the contents of tvfLocal
+                // can be corrupt, so we must discard all
+                // term vectors for this document:
+                threadState.numVectorFields = 0;
+                threadState.tvfLocal.reset();
+              }
+            }
+          }
+        } finally {
+          if (postingsVectorsUpto > threadState.maxPostingsVectors)
+            threadState.maxPostingsVectors = postingsVectorsUpto;
+          postingsVectorsUpto = 0;
+          threadState.vectorsPool.reset();
+        }
+      }
+    }
+  }
+
+  int offsetEnd;
+  Token localToken = new Token();
+
+  /* Invert one occurrence of one field in the document */
+  public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException, AbortException {
+
+    if (length>0)
+      position += analyzer.getPositionIncrementGap(fieldInfo.name);
+
+    if (!field.isTokenized()) {		  // un-tokenized field
+      String stringValue = field.stringValue();
+      final int valueLength = stringValue.length();
+      Token token = localToken;
+      token.clear();
+      char[] termBuffer = token.termBuffer();
+      if (termBuffer.length < valueLength)
+        termBuffer = token.resizeTermBuffer(valueLength);
+      stringValue.getChars(0, valueLength, termBuffer, 0);
+      token.setTermLength(valueLength);
+      token.setStartOffset(offset);
+      token.setEndOffset(offset + stringValue.length());
+      addPosition(token);
+      offset += stringValue.length();
+      length++;
+    } else {                                  // tokenized field
+      final TokenStream stream;
+      final TokenStream streamValue = field.tokenStreamValue();
+
+      if (streamValue != null) 
+        stream = streamValue;
+      else {
+        // the field does not have a TokenStream,
+        // so we have to obtain one from the analyzer
+        final Reader reader;			  // find or make Reader
+        final Reader readerValue = field.readerValue();
+
+        if (readerValue != null)
+          reader = readerValue;
+        else {
+          String stringValue = field.stringValue();
+          if (stringValue == null)
+            throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
+          threadState.stringReader.init(stringValue);
+          reader = threadState.stringReader;
+        }
+          
+        // Tokenize field and add to postingTable
+        stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
+      }
+
+      // reset the TokenStream to the first token
+      stream.reset();
+
+      try {
+        offsetEnd = offset-1;
+        Token token;
+        for(;;) {
+          token = stream.next(localToken);
+          if (token == null) break;
+          position += (token.getPositionIncrement() - 1);
+          addPosition(token);
+          if (++length >= maxFieldLength) {
+            if (threadState.docWriter.infoStream != null)
+              threadState.docWriter.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
+            break;
+          }
+        }
+        offset = offsetEnd+1;
+      } finally {
+        stream.close();
+      }
+    }
+
+    boost *= field.getBoost();
+  }
+
+  /** Only called when term vectors are enabled.  This
+   *  is called the first time we see a given term for
+   *  each document, to allocate a PostingVector
+   *  instance that is used to record data needed to
+   *  write the posting vectors. */
+  private PostingVector addNewVector() {
+
+    if (postingsVectorsUpto == threadState.postingsVectors.length) {
+      final int newSize;
+      if (threadState.postingsVectors.length < 2)
+        newSize = 2;
+      else
+        newSize = (int) (1.5*threadState.postingsVectors.length);
+      PostingVector[] newArray = new PostingVector[newSize];
+      System.arraycopy(threadState.postingsVectors, 0, newArray, 0, threadState.postingsVectors.length);
+      threadState.postingsVectors = newArray;
+    }
+        
+    p.vector = threadState.postingsVectors[postingsVectorsUpto];
+    if (p.vector == null)
+      p.vector = threadState.postingsVectors[postingsVectorsUpto] = new PostingVector();
+
+    postingsVectorsUpto++;
+
+    final PostingVector v = p.vector;
+    v.p = p;
+
+    if (doVectorPositions) {
+      final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+      v.posStart = v.posUpto = threadState.vectorsPool.byteOffset + upto;
+    }
+
+    if (doVectorOffsets) {
+      final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+      v.offsetStart = v.offsetUpto = threadState.vectorsPool.byteOffset + upto;
+    }
+
+    return v;
+  }
+
+  int offsetStartCode;
+  int offsetStart;
+
+  // Current posting we are working on
+  Posting p;
+  PostingVector vector;
+
+  /** Test whether the text for current Posting p equals
+   *  current tokenText. */
+  boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
+
+    final char[] text = threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    assert text != null;
+    int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+    int tokenPos = 0;
+    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+      if (tokenText[tokenPos] != text[pos])
+        return false;
+    return 0xffff == text[pos];
+  }
+
+  /** This is the hotspot of indexing: it's called once
+   *  for every term of every document.  Its job is to *
+   *  update the postings byte stream (Postings hash) *
+   *  based on the occurence of a single term. */
+  private void addPosition(Token token) throws AbortException {
+
+    final Payload payload = token.getPayload();
+
+    // Get the text of this term.  Term can either
+    // provide a String token or offset into a char[]
+    // array
+    final char[] tokenText = token.termBuffer();
+    final int tokenTextLen = token.termLength();
+
+    int code = 0;
+
+    // Compute hashcode
+    int downto = tokenTextLen;
+    while (downto > 0)
+      code = (code*31) + tokenText[--downto];
+
+    // System.out.println("  addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
+
+    int hashPos = code & postingsHashMask;
+
+    assert !postingsCompacted;
+
+    // Locate Posting in hash
+    p = postingsHash[hashPos];
+
+    if (p != null && !postingEquals(tokenText, tokenTextLen)) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code>>8)+code)|1;
+      do {
+        code += inc;
+        hashPos = code & postingsHashMask;
+        p = postingsHash[hashPos];
+      } while (p != null && !postingEquals(tokenText, tokenTextLen));
+    }
+
+    final int proxCode;
+
+    // If we hit an exception below, it's possible the
+    // posting list or term vectors data will be
+    // partially written and thus inconsistent if
+    // flushed, so we have to abort all documents
+    // since the last flush:
+
+    try {
+
+      if (p != null) {       // term seen since last flush
+
+        if (threadState.docID != p.lastDocID) { // term not yet seen in this doc
+            
+          // System.out.println("    seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);
+
+          assert p.docFreq > 0;
+
+          // Now that we know doc freq for previous doc,
+          // write it & lastDocCode
+          freqUpto = p.freqUpto & DocumentsWriter.BYTE_BLOCK_MASK;
+          freq = threadState.postingsPool.buffers[p.freqUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+          if (1 == p.docFreq)
+            writeFreqVInt(p.lastDocCode|1);
+          else {
+            writeFreqVInt(p.lastDocCode);
+            writeFreqVInt(p.docFreq);
+          }
+          p.freqUpto = freqUpto + (p.freqUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
+
+          if (doVectors) {
+            vector = addNewVector();
+            if (doVectorOffsets) {
+              offsetStartCode = offsetStart = offset + token.startOffset();
+              offsetEnd = offset + token.endOffset();
+            }
+          }
+
+          proxCode = position;
+
+          p.docFreq = 1;
+
+          // Store code so we can write this after we're
+          // done with this new doc
+          p.lastDocCode = (threadState.docID-p.lastDocID) << 1;
+          p.lastDocID = threadState.docID;
+
+        } else {                                // term already seen in this doc
+          // System.out.println("    seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);
+          p.docFreq++;
+
+          proxCode = position-p.lastPosition;
+
+          if (doVectors) {
+            vector = p.vector;
+            if (vector == null)
+              vector = addNewVector();
+            if (doVectorOffsets) {
+              offsetStart = offset + token.startOffset();
+              offsetEnd = offset + token.endOffset();
+              offsetStartCode = offsetStart-vector.lastOffset;
+            }
+          }
+        }
+      } else {					  // term not seen before
+        // System.out.println("    never seen docID=" + docID);
+
+        // Refill?
+        if (0 == threadState.postingsFreeCount) {
+          threadState.docWriter.getPostings(threadState.postingsFreeList);
+          threadState.postingsFreeCount = threadState.postingsFreeList.length;
+        }
+
+        final int textLen1 = 1+tokenTextLen;
+        if (textLen1 + threadState.charPool.byteUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
+          if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+            // Just skip this term, to remain as robust as
+            // possible during indexing.  A TokenFilter
+            // can be inserted into the analyzer chain if
+            // other behavior is wanted (pruning the term
+            // to a prefix, throwing an exception, etc).
+            if (threadState.maxTermPrefix == null)
+              threadState.maxTermPrefix = new String(tokenText, 0, 30);
+
+            // Still increment position:
+            position++;
+            return;
+          }
+          threadState.charPool.nextBuffer();
+        }
+
+        final char[] text = threadState.charPool.buffer;
+        final int textUpto = threadState.charPool.byteUpto;
+
+        // Pull next free Posting from free list
+        p = threadState.postingsFreeList[--threadState.postingsFreeCount];
+
+        p.textStart = textUpto + threadState.charPool.byteOffset;
+        threadState.charPool.byteUpto += textLen1;
+
+        System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
+
+        text[textUpto+tokenTextLen] = 0xffff;
+          
+        assert postingsHash[hashPos] == null;
+
+        postingsHash[hashPos] = p;
+        numPostings++;
+
+        if (numPostings == postingsHashHalfSize)
+          rehashPostings(2*postingsHashSize);
+
+        // Init first slice for freq & prox streams
+        final int upto1 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+        p.freqStart = p.freqUpto = threadState.postingsPool.byteOffset + upto1;
+
+        final int upto2 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+        p.proxStart = p.proxUpto = threadState.postingsPool.byteOffset + upto2;
+
+        p.lastDocCode = threadState.docID << 1;
+        p.lastDocID = threadState.docID;
+        p.docFreq = 1;
+
+        if (doVectors) {
+          vector = addNewVector();
+          if (doVectorOffsets) {
+            offsetStart = offsetStartCode = offset + token.startOffset();
+            offsetEnd = offset + token.endOffset();
+          }
+        }
+
+        proxCode = position;
+      }
+
+      proxUpto = p.proxUpto & DocumentsWriter.BYTE_BLOCK_MASK;
+      prox = threadState.postingsPool.buffers[p.proxUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+      assert prox != null;
+
+      if (payload != null && payload.length > 0) {
+        writeProxVInt((proxCode<<1)|1);
+        writeProxVInt(payload.length);
+        writeProxBytes(payload.data, payload.offset, payload.length);
+        fieldInfo.storePayloads = true;
+      } else
+        writeProxVInt(proxCode<<1);
+
+      p.proxUpto = proxUpto + (p.proxUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
+
+      p.lastPosition = position++;
+
+      if (doVectorPositions) {
+        posUpto = vector.posUpto & DocumentsWriter.BYTE_BLOCK_MASK;
+        pos = threadState.vectorsPool.buffers[vector.posUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+        writePosVInt(proxCode);
+        vector.posUpto = posUpto + (vector.posUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
+      }
+
+      if (doVectorOffsets) {
+        offsetUpto = vector.offsetUpto & DocumentsWriter.BYTE_BLOCK_MASK;
+        offsets = threadState.vectorsPool.buffers[vector.offsetUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+        writeOffsetVInt(offsetStartCode);
+        writeOffsetVInt(offsetEnd-offsetStart);
+        vector.lastOffset = offsetEnd;
+        vector.offsetUpto = offsetUpto + (vector.offsetUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
+      }
+    } catch (Throwable t) {
+      throw new AbortException(t, threadState.docWriter);
+    }
+  }
+
+  /** Write vInt into freq stream of current Posting */
+  public void writeFreqVInt(int i) {
+    while ((i & ~0x7F) != 0) {
+      writeFreqByte((byte)((i & 0x7f) | 0x80));
+      i >>>= 7;
+    }
+    writeFreqByte((byte) i);
+  }
+
+  /** Write vInt into prox stream of current Posting */
+  public void writeProxVInt(int i) {
+    while ((i & ~0x7F) != 0) {
+      writeProxByte((byte)((i & 0x7f) | 0x80));
+      i >>>= 7;
+    }
+    writeProxByte((byte) i);
+  }
+
+  /** Write byte into freq stream of current Posting */
+  byte[] freq;
+  int freqUpto;
+  public void writeFreqByte(byte b) {
+    assert freq != null;
+    if (freq[freqUpto] != 0) {
+      freqUpto = threadState.postingsPool.allocSlice(freq, freqUpto);
+      freq = threadState.postingsPool.buffer;
+      p.freqUpto = threadState.postingsPool.byteOffset;
+    }
+    freq[freqUpto++] = b;
+  }
+
+  /** Write byte into prox stream of current Posting */
+  byte[] prox;
+  int proxUpto;
+  public void writeProxByte(byte b) {
+    assert prox != null;
+    if (prox[proxUpto] != 0) {
+      proxUpto = threadState.postingsPool.allocSlice(prox, proxUpto);
+      prox = threadState.postingsPool.buffer;
+      p.proxUpto = threadState.postingsPool.byteOffset;
+      assert prox != null;
+    }
+    prox[proxUpto++] = b;
+    assert proxUpto != prox.length;
+  }
+
+  /** Currently only used to copy a payload into the prox
+   *  stream. */
+  public void writeProxBytes(byte[] b, int offset, int len) {
+    final int offsetEnd = offset + len;
+    while(offset < offsetEnd) {
+      if (prox[proxUpto] != 0) {
+        // End marker
+        proxUpto = threadState.postingsPool.allocSlice(prox, proxUpto);
+        prox = threadState.postingsPool.buffer;
+        p.proxUpto = threadState.postingsPool.byteOffset;
+      }
+
+      prox[proxUpto++] = b[offset++];
+      assert proxUpto != prox.length;
+    }
+  }
+
+  /** Write vInt into offsets stream of current
+   *  PostingVector */
+  public void writeOffsetVInt(int i) {
+    while ((i & ~0x7F) != 0) {
+      writeOffsetByte((byte)((i & 0x7f) | 0x80));
+      i >>>= 7;
+    }
+    writeOffsetByte((byte) i);
+  }
+
+  byte[] offsets;
+  int offsetUpto;
+
+  /** Write byte into offsets stream of current
+   *  PostingVector */
+  public void writeOffsetByte(byte b) {
+    assert offsets != null;
+    if (offsets[offsetUpto] != 0) {
+      offsetUpto = threadState.vectorsPool.allocSlice(offsets, offsetUpto);
+      offsets = threadState.vectorsPool.buffer;
+      vector.offsetUpto = threadState.vectorsPool.byteOffset;
+    }
+    offsets[offsetUpto++] = b;
+  }
+
+  /** Write vInt into pos stream of current
+   *  PostingVector */
+  public void writePosVInt(int i) {
+    while ((i & ~0x7F) != 0) {
+      writePosByte((byte)((i & 0x7f) | 0x80));
+      i >>>= 7;
+    }
+    writePosByte((byte) i);
+  }
+
+  byte[] pos;
+  int posUpto;
+
+  /** Write byte into pos stream of current
+   *  PostingVector */
+  public void writePosByte(byte b) {
+    assert pos != null;
+    if (pos[posUpto] != 0) {
+      posUpto = threadState.vectorsPool.allocSlice(pos, posUpto);
+      pos = threadState.vectorsPool.buffer;
+      vector.posUpto = threadState.vectorsPool.byteOffset;
+    }
+    pos[posUpto++] = b;
+  }
+
+  /** Called when postings hash is too small (> 50%
+   *  occupied) or too large (< 20% occupied). */
+  void rehashPostings(final int newSize) {
+
+    final int newMask = newSize-1;
+
+    Posting[] newHash = new Posting[newSize];
+    for(int i=0;i<postingsHashSize;i++) {
+      Posting p0 = postingsHash[i];
+      if (p0 != null) {
+        final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+        final char[] text = threadState.charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        int pos = start;
+        while(text[pos] != 0xffff)
+          pos++;
+        int code = 0;
+        while (pos > start)
+          code = (code*31) + text[--pos];
+
+        int hashPos = code & newMask;
+        assert hashPos >= 0;
+        if (newHash[hashPos] != null) {
+          final int inc = ((code>>8)+code)|1;
+          do {
+            code += inc;
+            hashPos = code & newMask;
+          } while (newHash[hashPos] != null);
+        }
+        newHash[hashPos] = p0;
+      }
+    }
+
+    postingsHashMask =  newMask;
+    postingsHash = newHash;
+    postingsHashSize = newSize;
+    postingsHashHalfSize = newSize >> 1;
+  }
+      
+  final ByteSliceReader vectorSliceReader = new ByteSliceReader();
+
+  /** Called once per field per document if term vectors
+   *  are enabled, to write the vectors to *
+   *  RAMOutputStream, which is then quickly flushed to
+   *  * the real term vectors files in the Directory. */
+  void writeVectors(FieldInfo fieldInfo) throws IOException {
+
+    assert fieldInfo.storeTermVector;
+    assert threadState.vectorFieldsInOrder(fieldInfo);
+
+    threadState.vectorFieldNumbers[threadState.numVectorFields] = fieldInfo.number;
+    threadState.vectorFieldPointers[threadState.numVectorFields] = threadState.tvfLocal.getFilePointer();
+    threadState.numVectorFields++;
+
+    final int numPostingsVectors = postingsVectorsUpto;
+    final PostingVector[] postingsVectors = threadState.postingsVectors;
+
+    final IndexOutput tvfLocal = threadState.tvfLocal;
+
+    threadState.tvfLocal.writeVInt(numPostingsVectors);
+    byte bits = 0x0;
+    if (doVectorPositions)
+      bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+    if (doVectorOffsets) 
+      bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+    threadState.tvfLocal.writeByte(bits);
+
+    threadState.doVectorSort(postingsVectors, numPostingsVectors);
+
+    Posting lastPosting = null;
+
+    final ByteSliceReader reader = vectorSliceReader;
+    final char[][] charBuffers = threadState.charPool.buffers;
+
+    for(int j=0;j<numPostingsVectors;j++) {
+      final PostingVector vector = postingsVectors[j];
+      Posting posting = vector.p;
+      final int freq = posting.docFreq;
+          
+      final int prefix;
+      final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+      final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+      int pos2 = start2;
+
+      // Compute common prefix between last term and
+      // this term
+      if (lastPosting == null)
+        prefix = 0;
+      else {
+        final char[] text1 = charBuffers[lastPosting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        final int start1 = lastPosting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+        int pos1 = start1;
+        while(true) {
+          final char c1 = text1[pos1];
+          final char c2 = text2[pos2];
+          if (c1 != c2 || c1 == 0xffff) {
+            prefix = pos1-start1;
+            break;
+          }
+          pos1++;
+          pos2++;
+        }
+      }
+      lastPosting = posting;
+
+      // Compute length
+      while(text2[pos2] != 0xffff)
+        pos2++;
+
+      final int suffix = pos2 - start2 - prefix;
+      tvfLocal.writeVInt(prefix);
+      tvfLocal.writeVInt(suffix);
+      tvfLocal.writeChars(text2, start2 + prefix, suffix);
+      tvfLocal.writeVInt(freq);
+
+      if (doVectorPositions) {
+        reader.init(threadState.vectorsPool, vector.posStart, vector.posUpto);
+        reader.writeTo(tvfLocal);
+      }
+
+      if (doVectorOffsets) {
+        reader.init(threadState.vectorsPool, vector.offsetStart, vector.offsetUpto);
+        reader.writeTo(tvfLocal);
+      }
+    }
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,89 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/** Used by DocumentsWriter to merge the postings from
+ *  multiple ThreadStates when creating a segment */
+final class DocumentsWriterFieldMergeState {
+
+  DocumentsWriterFieldData field;
+
+  Posting[] postings;
+
+  private Posting p;
+  char[] text;
+  int textOffset;
+
+  private int postingUpto = -1;
+
+  ByteSliceReader freq = new ByteSliceReader();
+  ByteSliceReader prox = new ByteSliceReader();
+
+  int docID;
+  int termFreq;
+
+  boolean nextTerm() throws IOException {
+    postingUpto++;
+    if (postingUpto == field.numPostings)
+      return false;
+
+    p = postings[postingUpto];
+    docID = 0;
+
+    text = field.threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+    if (p.freqUpto > p.freqStart)
+      freq.init(field.threadState.postingsPool, p.freqStart, p.freqUpto);
+    else
+      freq.bufferOffset = freq.upto = freq.endIndex = 0;
+
+    prox.init(field.threadState.postingsPool, p.proxStart, p.proxUpto);
+
+    // Should always be true
+    boolean result = nextDoc();
+    assert result;
+
+    return true;
+  }
+
+  public boolean nextDoc() throws IOException {
+    if (freq.bufferOffset + freq.upto == freq.endIndex) {
+      if (p.lastDocCode != -1) {
+        // Return last doc
+        docID = p.lastDocID;
+        termFreq = p.docFreq;
+        p.lastDocCode = -1;
+        return true;
+      } else 
+        // EOF
+        return false;
+    }
+
+    final int code = freq.readVInt();
+    docID += code >>> 1;
+    if ((code & 1) != 0)
+      termFreq = 1;
+    else
+      termFreq = freq.readVInt();
+
+    return true;
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,719 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.analysis.Analyzer;
+
+/** Used by DocumentsWriter to maintain per-thread state.
+ *  We keep a separate Posting hash and other state for each
+ *  thread and then merge postings hashes from all threads
+ *  when writing the segment. */
+final class DocumentsWriterThreadState {
+
+  Posting[] postingsFreeList;           // Free Posting instances
+  int postingsFreeCount;
+
+  RAMOutputStream tvfLocal = new RAMOutputStream();    // Term vectors for one doc
+  RAMOutputStream fdtLocal = new RAMOutputStream();    // Stored fields for one doc
+  FieldsWriter localFieldsWriter;       // Fields for one doc
+
+  long[] vectorFieldPointers;
+  int[] vectorFieldNumbers;
+
+  boolean isIdle = true;                // Whether we are in use
+  int numThreads = 1;                   // Number of threads that use this instance
+
+  int docID;                            // docID we are now working on
+  int numStoredFields;                  // How many stored fields in current doc
+  float docBoost;                       // Boost for current doc
+
+  DocumentsWriterFieldData[] fieldDataArray;           // Fields touched by current doc
+  int numFieldData;                     // How many fields in current doc
+  int numVectorFields;                  // How many vector fields in current doc
+
+  DocumentsWriterFieldData[] allFieldDataArray = new DocumentsWriterFieldData[10]; // All FieldData instances
+  int numAllFieldData;
+  DocumentsWriterFieldData[] fieldDataHash;            // Hash FieldData instances by field name
+  int fieldDataHashMask;
+  String maxTermPrefix;                 // Non-null prefix of a too-large term if this
+  // doc has one
+
+  boolean doFlushAfter;
+
+  final DocumentsWriter docWriter;
+
+  final ByteBlockPool postingsPool;
+  final ByteBlockPool vectorsPool;
+  final CharBlockPool charPool;
+
+  public DocumentsWriterThreadState(DocumentsWriter docWriter) {
+    this.docWriter = docWriter;
+    fieldDataArray = new DocumentsWriterFieldData[8];
+
+    fieldDataHash = new DocumentsWriterFieldData[16];
+    fieldDataHashMask = 15;
+
+    vectorFieldPointers = new long[10];
+    vectorFieldNumbers = new int[10];
+    postingsFreeList = new Posting[256];
+    postingsFreeCount = 0;
+
+    postingsPool = new ByteBlockPool(docWriter ,true);
+    vectorsPool = new ByteBlockPool(docWriter, false);
+    charPool = new CharBlockPool(docWriter);
+  }
+
+  /** Clear the postings hash and return objects back to
+   *  shared pool */
+  public void resetPostings() throws IOException {
+    fieldGen = 0;
+    maxPostingsVectors = 0;
+    doFlushAfter = false;
+    if (localFieldsWriter != null) {
+      localFieldsWriter.close();
+      localFieldsWriter = null;
+    }
+    postingsPool.reset();
+    charPool.reset();
+    docWriter.recyclePostings(postingsFreeList, postingsFreeCount);
+    postingsFreeCount = 0;
+    for(int i=0;i<numAllFieldData;i++) {
+      DocumentsWriterFieldData fp = allFieldDataArray[i];
+      fp.lastGen = -1;
+      if (fp.numPostings > 0)
+        fp.resetPostingArrays();
+    }
+  }
+
+  /** Move all per-document state that was accumulated in
+   *  the ThreadState into the "real" stores. */
+  public void writeDocument() throws IOException, AbortException {
+
+    // If we hit an exception while appending to the
+    // stored fields or term vectors files, we have to
+    // abort all documents since we last flushed because
+    // it means those files are possibly inconsistent.
+    try {
+
+      docWriter.numDocsInStore++;
+
+      // Append stored fields to the real FieldsWriter:
+      docWriter.fieldsWriter.flushDocument(numStoredFields, fdtLocal);
+      fdtLocal.reset();
+
+      // Append term vectors to the real outputs:
+      final IndexOutput tvx = docWriter.tvx;
+      final IndexOutput tvd = docWriter.tvd;
+      final IndexOutput tvf = docWriter.tvf;
+      if (tvx != null) {
+        tvx.writeLong(tvd.getFilePointer());
+        tvx.writeLong(tvf.getFilePointer());
+        tvd.writeVInt(numVectorFields);
+        if (numVectorFields > 0) {
+          for(int i=0;i<numVectorFields;i++)
+            tvd.writeVInt(vectorFieldNumbers[i]);
+          assert 0 == vectorFieldPointers[0];
+          long lastPos = vectorFieldPointers[0];
+          for(int i=1;i<numVectorFields;i++) {
+            long pos = vectorFieldPointers[i];
+            tvd.writeVLong(pos-lastPos);
+            lastPos = pos;
+          }
+          tvfLocal.writeTo(tvf);
+          tvfLocal.reset();
+        }
+      }
+
+      // Append norms for the fields we saw:
+      for(int i=0;i<numFieldData;i++) {
+        DocumentsWriterFieldData fp = fieldDataArray[i];
+        if (fp.doNorms) {
+          BufferedNorms bn = docWriter.norms[fp.fieldInfo.number];
+          assert bn != null;
+          assert bn.upto <= docID;
+          bn.fill(docID);
+          float norm = fp.boost * docWriter.writer.getSimilarity().lengthNorm(fp.fieldInfo.name, fp.length);
+          bn.add(norm);
+        }
+      }
+    } catch (Throwable t) {
+      // Forcefully idle this threadstate -- its state will
+      // be reset by abort()
+      isIdle = true;
+      throw new AbortException(t, docWriter);
+    }
+
+    if (docWriter.bufferIsFull && !docWriter.flushPending) {
+      docWriter.flushPending = true;
+      doFlushAfter = true;
+    }
+  }
+
+  int fieldGen;
+
+  /** Initializes shared state for this new document */
+  void init(Document doc, int docID) throws IOException, AbortException {
+
+    assert !isIdle;
+    assert docWriter.writer.testPoint("DocumentsWriter.ThreadState.init start");
+
+    this.docID = docID;
+    docBoost = doc.getBoost();
+    numStoredFields = 0;
+    numFieldData = 0;
+    numVectorFields = 0;
+    maxTermPrefix = null;
+
+    assert 0 == fdtLocal.length();
+    assert 0 == fdtLocal.getFilePointer();
+    assert 0 == tvfLocal.length();
+    assert 0 == tvfLocal.getFilePointer();
+    final int thisFieldGen = fieldGen++;
+
+    List docFields = doc.getFields();
+    final int numDocFields = docFields.size();
+    boolean docHasVectors = false;
+
+    // Absorb any new fields first seen in this document.
+    // Also absorb any changes to fields we had already
+    // seen before (eg suddenly turning on norms or
+    // vectors, etc.):
+
+    for(int i=0;i<numDocFields;i++) {
+      Fieldable field = (Fieldable) docFields.get(i);
+
+      FieldInfo fi = docWriter.fieldInfos.add(field.name(), field.isIndexed(), field.isTermVectorStored(),
+                                              field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
+                                              field.getOmitNorms(), false);
+      if (fi.isIndexed && !fi.omitNorms) {
+        // Maybe grow our buffered norms
+        if (docWriter.norms.length <= fi.number) {
+          int newSize = (int) ((1+fi.number)*1.25);
+          BufferedNorms[] newNorms = new BufferedNorms[newSize];
+          System.arraycopy(docWriter.norms, 0, newNorms, 0, docWriter.norms.length);
+          docWriter.norms = newNorms;
+        }
+          
+        if (docWriter.norms[fi.number] == null)
+          docWriter.norms[fi.number] = new BufferedNorms();
+
+        docWriter.hasNorms = true;
+      }
+
+      // Make sure we have a FieldData allocated
+      int hashPos = fi.name.hashCode() & fieldDataHashMask;
+      DocumentsWriterFieldData fp = fieldDataHash[hashPos];
+      while(fp != null && !fp.fieldInfo.name.equals(fi.name))
+        fp = fp.next;
+
+      if (fp == null) {
+
+        fp = new DocumentsWriterFieldData(this, fi);
+        fp.next = fieldDataHash[hashPos];
+        fieldDataHash[hashPos] = fp;
+
+        if (numAllFieldData == allFieldDataArray.length) {
+          int newSize = (int) (allFieldDataArray.length*1.5);
+          int newHashSize = fieldDataHash.length*2;
+
+          DocumentsWriterFieldData newArray[] = new DocumentsWriterFieldData[newSize];
+          DocumentsWriterFieldData newHashArray[] = new DocumentsWriterFieldData[newHashSize];
+          System.arraycopy(allFieldDataArray, 0, newArray, 0, numAllFieldData);
+
+          // Rehash
+          fieldDataHashMask = newSize-1;
+          for(int j=0;j<fieldDataHash.length;j++) {
+            DocumentsWriterFieldData fp0 = fieldDataHash[j];
+            while(fp0 != null) {
+              hashPos = fp0.fieldInfo.name.hashCode() & fieldDataHashMask;
+              DocumentsWriterFieldData nextFP0 = fp0.next;
+              fp0.next = newHashArray[hashPos];
+              newHashArray[hashPos] = fp0;
+              fp0 = nextFP0;
+            }
+          }
+
+          allFieldDataArray = newArray;
+          fieldDataHash = newHashArray;
+        }
+        allFieldDataArray[numAllFieldData++] = fp;
+      } else {
+        assert fp.fieldInfo == fi;
+      }
+
+      if (thisFieldGen != fp.lastGen) {
+
+        // First time we're seeing this field for this doc
+        fp.lastGen = thisFieldGen;
+        fp.fieldCount = 0;
+        fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false;
+        fp.doNorms = fi.isIndexed && !fi.omitNorms;
+
+        if (numFieldData == fieldDataArray.length) {
+          int newSize = fieldDataArray.length*2;
+          DocumentsWriterFieldData newArray[] = new DocumentsWriterFieldData[newSize];
+          System.arraycopy(fieldDataArray, 0, newArray, 0, numFieldData);
+          fieldDataArray = newArray;
+
+        }
+        fieldDataArray[numFieldData++] = fp;
+      }
+
+      if (field.isTermVectorStored()) {
+        if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.length) {
+          final int newSize = (int) (numVectorFields*1.5);
+          vectorFieldPointers = new long[newSize];
+          vectorFieldNumbers = new int[newSize];
+        }
+        fp.doVectors = true;
+        docHasVectors = true;
+
+        fp.doVectorPositions |= field.isStorePositionWithTermVector();
+        fp.doVectorOffsets |= field.isStoreOffsetWithTermVector();
+      }
+
+      if (fp.fieldCount == fp.docFields.length) {
+        Fieldable[] newArray = new Fieldable[fp.docFields.length*2];
+        System.arraycopy(fp.docFields, 0, newArray, 0, fp.docFields.length);
+        fp.docFields = newArray;
+      }
+
+      // Lazily allocate arrays for postings:
+      if (field.isIndexed() && fp.postingsHash == null)
+        fp.initPostingArrays();
+
+      fp.docFields[fp.fieldCount++] = field;
+    }
+
+    // Maybe init the local & global fieldsWriter
+    if (localFieldsWriter == null) {
+      if (docWriter.fieldsWriter == null) {
+        assert docWriter.docStoreSegment == null;
+        assert docWriter.segment != null;
+        docWriter.docStoreSegment = docWriter.segment;
+        // If we hit an exception while init'ing the
+        // fieldsWriter, we must abort this segment
+        // because those files will be in an unknown
+        // state:
+        try {
+          docWriter.fieldsWriter = new FieldsWriter(docWriter.directory, docWriter.docStoreSegment, docWriter.fieldInfos);
+        } catch (Throwable t) {
+          throw new AbortException(t, docWriter);
+        }
+        docWriter.files = null;
+      }
+      localFieldsWriter = new FieldsWriter(null, fdtLocal, docWriter.fieldInfos);
+    }
+
+    // First time we see a doc that has field(s) with
+    // stored vectors, we init our tvx writer
+    if (docHasVectors) {
+      if (docWriter.tvx == null) {
+        assert docWriter.docStoreSegment != null;
+        // If we hit an exception while init'ing the term
+        // vector output files, we must abort this segment
+        // because those files will be in an unknown
+        // state:
+        try {
+          docWriter.tvx = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+          docWriter.tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
+          docWriter.tvd = docWriter.directory.createOutput(docWriter.docStoreSegment +  "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+          docWriter.tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
+          docWriter.tvf = docWriter.directory.createOutput(docWriter.docStoreSegment +  "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
+          docWriter.tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
+
+          // We must "catch up" for all docs before us
+          // that had no vectors:
+          for(int i=0;i<docWriter.numDocsInStore;i++) {
+            docWriter.tvx.writeLong(docWriter.tvd.getFilePointer());
+            docWriter.tvd.writeVInt(0);
+            docWriter.tvx.writeLong(0);
+          }
+        } catch (Throwable t) {
+          throw new AbortException(t, docWriter);
+        }
+        docWriter.files = null;
+      }
+      numVectorFields = 0;
+    }
+  }
+
+  /** Do in-place sort of Posting array */
+  void doPostingSort(Posting[] postings, int numPosting) {
+    quickSort(postings, 0, numPosting-1);
+  }
+
+  void quickSort(Posting[] postings, int lo, int hi) {
+    if (lo >= hi)
+      return;
+    else if (hi == 1+lo) {
+      if (comparePostings(postings[lo], postings[hi]) > 0) {
+        final Posting tmp = postings[lo];
+        postings[lo] = postings[hi];
+        postings[hi] = tmp;
+      }
+      return;
+    }
+
+    int mid = (lo + hi) >>> 1;
+
+    if (comparePostings(postings[lo], postings[mid]) > 0) {
+      Posting tmp = postings[lo];
+      postings[lo] = postings[mid];
+      postings[mid] = tmp;
+    }
+
+    if (comparePostings(postings[mid], postings[hi]) > 0) {
+      Posting tmp = postings[mid];
+      postings[mid] = postings[hi];
+      postings[hi] = tmp;
+
+      if (comparePostings(postings[lo], postings[mid]) > 0) {
+        Posting tmp2 = postings[lo];
+        postings[lo] = postings[mid];
+        postings[mid] = tmp2;
+      }
+    }
+
+    int left = lo + 1;
+    int right = hi - 1;
+
+    if (left >= right)
+      return;
+
+    Posting partition = postings[mid];
+
+    for (; ;) {
+      while (comparePostings(postings[right], partition) > 0)
+        --right;
+
+      while (left < right && comparePostings(postings[left], partition) <= 0)
+        ++left;
+
+      if (left < right) {
+        Posting tmp = postings[left];
+        postings[left] = postings[right];
+        postings[right] = tmp;
+        --right;
+      } else {
+        break;
+      }
+    }
+
+    quickSort(postings, lo, left);
+    quickSort(postings, left + 1, hi);
+  }
+
+  /** Do in-place sort of PostingVector array */
+  void doVectorSort(PostingVector[] postings, int numPosting) {
+    quickSort(postings, 0, numPosting-1);
+  }
+
+  void quickSort(PostingVector[] postings, int lo, int hi) {
+    if (lo >= hi)
+      return;
+    else if (hi == 1+lo) {
+      if (comparePostings(postings[lo].p, postings[hi].p) > 0) {
+        final PostingVector tmp = postings[lo];
+        postings[lo] = postings[hi];
+        postings[hi] = tmp;
+      }
+      return;
+    }
+
+    int mid = (lo + hi) >>> 1;
+
+    if (comparePostings(postings[lo].p, postings[mid].p) > 0) {
+      PostingVector tmp = postings[lo];
+      postings[lo] = postings[mid];
+      postings[mid] = tmp;
+    }
+
+    if (comparePostings(postings[mid].p, postings[hi].p) > 0) {
+      PostingVector tmp = postings[mid];
+      postings[mid] = postings[hi];
+      postings[hi] = tmp;
+
+      if (comparePostings(postings[lo].p, postings[mid].p) > 0) {
+        PostingVector tmp2 = postings[lo];
+        postings[lo] = postings[mid];
+        postings[mid] = tmp2;
+      }
+    }
+
+    int left = lo + 1;
+    int right = hi - 1;
+
+    if (left >= right)
+      return;
+
+    PostingVector partition = postings[mid];
+
+    for (; ;) {
+      while (comparePostings(postings[right].p, partition.p) > 0)
+        --right;
+
+      while (left < right && comparePostings(postings[left].p, partition.p) <= 0)
+        ++left;
+
+      if (left < right) {
+        PostingVector tmp = postings[left];
+        postings[left] = postings[right];
+        postings[right] = tmp;
+        --right;
+      } else {
+        break;
+      }
+    }
+
+    quickSort(postings, lo, left);
+    quickSort(postings, left + 1, hi);
+  }
+
+  void quickSort(DocumentsWriterFieldData[] array, int lo, int hi) {
+    if (lo >= hi)
+      return;
+    else if (hi == 1+lo) {
+      if (array[lo].compareTo(array[hi]) > 0) {
+        final DocumentsWriterFieldData tmp = array[lo];
+        array[lo] = array[hi];
+        array[hi] = tmp;
+      }
+      return;
+    }
+
+    int mid = (lo + hi) >>> 1;
+
+    if (array[lo].compareTo(array[mid]) > 0) {
+      DocumentsWriterFieldData tmp = array[lo];
+      array[lo] = array[mid];
+      array[mid] = tmp;
+    }
+
+    if (array[mid].compareTo(array[hi]) > 0) {
+      DocumentsWriterFieldData tmp = array[mid];
+      array[mid] = array[hi];
+      array[hi] = tmp;
+
+      if (array[lo].compareTo(array[mid]) > 0) {
+        DocumentsWriterFieldData tmp2 = array[lo];
+        array[lo] = array[mid];
+        array[mid] = tmp2;
+      }
+    }
+
+    int left = lo + 1;
+    int right = hi - 1;
+
+    if (left >= right)
+      return;
+
+    DocumentsWriterFieldData partition = array[mid];
+
+    for (; ;) {
+      while (array[right].compareTo(partition) > 0)
+        --right;
+
+      while (left < right && array[left].compareTo(partition) <= 0)
+        ++left;
+
+      if (left < right) {
+        DocumentsWriterFieldData tmp = array[left];
+        array[left] = array[right];
+        array[right] = tmp;
+        --right;
+      } else {
+        break;
+      }
+    }
+
+    quickSort(array, lo, left);
+    quickSort(array, left + 1, hi);
+  }
+
+  /** If there are fields we've seen but did not see again
+   *  in the last run, then free them up.  Also reduce
+   *  postings hash size. */
+  void trimFields() {
+
+    int upto = 0;
+    for(int i=0;i<numAllFieldData;i++) {
+      DocumentsWriterFieldData fp = allFieldDataArray[i];
+      if (fp.lastGen == -1) {
+        // This field was not seen since the previous
+        // flush, so, free up its resources now
+
+        // Unhash
+        final int hashPos = fp.fieldInfo.name.hashCode() & fieldDataHashMask;
+        DocumentsWriterFieldData last = null;
+        DocumentsWriterFieldData fp0 = fieldDataHash[hashPos];
+        while(fp0 != fp) {
+          last = fp0;
+          fp0 = fp0.next;
+        }
+        assert fp0 != null;
+
+        if (last == null)
+          fieldDataHash[hashPos] = fp.next;
+        else
+          last.next = fp.next;
+
+        if (docWriter.infoStream != null)
+          docWriter.infoStream.println("  remove field=" + fp.fieldInfo.name);
+
+      } else {
+        // Reset
+        fp.lastGen = -1;
+        allFieldDataArray[upto++] = fp;
+          
+        if (fp.numPostings > 0 && ((float) fp.numPostings) / fp.postingsHashSize < 0.2) {
+          int hashSize = fp.postingsHashSize;
+
+          // Reduce hash so it's between 25-50% full
+          while (fp.numPostings < (hashSize>>1) && hashSize >= 2)
+            hashSize >>= 1;
+          hashSize <<= 1;
+
+          if (hashSize != fp.postingsHash.length)
+            fp.rehashPostings(hashSize);
+        }
+      }
+    }
+
+    // If we didn't see any norms for this field since
+    // last flush, free it
+    for(int i=0;i<docWriter.norms.length;i++) {
+      BufferedNorms n = docWriter.norms[i];
+      if (n != null && n.upto == 0)
+        docWriter.norms[i] = null;
+    }
+
+    numAllFieldData = upto;
+
+    // Also pare back PostingsVectors if it's excessively
+    // large
+    if (maxPostingsVectors * 1.5 < postingsVectors.length) {
+      final int newSize;
+      if (0 == maxPostingsVectors)
+        newSize = 1;
+      else
+        newSize = (int) (1.5*maxPostingsVectors);
+      PostingVector[] newArray = new PostingVector[newSize];
+      System.arraycopy(postingsVectors, 0, newArray, 0, newSize);
+      postingsVectors = newArray;
+    }
+  }
+
+  /** Tokenizes the fields of a document into Postings */
+  void processDocument(Analyzer analyzer)
+    throws IOException, AbortException {
+
+    final int numFields = numFieldData;
+    assert clearLastVectorFieldName();
+
+    assert 0 == fdtLocal.length();
+
+    if (docWriter.tvx != null)
+      // If we are writing vectors then we must visit
+      // fields in sorted order so they are written in
+      // sorted order.  TODO: we actually only need to
+      // sort the subset of fields that have vectors
+      // enabled; we could save [small amount of] CPU
+      // here.
+      quickSort(fieldDataArray, 0, numFields-1);
+
+    // We process the document one field at a time
+    for(int i=0;i<numFields;i++)
+      fieldDataArray[i].processField(analyzer);
+
+    if (docWriter.infoStream != null && maxTermPrefix != null)
+      docWriter.infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + maxTermPrefix + "...'"); 
+  }
+
+  // USE ONLY FOR DEBUGGING!
+  /*
+    public String getPostingText() {
+    char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
+    int upto = p.textStart & CHAR_BLOCK_MASK;
+    while(text[upto] != 0xffff)
+    upto++;
+    return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK));
+    }
+  */
+
+  /** Compares term text for two Posting instance and
+   *  returns -1 if p1 < p2; 1 if p1 > p2; else 0.
+   */
+  int comparePostings(Posting p1, Posting p2) {
+    if (p1 == p2)
+      return 0;
+    final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+    final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+    while(true) {
+      final char c1 = text1[pos1++];
+      final char c2 = text2[pos2++];
+      if (c1 < c2)
+        if (0xffff == c2)
+          return 1;
+        else
+          return -1;
+      else if (c2 < c1)
+        if (0xffff == c1)
+          return -1;
+        else
+          return 1;
+      else if (0xffff == c1)
+        return 0;
+    }
+  }
+
+  String lastVectorFieldName;
+
+  // Called only by assert
+  final boolean clearLastVectorFieldName() {
+    lastVectorFieldName = null;
+    return true;
+  }
+
+  // Called only by assert
+  final boolean vectorFieldsInOrder(FieldInfo fi) {
+    try {
+      if (lastVectorFieldName != null)
+        return lastVectorFieldName.compareTo(fi.name) < 0;
+      else
+        return true;
+    } finally {
+      lastVectorFieldName = fi.name;
+    }
+  }
+
+  PostingVector[] postingsVectors = new PostingVector[1];
+  int maxPostingsVectors;
+
+  // Used to read a string value for a field
+  ReusableStringReader stringReader = new ReusableStringReader();
+}
+

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/Posting.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/Posting.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/Posting.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/Posting.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,34 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Used by DocumentsWriter to track postings for a single
+ * term.  One of these exists per unique term seen since the
+ * last flush. */
+final class Posting {
+  int textStart;                                  // Address into char[] blocks where our text is stored
+  int docFreq;                                    // # times this term occurs in the current doc
+  int freqStart;                                  // Address of first byte[] slice for freq
+  int freqUpto;                                   // Next write address for freq
+  int proxStart;                                  // Address of first byte[] slice
+  int proxUpto;                                   // Next write address for prox
+  int lastDocID;                                  // Last docID where this term occurred
+  int lastDocCode;                                // Code for prior doc
+  int lastPosition;                               // Last position where this term occurred
+  PostingVector vector;                           // Corresponding PostingVector instance
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/Posting.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/PostingVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/PostingVector.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/PostingVector.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/PostingVector.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,30 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Used by DocumentsWriter to track data for term vectors.
+ * One of these exists per unique term seen in each field in
+ * the document. */
+class PostingVector {
+  Posting p;                                      // Corresponding Posting instance for this term
+  int lastOffset;                                 // Last offset we saw
+  int offsetStart;                                // Address of first slice for offsets
+  int offsetUpto;                                 // Next write address for offsets
+  int posStart;                                   // Address of first slice for positions
+  int posUpto;                                    // Next write address for positions
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/PostingVector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/java/org/apache/lucene/index/ReusableStringReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/ReusableStringReader.java?rev=636458&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/ReusableStringReader.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/ReusableStringReader.java Wed Mar 12 12:09:12 2008
@@ -0,0 +1,55 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+/** Used by DocumentsWriter to implemented a StringReader
+ *  that can be reset to a new string; we use this when
+ *  tokenizing the string value from a Field. */
+final class ReusableStringReader extends Reader {
+  int upto;
+  int left;
+  String s;
+  void init(String s) {
+    this.s = s;
+    left = s.length();
+    this.upto = 0;
+  }
+  public int read(char[] c) {
+    return read(c, 0, c.length);
+  }
+  public int read(char[] c, int off, int len) {
+    if (left > len) {
+      s.getChars(upto, upto+len, c, off);
+      upto += len;
+      left -= len;
+      return len;
+    } else if (0 == left) {
+      return -1;
+    } else {
+      s.getChars(upto, upto+left, c, off);
+      int r = left;
+      left = 0;
+      upto = s.length();
+      return r;
+    }
+  }
+  public void close() {};
+}
+

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/ReusableStringReader.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message