lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r882252 - in /lucene/java/branches/flex_1458/src: java/org/apache/lucene/index/ java/org/apache/lucene/index/codecs/ java/org/apache/lucene/index/codecs/intblock/ java/org/apache/lucene/index/codecs/sep/ java/org/apache/lucene/index/codecs/...
Date Thu, 19 Nov 2009 19:08:49 GMT
Author: mikemccand
Date: Thu Nov 19 19:08:47 2009
New Revision: 882252

URL: http://svn.apache.org/viewvc?rev=882252&view=rev
Log:
LUCENE-1458 (on flex branch): switch terms data in DocumentWriter's RAM buffer to use UTF8 byte[] instead of char[]

Removed:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CharBlockPool.java
Modified:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerThread.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestCodecs.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/CheckIndex.java Thu Nov 19 19:08:47 2009
@@ -361,7 +361,7 @@
       else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
         sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
       else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS)
-        sFormat = "FORMAT_FLEX_POSTINGS [Lucene 2.9]";
+        sFormat = "FORMAT_FLEX_POSTINGS [Lucene 3.1]";
       else if (format < SegmentInfos.CURRENT_FORMAT) {
         sFormat = "int=" + format + " [newer version of Lucene than this tool]";
         skip = true;
@@ -610,7 +610,6 @@
           status.termCount++;
 
           int lastDoc = -1;
-          int freq0 = 0;
           while(true) {
             final int doc = docs.next();
             if (doc == DocsEnum.NO_MORE_DOCS) {
@@ -619,7 +618,6 @@
             final int freq = docs.freq();
             status.totPos += freq;
 
-            freq0++;
             if (doc <= lastDoc) {
               throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
             }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java Thu Nov 19 19:08:47 2009
@@ -246,7 +246,7 @@
       fields[i].consumer.processFields(fields[i].fields, fields[i].fieldCount);
 
     if (docState.maxTermPrefix != null && docState.infoStream != null)
-      docState.infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); 
+      docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); 
 
     final DocumentsWriter.DocWriter one = fieldsWriter.finishDocument();
     final DocumentsWriter.DocWriter two = consumer.finishDocument();

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/DocumentsWriter.java Thu Nov 19 19:08:47 2009
@@ -1181,6 +1181,8 @@
   final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
   final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
 
+  final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-1;
+
   private class ByteBlockAllocator extends ByteBlockPool.Allocator {
 
     ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
@@ -1266,55 +1268,19 @@
 
   ByteBlockAllocator byteBlockAllocator = new ByteBlockAllocator();
 
-  /* Initial chunk size of the shared char[] blocks used to
-     store term text */
-  final static int CHAR_BLOCK_SHIFT = 14;
-  final static int CHAR_BLOCK_SIZE = 1 << CHAR_BLOCK_SHIFT;
-  final static int CHAR_BLOCK_MASK = CHAR_BLOCK_SIZE - 1;
-
-  final static int MAX_TERM_LENGTH = CHAR_BLOCK_SIZE-1;
-
-  private ArrayList<char[]> freeCharBlocks = new ArrayList<char[]>();
-
-  /* Allocate another char[] from the shared pool */
-  synchronized char[] getCharBlock() {
-    final int size = freeCharBlocks.size();
-    final char[] c;
-    if (0 == size) {
-      numBytesAlloc += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
-      c = new char[CHAR_BLOCK_SIZE];
-    } else
-      c = freeCharBlocks.remove(size-1);
-    // We always track allocations of char blocks, for now,
-    // because nothing that skips allocation tracking
-    // (currently only term vectors) uses its own char
-    // blocks.
-    numBytesUsed += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
-    assert numBytesUsed <= numBytesAlloc;
-    return c;
-  }
-
-  /* Return char[]s to the pool */
-  synchronized void recycleCharBlocks(char[][] blocks, int numBlocks) {
-    for(int i=0;i<numBlocks;i++)
-      freeCharBlocks.add(blocks[i]);
-  }
-
   String toMB(long v) {
     return nf.format(v/1024./1024.);
   }
 
-  /* We have three pools of RAM: Postings, byte blocks
-   * (holds freq/prox posting data) and char blocks (holds
-   * characters in the term).  Different docs require
-   * varying amount of storage from these three classes.
-   * For example, docs with many unique single-occurrence
-   * short terms will use up the Postings RAM and hardly any
-   * of the other two.  Whereas docs with very large terms
-   * will use alot of char blocks RAM and relatively less of
-   * the other two.  This method just frees allocations from
-   * the pools once we are over-budget, which balances the
-   * pools to match the current docs. */
+  /* We have two pools of RAM: Postings and byte blocks
+   * (holds freq/prox posting data).  Different docs require
+   * varying amount of storage from these classes.  For
+   * example, docs with many unique single-occurrence short
+   * terms will use up the Postings RAM and hardly any of
+   * the other two.  Whereas docs with very large terms will
+   * use alot of byte blocks RAM.  This method just frees
+   * allocations from the pools once we are over-budget,
+   * which balances the pools to match the current docs. */
   void balanceRAM() {
 
     // We flush when we've used our target usage
@@ -1330,8 +1296,7 @@
                 " allocMB=" + toMB(numBytesAlloc) +
                 " deletesMB=" + toMB(deletesRAMUsed) +
                 " vs trigger=" + toMB(freeTrigger) +
-                " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
-                " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE));
+                " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE));
 
       final long startBytesAlloc = numBytesAlloc + deletesRAMUsed;
 
@@ -1346,7 +1311,7 @@
       while(numBytesAlloc+deletesRAMUsed > freeLevel) {
       
         synchronized(this) {
-          if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) {
+          if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeIntBlocks.size() && !any) {
             // Nothing else to free -- must flush now.
             bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger;
             if (infoStream != null) {
@@ -1359,23 +1324,18 @@
             break;
           }
 
-          if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) {
+          if ((0 == iter % 3) && byteBlockAllocator.freeByteBlocks.size() > 0) {
             byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1);
             numBytesAlloc -= BYTE_BLOCK_SIZE;
           }
 
-          if ((1 == iter % 4) && freeCharBlocks.size() > 0) {
-            freeCharBlocks.remove(freeCharBlocks.size()-1);
-            numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
-          }
-
-          if ((2 == iter % 4) && freeIntBlocks.size() > 0) {
+          if ((1 == iter % 3) && freeIntBlocks.size() > 0) {
             freeIntBlocks.remove(freeIntBlocks.size()-1);
             numBytesAlloc -= INT_BLOCK_SIZE * INT_NUM_BYTE;
           }
         }
 
-        if ((3 == iter % 4) && any)
+        if ((2 == iter % 3) && any)
           // Ask consumer to free any recycled state
           any = consumer.freeRAM();
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java Thu Nov 19 19:08:47 2009
@@ -29,12 +29,11 @@
 
   final FreqProxTermsWriterPerField field;
   final int numPostings;
-  final CharBlockPool charPool;
+  private final ByteBlockPool bytePool;
   final RawPostingList[] postings;
 
   private FreqProxTermsWriter.PostingList p;
-  char[] text;
-  int textOffset;
+  final TermRef text = new TermRef();
 
   private int postingUpto = -1;
 
@@ -46,25 +45,33 @@
 
   public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) {
     this.field = field;
-    this.charPool = field.perThread.termsHashPerThread.charPool;
     this.numPostings = field.termsHashPerField.numPostings;
     this.postings = field.termsHashPerField.sortPostings();
+    this.bytePool = field.perThread.termsHashPerThread.bytePool;
   }
 
   boolean nextTerm() throws IOException {
     postingUpto++;
-    if (postingUpto == numPostings)
+    if (postingUpto == numPostings) {
       return false;
+    }
 
     p = (FreqProxTermsWriter.PostingList) postings[postingUpto];
     docID = 0;
 
-    text = charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-    textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+    text.bytes = bytePool.buffers[p.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    text.offset = p.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+    // nocommit -- how to avoid this added cost?
+    int pos = text.offset;
+    while(text.bytes[pos] != TermsHashPerField.END_OF_TERM) {
+      pos++;
+    }
+    text.length = pos - text.offset;
 
     field.termsHashPerField.initReader(freq, p, 0);
-    if (!field.fieldInfo.omitTermFreqAndPositions)
+    if (!field.fieldInfo.omitTermFreqAndPositions) {
       field.termsHashPerField.initReader(prox, p, 1);
+    }
 
     // Should always be true
     boolean result = nextDoc();

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Thu Nov 19 19:08:47 2009
@@ -45,19 +45,30 @@
       postings[i] = new PostingList();
   }
 
-  private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
+  private static int compareText(final TermRef text1, final TermRef text2) {
+
+    int pos1 = text1.offset;
+    int pos2 = text2.offset;
+    final byte[] bytes1 = text1.bytes;
+    final byte[] bytes2 = text2.bytes;
     while(true) {
-      final char c1 = text1[pos1++];
-      final char c2 = text2[pos2++];
-      if (c1 != c2) {
-        if (0xffff == c2)
+      final byte b1 = bytes1[pos1++];
+      final byte b2 = bytes2[pos2++];
+      if (b1 != b2) {
+        if (TermsHashPerField.END_OF_TERM == b2) {
+          //text2.length = pos2 - text2.offset;
           return 1;
-        else if (0xffff == c1)
+        } else if (TermsHashPerField.END_OF_TERM == b1) {
+          //text1.length = pos1 - text1.offset;
           return -1;
-        else
-          return c1-c2;
-      } else if (0xffff == c1)
+        } else {
+          return (b1&0xff)-(b2&0xff);
+        }
+      } else if (TermsHashPerField.END_OF_TERM == b1) {
+        //text1.length = pos1 - text1.offset;
+        //text2.length = pos2 - text2.offset;
         return 0;
+      }
     }
   }
 
@@ -169,6 +180,8 @@
 
     int numFields = fields.length;
 
+    final TermRef text = new TermRef();
+
     final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
 
     for(int i=0;i<numFields;i++) {
@@ -187,6 +200,11 @@
 
     final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
 
+    // TODO: really TermsHashPerField should take over most
+    // of this loop, including merge sort of terms from
+    // multiple threads and interacting with the
+    // TermsConsumer, only calling out to us (passing us the
+    // DocsConsumer) to handle delivery of docs/positions
     while(numFields > 0) {
 
       // Get the next term to merge
@@ -194,24 +212,20 @@
       int numToMerge = 1;
 
       for(int i=1;i<numFields;i++) {
-        final char[] text = mergeStates[i].text;
-        final int textOffset = mergeStates[i].textOffset;
-        final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
-
+        final int cmp = compareText(mergeStates[i].text, termStates[0].text);
         if (cmp < 0) {
           termStates[0] = mergeStates[i];
           numToMerge = 1;
-        } else if (cmp == 0)
+        } else if (cmp == 0) {
           termStates[numToMerge++] = mergeStates[i];
+        }
       }
 
-      final char[] termText = termStates[0].text;
-      final int termTextOffset = termStates[0].textOffset;
-
-      // nocommit
-      //System.out.println("FLUSH term=" + new String(termText, termTextOffset, 10));
+      text.bytes = termStates[0].text.bytes;
+      text.offset = termStates[0].text.offset;
+      text.length = termStates[0].text.length;  
 
-      final DocsConsumer docConsumer = termsConsumer.startTerm(termText, termTextOffset);
+      final DocsConsumer docConsumer = termsConsumer.startTerm(text);
 
       // Now termStates has numToMerge FieldMergeStates
       // which all share the same term.  Now we must
@@ -220,16 +234,17 @@
       while(numToMerge > 0) {
         
         FreqProxFieldMergeState minState = termStates[0];
-        for(int i=1;i<numToMerge;i++)
-          if (termStates[i].docID < minState.docID)
+        for(int i=1;i<numToMerge;i++) {
+          if (termStates[i].docID < minState.docID) {
             minState = termStates[i];
+          }
+        }
 
         final int termDocFreq = minState.termFreq;
         numDocs++;
 
         assert minState.docID < flushedDocCount: "doc=" + minState.docID + " maxDoc=" + flushedDocCount;
 
-        //System.out.println("  docID=" + minState.docID);
         final PositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
 
         final ByteSliceReader prox = minState.prox;
@@ -269,9 +284,13 @@
 
           // Remove from termStates
           int upto = 0;
-          for(int i=0;i<numToMerge;i++)
-            if (termStates[i] != minState)
+          // TODO: inefficient O(N) where N = number of
+          // threads that had seen this term:
+          for(int i=0;i<numToMerge;i++) {
+            if (termStates[i] != minState) {
               termStates[upto++] = termStates[i];
+            }
+          }
           numToMerge--;
           assert upto == numToMerge;
 
@@ -290,7 +309,7 @@
         }
       }
 
-      termsConsumer.finishTerm(termText, termTextOffset, numDocs);
+      termsConsumer.finishTerm(text, numDocs);
     }
 
     termsConsumer.finish();

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/IndexWriter.java Thu Nov 19 19:08:47 2009
@@ -224,12 +224,13 @@
   public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
 
   /**
-   * Absolute hard maximum length for a term.  If a term
-   * arrives from the analyzer longer than this length, it
-   * is skipped and a message is printed to infoStream, if
-   * set (see {@link #setInfoStream}).
+   * Absolute hard maximum length for a term, in bytes once
+   * encoded as UTF8.  If a term arrives from the analyzer
+   * longer than this length, it is skipped and a message is
+   * printed to infoStream, if set (see {@link
+   * #setInfoStream}).
    */
-  public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH;
+  public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH_UTF8;
 
   // The normal read buffer size defaults to 1024, but
   // increasing this during merging seems to yield

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentMerger.java Thu Nov 19 19:08:47 2009
@@ -33,7 +33,6 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.index.codecs.DocsConsumer;
 import org.apache.lucene.index.codecs.PositionsConsumer;
 
@@ -708,8 +707,6 @@
     return delCounts;
   }
   
-  private final UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
-
   /** Process postings from multiple segments all positioned on the
    *  same term. Writes out merged entries into freqOutput and
    *  the proxOutput streams.
@@ -723,19 +720,9 @@
   private final int appendPostings(final TermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
         throws CorruptIndexException, IOException {
 
-    // nocommit -- maybe cutover TermsConsumer API to
-    // TermRef as well?
     final TermRef text = smis[0].term;
-    UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, termBuffer);
-
-    // Make space for terminator
-    final int length = termBuffer.length;
-    termBuffer.setLength(1+termBuffer.length);
-
-    // nocommit -- make this a static final constant somewhere:
-    termBuffer.result[length] = 0xffff;
 
-    final DocsConsumer docConsumer = termsConsumer.startTerm(termBuffer.result, 0);
+    final DocsConsumer docConsumer = termsConsumer.startTerm(text);
 
     int df = 0;
     for (int i = 0; i < n; i++) {
@@ -793,7 +780,7 @@
         }
       }
     }
-    termsConsumer.finishTerm(termBuffer.result, 0, df);
+    termsConsumer.finishTerm(text, df);
 
     return df;
   }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Thu Nov 19 19:08:47 2009
@@ -22,7 +22,6 @@
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.UnicodeUtil;
 
 final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
 
@@ -135,46 +134,48 @@
       bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
     tvf.writeByte(bits);
 
-    int encoderUpto = 0;
-    int lastTermBytesCount = 0;
-
+    int lastLen = 0;
+    byte[] lastBytes = null;
+    int lastStart = 0;
+      
     final ByteSliceReader reader = perThread.vectorSliceReader;
-    final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+    final byte[][] byteBuffers = perThread.termsHashPerThread.termBytePool.buffers;
+
     for(int j=0;j<numPostings;j++) {
       final TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
       final int freq = posting.freq;
           
-      final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-      final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+      final byte[] bytes = byteBuffers[posting.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+      final int start = posting.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
 
-      // We swap between two encoders to save copying
-      // last Term's byte array
-      final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
-
-      // TODO: we could do this incrementally
-      UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
-      final int termBytesCount = utf8Result.length;
+      // nocommit: we can do this as completion of
+      // prefix-finding loop, below:
+      int upto = start;
+      while(bytes[upto] != TermsHashPerField.END_OF_TERM) {
+        upto++;
+      }
+      final int len = upto - start;
 
-      // TODO: UTF16toUTF8 could tell us this prefix
-      // Compute common prefix between last term and
+      // Compute common byte prefix between last term and
       // this term
       int prefix = 0;
       if (j > 0) {
-        final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
-        final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
-        while(prefix < lastTermBytesCount && prefix < termBytesCount) {
-          if (lastTermBytes[prefix] != termBytes[prefix])
+        while(prefix < lastLen && prefix < len) {
+          if (lastBytes[lastStart+prefix] != bytes[start+prefix]) {
             break;
+          }
           prefix++;
         }
       }
-      encoderUpto = 1-encoderUpto;
-      lastTermBytesCount = termBytesCount;
 
-      final int suffix = termBytesCount - prefix;
+      lastLen = len;
+      lastBytes = bytes;
+      lastStart = start;
+
+      final int suffix = len - prefix;
       tvf.writeVInt(prefix);
       tvf.writeVInt(suffix);
-      tvf.writeBytes(utf8Result.result, prefix, suffix);
+      tvf.writeBytes(bytes, lastStart+prefix, suffix);
       tvf.writeVInt(freq);
 
       if (doVectorPositions) {
@@ -208,9 +209,7 @@
 
   @Override
   void newTerm(RawPostingList p0) {
-
     assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
-
     TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
 
     p.freq = 1;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java Thu Nov 19 19:08:47 2009
@@ -17,8 +17,6 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.util.UnicodeUtil;
-
 final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread {
 
   final TermVectorsTermsWriter termsWriter;
@@ -36,9 +34,6 @@
   // Used by perField when serializing the term vectors
   final ByteSliceReader vectorSliceReader = new ByteSliceReader();
 
-  final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
-                                                new UnicodeUtil.UTF8Result()};
-
   @Override
   public void startDocument() {
     assert clearLastVectorFieldName();

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerField.java Thu Nov 19 19:08:47 2009
@@ -32,11 +32,13 @@
   final DocumentsWriter.DocState docState;
   final FieldInvertState fieldState;
   TermAttribute termAtt;
+
+  static final byte END_OF_TERM = (byte) 0xff;
   
   // Copied from our perThread
-  final CharBlockPool charPool;
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
+  final ByteBlockPool termBytePool;
 
   final int streamCount;
   final int numPostingInt;
@@ -50,17 +52,19 @@
   private int postingsHashMask = postingsHashSize-1;
   private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
   private RawPostingList p;
+  private final UnicodeUtil.UTF8Result utf8;
   
   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
     this.perThread = perThread;
     intPool = perThread.intPool;
-    charPool = perThread.charPool;
     bytePool = perThread.bytePool;
+    termBytePool = perThread.termBytePool;
     docState = perThread.docState;
     fieldState = docInverterPerField.fieldState;
     this.consumer = perThread.consumer.addField(this, fieldInfo);
     streamCount = consumer.getStreamCount();
     numPostingInt = 2*streamCount;
+    utf8 = perThread.utf8;
     this.fieldInfo = fieldInfo;
     if (nextPerThread != null)
       nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
@@ -204,46 +208,49 @@
    *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
   int comparePostings(RawPostingList p1, RawPostingList p2) {
 
-    if (p1 == p2)
+    if (p1 == p2) {
       return 0;
+    }
 
-    final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-    int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
-    final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-    int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+    final byte[] text1 = termBytePool.buffers[p1.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    int pos1 = p1.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+    final byte[] text2 = termBytePool.buffers[p2.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    int pos2 = p2.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
 
     assert text1 != text2 || pos1 != pos2;
 
     while(true) {
-      final char c1 = text1[pos1++];
-      final char c2 = text2[pos2++];
-      if (c1 != c2) {
-        if (0xffff == c2)
+      final byte b1 = text1[pos1++];
+      final byte b2 = text2[pos2++];
+      if (b1 != b2) {
+        if (END_OF_TERM == b2)
           return 1;
-        else if (0xffff == c1)
+        else if (END_OF_TERM == b1)
           return -1;
         else
-          return c1-c2;
+          return (b1&0xff)-(b2&0xff);
       } else
         // This method should never compare equal postings
         // unless p1==p2
-        assert c1 != 0xffff;
+        assert b1 != END_OF_TERM;
     }
   }
 
   /** Test whether the text for current RawPostingList p equals
-   *  current tokenText. */
-  private boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
+   *  current tokenText in utf8. */
+  private boolean postingEquals() {
 
-    final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    final byte[] text = termBytePool.buffers[p.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
     assert text != null;
-    int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+    int pos = p.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
 
-    int tokenPos = 0;
-    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
-      if (tokenText[tokenPos] != text[pos])
+    final byte[] utf8Bytes = utf8.result;
+    for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
+      if (utf8Bytes[tokenPos] != text[pos]) {
         return false;
-    return 0xffff == text[pos];
+      }
+    }
+    return END_OF_TERM == text[pos];
   }
   
   private boolean doCall;
@@ -354,38 +361,13 @@
     final char[] tokenText = termAtt.termBuffer();;
     final int tokenTextLen = termAtt.termLength();
 
-    // System.out.println("thpf.add: field=" + fieldInfo.name + " text=" + new String(tokenText, 0, tokenTextLen) + " c0=" + ((int) tokenText[0]) );
+    UnicodeUtil.UTF16toUTF8(tokenText, 0, tokenTextLen, utf8);
 
-    // Compute hashcode & replace any invalid UTF16 sequences
-    int downto = tokenTextLen;
+    // nocommit -- modify UnicodeUtil to compute hash for us
+    // so we don't do 2nd pass here
     int code = 0;
-    while (downto > 0) {
-      char ch = tokenText[--downto];
-
-      if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
-        if (0 == downto) {
-          // Unpaired
-          ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-        } else {
-          final char ch2 = tokenText[downto-1];
-          if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
-            // OK: high followed by low.  This is a valid
-            // surrogate pair.
-            code = ((code*31) + ch)*31+ch2;
-            downto--;
-            continue;
-          } else {
-            // Unpaired
-            ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-          }            
-        }
-      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
-                                                          ch == 0xffff)) {
-        // Unpaired or 0xffff
-        ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-      }
-
-      code = (code*31) + ch;
+    for(int i=0;i<utf8.length;i++) {
+      code = 31*code + utf8.result[i];
     }
 
     int hashPos = code & postingsHashMask;
@@ -393,7 +375,7 @@
     // Locate RawPostingList in hash
     p = postingsHash[hashPos];
 
-    if (p != null && !postingEquals(tokenText, tokenTextLen)) {
+    if (p != null && !postingEquals()) {
       // Conflict: keep searching different locations in
       // the hash table.
       final int inc = ((code>>8)+code)|1;
@@ -401,59 +383,65 @@
         code += inc;
         hashPos = code & postingsHashMask;
         p = postingsHash[hashPos];
-      } while (p != null && !postingEquals(tokenText, tokenTextLen));
+      } while (p != null && !postingEquals());
     }
 
     if (p == null) {
 
       // First time we are seeing this token since we last
       // flushed the hash.
-      final int textLen1 = 1+tokenTextLen;
-      if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
-        if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+      final int textLen1 = 1+utf8.length;
+      if (textLen1 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
+        if (textLen1 > DocumentsWriter.BYTE_BLOCK_SIZE) {
           // Just skip this term, to remain as robust as
           // possible during indexing.  A TokenFilter
           // can be inserted into the analyzer chain if
           // other behavior is wanted (pruning the term
           // to a prefix, throwing an exception, etc).
 
-          if (docState.maxTermPrefix == null)
+          if (docState.maxTermPrefix == null) {
             docState.maxTermPrefix = new String(tokenText, 0, 30);
+          }
 
           consumer.skippingLongTerm();
           return;
         }
-        charPool.nextBuffer();
+        bytePool.nextBuffer();
       }
 
       // Refill?
-      if (0 == perThread.freePostingsCount)
+      if (0 == perThread.freePostingsCount) {
         perThread.morePostings();
+      }
 
       // Pull next free RawPostingList from free list
       p = perThread.freePostings[--perThread.freePostingsCount];
       assert p != null;
 
-      final char[] text = charPool.buffer;
-      final int textUpto = charPool.charUpto;
-      p.textStart = textUpto + charPool.charOffset;
-      charPool.charUpto += textLen1;
-      System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
-      text[textUpto+tokenTextLen] = 0xffff;
+      final byte[] text = bytePool.buffer;
+      final int textUpto = bytePool.byteUpto;
+      p.textStart = textUpto + bytePool.byteOffset;
+
+      bytePool.byteUpto += textLen1;
+      System.arraycopy(utf8.result, 0, text, textUpto, utf8.length);
+      text[textUpto+utf8.length] = END_OF_TERM;
           
       assert postingsHash[hashPos] == null;
       postingsHash[hashPos] = p;
       numPostings++;
 
-      if (numPostings == postingsHashHalfSize)
+      if (numPostings == postingsHashHalfSize) {
         rehashPostings(2*postingsHashSize);
+      }
 
       // Init stream slices
-      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
         intPool.nextBuffer();
+      }
 
-      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
         bytePool.nextBuffer();
+      }
 
       intUptos = intPool.buffer;
       intUptoStart = intPool.intUpto;
@@ -532,16 +520,16 @@
       if (p0 != null) {
         int code;
         if (perThread.primary) {
-          final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
-          final char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-          int pos = start;
-          while(text[pos] != 0xffff)
-            pos++;
+          final int start = p0.textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+          final byte[] text = bytePool.buffers[p0.textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
           code = 0;
-          while (pos > start)
-            code = (code*31) + text[--pos];
-        } else
+          int pos = start;
+          while(text[pos] != END_OF_TERM) {
+            code = (code*31) + text[pos++];
+          }
+        } else {
           code = p0.textStart;
+        }
 
         int hashPos = code & newMask;
         assert hashPos >= 0;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerThread.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermsHashPerThread.java Thu Nov 19 19:08:47 2009
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.util.UnicodeUtil;
+
 import java.io.IOException;
 
 final class TermsHashPerThread extends InvertedDocConsumerPerThread {
@@ -25,33 +27,36 @@
   final TermsHashConsumerPerThread consumer;
   final TermsHashPerThread nextPerThread;
 
-  final CharBlockPool charPool;
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
+  final ByteBlockPool termBytePool;
   final boolean primary;
   final DocumentsWriter.DocState docState;
 
   final RawPostingList freePostings[] = new RawPostingList[256];
   int freePostingsCount;
 
+  // Used by perField:
+  final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+
   public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
     docState = docInverterPerThread.docState;
 
     this.termsHash = termsHash;
     this.consumer = termsHash.consumer.addThread(this);
 
+    intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
+    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
+
     if (nextTermsHash != null) {
       // We are primary
-      charPool = new CharBlockPool(termsHash.docWriter);
       primary = true;
+      termBytePool = bytePool;
     } else {
-      charPool = primaryPerThread.charPool;
       primary = false;
+      termBytePool = primaryPerThread.bytePool;
     }
 
-    intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
-    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
-
     if (nextTermsHash != null)
       nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
     else
@@ -114,9 +119,6 @@
     intPool.reset();
     bytePool.reset();
 
-    if (primary)
-      charPool.reset();
-
     if (recyclePostings) {
       termsHash.recyclePostings(freePostings, freePostingsCount);
       freePostingsCount = 0;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Thu Nov 19 19:08:47 2009
@@ -19,19 +19,19 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.index.TermRef;
+
 /**
  * NOTE: this API is experimental and will likely change
  */
 
 public abstract class TermsConsumer {
 
-  // nocommit -- CharSequence?
-  /** Starts a new term in this field; term ends with U+FFFF
-   *  char */
-  public abstract DocsConsumer startTerm(char[] text, int start) throws IOException;
+  /** Starts a new term in this field. */
+  public abstract DocsConsumer startTerm(TermRef text) throws IOException;
 
   /** Finishes the current term */
-  public abstract void finishTerm(char[] text, int start, int numDocs) throws IOException;
+  public abstract void finishTerm(TermRef text, int numDocs) throws IOException;
 
   /** Called when we are done adding terms to this field */
   public abstract void finish() throws IOException;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java Thu Nov 19 19:08:47 2009
@@ -32,8 +32,6 @@
   private int blockSize;
   private int[] pending;
   private int upto;
-  private long lastSavedFilePointer; //nocommit: not read
-  private int lastSavedUpto; //nocommit: not read
 
   protected void init(IndexOutput out, int fixedBlockSize) throws IOException {
     blockSize = fixedBlockSize;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/sep/SepSkipListReader.java Thu Nov 19 19:08:47 2009
@@ -43,12 +43,8 @@
   // nocommit -- make private again
   final IntIndexInput.Index lastPosIndex;
   
-  private long lastFreqPointer; //nocommit: not read
-  private long lastDocPointer; //nocommit: not read
-  private long lastPosPointer; //nocommit: not read
   private long lastPayloadPointer;
   private int lastPayloadLength;
-  private int lastChildLevel; //nocommit: not read
                            
   SepSkipListReader(IndexInput skipStream,
                     IntIndexInput freqIn,
@@ -114,12 +110,6 @@
       System.out.println("ssr.init docBase=" + docBaseIndex + " freqBase=" + freqBaseIndex + " posBase=" + posBaseIndex + " payloadBase=" + payloadBasePointer + " df=" + df);
     }
 
-    /*
-    lastFreqPointer = freqBasePointer;
-    lastDocPointer = docBasePointer;
-    lastPosPointer = posBasePointer;
-    */
-
     lastPayloadPointer = payloadBasePointer;
 
     for(int i=0;i<maxNumberOfSkipLevels;i++) {
@@ -149,9 +139,6 @@
   @Override
   protected void seekChild(int level) throws IOException {
     super.seekChild(level);
-    //freqPointer[level] = lastFreqPointer;
-    //docPointer[level] = lastDocPointer;
-    //posPointer[level] = lastPosPointer;
     payloadPointer[level] = lastPayloadPointer;
     payloadLength[level] = lastPayloadLength;
   }
@@ -171,9 +158,6 @@
     }
 
     if (level > 0) {
-      //lastFreqPointer = freqPointer[level];
-      //lastDocPointer = docPointer[level];
-      //lastPosPointer = posPointer[level];
       if (freqIndex != null) {
         freqIndex[level-1].set(freqIndex[level]);
       }
@@ -181,7 +165,6 @@
       if (posIndex != null) {
         posIndex[level-1].set(posIndex[level]);
       }
-      lastChildLevel = level-1;
     }
   }
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/DeltaBytesWriter.java Thu Nov 19 19:08:47 2009
@@ -17,16 +17,14 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.index.TermRef;
 
 import java.io.IOException;
 
 final class DeltaBytesWriter {
 
-  private final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); //nocommit: not read
-
   private byte[] lastBytes = new byte[10];
   private int lastLength;
   final IndexOutput out;
@@ -39,13 +37,18 @@
     lastLength = 0;
   }
 
-  void write(byte[] bytes, int length) throws IOException {
+  void write(TermRef text) throws IOException {
     int start = 0;
+    int upto = text.offset;
+    final int length = text.length;
+    final byte[] bytes = text.bytes;
+
     final int limit = length < lastLength ? length : lastLength;
     while(start < limit) {
-      if (bytes[start] != lastBytes[start])
+      if (bytes[upto] != lastBytes[start])
         break;
       start++;
+      upto++;
     }
 
     final int suffix = length - start;
@@ -54,11 +57,11 @@
 
     out.writeVInt(start);                       // prefix
     out.writeVInt(suffix);                      // suffix
-    out.writeBytes(bytes, start, suffix);
-    if (lastBytes.length < bytes.length) {
-      lastBytes = ArrayUtil.grow(lastBytes, bytes.length);
+    out.writeBytes(bytes, upto, suffix);
+    if (lastBytes.length < length) {
+      lastBytes = ArrayUtil.grow(lastBytes, length);
     }
-    System.arraycopy(bytes, start, lastBytes, start, suffix);
+    System.arraycopy(bytes, upto, lastBytes, start, suffix);
     lastLength = length;
   }
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Thu Nov 19 19:08:47 2009
@@ -364,12 +364,16 @@
             termLength[upto] = (short) thisTermLength;
             fileOffset[upto] = pointer;
             blockPointer[upto] = blockUpto * BYTE_BLOCK_SIZE + blockOffset;
+
+            /*
             TermRef tr = new TermRef();
             tr.bytes = blocks[blockUpto];
             tr.offset = blockOffset;
             tr.length = thisTermLength;
+
             //System.out.println("    read index term=" + new String(blocks[blockUpto], blockOffset, thisTermLength, "UTF-8") + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset);
             //System.out.println("    read index term=" + tr.toBytesString() + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset);
+            */
 
             lastBlock = block;
             lastBlockOffset = blockOffset;
@@ -403,13 +407,10 @@
         }
       }
 
-      final private TermRef termBuffer = new TermRef(); //nocommit: not read
-      final private TermsIndexResult termsIndexResult = new TermsIndexResult(); //nocommit: not read
-
       public final void getIndexOffset(TermRef term, TermsIndexResult result) throws IOException {
 
         if (Codec.DEBUG) {
-          System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " this=" + this);
+          System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " this=" + this + " numIndexedTerms=" + fileOffset.length);
         }
 
         int lo = 0;					  // binary search
@@ -446,7 +447,6 @@
         result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)];
         result.term.offset = (int) (loc & BYTE_BLOCK_MASK);
         result.term.length = termLength[hi];
-        //System.out.println("    hi term=" + result.term);
 
         result.position = hi*totalIndexInterval;
         result.offset = fileOffset[hi];

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Thu Nov 19 19:08:47 2009
@@ -22,6 +22,7 @@
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermRef;
 import org.apache.lucene.index.codecs.Codec;
 
 import java.util.List;
@@ -65,11 +66,9 @@
   }
   
   final private DeltaBytesWriter termWriter;
-  private FieldInfo currentField;
 
   @Override
   public FieldWriter addField(FieldInfo field) {
-    currentField = field;
     SimpleFieldWriter writer = new SimpleFieldWriter(field);
     fields.add(writer);
     return writer;
@@ -89,16 +88,16 @@
     }
 
     @Override
-    public boolean checkIndexTerm(byte[] term, int termLength, int docFreq) throws IOException {
+    public boolean checkIndexTerm(TermRef text, int docFreq) throws IOException {
       // First term is first indexed term:
       if (0 == (numTerms++ % termIndexInterval)) {
         final long termsPointer = termsOut.getFilePointer();
         if (Codec.DEBUG) {
-          System.out.println("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + new String(term, 0, termLength, "UTF-8") + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
+          System.out.println("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
         }
         // mxx
         //System.out.println(Thread.currentThread().getName() + ": ii seg=" + segment + " term=" + fieldInfo.name + ":" + new String(term, 0, termLength, "UTF-8") + " numTerms=" + (numTerms-1) + " termFP=" + termsPointer);
-        termWriter.write(term, termLength);
+        termWriter.write(text);
         out.writeVLong(termsPointer - lastTermsPointer);
         lastTermsPointer = termsPointer;
         numIndexTerms++;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java Thu Nov 19 19:08:47 2009
@@ -31,7 +31,6 @@
 import org.apache.lucene.index.codecs.DocsConsumer;
 import org.apache.lucene.index.codecs.TermsConsumer;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * Writes terms dict and interacts with docs/positions
@@ -79,7 +78,6 @@
     fieldInfos = state.fieldInfos;
 
     // Count indexed fields up front
-    final int numFields = fieldInfos.size(); //nocommit: not read
     Codec.writeHeader(out, CODEC_NAME, VERSION_CURRENT); 
 
     out.writeLong(0);                             // leave space for end index pointer
@@ -142,8 +140,6 @@
     }
   }
 
-  private final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
-
   long lastIndexPointer;
 
   class TermsWriter extends TermsConsumer {
@@ -169,48 +165,33 @@
     }
     
     @Override
-    public DocsConsumer startTerm(char[] text, int start) throws IOException {
+    public DocsConsumer startTerm(TermRef text) throws IOException {
       consumer.startTerm();
       if (Codec.DEBUG) {
-        // nocommit
-        int len = 0;
-        while(text[start+len] != 0xffff) {
-          len++;
-        }
-        consumer.desc = fieldInfo.name + ":" + new String(text, start, len);
-        System.out.println("stdw.startTerm term=" + fieldInfo.name + ":" + new String(text, start, len) + " seg=" + segment);
+        consumer.desc = fieldInfo.name + ":" + text;
+        System.out.println("stdw.startTerm term=" + fieldInfo.name + ":" + text + " seg=" + segment);
       }
       return consumer;
     }
 
     @Override
-    public void finishTerm(char[] text, int start, int numDocs) throws IOException {
+    public void finishTerm(TermRef text, int numDocs) throws IOException {
 
       // mxx
       if (Codec.DEBUG) {
         // nocommit
-        int len = 0;
-        while(text[start+len] != 0xffff) {
-          len++;
-        }
-        System.out.println(Thread.currentThread().getName() + ": stdw.finishTerm seg=" + segment + " text=" + fieldInfo.name + ":" + new String(text, start, len) + " numDocs=" + numDocs + " numTerms=" + numTerms);
+        System.out.println(Thread.currentThread().getName() + ": stdw.finishTerm seg=" + segment + " text=" + fieldInfo.name + ":" + text + " numDocs=" + numDocs + " numTerms=" + numTerms);
       }
 
       if (numDocs > 0) {
-        // TODO: we could do this incrementally
-        UnicodeUtil.UTF16toUTF8(text, start, utf8);
-
-        final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(utf8.result, utf8.length, numDocs);
+        final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
 
         // mxx
         if (Codec.DEBUG) {
           System.out.println(Thread.currentThread().getName() + ":  filePointer=" + out.getFilePointer() + " isIndexTerm?=" + isIndexTerm);
-          TermRef tr = new TermRef();
-          tr.bytes = utf8.result;
-          tr.length = utf8.length;
-          System.out.println("  term bytes=" + tr.toBytesString());
+          System.out.println("  term bytes=" + text.toBytesString());
         }
-        termWriter.write(utf8.result, utf8.length);
+        termWriter.write(text);
         out.writeVInt(numDocs);
 
         consumer.finishTerm(numDocs, isIndexTerm);

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java Thu Nov 19 19:08:47 2009
@@ -19,6 +19,7 @@
 
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.TermRef;
 import java.io.IOException;
 
 public abstract class StandardTermsIndexWriter {
@@ -26,7 +27,7 @@
   public abstract void setTermsOutput(IndexOutput out);
 
   public abstract class FieldWriter {
-    public abstract boolean checkIndexTerm(byte[] bytes, int length, int docFreq) throws IOException;
+    public abstract boolean checkIndexTerm(TermRef text, int docFreq) throws IOException;
   }
 
   public abstract FieldWriter addField(FieldInfo fieldInfo);

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/TestExternalCodecs.java Thu Nov 19 19:08:47 2009
@@ -122,18 +122,14 @@
         this.field = field;
       }
         
-      public DocsConsumer startTerm(char[] text, int start) {
-        int upto = start;
-        while(text[upto] != 0xffff) {
-          upto++;
-        }
-        final String term = new String(text, start, upto-start);
+      public DocsConsumer startTerm(TermRef text) {
+        final String term = text.toString();
         current = new RAMTerm(term);
         docsConsumer.reset(current);
         return docsConsumer;
       }
 
-      public void finishTerm(char[] text, int start, int numDocs) {
+      public void finishTerm(TermRef text, int numDocs) {
         // nocommit -- are we even called when numDocs == 0?
         if (numDocs > 0) {
           assert numDocs == current.docs.size();

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestCodecs.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestCodecs.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestCodecs.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestCodecs.java Thu Nov 19 19:08:47 2009
@@ -142,16 +142,14 @@
   }
 
   class TermData implements Comparable {
-    char[] text;
     String text2;
+    final TermRef text;
     int[] docs;
     PositionData[][] positions;
     FieldData field;
     
     public TermData(String text, int[] docs, PositionData[][] positions) {
-      this.text = new char[text.length()+1];
-      text.getChars(0, text.length(), this.text, 0);
-      this.text[text.length()] = 0xffff;
+      this.text = new TermRef(text);
       this.text2 = text;
       this.docs = docs;
       this.positions = positions;
@@ -164,7 +162,7 @@
     public void write(TermsConsumer termsConsumer) throws Throwable {
       if (Codec.DEBUG)
         System.out.println("  term=" + text2);
-      final DocsConsumer docsConsumer = termsConsumer.startTerm(text, 0);
+      final DocsConsumer docsConsumer = termsConsumer.startTerm(text);
       for(int i=0;i<docs.length;i++) {
         final int termDocFreq;
         if (field.omitTF)
@@ -184,7 +182,7 @@
         } else
           assert posConsumer==null;
       }
-      termsConsumer.finishTerm(text, 0, docs.length);
+      termsConsumer.finishTerm(text, docs.length);
     }
   }
 

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=882252&r1=882251&r2=882252&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestIndexWriter.java Thu Nov 19 19:08:47 2009
@@ -542,7 +542,7 @@
       RAMDirectory dir = new RAMDirectory();
       IndexWriter writer  = new IndexWriter(dir, new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);
 
-      char[] chars = new char[DocumentsWriter.CHAR_BLOCK_SIZE-1];
+      char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8-1];
       Arrays.fill(chars, 'x');
       Document doc = new Document();
       final String bigTerm = new String(chars);



Mime
View raw message