lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r931278 [6/10] - in /lucene/dev/trunk: lucene/ lucene/backwards/src/ lucene/backwards/src/java/org/apache/lucene/index/ lucene/backwards/src/java/org/apache/lucene/index/codecs/ lucene/backwards/src/java/org/apache/lucene/search/ lucene/bac...
Date Tue, 06 Apr 2010 19:19:36 GMT
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReader.java Tue Apr  6 19:19:27 2010
@@ -37,8 +37,16 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BitVector;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.CloseableThreadLocal;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.preflex.PreFlexFields;
+import org.apache.lucene.index.codecs.preflex.SegmentTermDocs;
+import org.apache.lucene.index.codecs.preflex.SegmentTermPositions;
+import org.apache.lucene.index.codecs.FieldsProducer;
 import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
+import org.apache.lucene.util.BytesRef;
 
 /**
  * @lucene.experimental
@@ -83,10 +91,11 @@ public class SegmentReader extends Index
 
     final String segment;
     final FieldInfos fieldInfos;
-    final IndexInput freqStream;
-    final IndexInput proxStream;
-    final TermInfosReader tisNoIndex;
 
+    final FieldsProducer fields;
+    final boolean isPreFlex;
+    final CodecProvider codecs;
+    
     final Directory dir;
     final Directory cfsDir;
     final int readBufferSize;
@@ -94,14 +103,22 @@ public class SegmentReader extends Index
 
     private final SegmentReader origInstance;
 
-    TermInfosReader tis;
     FieldsReader fieldsReaderOrig;
     TermVectorsReader termVectorsReaderOrig;
     CompoundFileReader cfsReader;
     CompoundFileReader storeCFSReader;
 
-    CoreReaders(SegmentReader origInstance, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor) throws IOException {
+    CoreReaders(SegmentReader origInstance, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor, CodecProvider codecs) throws IOException {
+
+      if (termsIndexDivisor < 1 && termsIndexDivisor != -1) {
+        throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + termsIndexDivisor);
+      }
+
       segment = si.name;
+      if (codecs == null) {
+        codecs = CodecProvider.getDefault();
+      }
+      this.codecs = codecs;      
       this.readBufferSize = readBufferSize;
       this.dir = dir;
 
@@ -118,23 +135,12 @@ public class SegmentReader extends Index
         fieldInfos = new FieldInfos(cfsDir, IndexFileNames.segmentFileName(segment, IndexFileNames.FIELD_INFOS_EXTENSION));
 
         this.termsIndexDivisor = termsIndexDivisor;
-        TermInfosReader reader = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize, termsIndexDivisor);
-        if (termsIndexDivisor == -1) {
-          tisNoIndex = reader;
-        } else {
-          tis = reader;
-          tisNoIndex = null;
-        }
 
-        // make sure that all index files have been read or are kept open
-        // so that if an index update removes them we'll still have them
-        freqStream = cfsDir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FREQ_EXTENSION), readBufferSize);
+        // Ask codec for its Fields
+        fields = si.getCodec().fieldsProducer(new SegmentReadState(cfsDir, si, fieldInfos, readBufferSize, termsIndexDivisor));
+        assert fields != null;
 
-        if (fieldInfos.hasProx()) {
-          proxStream = cfsDir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.PROX_EXTENSION), readBufferSize);
-        } else {
-          proxStream = null;
-        }
+        isPreFlex = fields instanceof PreFlexFields;
         success = true;
       } finally {
         if (!success) {
@@ -165,64 +171,12 @@ public class SegmentReader extends Index
       return cfsReader;
     }
 
-    synchronized TermInfosReader getTermsReader() {
-      if (tis != null) {
-        return tis;
-      } else {
-        return tisNoIndex;
-      }
-    }      
-
-    synchronized boolean termsIndexIsLoaded() {
-      return tis != null;
-    }      
-
-    // NOTE: only called from IndexWriter when a near
-    // real-time reader is opened, or applyDeletes is run,
-    // sharing a segment that's still being merged.  This
-    // method is not fully thread safe, and relies on the
-    // synchronization in IndexWriter
-    synchronized void loadTermsIndex(SegmentInfo si, int termsIndexDivisor) throws IOException {
-      if (tis == null) {
-        Directory dir0;
-        if (si.getUseCompoundFile()) {
-          // In some cases, we were originally opened when CFS
-          // was not used, but then we are asked to open the
-          // terms reader with index, the segment has switched
-          // to CFS
-          if (cfsReader == null) {
-            cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(segment, IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize);
-          }
-          dir0 = cfsReader;
-        } else {
-          dir0 = dir;
-        }
-
-        tis = new TermInfosReader(dir0, segment, fieldInfos, readBufferSize, termsIndexDivisor);
-      }
-    }
-
     synchronized void decRef() throws IOException {
 
       if (ref.decrementAndGet() == 0) {
 
-        // close everything, nothing is shared anymore with other readers
-        if (tis != null) {
-          tis.close();
-          // null so if an app hangs on to us we still free most ram
-          tis = null;
-        }
-        
-        if (tisNoIndex != null) {
-          tisNoIndex.close();
-        }
-        
-        if (freqStream != null) {
-          freqStream.close();
-        }
-
-        if (proxStream != null) {
-          proxStream.close();
+        if (fields != null) {
+          fields.close();
         }
 
         if (termVectorsReaderOrig != null) {
@@ -543,7 +497,7 @@ public class SegmentReader extends Index
    * @throws IOException if there is a low-level IO error
    */
   public static SegmentReader get(boolean readOnly, SegmentInfo si, int termInfosIndexDivisor) throws CorruptIndexException, IOException {
-    return get(readOnly, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, termInfosIndexDivisor);
+    return get(readOnly, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, termInfosIndexDivisor, null);
   }
 
   /**
@@ -555,8 +509,13 @@ public class SegmentReader extends Index
                                   SegmentInfo si,
                                   int readBufferSize,
                                   boolean doOpenStores,
-                                  int termInfosIndexDivisor)
+                                  int termInfosIndexDivisor,
+                                  CodecProvider codecs)
     throws CorruptIndexException, IOException {
+    if (codecs == null)  {
+      codecs = CodecProvider.getDefault();
+    }
+    
     SegmentReader instance = readOnly ? new ReadOnlySegmentReader() : new SegmentReader();
     instance.readOnly = readOnly;
     instance.si = si;
@@ -565,7 +524,7 @@ public class SegmentReader extends Index
     boolean success = false;
 
     try {
-      instance.core = new CoreReaders(instance, dir, si, readBufferSize, termInfosIndexDivisor);
+      instance.core = new CoreReaders(instance, dir, si, readBufferSize, termInfosIndexDivisor, codecs);
       if (doOpenStores) {
         instance.core.openDocStores(si);
       }
@@ -590,6 +549,11 @@ public class SegmentReader extends Index
     core.openDocStores(si);
   }
 
+  @Override
+  public synchronized Bits getDeletedDocs() {
+    return deletedDocs;
+  }
+
   private boolean checkDeletedCounts() throws IOException {
     final int recomputedCount = deletedDocs.getRecomputedCount();
      
@@ -859,17 +823,36 @@ public class SegmentReader extends Index
   List<String> files() throws IOException {
     return new ArrayList<String>(si.files());
   }
-
+  
   @Override
-  public TermEnum terms() {
+  public TermEnum terms() throws IOException {
     ensureOpen();
-    return core.getTermsReader().terms();
+    if (core.isPreFlex) {
+      // For old API on an old segment, instead of
+      // converting old API -> new API -> old API, just give
+      // direct access to old:
+      return ((PreFlexFields) core.fields).tis.terms();
+    } else {
+      // Emulate pre-flex API on top of flex index
+      return new LegacyTermEnum(null);
+    }
   }
 
+  /** @deprecated Please switch to the flex API ({@link
+   * #fields}) instead. */
+  @Deprecated
   @Override
   public TermEnum terms(Term t) throws IOException {
     ensureOpen();
-    return core.getTermsReader().terms(t);
+    if (core.isPreFlex) {
+      // For old API on an old segment, instead of
+      // converting old API -> new API -> old API, just give
+      // direct access to old:
+      return ((PreFlexFields) core.fields).tis.terms(t);
+    } else {
+      // Emulate pre-flex API on top of flex index
+      return new LegacyTermEnum(t);
+    }
   }
 
   FieldInfos fieldInfos() {
@@ -887,6 +870,9 @@ public class SegmentReader extends Index
     return (deletedDocs != null && deletedDocs.get(n));
   }
 
+  /** @deprecated Switch to the flex API ({@link
+   * IndexReader#termDocsEnum}) instead. */
+  @Deprecated
   @Override
   public TermDocs termDocs(Term term) throws IOException {
     if (term == null) {
@@ -895,27 +881,73 @@ public class SegmentReader extends Index
       return super.termDocs(term);
     }
   }
+  
+  @Override
+  public Fields fields() throws IOException {
+    return core.fields;
+  }
 
+  /** @deprecated Switch to the flex API {@link
+   *  IndexReader#termDocsEnum} instead. */
+  @Deprecated
   @Override
   public TermDocs termDocs() throws IOException {
     ensureOpen();
-    return new SegmentTermDocs(this);
+    if (core.isPreFlex) {
+      // For old API on an old segment, instead of
+      // converting old API -> new API -> old API, just give
+      // direct access to old:
+      final PreFlexFields pre = (PreFlexFields) core.fields;
+      SegmentTermDocs std = new SegmentTermDocs(pre.freqStream, pre.tis, core.fieldInfos);
+      std.setSkipDocs(deletedDocs);
+      return std;
+    } else {
+      // Emulate old API
+      return new LegacyTermDocs();
+    }
   }
 
+  /** @deprecated Switch to the flex API {@link
+   *  IndexReader#termDocsEnum} instead */
+  @Deprecated
   @Override
   public TermPositions termPositions() throws IOException {
     ensureOpen();
-    return new SegmentTermPositions(this);
+    if (core.isPreFlex) {
+      // For old API on an old segment, instead of
+      // converting old API -> new API -> old API, just give
+      // direct access to old:
+      final PreFlexFields pre = (PreFlexFields) core.fields;
+      SegmentTermPositions stp = new SegmentTermPositions(pre.freqStream, pre.proxStream, pre.tis, core.fieldInfos);
+      stp.setSkipDocs(deletedDocs);
+      return stp;
+    } else {
+      // Emulate old API
+      return new LegacyTermPositions();
+    }
   }
 
   @Override
   public int docFreq(Term t) throws IOException {
     ensureOpen();
-    TermInfo ti = core.getTermsReader().get(t);
-    if (ti != null)
-      return ti.docFreq;
-    else
+    Terms terms = core.fields.terms(t.field);
+    if (terms != null) {
+      return terms.docFreq(new BytesRef(t.text));
+    } else {
       return 0;
+    }
+  }
+
+  @Override
+  public int docFreq(String field, BytesRef term) throws IOException {
+    ensureOpen();
+
+    Terms terms = core.fields.terms(field);
+    if (terms != null) {
+      return terms.docFreq(term);
+    } else {
+      return 0;
+    }
   }
 
   @Override
@@ -1078,17 +1110,13 @@ public class SegmentReader extends Index
     }
   }
 
-  boolean termsIndexLoaded() {
-    return core.termsIndexIsLoaded();
-  }
-
   // NOTE: only called from IndexWriter when a near
   // real-time reader is opened, or applyDeletes is run,
   // sharing a segment that's still being merged.  This
   // method is not thread safe, and relies on the
   // synchronization in IndexWriter
-  void loadTermsIndex(int termsIndexDivisor) throws IOException {
-    core.loadTermsIndex(si, termsIndexDivisor);
+  void loadTermsIndex(int indexDivisor) throws IOException {
+    core.fields.loadTermsIndex(indexDivisor);
   }
 
   // for testing only
@@ -1266,14 +1294,9 @@ public class SegmentReader extends Index
   // same entry in the FieldCache.  See LUCENE-1579.
   @Override
   public final Object getFieldCacheKey() {
-    return core.freqStream;
-  }
-
-  @Override
-  public long getUniqueTermCount() {
-    return core.getTermsReader().size();
+    return core;
   }
-
+  
   /**
    * Lotsa tests did hacks like:<br/>
    * SegmentReader reader = (SegmentReader) IndexReader.open(dir);<br/>
@@ -1283,7 +1306,7 @@ public class SegmentReader extends Index
    */
   @Deprecated
   static SegmentReader getOnlySegmentReader(Directory dir) throws IOException {
-    return getOnlySegmentReader(IndexReader.open(dir,false));
+    return getOnlySegmentReader(IndexReader.open(dir, false));
   }
 
   static SegmentReader getOnlySegmentReader(IndexReader reader) {
@@ -1305,4 +1328,372 @@ public class SegmentReader extends Index
   public int getTermInfosIndexDivisor() {
     return core.termsIndexDivisor;
   }
+  
+  // Back compat: pre-flex TermEnum API over flex API
+  @Deprecated
+  final private class LegacyTermEnum extends TermEnum {
+    FieldsEnum fields;
+    TermsEnum terms;
+    boolean done;
+    String currentField;
+    BytesRef currentTerm;
+
+    public LegacyTermEnum(Term t) throws IOException {
+      fields = core.fields.iterator();
+      currentField = fields.next();
+      if (currentField == null) {
+        // no fields
+        done = true;
+      } else if (t != null) {
+        // Pre-seek to this term
+
+        while(currentField.compareTo(t.field) < 0) {
+          currentField = fields.next();
+          if (currentField == null) {
+            // Hit end of fields
+            done = true;
+            break;
+          }
+        }
+
+        if (!done) {
+          // We found some field -- get its terms:
+          terms = fields.terms();
+
+          if (currentField == t.field) {
+            // We found exactly the requested field; now
+            // seek the term text:
+            String text = t.text();
+
+            // this is only for backwards compatibility.
+            // previously you could supply a term with unpaired surrogates,
+            // and it would return the next Term.
+            // if someone does this, tack on the lowest possible trail surrogate.
+            // this emulates the old behavior, and forms "valid UTF-8" unicode.
+            BytesRef tr = new BytesRef(UnicodeUtil.nextValidUTF16String(text));
+            TermsEnum.SeekStatus status = terms.seek(tr);
+
+            if (status == TermsEnum.SeekStatus.END) {
+              // Rollover to the next field
+              terms = null;
+              next();
+            } else if (status == TermsEnum.SeekStatus.FOUND) {
+              // Found exactly the term
+              currentTerm = tr;
+            } else {
+              // Found another term, in this same field
+              currentTerm = terms.term();
+            }
+          } else {
+            // We didn't find exact field (we found the
+            // following field); advance to first term in
+            // this field
+            next();
+          }
+        }
+      } else {
+        terms = fields.terms();
+      }
+    }
+
+    @Override
+    public boolean next() throws IOException {
+
+      if (done) {
+        return false;
+      }
+
+      while(true) {
+        if (terms == null) {
+          // Advance to the next field
+          currentField = fields.next();
+          if (currentField == null) {
+            done = true;
+            return false;
+          }
+          terms = fields.terms();
+        }
+        currentTerm = terms.next();
+        if (currentTerm != null) {
+          // This field still has terms
+          return true;
+        } else {
+          // Done producing terms from this field; advance
+          // to next field
+          terms = null;
+        }
+      }
+    }
+
+    @Override
+    public Term term() {
+      if (!done && terms != null && currentTerm != null) {
+        return new Term(currentField, currentTerm.utf8ToString());
+      }
+      return null;
+    }
+
+    @Override
+    public int docFreq() {
+      return terms == null ? 0 : terms.docFreq();
+    }
+
+    @Override
+    public void close() {}
+  }
+
+  // Back compat: emulates legacy TermDocs API on top of
+  // flex API
+  private class LegacyTermDocs implements TermDocs {
+
+    String currentField;
+    final Fields fields;
+    TermsEnum terms;
+    DocsEnum docsEnum;
+    boolean any;
+
+    LegacyTermDocs() throws IOException {
+      fields = core.fields;
+    }
+
+    public void close() {}
+
+    public void seek(TermEnum termEnum) throws IOException {
+      seek(termEnum.term());
+    }
+
+    public boolean skipTo(int target) throws IOException {
+      if (!any) {
+        return false;
+      } else {
+        return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS;
+      }
+    }
+
+    public void seek(Term term) throws IOException {
+
+      any = false;
+
+      if (terms != null && !term.field.equals(currentField)) {
+        // new field
+        terms = null;
+      }
+
+      if (terms == null) {
+        currentField = term.field;
+        Terms terms1 = fields.terms(currentField);
+        if (terms1 == null) {
+          // no such field
+          return;
+        } else {
+          terms = terms1.iterator();
+        }
+      }
+
+      if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) {
+        // Term exists
+        any = true;
+        pendingBulkResult = null;
+        docsEnum = terms.docs(deletedDocs, docsEnum);
+      }
+    }
+
+    public int doc() {
+      if (!any) {
+        return 0;
+      } else {
+        return docsEnum.docID();
+      }
+    }
+
+    private DocsEnum.BulkReadResult pendingBulkResult;
+    private int bulkCount;
+    private int pendingBulk;
+
+    public int read(int[] docs, int[] freqs) throws IOException {
+      if (any && pendingBulkResult == null) {
+        pendingBulkResult = docsEnum.getBulkResult();
+      }
+      if (!any) {
+        return 0;
+      } else if (pendingBulk > 0) {
+        final int left = bulkCount - pendingBulk;
+        if (docs.length >= left) {
+          // read all pending
+          System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, left);
+          System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, left);
+          pendingBulk = 0;
+          return left;
+        } else {
+          // read only part of pending
+          System.arraycopy(pendingBulkResult.docs.ints, pendingBulk, docs, 0, docs.length);
+          System.arraycopy(pendingBulkResult.freqs.ints, pendingBulk, freqs, 0, docs.length);
+          pendingBulk += docs.length;
+          return docs.length;
+        }
+      } else {
+        // nothing pending
+        bulkCount = docsEnum.read();
+        if (docs.length >= bulkCount) {
+          System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, bulkCount);
+          System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, bulkCount);
+          return bulkCount;
+        } else {
+          System.arraycopy(pendingBulkResult.docs.ints, 0, docs, 0, docs.length);
+          System.arraycopy(pendingBulkResult.freqs.ints, 0, freqs, 0, docs.length);
+          pendingBulk = docs.length;
+          return docs.length;
+        }
+      }
+    }
+
+    public int freq() {
+      if (!any) {
+        return 0;
+      } else {
+        return docsEnum.freq();
+      }
+    }
+
+    public boolean next() throws IOException {
+      if (!any) {
+        return false;
+      } else {
+        return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS;
+      }
+    }
+  }
+
+  // Back compat: implements legacy TermPositions API on top
+  // of flex API
+  final private class LegacyTermPositions implements TermPositions {
+
+    String currentField;
+    final Fields fields;
+    TermsEnum terms;
+    DocsAndPositionsEnum postingsEnum;
+    DocsEnum docsEnum;
+    boolean any;
+
+    LegacyTermPositions() throws IOException {
+      fields = core.fields;
+    }
+
+    public void close() {}
+
+    public void seek(TermEnum termEnum) throws IOException {
+      seek(termEnum.term());
+    }
+
+    public boolean skipTo(int target) throws IOException {
+      if (!any) {
+        return false;
+      } else {
+        return docsEnum.advance(target) != docsEnum.NO_MORE_DOCS;
+      }
+    }
+
+    public void seek(Term term) throws IOException {
+
+      any = false;
+
+      if (terms != null && !term.field.equals(currentField)) {
+        // new field
+        terms = null;
+      }
+
+      if (terms == null) {
+        currentField = term.field;
+        Terms terms1 = fields.terms(currentField);
+        if (terms1 == null) {
+          // no such field
+          return;
+        } else {
+          terms = terms1.iterator();
+        }
+      }
+
+      if (terms.seek(new BytesRef(term.text)) == TermsEnum.SeekStatus.FOUND) {
+        // Term exists
+        any = true;
+        postingsEnum = terms.docsAndPositions(deletedDocs, postingsEnum);
+        if (postingsEnum == null) {
+          docsEnum = terms.docs(deletedDocs, postingsEnum);
+        } else {
+          docsEnum = postingsEnum;
+        }
+      }
+    }
+
+    public int doc() {
+      if (!any) {
+        return 0;
+      } else {
+        return docsEnum.docID();
+      }
+    }
+
+    public int freq() {
+      if (!any) {
+        return 0;
+      } else {
+        return docsEnum.freq();
+      }
+    }
+
+    public boolean next() throws IOException {
+      if (!any) {
+        return false;
+      } else {
+        return docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS;
+      }
+    }
+
+    public int read(int[] docs, int[] freqs) throws IOException {
+      throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
+    }
+
+    public int nextPosition() throws IOException {     
+      if (!any || postingsEnum == null) {
+        return 0;
+      } else {
+        return postingsEnum.nextPosition();
+      }
+    }
+
+    public int getPayloadLength() {
+      if (!any || postingsEnum == null) {
+        return 0;
+      } else {
+        return postingsEnum.getPayloadLength();
+      }
+    }
+
+    public byte[] getPayload(byte[] bytes, int offset) throws IOException {
+      if (!any || postingsEnum == null) {
+        return null;
+      }
+      final BytesRef payload = postingsEnum.getPayload();
+      // old API would always used passed in bytes if it
+      // "fits", else allocate new:
+      if (bytes != null && payload.length <= bytes.length - offset) {
+        System.arraycopy(payload.bytes, payload.offset, bytes, offset, payload.length);
+        return bytes;
+      } else if (payload.offset == 0 && payload.length == payload.bytes.length) {
+        return payload.bytes;
+      } else {
+        final byte[] retBytes = new byte[payload.length];
+        System.arraycopy(payload.bytes, payload.offset, retBytes, 0, payload.length);
+        return retBytes;
+      }
+    }
+
+    public boolean isPayloadAvailable() {
+      if (!any || postingsEnum == null) {
+        return false;
+      } else {
+        return postingsEnum.hasPayload();
+      }
+    }
+  }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Tue Apr  6 19:19:27 2010
@@ -19,32 +19,63 @@ package org.apache.lucene.index;
 
 import java.util.HashSet;
 import java.util.Collection;
+import java.io.PrintStream;
 
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CodecProvider;
 
-class SegmentWriteState {
-  DocumentsWriter docWriter;
-  Directory directory;
-  String segmentName;
-  String docStoreSegmentName;
-  int numDocs;
-  int termIndexInterval;
-  int numDocsInStore;
-  Collection<String> flushedFiles;
-
-  public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs,
-                           int numDocsInStore, int termIndexInterval) {
-    this.docWriter = docWriter;
+/**
+ * This class is not meant for public usage; it's only
+ * public in order to expose access across packages.  It's
+ * used internally when updating the index.
+ * @lucene.experimental
+ */
+public class SegmentWriteState {
+  public final PrintStream infoStream;
+  public final Directory directory;
+  public final String segmentName;
+  public final FieldInfos fieldInfos;
+  public final String docStoreSegmentName;
+  public final int numDocs;
+  public int numDocsInStore;
+  public final Collection<String> flushedFiles;
+
+  // Actual codec used
+  final Codec codec;
+
+  /** Expert: The fraction of terms in the "dictionary" which should be stored
+   * in RAM.  Smaller values use more memory, but make searching slightly
+   * faster, while larger values use less memory and make searching slightly
+   * slower.  Searching is typically not dominated by dictionary lookup, so
+   * tweaking this is rarely useful.*/
+  public final int termIndexInterval;
+
+  /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
+   * used to accelerate {@link TermDocs#skipTo(int)}.  Larger values result in
+   * smaller indexes, greater acceleration, but fewer accelerable cases, while
+   * smaller values result in bigger indexes, less acceleration and more
+   * accelerable cases. More detailed experiments would be useful here. */
+  public final int skipInterval = 16;
+  
+  /** Expert: The maximum number of skip levels. Smaller values result in 
+   * slightly smaller indexes, but slower skipping in big posting lists.
+   */
+  public final int maxSkipLevels = 10;
+
+  public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos,
+                           String docStoreSegmentName, int numDocs,
+                           int numDocsInStore, int termIndexInterval,
+                           CodecProvider codecs) {
+    this.infoStream = infoStream;
     this.directory = directory;
     this.segmentName = segmentName;
+    this.fieldInfos = fieldInfos;
     this.docStoreSegmentName = docStoreSegmentName;
     this.numDocs = numDocs;
     this.numDocsInStore = numDocsInStore;
     this.termIndexInterval = termIndexInterval;
+    this.codec = codecs.getWriter(this);
     flushedFiles = new HashSet<String>();
   }
-
-  public String segmentFileName(String ext) {
-    return segmentName + "." + ext;
-  }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java Tue Apr  6 19:19:27 2010
@@ -90,8 +90,8 @@ final class StoredFieldsWriter {
       state.flushedFiles.add(fieldsName);
       state.flushedFiles.add(fieldsIdxName);
 
-      state.docWriter.removeOpenFile(fieldsName);
-      state.docWriter.removeOpenFile(fieldsIdxName);
+      docWriter.removeOpenFile(fieldsName);
+      docWriter.removeOpenFile(fieldsIdxName);
 
       if (4+((long) state.numDocsInStore)*8 != state.directory.fileLength(fieldsIdxName))
         throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fieldsIdxName) + " length in bytes of " + fieldsIdxName + " file exists?=" + state.directory.fileExists(fieldsIdxName));

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/Term.java Tue Apr  6 19:19:27 2010
@@ -1,7 +1,5 @@
 package org.apache.lucene.index;
 
-import org.apache.lucene.util.StringHelper;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -19,6 +17,8 @@ import org.apache.lucene.util.StringHelp
  * limitations under the License.
  */
 
+import org.apache.lucene.util.StringHelper;
+
 /**
   A Term represents a word from text.  This is the unit of search.  It is
   composed of two elements, the text of the word, as a string, and the name of
@@ -35,7 +35,7 @@ public final class Term implements Compa
    * <p>Note that a null field or null text value results in undefined
    * behavior for most Lucene APIs that accept a Term parameter. */
   public Term(String fld, String txt) {
-    field = StringHelper.intern(fld);
+    field = fld == null ? null : StringHelper.intern(fld);
     text = txt;
   }
 
@@ -49,7 +49,8 @@ public final class Term implements Compa
     this(fld, "", true);
   }
 
-  Term(String fld, String txt, boolean intern) {
+  /** @lucene.experimental */
+  public Term(String fld, String txt, boolean intern) {
     field = intern ? StringHelper.intern(fld) : fld;	  // field names are interned
     text = txt;					          // unless already known to be
   }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermDocs.java Tue Apr  6 19:19:27 2010
@@ -27,8 +27,10 @@ import java.io.Closeable;
  ordered by document number.
 
  @see IndexReader#termDocs()
- */
+ @deprecated Use {@link DocsEnum} instead
+*/
 
+@Deprecated
 public interface TermDocs extends Closeable {
   /** Sets this to the data for a term.
    * The enumeration is reset to the start of the data for this term.

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermEnum.java Tue Apr  6 19:19:27 2010
@@ -23,8 +23,10 @@ import java.io.Closeable;
 /** Abstract class for enumerating terms.
 
   <p>Term enumerations are always ordered by Term.compareTo().  Each term in
-  the enumeration is greater than all that precede it.  */
+  the enumeration is greater than all that precede it.
+* @deprecated Use TermsEnum instead */
 
+@Deprecated
 public abstract class TermEnum implements Closeable {
   /** Increments the enumeration to the next element.  True if one exists.*/
   public abstract boolean next() throws IOException;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermPositions.java Tue Apr  6 19:19:27 2010
@@ -26,8 +26,9 @@ import java.io.IOException;
  * positions of each occurrence of a term in a document.
  *
  * @see IndexReader#termPositions()
+ * @deprecated Use {@link DocsAndPositionsEnum} instead 
  */
-
+@Deprecated
 public interface TermPositions
     extends TermDocs
 {

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Tue Apr  6 19:19:27 2010
@@ -22,7 +22,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
 
 final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
 
@@ -106,6 +106,8 @@ final class TermVectorsTermsWriterPerFie
 
     final int numPostings = termsHashPerField.numPostings;
 
+    final BytesRef flushTerm = perThread.flushTerm;
+
     assert numPostings >= 0;
 
     if (!doVectors || numPostings == 0)
@@ -126,7 +128,9 @@ final class TermVectorsTermsWriterPerFie
     perThread.doc.addField(termsHashPerField.fieldInfo.number);
     TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
 
-    final int[] termIDs = termsHashPerField.sortPostings();
+    // TODO: we may want to make this sort in same order
+    // as Codec's terms dict?
+    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
 
     tvf.writeVInt(numPostings);
     byte bits = 0x0;
@@ -136,46 +140,40 @@ final class TermVectorsTermsWriterPerFie
       bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
     tvf.writeByte(bits);
 
-    int encoderUpto = 0;
-    int lastTermBytesCount = 0;
-
+    int lastLen = 0;
+    byte[] lastBytes = null;
+    int lastStart = 0;
+      
     final ByteSliceReader reader = perThread.vectorSliceReader;
-    final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+    final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool;
+
     for(int j=0;j<numPostings;j++) {
       final int termID = termIDs[j];
       final int freq = postings.freqs[termID];
           
-      final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-      final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
+      // Get BytesRef
+      termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]);
 
-      // We swap between two encoders to save copying
-      // last Term's byte array
-      final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
-
-      // TODO: we could do this incrementally
-      UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
-      final int termBytesCount = utf8Result.length;
-
-      // TODO: UTF16toUTF8 could tell us this prefix
-      // Compute common prefix between last term and
+      // Compute common byte prefix between last term and
       // this term
       int prefix = 0;
       if (j > 0) {
-        final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
-        final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
-        while(prefix < lastTermBytesCount && prefix < termBytesCount) {
-          if (lastTermBytes[prefix] != termBytes[prefix])
+        while(prefix < lastLen && prefix < flushTerm.length) {
+          if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) {
             break;
+          }
           prefix++;
         }
       }
-      encoderUpto = 1-encoderUpto;
-      lastTermBytesCount = termBytesCount;
 
-      final int suffix = termBytesCount - prefix;
+      lastLen = flushTerm.length;
+      lastBytes = flushTerm.bytes;
+      lastStart = flushTerm.offset;
+
+      final int suffix = flushTerm.length - prefix;
       tvf.writeVInt(prefix);
       tvf.writeVInt(suffix);
-      tvf.writeBytes(utf8Result.result, prefix, suffix);
+      tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix);
       tvf.writeVInt(freq);
 
       if (doVectorPositions) {
@@ -209,9 +207,7 @@ final class TermVectorsTermsWriterPerFie
 
   @Override
   void newTerm(final int termID) {
-
     assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
-
     TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
 
     postings.freqs[termID] = 1;
@@ -275,23 +271,25 @@ final class TermVectorsTermsWriterPerFie
     int[] lastOffsets;                                 // Last offset we saw
     int[] lastPositions;                               // Last position where this term occurred
     
+    ParallelPostingsArray newInstance(int size) {
+      return new TermVectorsPostingsArray(size);
+    }
+
     @Override
-    ParallelPostingsArray resize(int newSize) {
-      TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize);
-      copy(this, newArray);
-      return newArray;
+    void copyTo(ParallelPostingsArray toArray, int numToCopy) {
+      assert toArray instanceof TermVectorsPostingsArray;
+      TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray;
+
+      super.copyTo(toArray, numToCopy);
+
+      System.arraycopy(freqs, 0, to.freqs, 0, size);
+      System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size);
+      System.arraycopy(lastPositions, 0, to.lastPositions, 0, size);
     }
-    
-    void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) {
-      super.copy(fromArray, toArray);
-      System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length);
-      System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length);
-      System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
+
+    @Override
+    int bytesPerPosting() {
+      return super.bytesPerPosting() + 3 * DocumentsWriter.INT_NUM_BYTE;
     }
   }
-  
-  @Override
-  int bytesPerPosting() {
-    return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE;
-  }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java Tue Apr  6 19:19:27 2010
@@ -17,13 +17,14 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
 
 final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread {
 
   final TermVectorsTermsWriter termsWriter;
   final TermsHashPerThread termsHashPerThread;
   final DocumentsWriter.DocState docState;
+  final BytesRef flushTerm = new BytesRef();
 
   TermVectorsTermsWriter.PerDoc doc;
 
@@ -36,9 +37,6 @@ final class TermVectorsTermsWriterPerThr
   // Used by perField when serializing the term vectors
   final ByteSliceReader vectorSliceReader = new ByteSliceReader();
 
-  final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
-                                                new UnicodeUtil.UTF8Result()};
-
   @Override
   public void startDocument() {
     assert clearLastVectorFieldName();

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java Tue Apr  6 19:19:27 2010
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.UnicodeUtil;
 
@@ -28,8 +29,7 @@ final class TermVectorsWriter {
   
   private IndexOutput tvx = null, tvd = null, tvf = null;
   private FieldInfos fieldInfos;
-  final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
-                                                                             new UnicodeUtil.UTF8Result()};
+  final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)};
 
   public TermVectorsWriter(Directory directory, String segment,
                            FieldInfos fieldInfos)
@@ -107,14 +107,14 @@ final class TermVectorsWriter {
 
           UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
           
-          int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
+          int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].bytes,
                                                    utf8Results[1-utf8Upto].length,
-                                                   utf8Results[utf8Upto].result,
+                                                   utf8Results[utf8Upto].bytes,
                                                    utf8Results[utf8Upto].length);
           int length = utf8Results[utf8Upto].length - start;
           tvf.writeVInt(start);       // write shared prefix length
           tvf.writeVInt(length);        // write delta length
-          tvf.writeBytes(utf8Results[utf8Upto].result, start, length);  // write delta bytes
+          tvf.writeBytes(utf8Results[utf8Upto].bytes, start, length);  // write delta bytes
           utf8Upto = 1-utf8Upto;
 
           final int termFreq = freqs[j];

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java Tue Apr  6 19:19:27 2010
@@ -34,8 +34,6 @@ abstract class TermsHashConsumerPerField
   abstract void newTerm(int termID) throws IOException;
   abstract void addTerm(int termID) throws IOException;
   abstract int getStreamCount();
-  
-  abstract ParallelPostingsArray createPostingsArray(int size);
-  abstract int bytesPerPosting();
 
+  abstract ParallelPostingsArray createPostingsArray(int size);
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java Tue Apr  6 19:19:27 2010
@@ -19,10 +19,13 @@ package org.apache.lucene.index;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Comparator;
 
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
 
 final class TermsHashPerField extends InvertedDocConsumerPerField {
 
@@ -32,12 +35,12 @@ final class TermsHashPerField extends In
   final TermsHashPerThread perThread;
   final DocumentsWriter.DocState docState;
   final FieldInvertState fieldState;
-  TermAttribute termAtt;
-  
+  TermToBytesRefAttribute termAtt;
+
   // Copied from our perThread
-  final CharBlockPool charPool;
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
+  final ByteBlockPool termBytePool;
 
   final int streamCount;
   final int numPostingInt;
@@ -52,43 +55,42 @@ final class TermsHashPerField extends In
   private int[] postingsHash;
  
   ParallelPostingsArray postingsArray;
-  
-  private final int bytesPerPosting;
-  
+  private final BytesRef utf8;
+  private Comparator<BytesRef> termComp;
+
   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
     this.perThread = perThread;
     intPool = perThread.intPool;
-    charPool = perThread.charPool;
     bytePool = perThread.bytePool;
+    termBytePool = perThread.termBytePool;
     docState = perThread.docState;
+
     postingsHash = new int[postingsHashSize];
     Arrays.fill(postingsHash, -1);
+    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+
     fieldState = docInverterPerField.fieldState;
     this.consumer = perThread.consumer.addField(this, fieldInfo);
+    postingsArray = consumer.createPostingsArray(postingsHashSize/2);
+    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
+
     streamCount = consumer.getStreamCount();
     numPostingInt = 2*streamCount;
+    utf8 = perThread.utf8;
     this.fieldInfo = fieldInfo;
     if (nextPerThread != null)
       nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
     else
       nextPerField = null;
-    
-    //   +3: Posting is referenced by hash, which
-    //       targets 25-50% fill factor; approximate this
-    //       as 3X # pointers
-    bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE;
   }
-  
-  void initPostingsArray() {
-    assert postingsArray == null;
 
-    postingsArray = consumer.createPostingsArray(postingsHashSize);
-    
+  // sugar: just forwards to DW
+  private void bytesUsed(long size) {
     if (perThread.termsHash.trackAllocations) {
-      perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize);
+      perThread.termsHash.docWriter.bytesUsed(size);
     }
   }
-
+  
   void shrinkHash(int targetSize) {
     assert postingsCompacted || numPostings == 0;
 
@@ -100,13 +102,20 @@ final class TermsHashPerField extends In
     }
 
     if (newSize != postingsHash.length) {
+      final long previousSize = postingsHash.length;
       postingsHash = new int[newSize];
+      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
       Arrays.fill(postingsHash, -1);
-      postingsArray = null;
       postingsHashSize = newSize;
       postingsHashHalfSize = newSize/2;
       postingsHashMask = newSize-1;
     }
+
+    if (postingsArray != null) {
+      final int startSize = postingsArray.size;
+      postingsArray = postingsArray.shrink(targetSize, false);
+      bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - startSize));
+    }
   }
 
   public void reset() {
@@ -129,14 +138,10 @@ final class TermsHashPerField extends In
       nextPerField.abort();
   }
   
-  private void growParallelPostingsArray() {
-    int oldSize = postingsArray.byteStarts.length;
-    int newSize = (int) (oldSize * 1.5);
-    this.postingsArray = this.postingsArray.resize(newSize);
-    
-    if (perThread.termsHash.trackAllocations) {
-      perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize));
-    }
+  private final void growParallelPostingsArray() {
+    int oldSize = postingsArray.size;
+    this.postingsArray = this.postingsArray.grow();
+    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
   }
 
   public void initReader(ByteSliceReader reader, int termID, int stream) {
@@ -166,7 +171,8 @@ final class TermsHashPerField extends In
   }
 
   /** Collapse the hash table & sort in-place. */
-  public int[] sortPostings() {
+  public int[] sortPostings(Comparator<BytesRef> termComp) {
+    this.termComp = termComp;
     compactPostings();
     quickSort(postingsHash, 0, numPostings-1);
     return postingsHash;
@@ -237,50 +243,48 @@ final class TermsHashPerField extends In
    *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
   int comparePostings(int term1, int term2) {
 
-    if (term1 == term2)
+    if (term1 == term2) {
+      // Our quicksort does this, eg during partition
       return 0;
-
-    final int textStart1 = postingsArray.textStarts[term1];
-    final int textStart2 = postingsArray.textStarts[term2];
-    
-    final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-    int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
-    final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-    int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
-
-    assert text1 != text2 || pos1 != pos2;
-
-    while(true) {
-      final char c1 = text1[pos1++];
-      final char c2 = text2[pos2++];
-      if (c1 != c2) {
-        if (0xffff == c2)
-          return 1;
-        else if (0xffff == c1)
-          return -1;
-        else
-          return c1-c2;
-      } else
-        // This method should never compare equal postings
-        // unless p1==p2
-        assert c1 != 0xffff;
     }
+
+    termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
+    termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);
+
+    return termComp.compare(perThread.tr1, perThread.tr2);
   }
 
   /** Test whether the text for current RawPostingList p equals
-   *  current tokenText. */
-  private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
+   *  current tokenText in utf8. */
+  private boolean postingEquals(final int termID) {
     final int textStart = postingsArray.textStarts[termID];
-    
-    final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
     assert text != null;
-    int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
 
-    int tokenPos = 0;
-    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
-      if (tokenText[tokenPos] != text[pos])
-        return false;
-    return 0xffff == text[pos];
+    int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+    
+    final int len;
+    if ((text[pos] & 0x80) == 0) {
+      // length is 1 byte
+      len = text[pos];
+      pos += 1;
+    } else {
+      // length is 2 bytes
+      len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
+      pos += 2;
+    }
+
+    if (len == utf8.length) {
+      final byte[] utf8Bytes = utf8.bytes;
+      for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
+        if (utf8Bytes[tokenPos] != text[pos]) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
   }
   
   private boolean doCall;
@@ -288,10 +292,14 @@ final class TermsHashPerField extends In
 
   @Override
   void start(Fieldable f) {
-    if (postingsArray == null) {
-      initPostingsArray();
+    if (fieldState.attributeSource.hasAttribute(TermToBytesRefAttribute.class)) {
+      termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
+    } else if (fieldState.attributeSource.hasAttribute(TermAttribute.class)) {
+      perThread.legacyTermAttributeWrapper.setTermAttribute(fieldState.attributeSource.getAttribute(TermAttribute.class));
+      termAtt = perThread.legacyTermAttributeWrapper;
+    } else {
+      throw new IllegalArgumentException("Could not find a term attribute (that implements TermToBytesRefAttribute) in the TokenStream");
     }
-    termAtt = fieldState.attributeSource.addAttribute(TermAttribute.class);
     consumer.start(f);
     if (nextPerField != null) {
       nextPerField.start(f);
@@ -337,12 +345,9 @@ final class TermsHashPerField extends In
 
       // New posting
       termID = numPostings++;
-      if (termID >= postingsArray.textStarts.length) {
+      if (termID >= postingsArray.size) {
         growParallelPostingsArray();
       }
-      if (perThread.termsHash.trackAllocations) {
-        perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
-      }
 
       assert termID >= 0;
 
@@ -392,48 +397,15 @@ final class TermsHashPerField extends In
     // We are first in the chain so we must "intern" the
     // term text into textStart address
 
-    // Get the text of this term.
-    final char[] tokenText = termAtt.termBuffer();
-    final int tokenTextLen = termAtt.termLength();
-
-    // Compute hashcode & replace any invalid UTF16 sequences
-    int downto = tokenTextLen;
-    int code = 0;
-    while (downto > 0) {
-      char ch = tokenText[--downto];
-
-      if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
-        if (0 == downto) {
-          // Unpaired
-          ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-        } else {
-          final char ch2 = tokenText[downto-1];
-          if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
-            // OK: high followed by low.  This is a valid
-            // surrogate pair.
-            code = ((code*31) + ch)*31+ch2;
-            downto--;
-            continue;
-          } else {
-            // Unpaired
-            ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-          }            
-        }
-      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
-                                                          ch == 0xffff)) {
-        // Unpaired or 0xffff
-        ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
-      }
-
-      code = (code*31) + ch;
-    }
+    // Get the text & hash of this term.
+    int code = termAtt.toBytesRef(utf8);
 
     int hashPos = code & postingsHashMask;
 
     // Locate RawPostingList in hash
     int termID = postingsHash[hashPos];
 
-    if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
+    if (termID != -1 && !postingEquals(termID)) {
       // Conflict: keep searching different locations in
       // the hash table.
       final int inc = ((code>>8)+code)|1;
@@ -441,61 +413,86 @@ final class TermsHashPerField extends In
         code += inc;
         hashPos = code & postingsHashMask;
         termID = postingsHash[hashPos];
-      } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
+      } while (termID != -1 && !postingEquals(termID));
     }
 
     if (termID == -1) {
 
       // First time we are seeing this token since we last
       // flushed the hash.
-      final int textLen1 = 1+tokenTextLen;
-      if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
-        if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+      final int textLen2 = 2+utf8.length;
+      if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
+        // Not enough room in current block
+
+        if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
           // Just skip this term, to remain as robust as
           // possible during indexing.  A TokenFilter
           // can be inserted into the analyzer chain if
           // other behavior is wanted (pruning the term
           // to a prefix, throwing an exception, etc).
-
-          if (docState.maxTermPrefix == null)
-            docState.maxTermPrefix = new String(tokenText, 0, 30);
+          if (docState.maxTermPrefix == null) {
+            final int saved = utf8.length;
+            try {
+              utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
+              docState.maxTermPrefix = utf8.toString();
+            } finally {
+              utf8.length = saved;
+            }
+          }
 
           consumer.skippingLongTerm();
           return;
         }
-        charPool.nextBuffer();
+        bytePool.nextBuffer();
       }
 
       // New posting
       termID = numPostings++;
-      if (termID >= postingsArray.textStarts.length) {
+      if (termID >= postingsArray.size) {
         growParallelPostingsArray();
       }
-      if (perThread.termsHash.trackAllocations) {
-        perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
-      }
 
       assert termID != -1;
-
-      final char[] text = charPool.buffer;
-      final int textUpto = charPool.charUpto;
-      postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
-      charPool.charUpto += textLen1;
-      System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
-      text[textUpto+tokenTextLen] = 0xffff;
-          
       assert postingsHash[hashPos] == -1;
+
       postingsHash[hashPos] = termID;
 
-      if (numPostings == postingsHashHalfSize)
+      final byte[] text = bytePool.buffer;
+      final int textUpto = bytePool.byteUpto;
+      postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;
+
+      // We first encode the length, followed by the UTF8
+      // bytes.  Length is encoded as vInt, but will consume
+      // 1 or 2 bytes at most (we reject too-long terms,
+      // above).
+
+      // encode length @ start of bytes
+      if (utf8.length < 128) {
+        // 1 byte to store length
+        text[textUpto] = (byte) utf8.length;
+        bytePool.byteUpto += utf8.length + 1;
+        System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
+      } else {
+        // 2 byte to store length
+        text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
+        text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
+        bytePool.byteUpto += utf8.length + 2;
+        System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
+      }
+
+      if (numPostings == postingsHashHalfSize) {
         rehashPostings(2*postingsHashSize);
+        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
+      }
 
       // Init stream slices
-      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
         intPool.nextBuffer();
+      }
 
-      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
         bytePool.nextBuffer();
+      }
 
       intUptos = intPool.buffer;
       intUptoStart = intPool.intUpto;
@@ -577,16 +574,28 @@ final class TermsHashPerField extends In
         int code;
         if (perThread.primary) {
           final int textStart = postingsArray.textStarts[termID];
-          final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
-          final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
-          int pos = start;
-          while(text[pos] != 0xffff)
-            pos++;
+          final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+          final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
           code = 0;
-          while (pos > start)
-            code = (code*31) + text[--pos];
-        } else
+
+          final int len;
+          int pos;
+          if ((text[start] & 0x80) == 0) {
+            // length is 1 byte
+            len = text[start];
+            pos = start+1;
+          } else {
+            len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
+            pos = start+2;
+          }
+
+          final int endPos = pos+len;
+          while(pos < endPos) {
+            code = (code*31) + text[pos++];
+          }
+        } else {
           code = postingsArray.textStarts[termID];
+        }
 
         int hashPos = code & newMask;
         assert hashPos >= 0;
@@ -603,6 +612,7 @@ final class TermsHashPerField extends In
 
     postingsHashMask = newMask;
     postingsHash = newHash;
+
     postingsHashSize = newSize;
     postingsHashHalfSize = newSize >> 1;
   }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java Tue Apr  6 19:19:27 2010
@@ -17,6 +17,11 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+
 import java.io.IOException;
 
 final class TermsHashPerThread extends InvertedDocConsumerPerThread {
@@ -25,30 +30,54 @@ final class TermsHashPerThread extends I
   final TermsHashConsumerPerThread consumer;
   final TermsHashPerThread nextPerThread;
 
-  final CharBlockPool charPool;
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
+  final ByteBlockPool termBytePool;
   final boolean primary;
   final DocumentsWriter.DocState docState;
 
+  // Used when comparing postings via termRefComp, in TermsHashPerField
+  final BytesRef tr1 = new BytesRef();
+  final BytesRef tr2 = new BytesRef();
+
+  // Used by perField:
+  final BytesRef utf8 = new BytesRef(10);
+  
+  final LegacyTermAttributeWrapper legacyTermAttributeWrapper = new LegacyTermAttributeWrapper();
+  
+  /** This class is used to wrap a legacy TermAttribute without support for {@link TermToBytesRefAttribute}. */
+  @Deprecated
+  static class LegacyTermAttributeWrapper implements TermToBytesRefAttribute {
+    private TermAttribute termAtt = null;
+  
+    void setTermAttribute(TermAttribute termAtt) {
+      this.termAtt = termAtt;
+    }
+  
+    public int toBytesRef(BytesRef target) {
+      assert target.bytes != null : "target byteref must be != null, because utf8 is used here";
+      return UnicodeUtil.UTF16toUTF8WithHash(termAtt.termBuffer(), 0, termAtt.termLength(), target);
+    }
+  }
+
   public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
     docState = docInverterPerThread.docState;
 
     this.termsHash = termsHash;
     this.consumer = termsHash.consumer.addThread(this);
 
+    intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
+    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
+
     if (nextTermsHash != null) {
       // We are primary
-      charPool = new CharBlockPool(termsHash.docWriter);
       primary = true;
+      termBytePool = bytePool;
     } else {
-      charPool = primaryPerThread.charPool;
       primary = false;
+      termBytePool = primaryPerThread.bytePool;
     }
 
-    intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
-    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
-
     if (nextTermsHash != null)
       nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
     else
@@ -97,7 +126,8 @@ final class TermsHashPerThread extends I
     intPool.reset();
     bytePool.reset();
 
-    if (primary)
-      charPool.reset();
+    if (primary) {
+      bytePool.reset();
+    }
   }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java Tue Apr  6 19:19:27 2010
@@ -161,8 +161,8 @@ public class ConstantScoreQuery extends 
   /** Prints a user-readable version of this query. */
   @Override
   public String toString(String field) {
-    return "ConstantScore(" + filter.toString()
-      + (getBoost()==1.0 ? ")" : "^" + getBoost());
+    return "ConstantScore(" + filter.toString() + ")"
+      + (getBoost()==1.0 ? "" : "^" + getBoost());
   }
 
   /** Returns true if <code>o</code> is equal to this. */

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java Tue Apr  6 19:19:27 2010
@@ -22,9 +22,9 @@ import org.apache.lucene.index.*;
 
 final class ExactPhraseScorer extends PhraseScorer {
 
-  ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets,
+  ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets,
       Similarity similarity, byte[] norms) {
-    super(weight, tps, offsets, similarity, norms);
+    super(weight, postings, offsets, similarity, norms);
   }
 
   @Override
@@ -42,11 +42,11 @@ final class ExactPhraseScorer extends Ph
     int freq = 0;
     do {					  // find position w/ all terms
       while (first.position < last.position) {	  // scan forward in first
-	    do {
-	      if (!first.nextPosition())
-	        return freq;
-	    } while (first.position < last.position);
-	      firstToLast();
+        do {
+          if (!first.nextPosition())
+            return freq;
+        } while (first.position < last.position);
+        firstToLast();
       }
       freq++;					  // all equal: a match
     } while (last.nextPosition());

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java?rev=931278&r1=931277&r2=931278&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/FieldCache.java Tue Apr  6 19:19:27 2010
@@ -20,6 +20,7 @@ package org.apache.lucene.search;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.document.NumericField; // for javadocs
 import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
 
@@ -100,7 +101,7 @@ public interface FieldCache {
    */
   public interface ByteParser extends Parser {
     /** Return a single Byte representation of this field's value. */
-    public byte parseByte(String string);
+    public byte parseByte(BytesRef term);
   }
 
   /** Interface to parse shorts from document fields.
@@ -108,7 +109,7 @@ public interface FieldCache {
    */
   public interface ShortParser extends Parser {
     /** Return a short representation of this field's value. */
-    public short parseShort(String string);
+    public short parseShort(BytesRef term);
   }
 
   /** Interface to parse ints from document fields.
@@ -116,7 +117,7 @@ public interface FieldCache {
    */
   public interface IntParser extends Parser {
     /** Return an integer representation of this field's value. */
-    public int parseInt(String string);
+    public int parseInt(BytesRef term);
   }
 
   /** Interface to parse floats from document fields.
@@ -124,7 +125,7 @@ public interface FieldCache {
    */
   public interface FloatParser extends Parser {
     /** Return an float representation of this field's value. */
-    public float parseFloat(String string);
+    public float parseFloat(BytesRef term);
   }
 
   /** Interface to parse long from document fields.
@@ -132,7 +133,7 @@ public interface FieldCache {
    */
   public interface LongParser extends Parser {
     /** Return an long representation of this field's value. */
-    public long parseLong(String string);
+    public long parseLong(BytesRef term);
   }
 
   /** Interface to parse doubles from document fields.
@@ -140,16 +141,20 @@ public interface FieldCache {
    */
   public interface DoubleParser extends Parser {
     /** Return an long representation of this field's value. */
-    public double parseDouble(String string);
+    public double parseDouble(BytesRef term);
   }
 
   /** Expert: The cache used internally by sorting and range query classes. */
   public static FieldCache DEFAULT = new FieldCacheImpl();
-  
+
   /** The default parser for byte values, which are encoded by {@link Byte#toString(byte)} */
   public static final ByteParser DEFAULT_BYTE_PARSER = new ByteParser() {
-    public byte parseByte(String value) {
-      return Byte.parseByte(value);
+    public byte parseByte(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Byte.parseByte(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_BYTE_PARSER;
@@ -162,8 +167,12 @@ public interface FieldCache {
 
   /** The default parser for short values, which are encoded by {@link Short#toString(short)} */
   public static final ShortParser DEFAULT_SHORT_PARSER = new ShortParser() {
-    public short parseShort(String value) {
-      return Short.parseShort(value);
+    public short parseShort(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Short.parseShort(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_SHORT_PARSER;
@@ -176,8 +185,12 @@ public interface FieldCache {
 
   /** The default parser for int values, which are encoded by {@link Integer#toString(int)} */
   public static final IntParser DEFAULT_INT_PARSER = new IntParser() {
-    public int parseInt(String value) {
-      return Integer.parseInt(value);
+    public int parseInt(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Integer.parseInt(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_INT_PARSER;
@@ -190,8 +203,12 @@ public interface FieldCache {
 
   /** The default parser for float values, which are encoded by {@link Float#toString(float)} */
   public static final FloatParser DEFAULT_FLOAT_PARSER = new FloatParser() {
-    public float parseFloat(String value) {
-      return Float.parseFloat(value);
+    public float parseFloat(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Float.parseFloat(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_FLOAT_PARSER;
@@ -204,8 +221,12 @@ public interface FieldCache {
 
   /** The default parser for long values, which are encoded by {@link Long#toString(long)} */
   public static final LongParser DEFAULT_LONG_PARSER = new LongParser() {
-    public long parseLong(String value) {
-      return Long.parseLong(value);
+    public long parseLong(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Long.parseLong(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_LONG_PARSER;
@@ -218,8 +239,12 @@ public interface FieldCache {
 
   /** The default parser for double values, which are encoded by {@link Double#toString(double)} */
   public static final DoubleParser DEFAULT_DOUBLE_PARSER = new DoubleParser() {
-    public double parseDouble(String value) {
-      return Double.parseDouble(value);
+    public double parseDouble(BytesRef term) {
+      // TODO: would be far better to directly parse from
+      // UTF8 bytes... but really users should use
+      // NumericField, instead, which already decodes
+      // directly from byte[]
+      return Double.parseDouble(term.utf8ToString());
     }
     protected Object readResolve() {
       return DEFAULT_DOUBLE_PARSER;
@@ -231,15 +256,14 @@ public interface FieldCache {
   };
 
   /**
-   * A parser instance for int values encoded by {@link NumericUtils#intToPrefixCoded(int)}, e.g. when indexed
+   * A parser instance for int values encoded by {@link NumericUtils}, e.g. when indexed
    * via {@link NumericField}/{@link NumericTokenStream}.
    */
   public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){
-    public int parseInt(String val) {
-      final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT;
-      if (shift>0 && shift<=31)
+    public int parseInt(BytesRef term) {
+      if (NumericUtils.getPrefixCodedIntShift(term) > 0)
         throw new FieldCacheImpl.StopFillCacheException();
-      return NumericUtils.prefixCodedToInt(val);
+      return NumericUtils.prefixCodedToInt(term);
     }
     protected Object readResolve() {
       return NUMERIC_UTILS_INT_PARSER;
@@ -255,11 +279,10 @@ public interface FieldCache {
    * via {@link NumericField}/{@link NumericTokenStream}.
    */
   public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){
-    public float parseFloat(String val) {
-      final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT;
-      if (shift>0 && shift<=31)
+    public float parseFloat(BytesRef term) {
+      if (NumericUtils.getPrefixCodedIntShift(term) > 0)
         throw new FieldCacheImpl.StopFillCacheException();
-      return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(val));
+      return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term));
     }
     protected Object readResolve() {
       return NUMERIC_UTILS_FLOAT_PARSER;
@@ -271,15 +294,14 @@ public interface FieldCache {
   };
 
   /**
-   * A parser instance for long values encoded by {@link NumericUtils#longToPrefixCoded(long)}, e.g. when indexed
+   * A parser instance for long values encoded by {@link NumericUtils}, e.g. when indexed
    * via {@link NumericField}/{@link NumericTokenStream}.
    */
   public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){
-    public long parseLong(String val) {
-      final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG;
-      if (shift>0 && shift<=63)
+    public long parseLong(BytesRef term) {
+      if (NumericUtils.getPrefixCodedLongShift(term) > 0)
         throw new FieldCacheImpl.StopFillCacheException();
-      return NumericUtils.prefixCodedToLong(val);
+      return NumericUtils.prefixCodedToLong(term);
     }
     protected Object readResolve() {
       return NUMERIC_UTILS_LONG_PARSER;
@@ -295,11 +317,10 @@ public interface FieldCache {
    * via {@link NumericField}/{@link NumericTokenStream}.
    */
   public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){
-    public double parseDouble(String val) {
-      final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG;
-      if (shift>0 && shift<=63)
+    public double parseDouble(BytesRef term) {
+      if (NumericUtils.getPrefixCodedLongShift(term) > 0)
         throw new FieldCacheImpl.StopFillCacheException();
-      return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(val));
+      return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term));
     }
     protected Object readResolve() {
       return NUMERIC_UTILS_DOUBLE_PARSER;



Mime
View raw message