lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r966819 [6/20] - in /lucene/dev/branches/realtime_search: ./ lucene/ lucene/backwards/ lucene/contrib/ lucene/contrib/benchmark/conf/ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ lucene/contrib/benchmark/src/j...
Date Thu, 22 Jul 2010 19:34:52 GMT
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java Thu Jul 22 19:34:35 2010
@@ -21,6 +21,7 @@ import org.apache.lucene.store.BufferedI
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -34,7 +35,11 @@ class TermVectorsReader implements Clone
   static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
 
   // NOTE: always change this if you switch to a new format!
+  // whenever you add a new format, make it 1 larger (positive version logic)!
   static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+  
+  // when removing support for old versions, leave the last supported version here
+  static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;
 
   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file 
   static final int FORMAT_SIZE = 4;
@@ -74,11 +79,13 @@ class TermVectorsReader implements Clone
       String idxName = IndexFileNames.segmentFileName(segment, "", IndexFileNames.VECTORS_INDEX_EXTENSION);
       if (d.fileExists(idxName)) {
         tvx = d.openInput(idxName, readBufferSize);
-        format = checkValidFormat(tvx);
-        tvd = d.openInput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION), readBufferSize);
-        final int tvdFormat = checkValidFormat(tvd);
-        tvf = d.openInput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.VECTORS_FIELDS_EXTENSION), readBufferSize);
-        final int tvfFormat = checkValidFormat(tvf);
+        format = checkValidFormat(tvx, idxName);
+        String fn = IndexFileNames.segmentFileName(segment, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+        tvd = d.openInput(fn, readBufferSize);
+        final int tvdFormat = checkValidFormat(tvd, fn);
+        fn = IndexFileNames.segmentFileName(segment, "", IndexFileNames.VECTORS_FIELDS_EXTENSION);
+        tvf = d.openInput(fn, readBufferSize);
+        final int tvfFormat = checkValidFormat(tvf, fn);
 
         assert format == tvdFormat;
         assert format == tvfFormat;
@@ -182,13 +189,13 @@ class TermVectorsReader implements Clone
     }
   }
 
-  private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
+  private int checkValidFormat(IndexInput in, String fn) throws CorruptIndexException, IOException
   {
     int format = in.readInt();
-    if (format > FORMAT_CURRENT) {
-      throw new CorruptIndexException("Incompatible format version: " + format + " expected " 
-                                      + FORMAT_CURRENT + " or less");
-    }
+    if (format < FORMAT_MINIMUM)
+      throw new IndexFormatTooOldException(fn, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+    if (format > FORMAT_CURRENT)
+      throw new IndexFormatTooNewException(fn, format, FORMAT_MINIMUM, FORMAT_CURRENT);
     return format;
   }
 
@@ -415,14 +422,15 @@ class TermVectorsReader implements Clone
       deltaLength = tvf.readVInt();
       totalLength = start + deltaLength;
 
-      final String term;
+      final BytesRef term = new BytesRef(totalLength);
       
       // Term stored as utf8 bytes
       if (byteBuffer.length < totalLength) {
         byteBuffer = ArrayUtil.grow(byteBuffer, totalLength);
       }
       tvf.readBytes(byteBuffer, start, deltaLength);
-      term = new String(byteBuffer, 0, totalLength, "UTF-8");
+      System.arraycopy(byteBuffer, 0, term.bytes, 0, totalLength);
+      term.length = totalLength;
       int freq = tvf.readVInt();
       int [] positions = null;
       if (storePositions) { //read in the positions
@@ -491,7 +499,7 @@ class TermVectorsReader implements Clone
 class ParallelArrayTermVectorMapper extends TermVectorMapper
 {
 
-  private String[] terms;
+  private BytesRef[] terms;
   private int[] termFreqs;
   private int positions[][];
   private TermVectorOffsetInfo offsets[][];
@@ -503,7 +511,7 @@ class ParallelArrayTermVectorMapper exte
   @Override
   public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
     this.field = field;
-    terms = new String[numTerms];
+    terms = new BytesRef[numTerms];
     termFreqs = new int[numTerms];
     this.storingOffsets = storeOffsets;
     this.storingPositions = storePositions;
@@ -514,7 +522,7 @@ class ParallelArrayTermVectorMapper exte
   }
 
   @Override
-  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+  public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
     terms[currentPosition] = term;
     termFreqs[currentPosition] = frequency;
     if (storingOffsets)

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Thu Jul 22 19:34:35 2010
@@ -128,7 +128,7 @@ final class TermVectorsTermsWriterPerFie
 
     // TODO: we may want to make this sort in same order
     // as Codec's terms dict?
-    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
+    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
 
     tvf.writeVInt(numPostings);
     byte bits = 0x0;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsWriter.java Thu Jul 22 19:34:35 2010
@@ -21,7 +21,6 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.UnicodeUtil;
 
 import java.io.IOException;
 
@@ -29,7 +28,6 @@ final class TermVectorsWriter {
   
   private IndexOutput tvx = null, tvd = null, tvf = null;
   private FieldInfos fieldInfos;
-  final BytesRef[] utf8Results = new BytesRef[] {new BytesRef(10), new BytesRef(10)};
 
   public TermVectorsWriter(Directory directory, String segment,
                            FieldInfos fieldInfos)
@@ -97,25 +95,19 @@ final class TermVectorsWriter {
 
         tvf.writeVInt(bits);
 
-        final String[] terms = vectors[i].getTerms();
+        final BytesRef[] terms = vectors[i].getTerms();
         final int[] freqs = vectors[i].getTermFrequencies();
 
-        int utf8Upto = 0;
-        utf8Results[1].length = 0;
-
         for (int j=0; j<numTerms; j++) {
-
-          UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
           
-          int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].bytes,
-                                                   utf8Results[1-utf8Upto].length,
-                                                   utf8Results[utf8Upto].bytes,
-                                                   utf8Results[utf8Upto].length);
-          int length = utf8Results[utf8Upto].length - start;
+          int start = j == 0 ? 0 : StringHelper.bytesDifference(terms[j-1].bytes,
+                                                   terms[j-1].length,
+                                                   terms[j].bytes,
+                                                   terms[j].length);
+          int length = terms[j].length - start;
           tvf.writeVInt(start);       // write shared prefix length
           tvf.writeVInt(length);        // write delta length
-          tvf.writeBytes(utf8Results[utf8Upto].bytes, start, length);  // write delta bytes
-          utf8Upto = 1-utf8Upto;
+          tvf.writeBytes(terms[j].bytes, start, length);  // write delta bytes
 
           final int termFreq = freqs[j];
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jul 22 19:34:35 2010
@@ -144,8 +144,7 @@ public abstract class TermsEnum {
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // return an unused dummy to prevent NPE
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      return null;
     }
       
     @Override

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java Thu Jul 22 19:34:35 2010
@@ -38,6 +38,8 @@ import org.apache.lucene.index.codecs.st
  *  @lucene.experimental */
 
 public abstract class CodecProvider {
+  private SegmentInfosWriter infosWriter = new DefaultSegmentInfosWriter();
+  private SegmentInfosReader infosReader = new DefaultSegmentInfosReader();
 
   private final HashMap<String, Codec> codecs = new HashMap<String, Codec>();
 
@@ -72,6 +74,14 @@ public abstract class CodecProvider {
   }
 
   public abstract Codec getWriter(SegmentWriteState state);
+  
+  public SegmentInfosWriter getSegmentInfosWriter() {
+    return infosWriter;
+  }
+  
+  public SegmentInfosReader getSegmentInfosReader() {
+    return infosReader;
+  }
 
   static private final CodecProvider defaultCodecs = new DefaultCodecProvider();
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java Thu Jul 22 19:34:35 2010
@@ -67,7 +67,7 @@ public class IntBlockCodec extends Codec
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -95,7 +95,7 @@ public class IntBlockCodec extends Codec
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -111,7 +111,7 @@ public class IntBlockCodec extends Codec
                                                        state.segmentInfo.name,
                                                        postingsReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/intblock/SimpleIntBlockIndexInput.java Thu Jul 22 19:34:35 2010
@@ -37,7 +37,8 @@ public class SimpleIntBlockIndexInput ex
 
   public SimpleIntBlockIndexInput(Directory dir, String fileName, int readBufferSize) throws IOException {
     IndexInput in = dir.openInput(fileName, readBufferSize);
-    CodecUtil.checkHeader(in, SimpleIntBlockIndexOutput.CODEC, SimpleIntBlockIndexOutput.VERSION_START);
+    CodecUtil.checkHeader(in, SimpleIntBlockIndexOutput.CODEC,
+      SimpleIntBlockIndexOutput.VERSION_START, SimpleIntBlockIndexOutput.VERSION_START);
     init(in);
   }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java Thu Jul 22 19:34:35 2010
@@ -33,7 +33,7 @@ import org.apache.lucene.index.codecs.Fi
  *  written segments should use StandardCodec.
  *
  * @deprecated This is only used to read indexes created
- * before 3.1.
+ * before 4.0.
  * @lucene.experimental
  */
 @Deprecated

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Jul 22 19:34:35 2010
@@ -39,11 +39,15 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.ArrayUtil;
 
 /** Exposes flex API on a pre-flex index, as a codec. 
  * @lucene.experimental */
 public class PreFlexFields extends FieldsProducer {
 
+  private static final boolean DEBUG_SURROGATES = false;
+
   public TermInfosReader tis;
   public final TermInfosReader tisNoIndex;
 
@@ -60,6 +64,15 @@ public class PreFlexFields extends Field
     throws IOException {
 
     si = info;
+
+    // NOTE: we must always load terms index, even for
+    // "sequential" scan during merging, because what is
+    // sequential to merger may not be to TermInfosReader
+    // since we do the surrogates dance:
+    if (indexDivisor < 0) {
+      indexDivisor = -indexDivisor;
+    }
+
     TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);    
     if (indexDivisor == -1) {
       tisNoIndex = r;
@@ -174,7 +187,6 @@ public class PreFlexFields extends Field
   private class PreFlexFieldsEnum extends FieldsEnum {
     final Iterator<FieldInfo> it;
     private final PreTermsEnum termsEnum;
-    private int count;
     FieldInfo current;
 
     public PreFlexFieldsEnum() throws IOException {
@@ -185,7 +197,6 @@ public class PreFlexFields extends Field
     @Override
     public String next() {
       if (it.hasNext()) {
-        count++;
         current = it.next();
         return current.name;
       } else {
@@ -195,7 +206,7 @@ public class PreFlexFields extends Field
 
     @Override
     public TermsEnum terms() throws IOException {
-      termsEnum.reset(current, count == 1);
+      termsEnum.reset(current);
       return termsEnum;
     }
   }
@@ -209,14 +220,15 @@ public class PreFlexFields extends Field
     @Override
     public TermsEnum iterator() throws IOException {    
       PreTermsEnum termsEnum = new PreTermsEnum();
-      termsEnum.reset(fieldInfo, false);
+      termsEnum.reset(fieldInfo);
       return termsEnum;
     }
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // Pre-flex indexes always sorted in UTF16 order
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      // Pre-flex indexes always sorted in UTF16 order, but
+      // we remap on-the-fly to unicode order
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
   }
 
@@ -225,39 +237,238 @@ public class PreFlexFields extends Field
     private FieldInfo fieldInfo;
     private boolean skipNext;
     private BytesRef current;
-    private final BytesRef scratchBytesRef = new BytesRef();
 
-    void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
+    private int[] surrogateSeekPending = new int[1];
+    private boolean[] surrogateDidSeekBack = new boolean[1];
+    private int surrogateSeekUpto;
+    private char[] pendingPrefix;
+
+    private SegmentTermEnum seekTermEnum;
+    private Term protoTerm;
+    private int newSuffixStart;
+
+    void reset(FieldInfo fieldInfo) throws IOException {
       this.fieldInfo = fieldInfo;
+      protoTerm = new Term(fieldInfo.name);
       if (termEnum == null) {
-        // First time reset is called
-        if (isFirstField) {
-          termEnum = getTermsDict().terms();
-          skipNext = false;
-        } else {
-          termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
-          skipNext = true;
-        }
+        termEnum = getTermsDict().terms(protoTerm);
+        seekTermEnum = getTermsDict().terms(protoTerm);
       } else {
-        final Term t = termEnum.term();
-        if (t != null && t.field() == fieldInfo.name) {
-          // No need to seek -- we have already advanced onto
-          // this field.  We must be @ first term because
-          // flex API will not advance this enum further, on
-          // seeing a different field.
-        } else {
-          assert t == null || !t.field().equals(fieldInfo.name);  // make sure field name is interned
-          final TermInfosReader tis = getTermsDict();
-          tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
+        getTermsDict().seekEnum(termEnum, protoTerm);
+      }
+      skipNext = true;
+      
+      surrogateSeekUpto = 0;
+      newSuffixStart = 0;
+
+      surrogatesDance();
+    }
+
+    private void surrogatesDance() throws IOException {
+      
+      // Tricky: prior to 4.0, Lucene index sorted terms in
+      // UTF16 order, but as of 4.0 we sort by Unicode code
+      // point order.  These orders differ because of the
+      // surrrogates; so we have to fixup our enum, here, by
+      // carefully first seeking past the surrogates and
+      // then back again at the end.  The process is
+      // recursive, since any given term could have multiple
+      // new occurrences of surrogate pairs, so we use a
+      // stack to record the pending seek-backs.
+      if (DEBUG_SURROGATES) {
+        System.out.println("  dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+      }
+
+      while(popPendingSeek());
+      while(pushNewSurrogate());
+    }
+
+    // only for debugging
+    private String getStack() {
+      if (surrogateSeekUpto == 0) {
+        return "null";
+      } else {
+        StringBuffer sb = new StringBuffer();
+        for(int i=0;i<surrogateSeekUpto;i++) {
+          if (i > 0) {
+            sb.append(' ');
+          }
+          sb.append(surrogateSeekPending[i]);
+        }
+        sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
+        return sb.toString();
+      }
+    }
+
+    private boolean popPendingSeek() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("  check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
+      }
+      // if a .next() has advanced beyond the
+      // after-surrogates range we had last seeked to, we
+      // must seek back to the start and resume .next from
+      // there.  this pops the pending seek off the stack.
+      final Term t = termEnum.term();
+      if (surrogateSeekUpto > 0) {
+        final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
+        if (DEBUG_SURROGATES) {
+          System.out.println("    seekPrefix=" + seekPrefix);
+        }
+        if (newSuffixStart < seekPrefix) {
+          assert pendingPrefix != null;
+          assert pendingPrefix.length > seekPrefix;
+          pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
+          pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START;
+          Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix));
+          if (DEBUG_SURROGATES) {
+            System.out.println("    do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
+          }
+          getTermsDict().seekEnum(termEnum, t2);
+          surrogateDidSeekBack[surrogateSeekUpto-1] = true;
+
+          // +2 because we don't want to re-check the
+          // surrogates we just seek'd back to
+          newSuffixStart = seekPrefix + 2;
+          return true;
+        } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
+          assert pendingPrefix != null;
+          assert pendingPrefix.length > seekPrefix;
+          pendingPrefix[seekPrefix] = 0xffff;
+          Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix));
+          if (DEBUG_SURROGATES) {
+            System.out.println("    finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
+          }
+          getTermsDict().seekEnum(termEnum, t2);
+          if (DEBUG_SURROGATES) {
+            System.out.println("    found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+          }
+          surrogateSeekUpto--;
+
+          if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
+            // force pop
+            newSuffixStart = -1;
+          } else {
+            newSuffixStart = termEnum.newSuffixStart;
+          }
+
+          return true;
+        }
+      }
+
+      return false;
+    }
+
+    private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
+    private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result();
+    
+    private boolean pushNewSurrogate() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("  check push newSuffix=" + newSuffixStart + " stack=" + getStack());
+      }
+      final Term t = termEnum.term();
+      if (t == null || t.field() != fieldInfo.name) {
+        return false;
+      }
+
+      final BytesRef bytes = t.bytes();
+      UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer);
+
+      for(int i=Math.max(0,newSuffixStart);i<termBuffer.length;i++) {
+        final char ch = termBuffer.result[i];
+        if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
+
+          if (DEBUG_SURROGATES) {
+            System.out.println("    found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
+          }
+
+          // the next() that we just did read in a new
+          // suffix, containing a surrogate pair
+
+          // seek forward to see if there are any terms with
+          // this same prefix, but with characters after the
+          // surrogate range; if so, we must first iterate
+          // them, then seek back to the surrogates
+
+          char[] testPrefix = new char[i+2];
+          for(int j=0;j<i;j++) {
+            testPrefix[j] = termBuffer.result[j];
+          }
+          testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
+
+          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new BytesRef(testPrefix, 0, i+1)));
+
+          Term t2 = seekTermEnum.term();
+          boolean isPrefix;
+          if (t2 != null && t2.field() == fieldInfo.name) {
+
+            final BytesRef seekBytes = t2.bytes();
+            UnicodeUtil.UTF8toUTF16(seekBytes.bytes, seekBytes.offset, seekBytes.length, seekBuffer);
+
+            isPrefix = true;
+            if (DEBUG_SURROGATES) {
+              System.out.println("      seek found " + UnicodeUtil.toHexString(t2.text()));
+            }
+            for(int j=0;j<i;j++) {
+              if (testPrefix[j] != seekBuffer.result[j]) {
+                isPrefix = false;
+                break;
+              }
+            }
+            if (DEBUG_SURROGATES && !isPrefix) {
+              System.out.println("      no end terms");
+            }
+          } else {
+            if (DEBUG_SURROGATES) {
+              System.out.println("      no end terms");
+            }
+            isPrefix = false;
+          }
+
+          if (isPrefix) {
+            // we found a term, sharing the same prefix,
+            // with characters after the surrogates, so we
+            // must first enum those, and then return the
+            // the surrogates afterwards.  push that pending
+            // seek on the surrogates stack now:
+            pendingPrefix = testPrefix;
+
+            getTermsDict().seekEnum(termEnum, t2);
+
+            if (surrogateSeekUpto == surrogateSeekPending.length) {
+              surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
+            }
+            if (surrogateSeekUpto == surrogateDidSeekBack.length) {
+              surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
+            }
+            surrogateSeekPending[surrogateSeekUpto] = i;
+            surrogateDidSeekBack[surrogateSeekUpto] = false;
+            surrogateSeekUpto++;
+
+            if (DEBUG_SURROGATES) {
+              System.out.println("      do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
+            }
+
+            newSuffixStart = i+1;
+
+            return true;
+          } else {
+            // there are no terms after the surrogates, so
+            // we do nothing to the enum and just step
+            // through the surrogates like normal.  but we
+            // must keep iterating through the term, in case
+            // another surrogate pair appears later
+          }
         }
-        skipNext = true;
       }
+
+      return false;
     }
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // Pre-flex indexes always sorted in UTF16 order
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      // Pre-flex indexes always sorted in UTF16 order, but
+      // we remap on-the-fly to unicode order
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
 
     @Override
@@ -272,23 +483,27 @@ public class PreFlexFields extends Field
 
     @Override
     public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("TE.seek() term=" + term.utf8ToString());
+      }
       skipNext = false;
       final TermInfosReader tis = getTermsDict();
-      final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
+      final Term t0 = protoTerm.createTerm(term);
+
+      assert termEnum != null;
+
       if (termEnum == null) {
         termEnum = tis.terms(t0);
       } else {
         tis.seekEnum(termEnum, t0);
       }
+
+      surrogateSeekUpto = 0;
+      surrogatesDance();
+
       final Term t = termEnum.term();
 
-      final BytesRef tr;
-      if (t != null) {
-        tr = scratchBytesRef;
-        scratchBytesRef.copy(t.text());
-      } else {
-        tr = null;
-      }
+      final BytesRef tr = t == null ? null : t.bytes();
 
       if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) {
         current = tr;
@@ -304,28 +519,46 @@ public class PreFlexFields extends Field
 
     @Override
     public BytesRef next() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("TE.next() skipNext=" + skipNext);
+      }
       if (skipNext) {
         skipNext = false;
         if (termEnum.term() == null) {
           return null;
         } else {
-          scratchBytesRef.copy(termEnum.term().text());
-          return current = scratchBytesRef;
+          return current = termEnum.term().bytes();
         }
       }
-      if (termEnum.next()) {
+      if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
+        newSuffixStart = termEnum.newSuffixStart;
+        if (DEBUG_SURROGATES) {
+          System.out.println("  set newSuffixStart=" + newSuffixStart);
+        }
+        surrogatesDance();
         final Term t = termEnum.term();
-        if (t.field() == fieldInfo.name) {
-          scratchBytesRef.copy(t.text());
-          current = scratchBytesRef;
-          return current;
+        if (t == null || t.field() != fieldInfo.name) {
+          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+          current = null;
         } else {
-          assert !t.field().equals(fieldInfo.name);  // make sure field name is interned
-          // Crossed into new field
-          return null;
+          current = t.bytes();
         }
+        return current;
       } else {
-        return null;
+        if (DEBUG_SURROGATES) {
+          System.out.println("  force pop");
+        }
+        // force pop
+        newSuffixStart = -1;
+        surrogatesDance();
+        final Term t = termEnum.term();
+        if (t == null || t.field() != fieldInfo.name) {
+          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+          return null;
+        } else {
+          current = t.bytes();
+          return current;
+        }
       }
     }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java Thu Jul 22 19:34:35 2010
@@ -22,8 +22,6 @@ import java.io.IOException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.codecs.standard.DefaultSkipListReader;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
@@ -31,7 +29,7 @@ import org.apache.lucene.util.Bits;
 /** @deprecated 
  *  @lucene.experimental */
 @Deprecated
-public class SegmentTermDocs implements TermDocs {
+public class SegmentTermDocs {
   //protected SegmentReader parent;
   private final FieldInfos fieldInfos;
   private final TermInfosReader tis;
@@ -84,17 +82,16 @@ public class SegmentTermDocs implements 
     this.skipDocs = skipDocs;
   }
 
-  public void seek(TermEnum termEnum) throws IOException {
+  public void seek(SegmentTermEnum segmentTermEnum) throws IOException {
     TermInfo ti;
     Term term;
     
     // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
-    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == fieldInfos) {        // optimized case
-      SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
+    if (segmentTermEnum.fieldInfos == fieldInfos) {        // optimized case
       term = segmentTermEnum.term();
       ti = segmentTermEnum.termInfo();
     } else  {                                         // punt case
-      term = termEnum.term();
+      term = segmentTermEnum.term();
       ti = tis.get(term); 
     }
     

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java Thu Jul 22 19:34:35 2010
@@ -20,9 +20,10 @@ package org.apache.lucene.index.codecs.p
 import java.io.IOException;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.index.FieldInfos;
-import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexFormatTooOldException;
+import org.apache.lucene.index.IndexFormatTooNewException;
 
 /**
  * @deprecated No longer used with flex indexing, except for
@@ -30,7 +31,7 @@ import org.apache.lucene.index.CorruptIn
  * @lucene.experimental */
 
 @Deprecated
-public final class SegmentTermEnum extends TermEnum implements Cloneable {
+public final class SegmentTermEnum implements Cloneable {
   private IndexInput input;
   FieldInfos fieldInfos;
   long size;
@@ -41,7 +42,11 @@ public final class SegmentTermEnum exten
   public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
 
   // NOTE: always change this if you switch to a new format!
+  // whenever you add a new format, make it 1 smaller (negative version logic)!
   public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
+  
+  // when removing support for old versions, levae the last supported version here
+  public static final int FORMAT_MINIMUM = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
 
   private TermBuffer termBuffer = new TermBuffer();
   private TermBuffer prevBuffer = new TermBuffer();
@@ -54,6 +59,7 @@ public final class SegmentTermEnum exten
   long indexPointer = 0;
   int indexInterval;
   int skipInterval;
+  int newSuffixStart;
   int maxSkipLevels;
   private int formatM1SkipInterval;
 
@@ -78,8 +84,10 @@ public final class SegmentTermEnum exten
       format = firstInt;
 
       // check that it is a format we can understand
-      if (format < FORMAT_CURRENT)
-        throw new CorruptIndexException("Unknown format version:" + format + " expected " + FORMAT_CURRENT + " or higher");
+    if (format > FORMAT_MINIMUM)
+      throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+    if (format < FORMAT_CURRENT)
+      throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
 
       size = input.readLong();                    // read the size
       
@@ -128,7 +136,6 @@ public final class SegmentTermEnum exten
   }
 
   /** Increments the enumeration to the next element.  True if one exists.*/
-  @Override
   public final boolean next() throws IOException {
     if (position++ >= size - 1) {
       prevBuffer.set(termBuffer);
@@ -138,6 +145,7 @@ public final class SegmentTermEnum exten
 
     prevBuffer.set(termBuffer);
     termBuffer.read(input, fieldInfos);
+    newSuffixStart = termBuffer.newSuffixStart;
 
     termInfo.docFreq = input.readVInt();	  // read doc freq
     termInfo.freqPointer += input.readVLong();	  // read freq pointer
@@ -176,7 +184,6 @@ public final class SegmentTermEnum exten
 
   /** Returns the current Term in the enumeration.
    Initially invalid, valid after next() called for the first time.*/
-  @Override
   public final Term term() {
     return termBuffer.toTerm();
   }
@@ -200,7 +207,6 @@ public final class SegmentTermEnum exten
 
   /** Returns the docFreq from the current TermInfo in the enumeration.
    Initially invalid, valid after next() called for the first time.*/
-  @Override
   public final int docFreq() {
     return termInfo.docFreq;
   }
@@ -218,7 +224,6 @@ public final class SegmentTermEnum exten
   }
 
   /** Closes the enumeration to further activity, freeing resources. */
-  @Override
   public final void close() throws IOException {
     input.close();
   }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java Thu Jul 22 19:34:35 2010
@@ -21,12 +21,11 @@ import java.io.IOException;
 
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.store.IndexInput;
 
 /** @lucene.experimental */
 public final class SegmentTermPositions
-extends SegmentTermDocs implements TermPositions {
+extends SegmentTermDocs  {
   private IndexInput proxStream;
   private IndexInput proxStreamOrig;
   private int proxCount;
@@ -55,7 +54,6 @@ extends SegmentTermDocs implements TermP
     this.proxStreamOrig = proxStream;  // the proxStream will be cloned lazily when nextPosition() is called for the first time
   }
 
-  @Override
   final void seek(TermInfo ti, Term term) throws IOException {
     super.seek(ti, term);
     if (ti != null)
@@ -67,7 +65,6 @@ extends SegmentTermDocs implements TermP
     needToLoadPayload = false;
   }
 
-  @Override
   public final void close() throws IOException {
     super.close();
     if (proxStream != null) proxStream.close();
@@ -99,13 +96,11 @@ extends SegmentTermDocs implements TermP
     return delta;
   }
   
-  @Override
   protected final void skippingDoc() throws IOException {
     // we remember to skip a document lazily
     lazySkipProxCount += freq;
   }
 
-  @Override
   public final boolean next() throws IOException {
     // we remember to skip the remaining positions of the current
     // document lazily
@@ -119,14 +114,12 @@ extends SegmentTermDocs implements TermP
     return false;
   }
 
-  @Override
   public final int read(final int[] docs, final int[] freqs) {
     throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
   }
 
 
   /** Called by super.skipTo(). */
-  @Override
   protected void skipProx(long proxPointer, int payloadLength) throws IOException {
     // we save the pointer, we might have to skip there lazily
     lazySkipPointer = proxPointer;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java Thu Jul 22 19:34:35 2010
@@ -19,7 +19,6 @@ package org.apache.lucene.index.codecs.p
 
 import java.io.IOException;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.index.Term;
@@ -34,6 +33,8 @@ final class TermBuffer implements Clonea
   private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
   private BytesRef bytes = new BytesRef(10);
 
+  int newSuffixStart;
+
   public final int compareTo(TermBuffer other) {
     if (field == other.field) 	  // fields are interned
       return compareChars(text.result, text.length, other.text.result, other.text.length);
@@ -60,23 +61,33 @@ final class TermBuffer implements Clonea
     int start = input.readVInt();
     int length = input.readVInt();
     int totalLength = start + length;
+    if (bytes.bytes.length < totalLength) {
+      bytes.grow(totalLength);
+    }
     if (dirty) {
       // Fully convert all bytes since bytes is dirty
       UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
-      if (bytes.bytes.length < totalLength)
-        bytes.bytes = new byte[totalLength];
       bytes.length = totalLength;
       input.readBytes(bytes.bytes, start, length);
       UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
       dirty = false;
     } else {
       // Incrementally convert only the UTF8 bytes that are new:
-      if (bytes.bytes.length < totalLength)
-        bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
       bytes.length = totalLength;
       input.readBytes(bytes.bytes, start, length);
       UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
     }
+
+    while(true) {
+      newSuffixStart = text.offsets[start];
+      if (newSuffixStart != -1) {
+        break;
+      }
+      if (--start == 0) {
+        newSuffixStart = 0;
+        break;
+      }
+    }
     this.field = fieldInfos.fieldName(input.readVInt());
   }
 
@@ -85,10 +96,9 @@ final class TermBuffer implements Clonea
       reset();
       return;
     }
-    final String termText = term.text();
-    final int termLen = termText.length();
-    text.setLength(termLen);
-    termText.getChars(0, termLen, text.result, 0);
+    
+    final BytesRef termBytes = term.bytes();
+    UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text);
     dirty = true;
     field = term.field();
     this.term = term;
@@ -113,7 +123,7 @@ final class TermBuffer implements Clonea
       return null;
 
     if (term == null)
-      term = new Term(field, new String(text.result, 0, text.length), false);
+      term = new Term(field, new BytesRef(text.result, 0, text.length), false);
 
     return term;
   }
@@ -124,10 +134,11 @@ final class TermBuffer implements Clonea
     try {
       clone = (TermBuffer)super.clone();
     } catch (CloneNotSupportedException e) {}
-
     clone.dirty = true;
     clone.bytes = new BytesRef(10);
     clone.text = new UnicodeUtil.UTF16Result();
+    clone.text.offsets = new int[text.offsets.length];
+    System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
     clone.text.copyText(text);
     return clone;
   }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java Thu Jul 22 19:34:35 2010
@@ -189,7 +189,7 @@ public final class TermInfosReader {
 
     while (hi >= lo) {
       int mid = (lo + hi) >>> 1;
-      int delta = term.compareTo(indexTerms[mid]);
+      int delta = term.compareToUTF16(indexTerms[mid]);
       if (delta < 0)
 	hi = mid - 1;
       else if (delta > 0)
@@ -234,17 +234,17 @@ public final class TermInfosReader {
 
     // optimize sequential access: first try scanning cached enum w/o seeking
     if (enumerator.term() != null                 // term is at or past current
-	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
-	    || term.compareTo(enumerator.term()) >= 0)) {
+	&& ((enumerator.prev() != null && term.compareToUTF16(enumerator.prev())> 0)
+	    || term.compareToUTF16(enumerator.term()) >= 0)) {
       int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
       if (indexTerms.length == enumOffset	  // but before end of block
-    || term.compareTo(indexTerms[enumOffset]) < 0) {
+    || term.compareToUTF16(indexTerms[enumOffset]) < 0) {
        // no need to seek
 
         final TermInfo ti;
 
         int numScans = enumerator.scanTo(term);
-        if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
+        if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
           ti = enumerator.termInfo();
           if (numScans > 1) {
             // we only  want to put this TermInfo into the cache if
@@ -279,7 +279,7 @@ public final class TermInfosReader {
     seekEnum(enumerator, indexPos);
     enumerator.scanTo(term);
     final TermInfo ti;
-    if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
+    if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
       ti = enumerator.termInfo();
       if (tiOrd == null) {
         termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position));
@@ -328,9 +328,9 @@ public final class TermInfosReader {
     SegmentTermEnum enumerator = getThreadResources().termEnum;
     seekEnum(enumerator, indexOffset);
 
-    while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
+    while(term.compareToUTF16(enumerator.term()) > 0 && enumerator.next()) {}
 
-    if (term.compareTo(enumerator.term()) == 0)
+    if (term.compareToUTF16(enumerator.term()) == 0)
       return enumerator.position;
     else
       return -1;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Thu Jul 22 19:34:35 2010
@@ -80,7 +80,7 @@ public class PulsingCodec extends Codec 
     // Terms dict
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -111,7 +111,7 @@ public class PulsingCodec extends Codec 
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -126,7 +126,7 @@ public class PulsingCodec extends Codec 
                                                        state.dir, state.fieldInfos, state.segmentInfo.name,
                                                        pulsingReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Thu Jul 22 19:34:35 2010
@@ -51,7 +51,8 @@ public class PulsingPostingsReaderImpl e
 
   @Override
   public void init(IndexInput termsIn) throws IOException {
-    CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, PulsingPostingsWriterImpl.VERSION_START);
+    CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
+      PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
     maxPulsingDocFreq = termsIn.readVInt();
     wrappedPostingsReader.init(termsIn);
   }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java Thu Jul 22 19:34:35 2010
@@ -229,7 +229,7 @@ public final class PulsingPostingsWriter
 
   @Override
   public void finishDoc() {
-    assert currentDoc.numPositions == currentDoc.termDocFreq;
+    assert omitTF || currentDoc.numPositions == currentDoc.termDocFreq;
   }
 
   boolean pendingIsIndexTerm;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java Thu Jul 22 19:34:35 2010
@@ -63,7 +63,7 @@ public class SepCodec extends Codec {
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -95,7 +95,7 @@ public class SepCodec extends Codec {
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -111,7 +111,7 @@ public class SepCodec extends Codec {
                                                        state.segmentInfo.name,
                                                        postingsReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Thu Jul 22 19:34:35 2010
@@ -95,7 +95,8 @@ public class SepPostingsReaderImpl exten
   @Override
   public void init(IndexInput termsIn) throws IOException {
     // Make sure we are talking to the matching past writer
-    CodecUtil.checkHeader(termsIn, SepPostingsWriterImpl.CODEC, SepPostingsWriterImpl.VERSION_START);
+    CodecUtil.checkHeader(termsIn, SepPostingsWriterImpl.CODEC,
+      SepPostingsWriterImpl.VERSION_START, SepPostingsWriterImpl.VERSION_START);
     skipInterval = termsIn.readInt();
     maxSkipLevels = termsIn.readInt();
   }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SingleIntIndexInput.java Thu Jul 22 19:34:35 2010
@@ -36,7 +36,8 @@ public class SingleIntIndexInput extends
   public SingleIntIndexInput(Directory dir, String fileName, int readBufferSize)
     throws IOException {
     in = dir.openInput(fileName, readBufferSize);
-    CodecUtil.checkHeader(in, SingleIntIndexOutput.CODEC, SingleIntIndexOutput.VERSION_START);
+    CodecUtil.checkHeader(in, SingleIntIndexOutput.CODEC,
+      SingleIntIndexOutput.VERSION_START, SingleIntIndexOutput.VERSION_START);
   }
 
   @Override

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Thu Jul 22 19:34:35 2010
@@ -86,6 +86,9 @@ public class SimpleStandardTermsIndexRea
   private PagedBytes.Reader termBytesReader;
 
   final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
+  
+  // start of the field info data
+  protected long dirOffset;
 
   public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp)
     throws IOException {
@@ -97,23 +100,21 @@ public class SimpleStandardTermsIndexRea
     boolean success = false;
 
     try {
-      CodecUtil.checkHeader(in, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START);
-
-      final long dirOffset = in.readLong();
-
+      
+      readHeader(in);
       indexInterval = in.readInt();
       this.indexDivisor = indexDivisor;
 
-      if (indexDivisor == -1) {
+      if (indexDivisor < 0) {
         totalIndexInterval = indexInterval;
       } else {
         // In case terms index gets loaded, later, on demand
         totalIndexInterval = indexInterval * indexDivisor;
       }
+      
+      seekDir(in, dirOffset);
 
       // Read directory
-      in.seek(dirOffset);
-
       final int numFields = in.readInt();
 
       for(int i=0;i<numFields;i++) {
@@ -131,18 +132,24 @@ public class SimpleStandardTermsIndexRea
       }
       success = true;
     } finally {
-      if (indexDivisor != -1) {
+      if (indexDivisor > 0) {
         in.close();
         this.in = null;
         if (success) {
           indexLoaded = true;
         }
-        termBytesReader = termBytes.freeze();
+        termBytesReader = termBytes.freeze(true);
       } else {
         this.in = in;
       }
     }
   }
+  
+  protected void readHeader(IndexInput input) throws IOException {
+    CodecUtil.checkHeader(input, SimpleStandardTermsIndexWriter.CODEC_NAME,
+      SimpleStandardTermsIndexWriter.VERSION_START, SimpleStandardTermsIndexWriter.VERSION_START);
+    dirOffset = input.readLong();
+  }
 
   private final class FieldIndexReader extends FieldReader {
 
@@ -173,7 +180,7 @@ public class SimpleStandardTermsIndexRea
       // We still create the indexReader when indexDivisor
       // is -1, so that StandardTermsDictReader can call
       // isIndexTerm for each field:
-      if (indexDivisor != -1) {
+      if (indexDivisor > 0) {
         coreIndex = new CoreFieldIndex(indexStart,
                                        termsStart,
                                        packedIndexStart,
@@ -218,7 +225,8 @@ public class SimpleStandardTermsIndexRea
 
     @Override
     public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
-      // You must call loadTermsIndex if you had specified -1 for indexDivisor
+      // You must call loadTermsIndex if you had specified
+      // indexDivisor < 0 to ctor
       if (coreIndex == null) {
         throw new IllegalStateException("terms index was not loaded");
       }
@@ -413,7 +421,7 @@ public class SimpleStandardTermsIndexRea
 
       indexLoaded = true;
       in.close();
-      termBytesReader = termBytes.freeze();
+      termBytesReader = termBytes.freeze(true);
     }
   }
 
@@ -444,4 +452,8 @@ public class SimpleStandardTermsIndexRea
       termBytesReader.close();
     }
   }
+
+  protected void seekDir(IndexInput input, long dirOffset) throws IOException {
+    input.seek(dirOffset);
+  }
 }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Thu Jul 22 19:34:35 2010
@@ -33,7 +33,7 @@ import java.io.IOException;
 
 /** @lucene.experimental */
 public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
-  final private IndexOutput out;
+  protected final IndexOutput out;
 
   final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
   final static int VERSION_START = 0;
@@ -50,12 +50,15 @@ public class SimpleStandardTermsIndexWri
     state.flushedFiles.add(indexFileName);
     termIndexInterval = state.termIndexInterval;
     out = state.directory.createOutput(indexFileName);
-    CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
     fieldInfos = state.fieldInfos;
-
+    writeHeader(out);
+    out.writeInt(termIndexInterval);
+  }
+  
+  protected void writeHeader(IndexOutput out) throws IOException {
+    CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
     // Placeholder for dir offset
     out.writeLong(0);
-    out.writeInt(termIndexInterval);
   }
 
   @Override
@@ -179,8 +182,12 @@ public class SimpleStandardTermsIndexWri
       out.writeLong(field.packedIndexStart);
       out.writeLong(field.packedOffsetsStart);
     }
+    writeTrailer(dirStart);
+    out.close();
+  }
+
+  protected void writeTrailer(long dirStart) throws IOException {
     out.seek(CodecUtil.headerLength(CODEC_NAME));
     out.writeLong(dirStart);
-    out.close();
   }
-}
\ No newline at end of file
+}

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Thu Jul 22 19:34:35 2010
@@ -58,7 +58,7 @@ public class StandardCodec extends Codec
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -85,7 +85,7 @@ public class StandardCodec extends Codec
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -101,7 +101,7 @@ public class StandardCodec extends Codec
                                                        state.segmentInfo.name,
                                                        postings,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReaderImpl.java Thu Jul 22 19:34:35 2010
@@ -73,7 +73,8 @@ public class StandardPostingsReaderImpl 
   public void init(IndexInput termsIn) throws IOException {
 
     // Make sure we are talking to the matching past writer
-    CodecUtil.checkHeader(termsIn, StandardPostingsWriterImpl.CODEC, StandardPostingsWriterImpl.VERSION_START);
+    CodecUtil.checkHeader(termsIn, StandardPostingsWriterImpl.CODEC,
+      StandardPostingsWriterImpl.VERSION_START, StandardPostingsWriterImpl.VERSION_START);
 
     skipInterval = termsIn.readInt();
     maxSkipLevels = termsIn.readInt();
@@ -152,11 +153,17 @@ public class StandardPostingsReaderImpl 
     
   @Override
   public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
-    final SegmentDocsEnum docsEnum;
-    if (reuse == null) {
+    SegmentDocsEnum docsEnum;
+    if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
       docsEnum = new SegmentDocsEnum(freqIn);
     } else {
       docsEnum = (SegmentDocsEnum) reuse;
+      if (docsEnum.startFreqIn != freqIn) {
+        // If you are using ParellelReader, and pass in a
+        // reused DocsEnum, it could have come from another
+        // reader also using standard codec
+        docsEnum = new SegmentDocsEnum(freqIn);
+      }
     }
     return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
   }
@@ -166,11 +173,17 @@ public class StandardPostingsReaderImpl 
     if (fieldInfo.omitTermFreqAndPositions) {
       return null;
     }
-    final SegmentDocsAndPositionsEnum docsEnum;
-    if (reuse == null) {
+    SegmentDocsAndPositionsEnum docsEnum;
+    if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
       docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
     } else {
       docsEnum = (SegmentDocsAndPositionsEnum) reuse;
+      if (docsEnum.startFreqIn != freqIn) {
+        // If you are using ParellelReader, and pass in a
+        // reused DocsEnum, it could have come from another
+        // reader also using standard codec
+        docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
+      }
     }
     return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
   }
@@ -178,6 +191,7 @@ public class StandardPostingsReaderImpl 
   // Decodes only docs
   private class SegmentDocsEnum extends DocsEnum {
     final IndexInput freqIn;
+    final IndexInput startFreqIn;
 
     boolean omitTF;                               // does current field omit term freq?
     boolean storePayloads;                        // does current field store payloads?
@@ -196,6 +210,7 @@ public class StandardPostingsReaderImpl 
     DefaultSkipListReader skipper;
 
     public SegmentDocsEnum(IndexInput freqIn) throws IOException {
+      startFreqIn = freqIn;
       this.freqIn = (IndexInput) freqIn.clone();
     }
 
@@ -345,6 +360,7 @@ public class StandardPostingsReaderImpl 
 
   // Decodes docs & positions
   private class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
+    final IndexInput startFreqIn;
     private final IndexInput freqIn;
     private final IndexInput proxIn;
 
@@ -372,6 +388,7 @@ public class StandardPostingsReaderImpl 
     private long lazyProxPointer;
 
     public SegmentDocsAndPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
+      startFreqIn = freqIn;
       this.freqIn = (IndexInput) freqIn.clone();
       this.proxIn = (IndexInput) proxIn.clone();
     }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java Thu Jul 22 19:34:35 2010
@@ -71,6 +71,9 @@ public class StandardTermsDictReader ext
 
   // Reads the terms index
   private StandardTermsIndexReader indexReader;
+  
+  // keeps the dirStart offset
+  protected long dirOffset;
 
   // Used as key for the terms cache
   private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
@@ -116,15 +119,13 @@ public class StandardTermsDictReader ext
 
     boolean success = false;
     try {
-      CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT);
-
-      final long dirOffset = in.readLong();
+      readHeader(in);
 
       // Have PostingsReader init itself
       postingsReader.init(in);
 
       // Read per-field details
-      in.seek(dirOffset);
+      seekDir(in, dirOffset);
 
       final int numFields = in.readInt();
 
@@ -151,6 +152,17 @@ public class StandardTermsDictReader ext
     this.indexReader = indexReader;
   }
 
+  protected void readHeader(IndexInput input) throws IOException {
+    CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME,
+      StandardTermsDictWriter.VERSION_START, StandardTermsDictWriter.VERSION_CURRENT);
+    dirOffset = in.readLong();    
+  }
+  
+  protected void seekDir(IndexInput input, long dirOffset)
+      throws IOException {
+    input.seek(dirOffset);
+  }
+  
   @Override
   public void loadTermsIndex(int indexDivisor) throws IOException {
     indexReader.loadTermsIndex(indexDivisor);

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java Thu Jul 22 19:34:35 2010
@@ -55,41 +55,48 @@ public class StandardTermsDictWriter ext
 
   private final DeltaBytesWriter termWriter;
 
-  final IndexOutput out;
+  protected final IndexOutput out;
   final StandardPostingsWriter postingsWriter;
   final FieldInfos fieldInfos;
   FieldInfo currentField;
-  private final StandardTermsIndexWriter indexWriter;
+  private final StandardTermsIndexWriter termsIndexWriter;
   private final List<TermsConsumer> fields = new ArrayList<TermsConsumer>();
   private final Comparator<BytesRef> termComp;
 
-  public StandardTermsDictWriter(StandardTermsIndexWriter indexWriter, SegmentWriteState state, StandardPostingsWriter postingsWriter, Comparator<BytesRef> termComp) throws IOException {
+  public StandardTermsDictWriter(
+      StandardTermsIndexWriter termsIndexWriter,
+      SegmentWriteState state,
+      StandardPostingsWriter postingsWriter,
+      Comparator<BytesRef> termComp) throws IOException
+  {
     final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.TERMS_EXTENSION);
-    this.indexWriter = indexWriter;
+    this.termsIndexWriter = termsIndexWriter;
     this.termComp = termComp;
     out = state.directory.createOutput(termsFileName);
-    indexWriter.setTermsOutput(out);
+    termsIndexWriter.setTermsOutput(out);
     state.flushedFiles.add(termsFileName);
 
     fieldInfos = state.fieldInfos;
-
-    // Count indexed fields up front
-    CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); 
-
-    out.writeLong(0);                             // leave space for end index pointer
-
+    writeHeader(out);
     termWriter = new DeltaBytesWriter(out);
     currentField = null;
     this.postingsWriter = postingsWriter;
 
     postingsWriter.start(out);                          // have consumer write its format/header
   }
+  
+  protected void writeHeader(IndexOutput out) throws IOException {
+    // Count indexed fields up front
+    CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); 
+
+    out.writeLong(0);                             // leave space for end index pointer    
+  }
 
   @Override
   public TermsConsumer addField(FieldInfo field) {
     assert currentField == null || currentField.name.compareTo(field.name) < 0;
     currentField = field;
-    StandardTermsIndexWriter.FieldWriter fieldIndexWriter = indexWriter.addField(field);
+    StandardTermsIndexWriter.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
     TermsConsumer terms = new TermsWriter(fieldIndexWriter, field, postingsWriter);
     fields.add(terms);
     return terms;
@@ -110,8 +117,7 @@ public class StandardTermsDictWriter ext
         out.writeLong(field.numTerms);
         out.writeLong(field.termsStartPointer);
       }
-      out.seek(CodecUtil.headerLength(CODEC_NAME));
-      out.writeLong(dirStart);
+      writeTrailer(dirStart);
     } finally {
       try {
         out.close();
@@ -119,12 +125,18 @@ public class StandardTermsDictWriter ext
         try {
           postingsWriter.close();
         } finally {
-          indexWriter.close();
+          termsIndexWriter.close();
         }
       }
     }
   }
 
+  protected void writeTrailer(long dirStart) throws IOException {
+    // TODO Auto-generated method stub
+    out.seek(CodecUtil.headerLength(CODEC_NAME));
+    out.writeLong(dirStart);    
+  }
+  
   class TermsWriter extends TermsConsumer {
     private final FieldInfo fieldInfo;
     private final StandardPostingsWriter postingsWriter;
@@ -132,7 +144,11 @@ public class StandardTermsDictWriter ext
     private long numTerms;
     private final StandardTermsIndexWriter.FieldWriter fieldIndexWriter;
 
-    TermsWriter(StandardTermsIndexWriter.FieldWriter fieldIndexWriter, FieldInfo fieldInfo, StandardPostingsWriter postingsWriter) {
+    TermsWriter(
+        StandardTermsIndexWriter.FieldWriter fieldIndexWriter,
+        FieldInfo fieldInfo,
+        StandardPostingsWriter postingsWriter) 
+    {
       this.fieldInfo = fieldInfo;
       this.fieldIndexWriter = fieldIndexWriter;
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java Thu Jul 22 19:34:35 2010
@@ -101,7 +101,7 @@ public class MultiFieldQueryParser exten
     if (field == null) {
       List<BooleanClause> clauses = new ArrayList<BooleanClause>();
       for (int i = 0; i < fields.length; i++) {
-        Query q = super.getFieldQuery(fields[i], queryText);
+        Query q = super.getFieldQuery(fields[i], queryText, true);
         if (q != null) {
           //If the user passes a map of boosts
           if (boosts != null) {
@@ -119,7 +119,7 @@ public class MultiFieldQueryParser exten
         return null;
       return getBooleanQuery(clauses, true);
     }
-    Query q = super.getFieldQuery(field, queryText);
+    Query q = super.getFieldQuery(field, queryText, true);
     applySlop(q,slop);
     return q;
   }
@@ -134,8 +134,29 @@ public class MultiFieldQueryParser exten
   
 
   @Override
-  protected Query getFieldQuery(String field, String queryText) throws ParseException {
-    return getFieldQuery(field, queryText, 0);
+  protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
+    if (field == null) {
+      List<BooleanClause> clauses = new ArrayList<BooleanClause>();
+      for (int i = 0; i < fields.length; i++) {
+        Query q = super.getFieldQuery(fields[i], queryText, quoted);
+        if (q != null) {
+          //If the user passes a map of boosts
+          if (boosts != null) {
+            //Get the boost from the map and apply them
+            Float boost = boosts.get(fields[i]);
+            if (boost != null) {
+              q.setBoost(boost.floatValue());
+            }
+          }
+          clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD));
+        }
+      }
+      if (clauses.size() == 0)  // happens for stopwords
+        return null;
+      return getBooleanQuery(clauses, true);
+    }
+    Query q = super.getFieldQuery(field, queryText, quoted);
+    return q;
   }
 
 



Mime
View raw message