lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r979860 [2/5] - in /lucene/dev/branches/realtime_search: ./ lucene/ lucene/contrib/ lucene/contrib/highlighter/src/test/ lucene/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/ lucene/contrib/memory/src/test/org/apache/lu...
Date Tue, 27 Jul 2010 20:44:39 GMT
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Tue Jul 27 20:44:34 2010
@@ -40,12 +40,11 @@ import org.apache.lucene.store.IndexInpu
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.ArrayUtil;
 
 /** Exposes flex API on a pre-flex index, as a codec. 
  * @lucene.experimental */
 public class PreFlexFields extends FieldsProducer {
-
+  
   private static final boolean DEBUG_SURROGATES = false;
 
   public TermInfosReader tis;
@@ -60,7 +59,7 @@ public class PreFlexFields extends Field
   private final int readBufferSize;
   private Directory cfsReader;
 
-  PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
+  public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
     throws IOException {
 
     si = info;
@@ -107,6 +106,15 @@ public class PreFlexFields extends Field
     this.dir = dir;
   }
 
+  // If this returns, we do the surrogates dance so that the
+  // terms are sorted by unicode sort order.  This should be
+  // true when segments are used for "normal" searching;
+  // it's only false during testing, to create a pre-flex
+  // index, using the test-only PreFlexRW.
+  protected boolean sortTermsByUnicode() {
+    return true;
+  }
+
   static void files(Directory dir, SegmentInfo info, Collection<String> files) throws IOException {
     files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION));
     files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
@@ -182,6 +190,12 @@ public class PreFlexFields extends Field
     if (cfsReader != null) {
       cfsReader.close();
     }
+    if (freqStream != null) {
+      freqStream.close();
+    }
+    if (proxStream != null) {
+      proxStream.close();
+    }
   }
 
   private class PreFlexFieldsEnum extends FieldsEnum {
@@ -228,7 +242,11 @@ public class PreFlexFields extends Field
     public Comparator<BytesRef> getComparator() {
       // Pre-flex indexes always sorted in UTF16 order, but
       // we remap on-the-fly to unicode order
-      return BytesRef.getUTF8SortedAsUnicodeComparator();
+      if (sortTermsByUnicode()) {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      } else {
+        return BytesRef.getUTF8SortedAsUTF16Comparator();
+      }
     }
   }
 
@@ -238,119 +256,225 @@ public class PreFlexFields extends Field
     private boolean skipNext;
     private BytesRef current;
 
-    private int[] surrogateSeekPending = new int[1];
-    private boolean[] surrogateDidSeekBack = new boolean[1];
-    private int surrogateSeekUpto;
-    private char[] pendingPrefix;
-
     private SegmentTermEnum seekTermEnum;
     private Term protoTerm;
+    
+    private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
+    private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;
+
+    // Returns true if the unicode char is "after" the
+    // surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
+    private final boolean isHighBMPChar(byte[] b, int idx) {
+      return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
+    }
+
+    // Returns true if the unicode char in the UTF8 byte
+    // sequence starting at idx encodes a char outside of
+    // BMP (ie what would be a surrogate pair in UTF16):
+    private final boolean isNonBMPChar(byte[] b, int idx) {
+      return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
+    }
+
+    private final byte[] scratch = new byte[4];
+    private final BytesRef prevTerm = new BytesRef();
+    private final BytesRef scratchTerm = new BytesRef();
     private int newSuffixStart;
 
-    void reset(FieldInfo fieldInfo) throws IOException {
-      this.fieldInfo = fieldInfo;
-      protoTerm = new Term(fieldInfo.name);
-      if (termEnum == null) {
-        termEnum = getTermsDict().terms(protoTerm);
-        seekTermEnum = getTermsDict().terms(protoTerm);
+    // Swap in S, in place of E:
+    private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
+      final int savLength = term.length;
+
+      assert term.offset == 0;
+
+      // The 3 bytes starting at downTo make up 1
+      // unicode character:
+      assert isHighBMPChar(term.bytes, pos);
+
+      // NOTE: we cannot make this assert, because
+      // AutomatonQuery legitimately sends us malformed UTF8
+      // (eg the UTF8 bytes with just 0xee)
+      // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();
+
+      // Save the bytes && length, since we need to
+      // restore this if seek "back" finds no matching
+      // terms
+      if (term.bytes.length < 4+pos) {
+        term.grow(4+pos);
+      }
+
+      scratch[0] = term.bytes[pos];
+      scratch[1] = term.bytes[pos+1];
+      scratch[2] = term.bytes[pos+2];
+
+      term.bytes[pos] = (byte) 0xf0;
+      term.bytes[pos+1] = (byte) 0x90;
+      term.bytes[pos+2] = (byte) 0x80;
+      term.bytes[pos+3] = (byte) 0x80;
+      term.length = 4+pos;
+
+      if (DEBUG_SURROGATES) {
+        System.out.println("      try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
+      }
+
+      // Seek "back":
+      getTermsDict().seekEnum(te, protoTerm.createTerm(term));
+
+      // Test if the term we seek'd to in fact found a
+      // surrogate pair at the same position as the E:
+      Term t2 = te.term();
+
+      // Cannot be null (or move to next field) because at
+      // "worst" it'd seek to the same term we are on now,
+      // unless we are being called from seek
+      if (t2 == null || t2.field() != fieldInfo.name) {
+        return false;
+      }
+
+      if (DEBUG_SURROGATES) {
+        System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()));
+      }
+
+      // Now test if prefix is identical and we found
+      // a non-BMP char at the same position:
+      BytesRef b2 = t2.bytes();
+      assert b2.offset == 0;
+
+      boolean matches;
+      if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
+        matches = true;
+        for(int i=0;i<pos;i++) {
+          if (term.bytes[i] != b2.bytes[i]) {
+            matches = false;
+            break;
+          }
+        }              
       } else {
-        getTermsDict().seekEnum(termEnum, protoTerm);
+        matches = false;
       }
-      skipNext = true;
-      
-      surrogateSeekUpto = 0;
-      newSuffixStart = 0;
 
-      surrogatesDance();
+      // Restore term:
+      term.length = savLength;
+      term.bytes[pos] = scratch[0];
+      term.bytes[pos+1] = scratch[1];
+      term.bytes[pos+2] = scratch[2];
+
+      return matches;
     }
 
-    private void surrogatesDance() throws IOException {
-      
-      // Tricky: prior to 4.0, Lucene index sorted terms in
-      // UTF16 order, but as of 4.0 we sort by Unicode code
-      // point order.  These orders differ because of the
-      // surrrogates; so we have to fixup our enum, here, by
-      // carefully first seeking past the surrogates and
-      // then back again at the end.  The process is
-      // recursive, since any given term could have multiple
-      // new occurrences of surrogate pairs, so we use a
-      // stack to record the pending seek-backs.
+    // Seek type 2 "continue" (back to the start of the
+    // surrogates): scan the stripped suffix from the
+    // prior term, backwards. If there was an E in that
+    // part, then we try to seek back to S.  If that
+    // seek finds a matching term, we go there.
+    private boolean doContinue() throws IOException {
+
       if (DEBUG_SURROGATES) {
-        System.out.println("  dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+        System.out.println("  try cont");
       }
 
-      while(popPendingSeek());
-      while(pushNewSurrogate());
-    }
+      int downTo = prevTerm.length-1;
 
-    // only for debugging
-    private String getStack() {
-      if (surrogateSeekUpto == 0) {
-        return "null";
-      } else {
-        StringBuffer sb = new StringBuffer();
-        for(int i=0;i<surrogateSeekUpto;i++) {
-          if (i > 0) {
-            sb.append(' ');
+      boolean didSeek = false;
+      
+      final int limit = Math.min(newSuffixStart, scratchTerm.length-1);
+
+      while(downTo > limit) {
+
+        if (isHighBMPChar(prevTerm.bytes, downTo)) {
+
+          if (DEBUG_SURROGATES) {
+            System.out.println("    found E pos=" + downTo + " vs len=" + prevTerm.length);
+          }
+
+          if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
+            // TODO: more efficient seek?
+            getTermsDict().seekEnum(termEnum, seekTermEnum.term());
+            //newSuffixStart = downTo+4;
+            newSuffixStart = downTo;
+            scratchTerm.copy(termEnum.term().bytes());
+            didSeek = true;
+            if (DEBUG_SURROGATES) {
+              System.out.println("      seek!");
+            }
+            break;
+          } else {
+            if (DEBUG_SURROGATES) {
+              System.out.println("      no seek");
+            }
           }
-          sb.append(surrogateSeekPending[i]);
         }
-        sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
-        return sb.toString();
+
+        // Shorten prevTerm in place so that we don't redo
+        // this loop if we come back here:
+        if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
+          prevTerm.length = downTo;
+        }
+        
+        downTo--;
       }
+
+      return didSeek;
     }
 
-    private boolean popPendingSeek() throws IOException {
+    // Look for seek type 3 ("pop"): if the delta from
+    // prev -> current was replacing an S with an E,
+    // we must now seek to beyond that E.  This seek
+    // "finishes" the dance at this character
+    // position.
+    private boolean doPop() throws IOException {
+
       if (DEBUG_SURROGATES) {
-        System.out.println("  check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
+        System.out.println("  try pop");
       }
-      // if a .next() has advanced beyond the
-      // after-surrogates range we had last seeked to, we
-      // must seek back to the start and resume .next from
-      // there.  this pops the pending seek off the stack.
-      final Term t = termEnum.term();
-      if (surrogateSeekUpto > 0) {
-        final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
+
+      assert newSuffixStart <= prevTerm.length;
+      assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;
+
+      if (prevTerm.length > newSuffixStart &&
+          isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
+          isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {
+
+        // Seek type 2 -- put 0xFF at this position:
+        scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
+        scratchTerm.length = newSuffixStart+1;
+
         if (DEBUG_SURROGATES) {
-          System.out.println("    seekPrefix=" + seekPrefix);
+          System.out.println("    seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
         }
-        if (newSuffixStart < seekPrefix) {
-          assert pendingPrefix != null;
-          assert pendingPrefix.length > seekPrefix;
-          pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
-          pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START;
-          Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix));
+          
+        // TODO: more efficient seek?  can we simply swap
+        // the enums?
+        getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
+
+        final Term t2 = termEnum.term();
+
+        // We could hit EOF or different field since this
+        // was a seek "forward":
+        if (t2 != null && t2.field() == fieldInfo.name) {
+
           if (DEBUG_SURROGATES) {
-            System.out.println("    do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
+            System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
           }
-          getTermsDict().seekEnum(termEnum, t2);
-          surrogateDidSeekBack[surrogateSeekUpto-1] = true;
 
-          // +2 because we don't want to re-check the
-          // surrogates we just seek'd back to
-          newSuffixStart = seekPrefix + 2;
+          final BytesRef b2 = t2.bytes();
+          assert b2.offset == 0;
+
+
+          // Set newSuffixStart -- we can't use
+          // termEnum's since the above seek may have
+          // done no scanning (eg, term was precisely
+          // and index term, or, was in the term seek
+          // cache):
+          scratchTerm.copy(b2);
+          setNewSuffixStart(prevTerm, scratchTerm);
+
           return true;
-        } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
-          assert pendingPrefix != null;
-          assert pendingPrefix.length > seekPrefix;
-          pendingPrefix[seekPrefix] = 0xffff;
-          Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix));
-          if (DEBUG_SURROGATES) {
-            System.out.println("    finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
-          }
-          getTermsDict().seekEnum(termEnum, t2);
+        } else if (newSuffixStart != 0 || scratchTerm.length != 0) {
           if (DEBUG_SURROGATES) {
-            System.out.println("    found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
-          }
-          surrogateSeekUpto--;
-
-          if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
-            // force pop
-            newSuffixStart = -1;
-          } else {
-            newSuffixStart = termEnum.newSuffixStart;
+            System.out.println("      got term=null (or next field)");
           }
-
+          newSuffixStart = 0;
+          scratchTerm.length = 0;
           return true;
         }
       }
@@ -358,117 +482,249 @@ public class PreFlexFields extends Field
       return false;
     }
 
-    private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
-    private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result();
+    // Pre-flex indices store terms in UTF16 sort order, but
+    // certain queries require Unicode codepoint order; this
+    // method carefully seeks around surrogates to handle
+    // this impedance mismatch
+
+    private void surrogateDance() throws IOException {
+
+      if (!unicodeSortOrder) {
+        return;
+      }
+
+      // We are invoked after TIS.next() (by UTF16 order) to
+      // possibly seek to a different "next" (by unicode
+      // order) term.
+
+      // We scan only the "delta" from the last term to the
+      // current term, in UTF8 bytes.  We look at 1) the bytes
+      // stripped from the prior term, and then 2) the bytes
+      // appended to that prior term's prefix.
     
-    private boolean pushNewSurrogate() throws IOException {
+      // We don't care about specific UTF8 sequences, just
+      // the "category" of the UTF16 character.  Category S
+      // is a high/low surrogate pair (it non-BMP).
+      // Category E is any BMP char > UNI_SUR_LOW_END (and <
+      // U+FFFF). Category A is the rest (any unicode char
+      // <= UNI_SUR_HIGH_START).
+
+      // The core issue is that pre-flex indices sort the
+      // characters as ASE, while flex must sort as AES.  So
+      // when scanning, when we hit S, we must 1) seek
+      // forward to E and enum the terms there, then 2) seek
+      // back to S and enum all terms there, then 3) seek to
+      // after E.  Three different seek points (1, 2, 3).
+    
+      // We can easily detect S in UTF8: if a byte has
+      // prefix 11110 (0xf0), then that byte and the
+      // following 3 bytes encode a single unicode codepoint
+      // in S.  Similary,we can detect E: if a byte has
+      // prefix 1110111 (0xee), then that byte and the
+      // following 2 bytes encode a single unicode codepoint
+      // in E.
+
+      // Note that this is really a recursive process --
+      // maybe the char at pos 2 needs to dance, but any
+      // point in its dance, suddenly pos 4 needs to dance
+      // so you must finish pos 4 before returning to pos
+      // 2.  But then during pos 4's dance maybe pos 7 needs
+      // to dance, etc.  However, despite being recursive,
+      // we don't need to hold any state because the state
+      // can always be derived by looking at prior term &
+      // current term.
+
+      // TODO: can we avoid this copy?
+      if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
+        scratchTerm.length = 0;
+      } else {
+        scratchTerm.copy(termEnum.term().bytes());
+      }
+      
       if (DEBUG_SURROGATES) {
-        System.out.println("  check push newSuffix=" + newSuffixStart + " stack=" + getStack());
+        System.out.println("  dance");
+        System.out.println("    prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
+        System.out.println("         " + prevTerm.toString());
+        System.out.println("    term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
+        System.out.println("         " + scratchTerm.toString());
+      }
+
+      // This code assumes TermInfosReader/SegmentTermEnum
+      // always use BytesRef.offset == 0
+      assert prevTerm.offset == 0;
+      assert scratchTerm.offset == 0;
+
+      // Need to loop here because we may need to do multiple
+      // pops, and possibly a continue in the end, ie:
+      //
+      //  cont
+      //  pop, cont
+      //  pop, pop, cont
+      //  <nothing>
+      //
+
+      while(true) {
+        if (doContinue()) {
+          break;
+        } else {
+          if (!doPop()) {
+            break;
+          }
+        }
       }
-      final Term t = termEnum.term();
-      if (t == null || t.field() != fieldInfo.name) {
-        return false;
+
+      if (DEBUG_SURROGATES) {
+        System.out.println("  finish bmp ends");
       }
 
-      final BytesRef bytes = t.bytes();
-      UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer);
+      doPushes();
+    }
 
-      for(int i=Math.max(0,newSuffixStart);i<termBuffer.length;i++) {
-        final char ch = termBuffer.result[i];
-        if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
 
-          if (DEBUG_SURROGATES) {
-            System.out.println("    found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
-          }
+    // Look for seek type 1 ("push"): if the newly added
+    // suffix contains any S, we must try to seek to the
+    // corresponding E.  If we find a match, we go there;
+    // else we keep looking for additional S's in the new
+    // suffix.  This "starts" the dance, at this character
+    // position:
+    private void doPushes() throws IOException {
+
+      int upTo = newSuffixStart;
+      if (DEBUG_SURROGATES) {
+        System.out.println("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
+      }
 
-          // the next() that we just did read in a new
-          // suffix, containing a surrogate pair
+      while(upTo < scratchTerm.length) {
+        if (isNonBMPChar(scratchTerm.bytes, upTo) &&
+            (upTo > newSuffixStart ||
+             (upTo >= prevTerm.length ||
+              (!isNonBMPChar(prevTerm.bytes, upTo) &&
+               !isHighBMPChar(prevTerm.bytes, upTo))))) {
+
+          // A non-BMP char (4 bytes UTF8) starts here:
+          assert scratchTerm.length >= upTo + 4;
+          
+          final int savLength = scratchTerm.length;
+          scratch[0] = scratchTerm.bytes[upTo];
+          scratch[1] = scratchTerm.bytes[upTo+1];
+          scratch[2] = scratchTerm.bytes[upTo+2];
+
+          scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
+          scratchTerm.bytes[upTo+1] = (byte) 0x80;
+          scratchTerm.bytes[upTo+2] = (byte) 0x80;
+          scratchTerm.length = upTo+3;
 
-          // seek forward to see if there are any terms with
-          // this same prefix, but with characters after the
-          // surrogate range; if so, we must first iterate
-          // them, then seek back to the surrogates
+          if (DEBUG_SURROGATES) {
+            System.out.println("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
+          }
 
-          char[] testPrefix = new char[i+2];
-          for(int j=0;j<i;j++) {
-            testPrefix[j] = termBuffer.result[j];
+          // Seek "forward":
+          // TODO: more efficient seek?
+          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
+
+          scratchTerm.bytes[upTo] = scratch[0];
+          scratchTerm.bytes[upTo+1] = scratch[1];
+          scratchTerm.bytes[upTo+2] = scratch[2];
+          scratchTerm.length = savLength;
+
+          // Did we find a match?
+          final Term t2 = seekTermEnum.term();
+            
+          if (DEBUG_SURROGATES) {
+            if (t2 == null) {
+              System.out.println("      hit term=null");
+            } else {
+              System.out.println("      hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
+            }
           }
-          testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
 
-          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new BytesRef(testPrefix, 0, i+1)));
+          // Since this was a seek "forward", we could hit
+          // EOF or a different field:
+          boolean matches;
 
-          Term t2 = seekTermEnum.term();
-          boolean isPrefix;
           if (t2 != null && t2.field() == fieldInfo.name) {
-
-            final BytesRef seekBytes = t2.bytes();
-            UnicodeUtil.UTF8toUTF16(seekBytes.bytes, seekBytes.offset, seekBytes.length, seekBuffer);
-
-            isPrefix = true;
-            if (DEBUG_SURROGATES) {
-              System.out.println("      seek found " + UnicodeUtil.toHexString(t2.text()));
-            }
-            for(int j=0;j<i;j++) {
-              if (testPrefix[j] != seekBuffer.result[j]) {
-                isPrefix = false;
-                break;
-              }
-            }
-            if (DEBUG_SURROGATES && !isPrefix) {
-              System.out.println("      no end terms");
+            final BytesRef b2 = t2.bytes();
+            assert b2.offset == 0;
+            if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
+              matches = true;
+              for(int i=0;i<upTo;i++) {
+                if (scratchTerm.bytes[i] != b2.bytes[i]) {
+                  matches = false;
+                  break;
+                }
+              }              
+                
+            } else {
+              matches = false;
             }
           } else {
-            if (DEBUG_SURROGATES) {
-              System.out.println("      no end terms");
-            }
-            isPrefix = false;
+            matches = false;
           }
 
-          if (isPrefix) {
-            // we found a term, sharing the same prefix,
-            // with characters after the surrogates, so we
-            // must first enum those, and then return the
-            // the surrogates afterwards.  push that pending
-            // seek on the surrogates stack now:
-            pendingPrefix = testPrefix;
-
-            getTermsDict().seekEnum(termEnum, t2);
-
-            if (surrogateSeekUpto == surrogateSeekPending.length) {
-              surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
-            }
-            if (surrogateSeekUpto == surrogateDidSeekBack.length) {
-              surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
-            }
-            surrogateSeekPending[surrogateSeekUpto] = i;
-            surrogateDidSeekBack[surrogateSeekUpto] = false;
-            surrogateSeekUpto++;
+          if (matches) {
 
             if (DEBUG_SURROGATES) {
-              System.out.println("      do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
+              System.out.println("      matches!");
             }
 
-            newSuffixStart = i+1;
-
-            return true;
+            // OK seek "back"
+            // TODO: more efficient seek?
+            getTermsDict().seekEnum(termEnum, seekTermEnum.term());
+
+            scratchTerm.copy(seekTermEnum.term().bytes());
+
+            // +3 because we don't need to check the char
+            // at upTo: we know it's > BMP
+            upTo += 3;
+
+            // NOTE: we keep iterating, now, since this
+            // can easily "recurse".  Ie, after seeking
+            // forward at a certain char position, we may
+            // find another surrogate in our [new] suffix
+            // and must then do another seek (recurse)
           } else {
-            // there are no terms after the surrogates, so
-            // we do nothing to the enum and just step
-            // through the surrogates like normal.  but we
-            // must keep iterating through the term, in case
-            // another surrogate pair appears later
+            upTo++;
           }
+        } else {
+          upTo++;
         }
       }
+    }
 
-      return false;
+    private boolean unicodeSortOrder;
+
+    void reset(FieldInfo fieldInfo) throws IOException {
+      //System.out.println("pff.reset te=" + termEnum);
+      this.fieldInfo = fieldInfo;
+      protoTerm = new Term(fieldInfo.name);
+      if (termEnum == null) {
+        termEnum = getTermsDict().terms(protoTerm);
+        seekTermEnum = getTermsDict().terms(protoTerm);
+        //System.out.println("  term=" + termEnum.term());
+      } else {
+        getTermsDict().seekEnum(termEnum, protoTerm);
+      }
+      skipNext = true;
+
+      unicodeSortOrder = sortTermsByUnicode();
+
+      final Term t = termEnum.term();
+      if (t != null && t.field() == fieldInfo.name) {
+        newSuffixStart = 0;
+        prevTerm.length = 0;
+        surrogateDance();
+      }
     }
 
     @Override
     public Comparator<BytesRef> getComparator() {
       // Pre-flex indexes always sorted in UTF16 order, but
       // we remap on-the-fly to unicode order
-      return BytesRef.getUTF8SortedAsUnicodeComparator();
+      if (unicodeSortOrder) {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      } else {
+        return BytesRef.getUTF8SortedAsUTF16Comparator();
+      }
     }
 
     @Override
@@ -484,7 +740,7 @@ public class PreFlexFields extends Field
     @Override
     public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
       if (DEBUG_SURROGATES) {
-        System.out.println("TE.seek() term=" + term.utf8ToString());
+        System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
       }
       skipNext = false;
       final TermInfosReader tis = getTermsDict();
@@ -492,50 +748,142 @@ public class PreFlexFields extends Field
 
       assert termEnum != null;
 
-      if (termEnum == null) {
-        termEnum = tis.terms(t0);
-      } else {
-        tis.seekEnum(termEnum, t0);
-      }
-
-      surrogateSeekUpto = 0;
-      surrogatesDance();
+      tis.seekEnum(termEnum, t0);
 
       final Term t = termEnum.term();
 
-      final BytesRef tr = t == null ? null : t.bytes();
-
-      if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) {
-        current = tr;
+      if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
+        // If we found an exact match, no need to do the
+        // surrogate dance
+        if (DEBUG_SURROGATES) {
+          System.out.println("  seek exact match");
+        }
+        current = t.bytes();
         return SeekStatus.FOUND;
       } else if (t == null || t.field() != fieldInfo.name) {
+
+        // TODO: maybe we can handle this like the next()
+        // into null?  set term as prevTerm then dance?
+
+        if (DEBUG_SURROGATES) {
+          System.out.println("  seek hit EOF");
+        }
+
+        // We hit EOF; try end-case surrogate dance: if we
+        // find an E, try swapping in S, backwards:
+        scratchTerm.copy(term);
+
+        assert scratchTerm.offset == 0;
+
+        for(int i=scratchTerm.length-1;i>=0;i--) {
+          if (isHighBMPChar(scratchTerm.bytes, i)) {
+            if (DEBUG_SURROGATES) {
+              System.out.println("    found E pos=" + i + "; try seek");
+            }
+
+            if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
+
+              scratchTerm.copy(seekTermEnum.term().bytes());
+              getTermsDict().seekEnum(termEnum, seekTermEnum.term());
+
+              newSuffixStart = 1+i;
+
+              doPushes();
+
+              // Found a match
+              // TODO: faster seek?
+              current = termEnum.term().bytes();
+              return SeekStatus.NOT_FOUND;
+            }
+          }
+        }
+        
+        if (DEBUG_SURROGATES) {
+          System.out.println("  seek END");
+        }
+
         current = null;
         return SeekStatus.END;
       } else {
-        current = tr;
-        return SeekStatus.NOT_FOUND;
+
+        // We found a non-exact but non-null term; this one
+        // is fun -- just treat it like next, by pretending
+        // requested term was prev:
+        prevTerm.copy(term);
+
+        if (DEBUG_SURROGATES) {
+          System.out.println("  seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
+        }
+
+        final BytesRef br = t.bytes();
+        assert br.offset == 0;
+
+        setNewSuffixStart(term, br);
+
+        surrogateDance();
+
+        final Term t2 = termEnum.term();
+        if (t2 == null || t2.field() != fieldInfo.name) {
+          assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
+          current = null;
+          return SeekStatus.END;
+        } else {
+          current = t2.bytes();
+          assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
+          return SeekStatus.NOT_FOUND;
+        }
+      }
+    }
+
+    private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
+      final int limit = Math.min(br1.length, br2.length);
+      int lastStart = 0;
+      for(int i=0;i<limit;i++) {
+        if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
+          lastStart = i;
+        }
+        if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
+          newSuffixStart = lastStart;
+          if (DEBUG_SURROGATES) {
+            System.out.println("    set newSuffixStart=" + newSuffixStart);
+          }
+          return;
+        }
+      }
+      newSuffixStart = limit;
+      if (DEBUG_SURROGATES) {
+        System.out.println("    set newSuffixStart=" + newSuffixStart);
       }
     }
 
     @Override
     public BytesRef next() throws IOException {
       if (DEBUG_SURROGATES) {
-        System.out.println("TE.next() skipNext=" + skipNext);
+        System.out.println("TE.next()");
       }
       if (skipNext) {
+        if (DEBUG_SURROGATES) {
+          System.out.println("  skipNext=true");
+        }
         skipNext = false;
         if (termEnum.term() == null) {
           return null;
+        } else if (termEnum.term().field() != fieldInfo.name) {
+          return null;
         } else {
           return current = termEnum.term().bytes();
         }
       }
+
+      // TODO: can we use STE's prevBuffer here?
+      prevTerm.copy(termEnum.term().bytes());
+
       if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
         newSuffixStart = termEnum.newSuffixStart;
         if (DEBUG_SURROGATES) {
-          System.out.println("  set newSuffixStart=" + newSuffixStart);
+          System.out.println("  newSuffixStart=" + newSuffixStart);
         }
-        surrogatesDance();
+        surrogateDance();
         final Term t = termEnum.term();
         if (t == null || t.field() != fieldInfo.name) {
           assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@@ -545,12 +893,15 @@ public class PreFlexFields extends Field
         }
         return current;
       } else {
+        // This field is exhausted, but we have to give
+        // surrogateDance a chance to seek back:
         if (DEBUG_SURROGATES) {
-          System.out.println("  force pop");
+          System.out.println("  force cont");
         }
-        // force pop
-        newSuffixStart = -1;
-        surrogatesDance();
+        //newSuffixStart = prevTerm.length;
+        newSuffixStart = 0;
+        surrogateDance();
+        
         final Term t = termEnum.term();
         if (t == null || t.field() != fieldInfo.name) {
           assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@@ -574,20 +925,32 @@ public class PreFlexFields extends Field
 
     @Override
     public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
-      if (reuse != null) {
-        return ((PreDocsEnum) reuse).reset(termEnum, skipDocs);        
+      PreDocsEnum docsEnum;
+      if (reuse == null || !(reuse instanceof PreDocsEnum)) {
+        docsEnum = new PreDocsEnum();
       } else {
-        return (new PreDocsEnum()).reset(termEnum, skipDocs);
+        docsEnum = (PreDocsEnum) reuse;
+        if (docsEnum.getFreqStream() != freqStream) {
+          docsEnum = new PreDocsEnum();
+        }
       }
+      return docsEnum.reset(termEnum, skipDocs);
     }
 
     @Override
     public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
-      if (reuse != null) {
-        return ((PreDocsAndPositionsEnum) reuse).reset(termEnum, skipDocs);        
+      PreDocsAndPositionsEnum docsPosEnum;
+      if (fieldInfo.omitTermFreqAndPositions) {
+        return null;
+      } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
+        docsPosEnum = new PreDocsAndPositionsEnum();
       } else {
-        return (new PreDocsAndPositionsEnum()).reset(termEnum, skipDocs);
+        docsPosEnum = (PreDocsAndPositionsEnum) reuse;
+        if (docsPosEnum.getFreqStream() != freqStream) {
+          docsPosEnum = new PreDocsAndPositionsEnum();
+        }
       }
+      return docsPosEnum.reset(termEnum, skipDocs);        
     }
   }
 
@@ -598,6 +961,10 @@ public class PreFlexFields extends Field
       docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
     }
 
+    IndexInput getFreqStream() {
+      return freqStream;
+    }
+
     public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
       docs.setSkipDocs(skipDocs);
       docs.seek(termEnum);
@@ -650,6 +1017,10 @@ public class PreFlexFields extends Field
       pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
     }
 
+    IndexInput getFreqStream() {
+      return freqStream;
+    }
+
     public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
       pos.setSkipDocs(skipDocs);
       pos.seek(termEnum);

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java Tue Jul 27 20:44:34 2010
@@ -84,26 +84,16 @@ public final class SegmentTermEnum imple
       format = firstInt;
 
       // check that it is a format we can understand
-    if (format > FORMAT_MINIMUM)
-      throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
-    if (format < FORMAT_CURRENT)
-      throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+      if (format > FORMAT_MINIMUM)
+        throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
+      if (format < FORMAT_CURRENT)
+        throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT);
 
       size = input.readLong();                    // read the size
       
-      if(format == -1){
-        if (!isIndex) {
-          indexInterval = input.readInt();
-          formatM1SkipInterval = input.readInt();
-        }
-        // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in 
-        // skipTo implementation of these versions
-        skipInterval = Integer.MAX_VALUE;
-      } else {
-        indexInterval = input.readInt();
-        skipInterval = input.readInt();
-        maxSkipLevels = input.readInt();
-      }
+      indexInterval = input.readInt();
+      skipInterval = input.readInt();
+      maxSkipLevels = input.readInt();
       assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0";
       assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0";
     }
@@ -132,18 +122,21 @@ public final class SegmentTermEnum imple
     position = p;
     termBuffer.set(t);
     prevBuffer.reset();
+    //System.out.println("  ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this);
     termInfo.set(ti);
   }
 
   /** Increments the enumeration to the next element.  True if one exists.*/
   public final boolean next() throws IOException {
+    prevBuffer.set(termBuffer);
+    //System.out.println("  ste setPrev=" + prev() + " this=" + this);
+
     if (position++ >= size - 1) {
-      prevBuffer.set(termBuffer);
       termBuffer.reset();
+      //System.out.println("    EOF");
       return false;
     }
 
-    prevBuffer.set(termBuffer);
     termBuffer.read(input, fieldInfos);
     newSuffixStart = termBuffer.newSuffixStart;
 
@@ -168,6 +161,7 @@ public final class SegmentTermEnum imple
     if (isIndex)
       indexPointer += input.readVLong();	  // read index pointer
 
+    //System.out.println("  ste ret term=" + term());
     return true;
   }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java Tue Jul 27 20:44:34 2010
@@ -18,9 +18,10 @@ package org.apache.lucene.index.codecs.p
  */
 
 import java.io.IOException;
+import java.util.Comparator;
+
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.FieldInfos;
 
@@ -28,102 +29,65 @@ final class TermBuffer implements Clonea
 
   private String field;
   private Term term;                            // cached
-  private boolean dirty;                          // true if text was set externally (ie not read via UTF8 bytes)
 
-  private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
   private BytesRef bytes = new BytesRef(10);
 
-  int newSuffixStart;
+  private static final Comparator<BytesRef> utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
+
+  int newSuffixStart;                             // only valid right after .read is called
 
-  public final int compareTo(TermBuffer other) {
+  public int compareTo(TermBuffer other) {
     if (field == other.field) 	  // fields are interned
-      return compareChars(text.result, text.length, other.text.result, other.text.length);
+      return utf8AsUTF16Comparator.compare(bytes, other.bytes);
     else
       return field.compareTo(other.field);
   }
 
-  private static int compareChars(char[] chars1, int len1,
-                                  char[] chars2, int len2) {
-    final int end = len1 < len2 ? len1:len2;
-    for (int k = 0; k < end; k++) {
-      char c1 = chars1[k];
-      char c2 = chars2[k];
-      if (c1 != c2) {
-        return c1 - c2;
-      }
-    }
-    return len1 - len2;
-  }
-
-  public final void read(IndexInput input, FieldInfos fieldInfos)
+  public void read(IndexInput input, FieldInfos fieldInfos)
     throws IOException {
     this.term = null;                           // invalidate cache
-    int start = input.readVInt();
+    newSuffixStart = input.readVInt();
     int length = input.readVInt();
-    int totalLength = start + length;
+    int totalLength = newSuffixStart + length;
     if (bytes.bytes.length < totalLength) {
       bytes.grow(totalLength);
     }
-    if (dirty) {
-      // Fully convert all bytes since bytes is dirty
-      UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
-      bytes.length = totalLength;
-      input.readBytes(bytes.bytes, start, length);
-      UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
-      dirty = false;
-    } else {
-      // Incrementally convert only the UTF8 bytes that are new:
-      bytes.length = totalLength;
-      input.readBytes(bytes.bytes, start, length);
-      UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
-    }
-
-    while(true) {
-      newSuffixStart = text.offsets[start];
-      if (newSuffixStart != -1) {
-        break;
-      }
-      if (--start == 0) {
-        newSuffixStart = 0;
-        break;
-      }
-    }
+    bytes.length = totalLength;
+    input.readBytes(bytes.bytes, newSuffixStart, length);
     this.field = fieldInfos.fieldName(input.readVInt());
   }
 
-  public final void set(Term term) {
+  public void set(Term term) {
     if (term == null) {
       reset();
       return;
     }
-    
-    final BytesRef termBytes = term.bytes();
-    UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text);
-    dirty = true;
+    bytes.copy(term.bytes());
     field = term.field();
     this.term = term;
   }
 
-  public final void set(TermBuffer other) {
-    text.copyText(other.text);
-    dirty = true;
+  public void set(TermBuffer other) {
     field = other.field;
-    term = other.term;
+    // dangerous to copy Term over, since the underlying
+    // BytesRef could subsequently be modified:
+    term = null;
+    bytes.copy(other.bytes);
   }
 
   public void reset() {
     field = null;
-    text.setLength(0);
     term = null;
-    dirty = true;
   }
 
   public Term toTerm() {
     if (field == null)                            // unset
       return null;
 
-    if (term == null)
-      term = new Term(field, new BytesRef(text.result, 0, text.length), false);
+    if (term == null) {
+      term = new Term(field, new BytesRef(bytes), false);
+      //term = new Term(field, bytes, false);
+    }
 
     return term;
   }
@@ -134,12 +98,7 @@ final class TermBuffer implements Clonea
     try {
       clone = (TermBuffer)super.clone();
     } catch (CloneNotSupportedException e) {}
-    clone.dirty = true;
-    clone.bytes = new BytesRef(10);
-    clone.text = new UnicodeUtil.UTF16Result();
-    clone.text.offsets = new int[text.offsets.length];
-    System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
-    clone.text.copyText(text);
+    clone.bytes = new BytesRef(bytes);
     return clone;
   }
 }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java Tue Jul 27 20:44:34 2010
@@ -23,30 +23,30 @@ package org.apache.lucene.index.codecs.p
  * indexing. */
 
 @Deprecated
-class TermInfo {
+public class TermInfo {
   /** The number of documents which contain the term. */
-  int docFreq = 0;
+  public int docFreq = 0;
 
-  long freqPointer = 0;
-  long proxPointer = 0;
-  int skipOffset;
+  public long freqPointer = 0;
+  public long proxPointer = 0;
+  public int skipOffset;
 
-  TermInfo() {}
+  public TermInfo() {}
 
-  TermInfo(int df, long fp, long pp) {
+  public TermInfo(int df, long fp, long pp) {
     docFreq = df;
     freqPointer = fp;
     proxPointer = pp;
   }
 
-  TermInfo(TermInfo ti) {
+  public TermInfo(TermInfo ti) {
     docFreq = ti.docFreq;
     freqPointer = ti.freqPointer;
     proxPointer = ti.proxPointer;
     skipOffset = ti.skipOffset;
   }
 
-  final void set(int docFreq,
+  public final void set(int docFreq,
                  long freqPointer, long proxPointer, int skipOffset) {
     this.docFreq = docFreq;
     this.freqPointer = freqPointer;
@@ -54,7 +54,7 @@ class TermInfo {
     this.skipOffset = skipOffset;
   }
 
-  final void set(TermInfo ti) {
+  public final void set(TermInfo ti) {
     docFreq = ti.docFreq;
     freqPointer = ti.freqPointer;
     proxPointer = ti.proxPointer;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java Tue Jul 27 20:44:34 2010
@@ -119,9 +119,12 @@ public final class TermInfosReader {
           indexTerms = new Term[indexSize];
           indexInfos = new TermInfo[indexSize];
           indexPointers = new long[indexSize];
-        
-          for (int i = 0; indexEnum.next(); i++) {
+
+          for (int i=0;indexEnum.next(); i++) {
             indexTerms[i] = indexEnum.term();
+            assert indexTerms[i] != null;
+            assert indexTerms[i].text() != null;
+            assert indexTerms[i].field() != null;
             indexInfos[i] = indexEnum.termInfo();
             indexPointers[i] = indexEnum.indexPointer;
         
@@ -160,14 +163,14 @@ public final class TermInfosReader {
     return origEnum.maxSkipLevels;
   }
 
-  final void close() throws IOException {
+  void close() throws IOException {
     if (origEnum != null)
       origEnum.close();
     threadResources.close();
   }
 
   /** Returns the number of term/value pairs in the set. */
-  final long size() {
+  long size() {
     return size;
   }
 
@@ -183,12 +186,13 @@ public final class TermInfosReader {
 
 
   /** Returns the offset of the greatest index entry which is less than or equal to term.*/
-  private final int getIndexOffset(Term term) {
+  private int getIndexOffset(Term term) {
     int lo = 0;					  // binary search indexTerms[]
     int hi = indexTerms.length - 1;
 
     while (hi >= lo) {
       int mid = (lo + hi) >>> 1;
+      assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
       int delta = term.compareToUTF16(indexTerms[mid]);
       if (delta < 0)
 	hi = mid - 1;
@@ -200,7 +204,7 @@ public final class TermInfosReader {
     return hi;
   }
 
-  private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
+  private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
     enumerator.seek(indexPointers[indexOffset],
                     ((long) indexOffset * totalIndexInterval) - 1,
                     indexTerms[indexOffset], indexInfos[indexOffset]);
@@ -231,6 +235,9 @@ public final class TermInfosReader {
   }
 
   TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException {
+    if (size == 0) {
+      return null;
+    }
 
     // optimize sequential access: first try scanning cached enum w/o seeking
     if (enumerator.term() != null                 // term is at or past current
@@ -242,7 +249,6 @@ public final class TermInfosReader {
        // no need to seek
 
         final TermInfo ti;
-
         int numScans = enumerator.scanTo(term);
         if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
           ti = enumerator.termInfo();
@@ -279,6 +285,7 @@ public final class TermInfosReader {
     seekEnum(enumerator, indexPos);
     enumerator.scanTo(term);
     final TermInfo ti;
+
     if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
       ti = enumerator.termInfo();
       if (tiOrd == null) {
@@ -294,7 +301,7 @@ public final class TermInfosReader {
   }
 
   // called only from asserts
-  private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
+  private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
     if (ti1.docFreq != ti2.docFreq) {
       return false;
     }
@@ -319,7 +326,7 @@ public final class TermInfosReader {
   }
 
   /** Returns the position of a Term in the set or -1. */
-  final long getPosition(Term term) throws IOException {
+  long getPosition(Term term) throws IOException {
     if (size == 0) return -1;
 
     ensureIndexIsRead();

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Tue Jul 27 20:44:34 2010
@@ -217,22 +217,40 @@ public class SepPostingsReaderImpl exten
   @Override
   public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
     final SepTermState termState = (SepTermState) _termState;
-    if (reuse == null) {
-      return (new SepDocsEnum()).init(fieldInfo, termState, skipDocs);
+    SepDocsEnum docsEnum;
+    if (reuse == null || !(reuse instanceof SepDocsEnum)) {
+      docsEnum = new SepDocsEnum();
     } else {
-      return ((SepDocsEnum) reuse).init(fieldInfo, termState, skipDocs);
+      docsEnum = (SepDocsEnum) reuse;
+      if (docsEnum.startDocIn != docIn) {
+        // If you are using ParellelReader, and pass in a
+        // reused DocsAndPositionsEnum, it could have come
+        // from another reader also using sep codec
+        docsEnum = new SepDocsEnum();        
+      }
     }
+
+    return docsEnum.init(fieldInfo, termState, skipDocs);
   }
 
   @Override
   public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
     assert !fieldInfo.omitTermFreqAndPositions;
     final SepTermState termState = (SepTermState) _termState;
-    if (reuse == null) {
-      return (new SepDocsAndPositionsEnum()).init(fieldInfo, termState, skipDocs);
+    SepDocsAndPositionsEnum postingsEnum;
+    if (reuse == null || !(reuse instanceof SepDocsAndPositionsEnum)) {
+      postingsEnum = new SepDocsAndPositionsEnum();
     } else {
-      return ((SepDocsAndPositionsEnum) reuse).init(fieldInfo, termState, skipDocs);
+      postingsEnum = (SepDocsAndPositionsEnum) reuse;
+      if (postingsEnum.startDocIn != docIn) {
+        // If you are using ParellelReader, and pass in a
+        // reused DocsAndPositionsEnum, it could have come
+        // from another reader also using sep codec
+        postingsEnum = new SepDocsAndPositionsEnum();        
+      }
     }
+
+    return postingsEnum.init(fieldInfo, termState, skipDocs);
   }
 
   class SepDocsEnum extends DocsEnum {
@@ -253,6 +271,7 @@ public class SepPostingsReaderImpl exten
     private final IntIndexInput.Index docIndex;
     private final IntIndexInput.Index freqIndex;
     private final IntIndexInput.Index posIndex;
+    private final IntIndexInput startDocIn;
 
     // TODO: -- should we do hasProx with 2 different enum classes?
 
@@ -260,6 +279,7 @@ public class SepPostingsReaderImpl exten
     SepSkipListReader skipper;
 
     SepDocsEnum() throws IOException {
+      startDocIn = docIn;
       docReader = docIn.reader();
       docIndex = docIn.index();
       if (freqIn != null) {
@@ -439,6 +459,8 @@ public class SepPostingsReaderImpl exten
     private final IntIndexInput.Index docIndex;
     private final IntIndexInput.Index freqIndex;
     private final IntIndexInput.Index posIndex;
+    private final IntIndexInput startDocIn;
+
     private long payloadOffset;
 
     private int pendingPosCount;
@@ -452,6 +474,7 @@ public class SepPostingsReaderImpl exten
     private boolean posSeekPending;
 
     SepDocsAndPositionsEnum() throws IOException {
+      startDocIn = docIn;
       docReader = docIn.reader();
       docIndex = docIn.index();
       freqReader = freqIn.reader();

Propchange: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 27 20:44:34 2010
@@ -1,5 +1,6 @@
-/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730,957490,960490,961612
-/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:953476-978809
+/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730,957490,960490,961612,979161
+/lucene/dev/branches/preflexfixes/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:967125-979432
+/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:953476-979858
 /lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:824912-931101
 /lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:909334,948516
 /lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:924483-924731,924781,925176-925462

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/FSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/FSDirectory.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/FSDirectory.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/FSDirectory.java Tue Jul 27 20:44:34 2010
@@ -33,6 +33,8 @@ import java.util.Collections;
 import static java.util.Collections.synchronizedSet;
 import java.util.HashSet;
 import java.util.Set;
+
+import org.apache.lucene.store.SimpleFSDirectory.SimpleFSIndexInput;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.Constants;
@@ -126,6 +128,12 @@ public abstract class FSDirectory extend
   protected final Set<String> staleFiles = synchronizedSet(new HashSet<String>()); // Files written, but not yet sync'ed
   private int chunkSize = DEFAULT_READ_CHUNK_SIZE; // LUCENE-1566
 
+  /**
+   * Chunk size used to read when using FileChannel API. If an attempt to read a
+   * large file is made without limiting the chunk size, an OOM may occur.
+   */
+  private static final long CHANNEL_CHUNK_SIZE = 1 << 21; // Use 2MB chunk size - LUCENE-2537
+
   // returns the canonical version of the directory, creating it if it doesn't exist.
   private static File getCanonicalPath(File file) throws IOException {
     return new File(file.getCanonicalPath());
@@ -441,7 +449,7 @@ public abstract class FSDirectory extend
       try {
         input = new FileInputStream(new File(directory, src)).getChannel();
         output = new FileOutputStream(new File(target.directory, dest)).getChannel();
-        output.transferFrom(input, 0, input.size());
+        copy(input, output, input.size());
       } catch (IOException ioe) {
         priorException = ioe;
       } finally {
@@ -451,6 +459,25 @@ public abstract class FSDirectory extend
       super.copy(to, src, dest);
     }
   }
+
+  /**
+   * Copies the content of a given {@link FileChannel} to a destination one. The
+   * copy is done in chunks of 2MB because if transferFrom is used without a
+   * limit when copying a very large file, then an OOM may be thrown (depends on
+   * the state of the RAM in the machine, as well as the OS used). Performance
+   * measurements showed that chunk sizes larger than 2MB do not result in much
+   * faster file copy, therefore we limit the size to be safe with different
+   * file sizes and systems.
+   */
+  static void copy(FileChannel input, FileChannel output, long numBytes) throws IOException {
+    long pos = output.position();
+    long writeTo = numBytes + pos;
+    while (pos < writeTo) {
+      pos += output.transferFrom(input, pos, Math.min(CHANNEL_CHUNK_SIZE, writeTo - pos));
+    }
+    // transferFrom does not change the position of the channel. Need to change it manually
+    output.position(pos);
+  }
   
   protected static class FSIndexOutput extends BufferedIndexOutput {
     private final FSDirectory parent;
@@ -472,6 +499,28 @@ public abstract class FSDirectory extend
     }
 
     @Override
+    public void copyBytes(DataInput input, long numBytes) throws IOException {
+      // Optimized copy only if the number of bytes to copy is larger than the
+      // buffer size, and the given IndexInput supports FileChannel copying ..
+      // NOTE: the below check relies on NIOIndexInput extending Simple. If that
+      // changes in the future, we should change the check as well.
+      if (numBytes > BUFFER_SIZE && input instanceof SimpleFSIndexInput) {
+        // flush any bytes in the buffer
+        flush();
+        // do the optimized copy
+        FileChannel in = ((SimpleFSIndexInput) input).file.getChannel();
+        FileChannel out = file.getChannel();
+        copy(in, out, numBytes);
+        // corrects the position in super (BufferedIndexOutput), so that calls
+        // to getFilePointer will return the correct pointer.
+        // Perhaps a specific method is better?
+        super.seek(out.position());
+      } else {
+        super.copyBytes(input, numBytes);
+      }
+    }
+    
+    @Override
     public void close() throws IOException {
       // only close the file if it has not been closed yet
       if (isOpen) {

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java Tue Jul 27 20:44:34 2010
@@ -125,7 +125,7 @@ public class SimpleFSDirectory extends F
           final OutOfMemoryError outOfMemoryError = new OutOfMemoryError(
               "OutOfMemoryError likely caused by the Sun VM Bug described in "
               + "https://issues.apache.org/jira/browse/LUCENE-1566; try calling FSDirectory.setReadChunkSize "
-              + "with a a value smaller than the current chunks size (" + chunkSize + ")");
+              + "with a value smaller than the current chunks size (" + chunkSize + ")");
           outOfMemoryError.initCause(e);
           throw outOfMemoryError;
         }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/util/BytesRef.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/util/BytesRef.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/util/BytesRef.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/util/BytesRef.java Tue Jul 27 20:44:34 2010
@@ -331,12 +331,17 @@ public final class BytesRef implements C
           // We know the terms are not equal, but, we may
           // have to carefully fixup the bytes at the
           // difference to match UTF16's sort order:
+          
+          // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff, 
+          // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences]
+          // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such
+          // that 6-byte sequences are needed we have much bigger problems anyway.
           if (aByte >= 0xee && bByte >= 0xee) {
             if ((aByte & 0xfe) == 0xee) {
-              aByte += 0x10;
+              aByte += 0xe;
             }
             if ((bByte&0xfe) == 0xee) {
-              bByte += 0x10;
+              bByte += 0xe;
             }
           }
           return aByte - bByte;
@@ -346,10 +351,6 @@ public final class BytesRef implements C
       // One is a prefix of the other, or, they are equal:
       return a.length - b.length;
     }
-
-    public boolean equals(Object other) {
-      return this == other;
-    }
   }
 
   public void writeExternal(ObjectOutput out)

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java Tue Jul 27 20:44:34 2010
@@ -39,7 +39,8 @@ public class TestCharTokenizers extends 
     Random newRandom = newRandom();
     // create random input
     int num = 1024 + newRandom.nextInt(1024);
-    for (int i = 1; i < num*_TestUtil.getRandomMultiplier(); i++) {
+    num *= RANDOM_MULTIPLIER;
+    for (int i = 1; i < num; i++) {
       builder.append("\ud801\udc1cabc");
       if((i % 10) == 0)
         builder.append(" ");

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java Tue Jul 27 20:44:34 2010
@@ -2,9 +2,7 @@ package org.apache.lucene.document;
 
 import org.apache.lucene.util.LuceneTestCase;
 
-import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.store.MockRAMDirectory;
 
@@ -58,8 +56,7 @@ public class TestBinaryDocument extends 
     
     /** add the doc to a ram index */
     MockRAMDirectory dir = new MockRAMDirectory();
-    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, 
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
     writer.addDocument(doc);
     
     /** open a reader and fetch the document */ 
@@ -98,8 +95,7 @@ public class TestBinaryDocument extends 
     
     /** add the doc to a ram index */
     MockRAMDirectory dir = new MockRAMDirectory();
-    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, 
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
     writer.addDocument(doc);
     
     /** open a reader and fetch the document */ 

Propchange: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestDateTools.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 27 20:44:34 2010
@@ -1,5 +1,6 @@
-/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/document/TestDateTools.java:943137,949730,957490,960490,961612
-/lucene/dev/trunk/lucene/src/test/org/apache/lucene/document/TestDateTools.java:953476-978809
+/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/document/TestDateTools.java:943137,949730,957490,960490,961612,979161
+/lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestDateTools.java:967125-979432
+/lucene/dev/trunk/lucene/src/test/org/apache/lucene/document/TestDateTools.java:953476-979858
 /lucene/java/branches/flex_1458/src/test/org/apache/lucene/document/TestDateTools.java:824912-931101
 /lucene/java/branches/lucene_2_4/src/test/org/apache/lucene/document/TestDateTools.java:748824
 /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestDateTools.java:829134,829881,831036,896850,909334,948516

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestDocument.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestDocument.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestDocument.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestDocument.java Tue Jul 27 20:44:34 2010
@@ -1,8 +1,6 @@
 package org.apache.lucene.document;
 
-import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
@@ -155,8 +153,7 @@ public class TestDocument extends Lucene
    */
   public void testGetValuesForIndexedDocument() throws Exception {
     RAMDirectory dir = new RAMDirectory();
-    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
     writer.addDocument(makeDocumentWithFields());
     IndexReader reader = writer.getReader();
     
@@ -234,8 +231,7 @@ public class TestDocument extends Lucene
         Field.Index.NOT_ANALYZED));
     
     RAMDirectory dir = new RAMDirectory();
-    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir,
-        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
+    RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir);
     writer.addDocument(doc);
     field.setValue("id2");
     writer.addDocument(doc);

Propchange: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/document/TestNumberTools.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 27 20:44:34 2010
@@ -1,5 +1,6 @@
-/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:943137,949730,957490,960490,961612
-/lucene/dev/trunk/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:953476-978809
+/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:943137,949730,957490,960490,961612,979161
+/lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:967125-979432
+/lucene/dev/trunk/lucene/src/test/org/apache/lucene/document/TestNumberTools.java:953476-979858
 /lucene/java/branches/flex_1458/src/test/org/apache/lucene/document/TestNumberTools.java:824912-931101
 /lucene/java/branches/lucene_2_4/src/test/org/apache/lucene/document/TestNumberTools.java:748824
 /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/document/TestNumberTools.java:829134,829881,831036,896850,909334,948516

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java Tue Jul 27 20:44:34 2010
@@ -17,20 +17,18 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
-import java.util.Random;
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.Random;
 
-import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.CodecProvider;
-import org.apache.lucene.index.codecs.intblock.IntBlockCodec;
-import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
-import org.apache.lucene.index.codecs.pulsing.PulsingCodec;
-import org.apache.lucene.index.codecs.sep.SepCodec;
-import org.apache.lucene.index.codecs.standard.StandardCodec;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCaseJ4;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
 
 /** Silly class that randomizes the indexing experience.  EG
  *  it may swap in a different merge policy/scheduler; may
@@ -45,32 +43,48 @@ public class RandomIndexWriter implement
   int docCount;
   int flushAt;
 
-  public RandomIndexWriter(Random r, Directory dir, IndexWriterConfig c) throws IOException {
-    this.r = r;
-    if (r.nextBoolean()) {
-      c.setMergePolicy(new LogDocMergePolicy());
-    }
-    if (r.nextBoolean()) {
-      c.setMergeScheduler(new SerialMergeScheduler());
+  // Randomly calls Thread.yield so we mixup thread scheduling
+  private static final class MockIndexWriter extends IndexWriter {
+
+    private final Random r;
+
+    public MockIndexWriter(Random r,Directory dir, IndexWriterConfig conf) throws IOException {
+      super(dir, conf);
+      this.r = r;
     }
-    if (r.nextBoolean()) {
-      c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000));
+
+    @Override
+    boolean testPoint(String name) {
+      if (r.nextInt(4) == 2)
+        Thread.yield();
+      return true;
     }
-    if (r.nextBoolean()) {
-      c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000));
-    }
-    
-    if (c.getMergePolicy() instanceof LogMergePolicy) {
-      LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy();
-      logmp.setUseCompoundDocStore(r.nextBoolean());
-      logmp.setUseCompoundFile(r.nextBoolean());
-      logmp.setCalibrateSizeByDeletes(r.nextBoolean());
-    }
-    
-    c.setReaderPooling(r.nextBoolean());
-    c.setCodecProvider(new RandomCodecProvider(r));
-    w = new IndexWriter(dir, c);
+  }
+
+  /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT and MockAnalyzer */
+  public RandomIndexWriter(Random r, Directory dir) throws IOException {
+    this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, new MockAnalyzer()));
+  }
+  
+  /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT */
+  public RandomIndexWriter(Random r, Directory dir, Analyzer a) throws IOException {
+    this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, a));
+  }
+  
+  /** create a RandomIndexWriter with a random config */
+  public RandomIndexWriter(Random r, Directory dir, Version v, Analyzer a) throws IOException {
+    this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, v, a));
+  }
+  
+  /** create a RandomIndexWriter with the provided config */
+  public RandomIndexWriter(Random r, Directory dir, IndexWriterConfig c) throws IOException {
+    this.r = r;
+    w = new MockIndexWriter(r, dir, c);
     flushAt = _TestUtil.nextInt(r, 10, 1000);
+    if (LuceneTestCaseJ4.VERBOSE) {
+      System.out.println("RIW config=" + w.getConfig());
+      System.out.println("codec default=" + CodecProvider.getDefaultCodec());
+    }
   } 
 
   public void addDocument(Document doc) throws IOException {
@@ -89,14 +103,27 @@ public class RandomIndexWriter implement
     w.deleteDocuments(term);
   }
   
+  public void commit() throws CorruptIndexException, IOException {
+    w.commit();
+  }
+  
   public int maxDoc() {
     return w.maxDoc();
   }
 
   public IndexReader getReader() throws IOException {
-    if (r.nextBoolean()) {
+    // If we are writing with PreFlexRW, force a full
+    // IndexReader.open so terms are sorted in codepoint
+    // order during searching:
+    if (!w.codecs.getWriter(null).name.equals("PreFlex") && r.nextBoolean()) {
+      if (LuceneTestCaseJ4.VERBOSE) {
+        System.out.println("RIW.getReader: use NRT reader");
+      }
       return w.getReader();
     } else {
+      if (LuceneTestCaseJ4.VERBOSE) {
+        System.out.println("RIW.getReader: open new reader");
+      }
       w.commit();
       return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10));
     }
@@ -112,22 +139,4 @@ public class RandomIndexWriter implement
   public void optimize() throws IOException {
     w.optimize();
   }
-  
-  class RandomCodecProvider extends CodecProvider {
-    final String codec;
-    
-    RandomCodecProvider(Random random) {
-      register(new StandardCodec());
-      register(new IntBlockCodec());
-      register(new PreFlexCodec());
-      register(new PulsingCodec());
-      register(new SepCodec());
-      codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)];
-    }
-    
-    @Override
-    public Codec getWriter(SegmentWriteState state) {
-      return lookup(codec);
-    }
-  }
 }

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java Tue Jul 27 20:44:34 2010
@@ -19,7 +19,6 @@ package org.apache.lucene.index;
 
 import java.io.IOException;
 
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@@ -139,7 +138,6 @@ public class TestAddIndexes extends Luce
 
     setUpDirs(dir, aux);
     IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
-
     writer.addIndexes(new Directory[] {aux});
 
     // Adds 10 docs, then replaces them with another 10

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAtomicUpdate.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAtomicUpdate.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAtomicUpdate.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestAtomicUpdate.java Tue Jul 27 20:44:34 2010
@@ -47,7 +47,7 @@ public class TestAtomicUpdate extends Lu
   private static abstract class TimedThread extends Thread {
     volatile boolean failed;
     int count;
-    private static float RUN_TIME_SEC = 0.5f * (float)_TestUtil.getRandomMultiplier();
+    private static float RUN_TIME_SEC = 0.5f * RANDOM_MULTIPLIER;
     private TimedThread[] allThreads;
 
     abstract public void doWork() throws Throwable;

Propchange: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 27 20:44:34 2010
@@ -1,5 +1,6 @@
-/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:943137,949730,957490,960490,961612
-/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:953476-978809
+/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:943137,949730,957490,960490,961612,979161
+/lucene/dev/branches/preflexfixes/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:967125-979432
+/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:953476-979858
 /lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:824912-931101
 /lucene/java/branches/lucene_2_4/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:748824
 /lucene/java/branches/lucene_2_9/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java:829134,829881,831036,896850,909334,948516

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestByteSlices.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestByteSlices.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestByteSlices.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestByteSlices.java Tue Jul 27 20:44:34 2010
@@ -18,7 +18,6 @@ import java.util.Random;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
 
 public class TestByteSlices extends LuceneTestCase {
 
@@ -55,7 +54,7 @@ public class TestByteSlices extends Luce
   public void testBasic() throws Throwable {
     ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator());
 
-    final int NUM_STREAM = 100*_TestUtil.getRandomMultiplier();
+    final int NUM_STREAM = 100 * RANDOM_MULTIPLIER;
 
     ByteSliceWriter writer = new ByteSliceWriter(pool);
 
@@ -74,7 +73,8 @@ public class TestByteSlices extends Luce
         counters[stream] = 0;
       }
       
-      for(int iter=0;iter<10000*_TestUtil.getRandomMultiplier();iter++) {
+      int num = 10000 * RANDOM_MULTIPLIER;
+      for (int iter = 0; iter < num; iter++) {
         int stream = r.nextInt(NUM_STREAM);
         if (VERBOSE)
           System.out.println("write stream=" + stream);

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestCodecs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestCodecs.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestCodecs.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestCodecs.java Tue Jul 27 20:44:34 2010
@@ -44,7 +44,6 @@ import org.apache.lucene.store.RAMDirect
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.MultiCodecTestCase;
 import org.apache.lucene.util.Version;
-import org.apache.lucene.util._TestUtil;
 
 // TODO: test multiple codecs here?
 
@@ -69,7 +68,7 @@ public class TestCodecs extends MultiCod
   private Random RANDOM;
   private static String[] fieldNames = new String[] {"one", "two", "three", "four"};
 
-  private final static int NUM_TEST_ITER = 20*_TestUtil.getRandomMultiplier();
+  private final static int NUM_TEST_ITER = 20 * RANDOM_MULTIPLIER;
   private final static int NUM_TEST_THREADS = 3;
   private final static int NUM_FIELDS = 4;
   private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping
@@ -493,14 +492,21 @@ public class TestCodecs extends MultiCod
         // Test random seek by ord:
         final int idx = TestCodecs.this.nextInt(field.terms.length);
         term = field.terms[idx];
-        status = termsEnum.seek(idx);
-        assertEquals(status, TermsEnum.SeekStatus.FOUND);
-        assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
-        assertEquals(term.docs.length, termsEnum.docFreq());
-        if (field.omitTF) {
-          this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false);
-        } else {
-          this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
+        try {
+          status = termsEnum.seek(idx);
+        } catch (UnsupportedOperationException uoe) {
+          // ok -- skip it
+          status = null;
+        }
+        if (status != null) {
+          assertEquals(status, TermsEnum.SeekStatus.FOUND);
+          assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
+          assertEquals(term.docs.length, termsEnum.docFreq());
+          if (field.omitTF) {
+            this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false);
+          } else {
+            this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
+          }
         }
 
         // Test seek to non-existent terms:
@@ -520,9 +526,12 @@ public class TestCodecs extends MultiCod
 
         // Seek to each term by ord, backwards
         for(int i=field.terms.length-1;i>=0;i--) {
-          assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
-          assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
-          assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
+          try {
+            assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
+            assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
+            assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
+          } catch (UnsupportedOperationException uoe) {
+          }
         }
 
         // Seek to non-existent empty-string term

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestFlex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestFlex.java?rev=979860&r1=979859&r2=979860&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestFlex.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/index/TestFlex.java Tue Jul 27 20:44:34 2010
@@ -20,6 +20,8 @@ package org.apache.lucene.index;
 import java.io.*;
 import java.util.*;
 import org.apache.lucene.store.*;
+import org.apache.lucene.index.codecs.*;
+import org.apache.lucene.index.codecs.standard.*;
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.document.*;
@@ -64,7 +66,8 @@ public class TestFlex extends LuceneTest
 
   public void testTermOrd() throws Exception {
     Directory d = new MockRAMDirectory();
-    IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
+    IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT,
+                                                             new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
     Document doc = new Document();
     doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED));
     w.addDocument(doc);



Mime
View raw message