lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1670929 [2/5] - in /lucene/dev/branches/lucene6271: ./ lucene/ lucene/codecs/ lucene/codecs/src/java/org/apache/lucene/codecs/autoprefix/ lucene/codecs/src/resources/META-INF/services/ lucene/codecs/src/test/org/apache/lucene/codecs/autopr...
Date Thu, 02 Apr 2015 15:37:41 GMT
Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java Thu Apr  2 15:37:39 2015
@@ -35,9 +35,14 @@ final class IntersectTermsEnumFrame {
   long fpEnd;
   long lastSubFP;
 
+  // private static boolean DEBUG = IntersectTermsEnum.DEBUG;
+
   // State in automaton
   int state;
 
+  // State just before the last label
+  int lastState;
+
   int metaDataUpto;
 
   byte[] suffixBytes = new byte[128];
@@ -73,6 +78,8 @@ final class IntersectTermsEnumFrame {
   int transitionIndex;
   int transitionCount;
 
+  final boolean versionAutoPrefix;
+
   FST.Arc<BytesRef> arc;
 
   final BlockTermState termState;
@@ -89,6 +96,17 @@ final class IntersectTermsEnumFrame {
   int startBytePos;
   int suffix;
 
+  // When we are on an auto-prefix term this is the starting lead byte
+  // of the suffix (e.g. 'a' for the foo[a-m]* case):
+  int floorSuffixLeadStart;
+
+  // When we are on an auto-prefix term this is the ending lead byte
+  // of the suffix (e.g. 'm' for the foo[a-m]* case):
+  int floorSuffixLeadEnd;
+
+  // True if the term we are currently on is an auto-prefix term:
+  boolean isAutoPrefixTerm;
+
   private final IntersectTermsEnum ite;
 
   public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException {
@@ -97,35 +115,39 @@ final class IntersectTermsEnumFrame {
     this.termState = ite.fr.parent.postingsReader.newTermState();
     this.termState.totalTermFreq = -1;
     this.longs = new long[ite.fr.longsSize];
+    this.versionAutoPrefix = ite.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
   }
 
   void loadNextFloorBlock() throws IOException {
     assert numFollowFloorBlocks > 0;
-    //if (DEBUG) System.out.println("    loadNextFoorBlock trans=" + transitions[transitionIndex]);
+    //if (DEBUG) System.out.println("    loadNextFloorBlock transition.min=" + transition.min);
 
     do {
       fp = fpOrig + (floorDataReader.readVLong() >>> 1);
       numFollowFloorBlocks--;
-      // if (DEBUG) System.out.println("    skip floor block2!  nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+      //if (DEBUG) System.out.println("    skip floor block2!  nextFloorLabel=" + (char) nextFloorLabel + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
       if (numFollowFloorBlocks != 0) {
         nextFloorLabel = floorDataReader.readByte() & 0xff;
       } else {
         nextFloorLabel = 256;
       }
-      // if (DEBUG) System.out.println("    nextFloorLabel=" + (char) nextFloorLabel);
+      //if (DEBUG) System.out.println("    nextFloorLabel=" + (char) nextFloorLabel);
     } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
 
+    //if (DEBUG) System.out.println("      done loadNextFloorBlock");
+
     load(null);
   }
 
   public void setState(int state) {
     this.state = state;
     transitionIndex = 0;
-    transitionCount = ite.compiledAutomaton.automaton.getNumTransitions(state);
+    transitionCount = ite.automaton.getNumTransitions(state);
     if (transitionCount != 0) {
-      ite.compiledAutomaton.automaton.initTransition(state, transition);
-      ite.compiledAutomaton.automaton.getNextTransition(transition);
+      ite.automaton.initTransition(state, transition);
+      ite.automaton.getNextTransition(transition);
       curTransitionMax = transition.max;
+      //if (DEBUG) System.out.println("    after setState state=" + state + " trans: " + transition + " transCount=" + transitionCount);
     } else {
       curTransitionMax = -1;
     }
@@ -133,7 +155,7 @@ final class IntersectTermsEnumFrame {
 
   void load(BytesRef frameIndexData) throws IOException {
 
-    // if (DEBUG) System.out.println("    load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
+    //xif (DEBUG) System.out.println("    load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
 
     if (frameIndexData != null && transitionCount != 0) {
       // Floor frame
@@ -148,7 +170,7 @@ final class IntersectTermsEnumFrame {
       if ((code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
         numFollowFloorBlocks = floorDataReader.readVInt();
         nextFloorLabel = floorDataReader.readByte() & 0xff;
-        // if (DEBUG) System.out.println("    numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
+        //if (DEBUG) System.out.println("    numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel);
 
         // If current state is accept, we must process
         // first block in case it has empty suffix:
@@ -158,7 +180,7 @@ final class IntersectTermsEnumFrame {
           while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
             fp = fpOrig + (floorDataReader.readVLong() >>> 1);
             numFollowFloorBlocks--;
-            // if (DEBUG) System.out.println("    skip floor block!  nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
+            //xif (DEBUG) System.out.println("    skip floor block!  nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks);
             if (numFollowFloorBlocks != 0) {
               nextFloorLabel = floorDataReader.readByte() & 0xff;
             } else {
@@ -179,7 +201,7 @@ final class IntersectTermsEnumFrame {
     code = ite.in.readVInt();
     isLeafBlock = (code & 1) != 0;
     int numBytes = code >>> 1;
-    // if (DEBUG) System.out.println("      entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
+    //if (DEBUG) System.out.println("      entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes);
     if (suffixBytes.length < numBytes) {
       suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
     }
@@ -214,41 +236,106 @@ final class IntersectTermsEnumFrame {
       // written one after another -- tail recurse:
       fpEnd = ite.in.getFilePointer();
     }
+
+    // Necessary in case this ord previously was an auto-prefix
+    // term but now we recurse to a new leaf block
+    isAutoPrefixTerm = false;
   }
 
   // TODO: maybe add scanToLabel; should give perf boost
 
+  // Decodes next entry; returns true if it's a sub-block
   public boolean next() {
-    return isLeafBlock ? nextLeaf() : nextNonLeaf();
+    if (isLeafBlock) {
+      nextLeaf();
+      return false;
+    } else {
+      return nextNonLeaf();
+    }
   }
 
-  // Decodes next entry; returns true if it's a sub-block
-  public boolean nextLeaf() {
-    //if (DEBUG) System.out.println("  frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+  public void nextLeaf() {
+    //if (DEBUG) {
+    //  System.out.println("  frame.nextLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+    //}
     assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
     nextEnt++;
     suffix = suffixesReader.readVInt();
     startBytePos = suffixesReader.getPosition();
     suffixesReader.skipBytes(suffix);
-    return false;
   }
 
   public boolean nextNonLeaf() {
-    //if (DEBUG) System.out.println("  frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
+    //if (DEBUG) {
+    //  System.out.println("  frame.nextNonLeaf ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " versionAutoPrefix=" + versionAutoPrefix + " fp=" + suffixesReader.getPosition());
+    // }
     assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
     nextEnt++;
     final int code = suffixesReader.readVInt();
-    suffix = code >>> 1;
-    startBytePos = suffixesReader.getPosition();
-    suffixesReader.skipBytes(suffix);
-    if ((code & 1) == 0) {
-      // A normal term
-      termState.termBlockOrd++;
-      return false;
+    if (versionAutoPrefix == false) {
+      suffix = code >>> 1;
+      startBytePos = suffixesReader.getPosition();
+      suffixesReader.skipBytes(suffix);
+      if ((code & 1) == 0) {
+        // A normal term
+        termState.termBlockOrd++;
+        return false;
+      } else {
+        // A sub-block; make sub-FP absolute:
+        lastSubFP = fp - suffixesReader.readVLong();
+        return true;
+      }
     } else {
-      // A sub-block; make sub-FP absolute:
-      lastSubFP = fp - suffixesReader.readVLong();
-      return true;
+      suffix = code >>> 2;
+      startBytePos = suffixesReader.getPosition();
+      suffixesReader.skipBytes(suffix);
+      switch (code & 3) {
+      case 0:
+        // A normal term
+        //if (DEBUG) System.out.println("    ret: term");
+        isAutoPrefixTerm = false;
+        termState.termBlockOrd++;
+        return false;
+      case 1:
+        // A sub-block; make sub-FP absolute:
+        isAutoPrefixTerm = false;
+        lastSubFP = fp - suffixesReader.readVLong();
+        //if (DEBUG) System.out.println("    ret: sub-block");
+        return true;
+      case 2:
+        // A normal prefix term, suffix leads with empty string
+        floorSuffixLeadStart = -1;
+        termState.termBlockOrd++;
+        floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
+        if (floorSuffixLeadEnd == 0xff) {
+          floorSuffixLeadEnd = -1;
+          //System.out.println("  fill in -1");
+        }
+        //if (DEBUG) System.out.println("    ret: floor prefix term: start=-1 end=" + floorSuffixLeadEnd);
+        isAutoPrefixTerm = true;
+        return false;
+      case 3:
+        // A floor'd prefix term, suffix leads with real byte
+        if (suffix == 0) {
+          // TODO: this is messy, but necessary because we are an auto-prefix term, but our suffix is the empty string here, so we have to
+          // look at the parent block to get the lead suffix byte:
+          assert ord > 0;
+          IntersectTermsEnumFrame parent = ite.stack[ord-1];
+          floorSuffixLeadStart = parent.suffixBytes[parent.startBytePos+parent.suffix-1] & 0xff;
+          //if (DEBUG) System.out.println("    peek-parent: suffix=" + floorSuffixLeadStart);
+        } else {
+          floorSuffixLeadStart = suffixBytes[startBytePos+suffix-1] & 0xff;
+        }
+        termState.termBlockOrd++;
+        isAutoPrefixTerm = true;
+        floorSuffixLeadEnd = suffixesReader.readByte() & 0xff;
+        //if (DEBUG) System.out.println("    ret: floor prefix term start=" + floorSuffixLeadStart + " end=" + floorSuffixLeadEnd);
+        return false;
+      default:
+        // Silly javac:
+        assert false;
+        return false;
+      }
     }
   }
 

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnum.java Thu Apr  2 15:37:39 2015
@@ -34,7 +34,9 @@ import org.apache.lucene.util.RamUsageEs
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Util;
 
-/** Iterates through terms in this field */
+/** Iterates through terms in this field.  This implementation skips
+ *  any auto-prefix terms it encounters. */
+
 final class SegmentTermsEnum extends TermsEnum {
 
   // Lazy init:
@@ -48,7 +50,7 @@ final class SegmentTermsEnum extends Ter
 
   private int targetBeforeCurrentLength;
 
-  // static boolean DEBUG = false;
+  //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
 
   private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
 
@@ -119,6 +121,8 @@ final class SegmentTermsEnum extends Ter
    *  computing aggregate statistics. */
   public Stats computeBlockStats() throws IOException {
 
+    // TODO: add total auto-prefix term count
+
     Stats stats = new Stats(fr.parent.segment, fr.fieldInfo.name);
     if (fr.index != null) {
       stats.indexNodeCount = fr.index.getNodeCount();
@@ -152,8 +156,10 @@ final class SegmentTermsEnum extends Ter
       while (currentFrame.nextEnt == currentFrame.entCount) {
         stats.endBlock(currentFrame);
         if (!currentFrame.isLastInFloor) {
+          // Advance to next floor block
           currentFrame.loadNextFloorBlock();
           stats.startBlock(currentFrame, true);
+          break;
         } else {
           if (currentFrame.ord == 0) {
             break allTerms;
@@ -175,8 +181,6 @@ final class SegmentTermsEnum extends Ter
           // This is a "next" frame -- even if it's
           // floor'd we must pretend it isn't so we don't
           // try to scan to the right floor frame:
-          currentFrame.isFloor = false;
-          //currentFrame.hasTerms = true;
           currentFrame.loadBlock();
           stats.startBlock(currentFrame, !currentFrame.isLastInFloor);
         } else {
@@ -294,6 +298,7 @@ final class SegmentTermsEnum extends Ter
     return true;
   }
 
+  /*
   // for debugging
   @SuppressWarnings("unused")
   static String brToString(BytesRef b) {
@@ -307,8 +312,15 @@ final class SegmentTermsEnum extends Ter
     }
   }
 
+  // for debugging
+  @SuppressWarnings("unused")
+  static String brToString(BytesRefBuilder b) {
+    return brToString(b.get());
+  }
+  */
+
   @Override
-  public boolean seekExact(final BytesRef target) throws IOException {
+  public boolean seekExact(BytesRef target) throws IOException {
 
     if (fr.index == null) {
       throw new IllegalStateException("terms index was not loaded");
@@ -565,7 +577,8 @@ final class SegmentTermsEnum extends Ter
   }
 
   @Override
-  public SeekStatus seekCeil(final BytesRef target) throws IOException {
+  public SeekStatus seekCeil(BytesRef target) throws IOException {
+
     if (fr.index == null) {
       throw new IllegalStateException("terms index was not loaded");
     }
@@ -575,7 +588,7 @@ final class SegmentTermsEnum extends Ter
     assert clearEOF();
 
     // if (DEBUG) {
-    //   System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=  " + validIndexPrefix);
+    //   System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=  " + validIndexPrefix);
     //   printSeekState(System.out);
     // }
 
@@ -617,7 +630,7 @@ final class SegmentTermsEnum extends Ter
       while (targetUpto < targetLimit) {
         cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
         //if (DEBUG) {
-        //System.out.println("    cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"   + " arc.output=" + arc.output + " output=" + output);
+        //System.out.println("    cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")"   + " arc.output=" + arc.output + " output=" + output);
         //}
         if (cmp != 0) {
           break;
@@ -647,7 +660,7 @@ final class SegmentTermsEnum extends Ter
         while (targetUpto < targetLimit2) {
           cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
           //if (DEBUG) {
-          //System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
+          //System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")");
           //}
           if (cmp != 0) {
             break;
@@ -733,7 +746,7 @@ final class SegmentTermsEnum extends Ter
 
         // Index is exhausted
         // if (DEBUG) {
-        //   System.out.println("    index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
+        //   System.out.println("    index: index exhausted label=" + ((char) targetLabel) + " " + targetLabel);
         // }
             
         validIndexPrefix = currentFrame.prefix;
@@ -743,6 +756,7 @@ final class SegmentTermsEnum extends Ter
 
         currentFrame.loadBlock();
 
+        //if (DEBUG) System.out.println("  now scanToTerm");
         final SeekStatus result = currentFrame.scanToTerm(target, false);
         if (result == SeekStatus.END) {
           term.copyBytes(target);
@@ -750,7 +764,7 @@ final class SegmentTermsEnum extends Ter
 
           if (next() != null) {
             //if (DEBUG) {
-            //System.out.println("  return NOT_FOUND term=" + brToString(term) + " " + term);
+            //System.out.println("  return NOT_FOUND term=" + brToString(term));
             //}
             return SeekStatus.NOT_FOUND;
           } else {
@@ -761,7 +775,7 @@ final class SegmentTermsEnum extends Ter
           }
         } else {
           //if (DEBUG) {
-          //System.out.println("  return " + result + " term=" + brToString(term) + " " + term);
+          //System.out.println("  return " + result + " term=" + brToString(term));
           //}
           return result;
         }
@@ -776,7 +790,7 @@ final class SegmentTermsEnum extends Ter
         }
 
         //if (DEBUG) {
-        //System.out.println("    index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
+        //System.out.println("    index: follow label=" + (target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
         //}
         targetUpto++;
 
@@ -802,7 +816,7 @@ final class SegmentTermsEnum extends Ter
       termExists = false;
       if (next() != null) {
         //if (DEBUG) {
-        //System.out.println("  return NOT_FOUND term=" + term.utf8ToString() + " " + term);
+        //System.out.println("  return NOT_FOUND term=" + term.get().utf8ToString() + " " + term);
         //}
         return SeekStatus.NOT_FOUND;
       } else {
@@ -906,7 +920,9 @@ final class SegmentTermsEnum extends Ter
     // Pop finished blocks
     while (currentFrame.nextEnt == currentFrame.entCount) {
       if (!currentFrame.isLastInFloor) {
+        // Advance to next floor block
         currentFrame.loadNextFloorBlock();
+        break;
       } else {
         //if (DEBUG) System.out.println("  pop frame");
         if (currentFrame.ord == 0) {
@@ -946,11 +962,9 @@ final class SegmentTermsEnum extends Ter
         // This is a "next" frame -- even if it's
         // floor'd we must pretend it isn't so we don't
         // try to scan to the right floor frame:
-        currentFrame.isFloor = false;
-        //currentFrame.hasTerms = true;
         currentFrame.loadBlock();
       } else {
-        //if (DEBUG) System.out.println("  return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
+        //if (DEBUG) System.out.println("  return term=" + brToString(term) + " currentFrame.ord=" + currentFrame.ord);
         return term.get();
       }
     }

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java Thu Apr  2 15:37:39 2015
@@ -37,6 +37,10 @@ final class SegmentTermsEnumFrame {
 
   FST.Arc<BytesRef> arc;
 
+  final boolean versionAutoPrefix;
+
+  //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
   // File pointer where this block was loaded from
   long fp;
   long fpOrig;
@@ -96,6 +100,7 @@ final class SegmentTermsEnumFrame {
     this.state = ste.fr.parent.postingsReader.newTermState();
     this.state.totalTermFreq = -1;
     this.longs = new long[ste.fr.longsSize];
+    this.versionAutoPrefix = ste.fr.parent.version >= BlockTreeTermsReader.VERSION_AUTO_PREFIX_TERMS;
   }
 
   public void setFloorData(ByteArrayDataInput in, BytesRef source) {
@@ -262,12 +267,17 @@ final class SegmentTermsEnumFrame {
     */
   }
 
-  public boolean next() {
-    return isLeafBlock ? nextLeaf() : nextNonLeaf();
+  // Decodes next entry; returns true if it's a sub-block
+  public boolean next() throws IOException {
+    if (isLeafBlock) {
+      nextLeaf();
+      return false;
+    } else {
+      return nextNonLeaf();
+    }
   }
 
-  // Decodes next entry; returns true if it's a sub-block
-  public boolean nextLeaf() {
+  public void nextLeaf() {
     //if (DEBUG) System.out.println("  frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
     assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
     nextEnt++;
@@ -276,36 +286,78 @@ final class SegmentTermsEnumFrame {
     ste.term.setLength(prefix + suffix);
     ste.term.grow(ste.term.length());
     suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
-    // A normal term
     ste.termExists = true;
-    return false;
   }
 
-  public boolean nextNonLeaf() {
-    //if (DEBUG) System.out.println("  frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount);
-    assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
-    nextEnt++;
-    final int code = suffixesReader.readVInt();
-    suffix = code >>> 1;
-    startBytePos = suffixesReader.getPosition();
-    ste.term.setLength(prefix + suffix);
-    ste.term.grow(ste.term.length());
-    suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
-    if ((code & 1) == 0) {
-      // A normal term
-      ste.termExists = true;
-      subCode = 0;
-      state.termBlockOrd++;
-      return false;
-    } else {
-      // A sub-block; make sub-FP absolute:
-      ste.termExists = false;
-      subCode = suffixesReader.readVLong();
-      lastSubFP = fp - subCode;
-      //if (DEBUG) {
-      //System.out.println("    lastSubFP=" + lastSubFP);
-      //}
-      return true;
+  public boolean nextNonLeaf() throws IOException {
+    //if (DEBUG) System.out.println("  stef.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + suffixesReader.getPosition());
+    while (true) {
+      if (nextEnt == entCount) {
+        assert arc == null || (isFloor && isLastInFloor == false): "isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor;
+        loadNextFloorBlock();
+        if (isLeafBlock) {
+          nextLeaf();
+          return false;
+        } else {
+          continue;
+        }
+      }
+        
+      assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
+      nextEnt++;
+      final int code = suffixesReader.readVInt();
+      if (versionAutoPrefix == false) {
+        suffix = code >>> 1;
+      } else {
+        suffix = code >>> 2;
+      }
+      startBytePos = suffixesReader.getPosition();
+      ste.term.setLength(prefix + suffix);
+      ste.term.grow(ste.term.length());
+      suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
+      if (versionAutoPrefix == false) {
+        if ((code & 1) == 0) {
+          // A normal term
+          ste.termExists = true;
+          subCode = 0;
+          state.termBlockOrd++;
+          return false;
+        } else {
+          // A sub-block; make sub-FP absolute:
+          ste.termExists = false;
+          subCode = suffixesReader.readVLong();
+          lastSubFP = fp - subCode;
+          //if (DEBUG) {
+          //System.out.println("    lastSubFP=" + lastSubFP);
+          //}
+          return true;
+        }
+      } else {
+
+        switch(code & 3) {
+        case 0:
+          // A normal term
+          ste.termExists = true;
+          subCode = 0;
+          state.termBlockOrd++;
+          return false;
+        case 1:
+          // A sub-block; make sub-FP absolute:
+          ste.termExists = false;
+          subCode = suffixesReader.readVLong();
+          lastSubFP = fp - subCode;
+          //if (DEBUG) {
+          //System.out.println("    lastSubFP=" + lastSubFP);
+          //}
+          return true;
+        case 2:
+        case 3:
+          // A prefix term: skip it
+          state.termBlockOrd++;
+          suffixesReader.readByte();
+          continue;
+        }
+      }
     }
   }
         
@@ -448,18 +500,38 @@ final class SegmentTermsEnumFrame {
       assert nextEnt < entCount;
       nextEnt++;
       final int code = suffixesReader.readVInt();
-      suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1);
-      //if (DEBUG) System.out.println("    " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
-      if ((code & 1) != 0) {
-        final long subCode = suffixesReader.readVLong();
-        //if (DEBUG) System.out.println("      subCode=" + subCode);
-        if (targetSubCode == subCode) {
-          //if (DEBUG) System.out.println("        match!");
-          lastSubFP = subFP;
-          return;
+      if (versionAutoPrefix == false) {
+        suffixesReader.skipBytes(code >>> 1);
+        if ((code & 1) != 0) {
+          final long subCode = suffixesReader.readVLong();
+          if (targetSubCode == subCode) {
+            //if (DEBUG) System.out.println("        match!");
+            lastSubFP = subFP;
+            return;
+          }
+        } else {
+          state.termBlockOrd++;
         }
       } else {
-        state.termBlockOrd++;
+        int flag = code & 3;
+        suffixesReader.skipBytes(code >>> 2);
+        //if (DEBUG) System.out.println("    " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1));
+        if (flag == 1) {
+          // Sub-block
+          final long subCode = suffixesReader.readVLong();
+          //if (DEBUG) System.out.println("      subCode=" + subCode);
+          if (targetSubCode == subCode) {
+            //if (DEBUG) System.out.println("        match!");
+            lastSubFP = subFP;
+            return;
+          }
+        } else {
+          state.termBlockOrd++;
+          if (flag == 2 || flag == 3) {
+            // Floor'd prefix term
+            suffixesReader.readByte();
+          }
+        }
       }
     }
   }
@@ -473,6 +545,21 @@ final class SegmentTermsEnumFrame {
   private int suffix;
   private long subCode;
 
+  // for debugging
+  /*
+  @SuppressWarnings("unused")
+  static String brToString(BytesRef b) {
+    try {
+      return b.utf8ToString() + " " + b;
+    } catch (Throwable t) {
+      // If BytesRef isn't actually UTF8, or it's eg a
+      // prefix of UTF8 that ends mid-unicode-char, we
+      // fallback to hex:
+      return b.toString();
+    }
+  }
+  */
+
   // Target's prefix matches this block's prefix; we
   // scan the entries check if the suffix matches.
   public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
@@ -535,9 +622,6 @@ final class SegmentTermsEnumFrame {
           // keep scanning
 
           if (nextEnt == entCount) {
-            if (exactOnly) {
-              fillTerm();
-            }
             // We are done scanning this block
             break nextTerm;
           } else {
@@ -590,7 +674,7 @@ final class SegmentTermsEnumFrame {
   // scan the entries check if the suffix matches.
   public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
 
-    //if (DEBUG) System.out.println("    scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term));
+    //if (DEBUG) System.out.println("    scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(target));
 
     assert nextEnt != -1;
 
@@ -605,30 +689,60 @@ final class SegmentTermsEnumFrame {
     assert prefixMatches(target);
 
     // Loop over each entry (term or sub-block) in this block:
-    //nextTerm: while(nextEnt < entCount) {
-    nextTerm: while (true) {
+    nextTerm: while(nextEnt < entCount) {
+
       nextEnt++;
 
       final int code = suffixesReader.readVInt();
-      suffix = code >>> 1;
-      // if (DEBUG) {
-      //   BytesRef suffixBytesRef = new BytesRef();
-      //   suffixBytesRef.bytes = suffixBytes;
-      //   suffixBytesRef.offset = suffixesReader.getPosition();
-      //   suffixBytesRef.length = suffix;
-      //   System.out.println("      cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
-      // }
+      if (versionAutoPrefix == false) {
+        suffix = code >>> 1;
+      } else {
+        suffix = code >>> 2;
+      }
+
+      //if (DEBUG) {
+      //  BytesRef suffixBytesRef = new BytesRef();
+      //  suffixBytesRef.bytes = suffixBytes;
+      //  suffixBytesRef.offset = suffixesReader.getPosition();
+      //  suffixBytesRef.length = suffix;
+      //  System.out.println("      cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef));
+      //}
 
-      ste.termExists = (code & 1) == 0;
       final int termLen = prefix + suffix;
       startBytePos = suffixesReader.getPosition();
       suffixesReader.skipBytes(suffix);
-      if (ste.termExists) {
-        state.termBlockOrd++;
-        subCode = 0;
+      if (versionAutoPrefix == false) {
+        ste.termExists = (code & 1) == 0;
+        if (ste.termExists) {
+          state.termBlockOrd++;
+          subCode = 0;
+        } else {
+          subCode = suffixesReader.readVLong();
+          lastSubFP = fp - subCode;
+        }
       } else {
-        subCode = suffixesReader.readVLong();
-        lastSubFP = fp - subCode;
+        switch (code & 3) {
+        case 0:
+          // Normal term
+          ste.termExists = true;
+          state.termBlockOrd++;
+          subCode = 0;
+          break;
+        case 1:
+          // Sub-block
+          ste.termExists = false;
+          subCode = suffixesReader.readVLong();
+          lastSubFP = fp - subCode;
+          break;
+        case 2:
+        case 3:
+          // Floor prefix term: skip it
+          //if (DEBUG) System.out.println("        skip floor prefix term");
+          suffixesReader.readByte();
+          ste.termExists = false;
+          state.termBlockOrd++;
+          continue;
+        }
       }
 
       final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen);
@@ -637,7 +751,7 @@ final class SegmentTermsEnumFrame {
       // Loop over bytes in the suffix, comparing to
       // the target
       int bytePos = startBytePos;
-      while(true) {
+      while (true) {
         final int cmp;
         final boolean stop;
         if (targetPos < targetLimit) {
@@ -652,24 +766,18 @@ final class SegmentTermsEnumFrame {
         if (cmp < 0) {
           // Current entry is still before the target;
           // keep scanning
-
-          if (nextEnt == entCount) {
-            if (exactOnly) {
-              fillTerm();
-              //termExists = true;
-            }
-            // We are done scanning this block
-            break nextTerm;
-          } else {
-            continue nextTerm;
-          }
+          continue nextTerm;
         } else if (cmp > 0) {
 
           // Done!  Current entry is after target --
           // return NOT_FOUND:
           fillTerm();
 
+          //if (DEBUG) System.out.println("        maybe done exactOnly=" + exactOnly + " ste.termExists=" + ste.termExists);
+
           if (!exactOnly && !ste.termExists) {
+            //System.out.println("  now pushFrame");
+            // TODO this 
             // We are on a sub-block, and caller wants
             // us to position to the next term after
             // the target, so we must recurse into the

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/codecs/blocktree/Stats.java Thu Apr  2 15:37:39 2015
@@ -48,6 +48,8 @@ public class Stats {
   /** Total number of bytes (sum of term lengths) across all terms in the field. */
   public long totalTermBytes;
 
+  // TODO: add total auto-prefix term count
+
   /** The number of normal (non-floor) blocks in the terms file. */
   public int nonFloorBlockCount;
 

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/AutomatonTermsEnum.java Thu Apr  2 15:37:39 2015
@@ -43,9 +43,9 @@ import org.apache.lucene.util.automaton.
  * completely accepted. This is not possible when the language accepted by the
  * FSM is not finite (i.e. * operator).
  * </p>
- * @lucene.experimental
+ * @lucene.internal
  */
-class AutomatonTermsEnum extends FilteredTermsEnum {
+public class AutomatonTermsEnum extends FilteredTermsEnum {
   // a tableized array-based form of the DFA
   private final ByteRunAutomaton runAutomaton;
   // common suffix of the automaton
@@ -70,9 +70,8 @@ class AutomatonTermsEnum extends Filtere
   /**
    * Construct an enumerator based upon an automaton, enumerating the specified
    * field, working on a supplied TermsEnum
-   * <p>
+   *
    * @lucene.experimental 
-   * <p>
    * @param compiled CompiledAutomaton
    */
   public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java Thu Apr  2 15:37:39 2015
@@ -25,7 +25,9 @@ import java.nio.file.Paths;
 import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Deque;
 import java.util.HashMap;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -56,6 +58,8 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LongBitSet;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 /**
  * Basic tool and API to check the health of an index and
@@ -902,6 +906,180 @@ public class CheckIndex implements Close
     return status;
   }
 
+  /** Visits all terms in the range minTerm (inclusive) to maxTerm (exclusive), marking all doc IDs encountered into allDocsSeen, and
+   *  returning the total number of terms visited. */
+  private static long getDocsFromTermRange(String field, int maxDoc, TermsEnum termsEnum, FixedBitSet docsSeen, BytesRef minTerm, BytesRef maxTerm, boolean isIntersect) throws IOException {
+    docsSeen.clear(0, docsSeen.length());
+
+    long termCount = 0;
+    PostingsEnum postingsEnum = null;
+    BytesRefBuilder lastTerm = null;
+    while (true) {
+      BytesRef term;
+
+      // Kinda messy: for intersect, we must first next(), but for "normal", we are already on our first term:
+      if (isIntersect || termCount != 0) {
+        term = termsEnum.next();
+      } else {
+        term = termsEnum.term();
+      }
+
+      if (term == null) {
+        if (isIntersect == false) {
+          throw new RuntimeException("didn't see max term field=" + field + " term=" + maxTerm);
+        }
+        return termCount;
+      }
+
+      assert term.isValid();
+        
+      if (lastTerm == null) {
+        lastTerm = new BytesRefBuilder();
+        lastTerm.copyBytes(term);
+      } else {
+        if (lastTerm.get().compareTo(term) >= 0) {
+          throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
+        }
+        lastTerm.copyBytes(term);
+      }
+
+      //System.out.println("    term=" + term);
+
+      // Caller already ensured terms enum positioned >= minTerm:
+      if (term.compareTo(minTerm) < 0) {
+        throw new RuntimeException("saw term before min term field=" + field + " term=" + minTerm);
+      }
+
+      if (isIntersect == false) {
+        int cmp = term.compareTo(maxTerm);
+        if (cmp == 0) {
+          // Done!
+          return termCount;
+        } else if (cmp > 0) {
+          throw new RuntimeException("didn't see end term field=" + field + " term=" + maxTerm);
+        }
+      }
+
+      postingsEnum = termsEnum.postings(null, postingsEnum, 0);
+
+      int lastDoc = -1;
+      while (true) {
+        int doc = postingsEnum.nextDoc();
+        if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+          break;
+        }
+        if (doc <= lastDoc) {
+          throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
+        }
+        if (doc >= maxDoc) {
+          throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
+        }
+
+        //System.out.println("      doc=" + doc);
+        docsSeen.set(doc);
+
+        lastDoc = doc;
+      }
+
+      termCount++;
+    }
+  }
+
+  /** Test Terms.intersect on this range, and validates that it returns the same doc ids as using non-intersect TermsEnum.  Returns true if
+   *  any fake terms were seen. */
+  private static boolean checkSingleTermRange(String field, int maxDoc, Terms terms, BytesRef minTerm, BytesRef maxTerm, FixedBitSet normalDocs, FixedBitSet intersectDocs) throws IOException {
+    // System.out.println("  check minTerm=" + minTerm + " maxTerm=" + maxTerm);
+
+    TermsEnum termsEnum = terms.iterator(null);
+    TermsEnum.SeekStatus status = termsEnum.seekCeil(minTerm);
+    if (status != TermsEnum.SeekStatus.FOUND) {
+      throw new RuntimeException("failed to seek to existing term field=" + field + " term=" + minTerm);
+    }
+
+    // Do "dumb" iteration to visit all terms in the range:
+    long normalTermCount = getDocsFromTermRange(field, maxDoc, termsEnum, normalDocs, minTerm, maxTerm, false);
+
+    // Now do the same operation using intersect:
+    long intersectTermCount = getDocsFromTermRange(field, maxDoc, terms.intersect(new CompiledAutomaton(Automata.makeBinaryInterval(minTerm, true, maxTerm, false), true, false, Integer.MAX_VALUE, true), null), intersectDocs, minTerm, maxTerm, true);
+
+    if (intersectTermCount > normalTermCount) {
+      throw new RuntimeException("intersect returned too many terms: field=" + field + " intersectTermCount=" + intersectTermCount + " normalTermCount=" + normalTermCount);
+    }
+
+    if (normalDocs.equals(intersectDocs) == false) {
+      throw new RuntimeException("intersect visited different docs than straight terms enum: " + normalDocs.cardinality() + " for straight enum, vs " + intersectDocs.cardinality() + " for intersect, minTerm=" + minTerm + " maxTerm=" + maxTerm);
+    }
+    //System.out.println("    " + intersectTermCount + " vs " + normalTermCount);
+    return intersectTermCount != normalTermCount;
+  }
+
+  /** Make an effort to visit "fake" (e.g. auto-prefix) terms.  We do this by running term range intersections across an initially wide
+   *  interval of terms, at different boundaries, and then gradually decrease the interval.  This is not guaranteed to hit all non-real
+   *  terms (doing that in general is non-trivial), but it should hit many of them, and validate their postings against the postings for the
+   *  real terms. */
+  private static void checkTermRanges(String field, int maxDoc, Terms terms, long numTerms) throws IOException {
+
+    // We'll target this many terms in our interval for the current level:
+    double currentInterval = numTerms;
+
+    FixedBitSet normalDocs = new FixedBitSet(maxDoc);
+    FixedBitSet intersectDocs = new FixedBitSet(maxDoc);
+
+    TermsEnum termsEnum = null;
+    //System.out.println("CI.checkTermRanges field=" + field + " numTerms=" + numTerms);
+
+    while (currentInterval >= 10.0) {
+      //System.out.println("  cycle interval=" + currentInterval);
+
+      // We iterate this terms enum to locate min/max term for each sliding/overlapping interval we test at the current level:
+      termsEnum = terms.iterator(termsEnum);
+
+      long termCount = 0;
+
+      Deque<BytesRef> termBounds = new LinkedList<>();
+
+      long lastTermAdded = Long.MIN_VALUE;
+
+      BytesRefBuilder lastTerm = null;
+
+      while (true) {
+        BytesRef term = termsEnum.next();
+        if (term == null) {
+          break;
+        }
+        //System.out.println("  top: term=" + term.utf8ToString());
+        if (termCount >= lastTermAdded + currentInterval/4) {
+          termBounds.add(BytesRef.deepCopyOf(term));
+          lastTermAdded = termCount;
+          if (termBounds.size() == 5) {
+            BytesRef minTerm = termBounds.removeFirst();
+            BytesRef maxTerm = termBounds.getLast();
+            checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
+          }
+        }
+        termCount++;
+
+        if (lastTerm == null) {
+          lastTerm = new BytesRefBuilder();
+          lastTerm.copyBytes(term);
+        } else {
+          if (lastTerm.get().compareTo(term) >= 0) {
+            throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
+          }
+          lastTerm.copyBytes(term);
+        }
+      }
+
+      if (lastTerm != null && termBounds.isEmpty() == false) {
+        BytesRef minTerm = termBounds.removeFirst();
+        BytesRef maxTerm = lastTerm.get();
+        checkSingleTermRange(field, maxDoc, terms, minTerm, maxTerm, normalDocs, intersectDocs);
+      }
+
+      currentInterval *= .75;
+    }
+  }
+
   /**
    * checks Fields api is consistent with itself.
    * searcher is optional, to verify with queries. Can be null.
@@ -922,6 +1100,7 @@ public class CheckIndex implements Close
     
     String lastField = null;
     for (String field : fields) {
+
       // MultiFieldsEnum relies upon this order...
       if (lastField != null && field.compareTo(lastField) <= 0) {
         throw new RuntimeException("fields out of order: lastField=" + lastField + " field=" + field);
@@ -1031,7 +1210,8 @@ public class CheckIndex implements Close
         if (term == null) {
           break;
         }
-
+        // System.out.println("CI: field=" + field + " check term=" + term + " docFreq=" + termsEnum.docFreq());
+        
         assert term.isValid();
         
         // make sure terms arrive in order according to
@@ -1323,13 +1503,21 @@ public class CheckIndex implements Close
         // docs got deleted and then merged away):
         
       } else {
+
+        long fieldTermCount = (status.delTermCount+status.termCount)-termCountStart;
+
+        if (hasFreqs == false) {
+          // For DOCS_ONLY fields we recursively test term ranges:
+          checkTermRanges(field, maxDoc, fieldTerms, fieldTermCount);
+        }
+
         final Object stats = fieldTerms.getStats();
         assert stats != null;
         if (status.blockTreeStats == null) {
           status.blockTreeStats = new HashMap<>();
         }
         status.blockTreeStats.put(field, stats);
-        
+
         if (sumTotalTermFreq != 0) {
           final long v = fields.terms(field).getSumTotalTermFreq();
           if (v != -1 && sumTotalTermFreq != v) {
@@ -1344,11 +1532,9 @@ public class CheckIndex implements Close
           }
         }
         
-        if (fieldTerms != null) {
-          final int v = fieldTerms.getDocCount();
-          if (v != -1 && visitedDocs.cardinality() != v) {
-            throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
-          }
+        final int v = fieldTerms.getDocCount();
+        if (v != -1 && visitedDocs.cardinality() != v) {
+          throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
         }
         
         // Test seek to last term:
@@ -1356,6 +1542,9 @@ public class CheckIndex implements Close
           if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) { 
             throw new RuntimeException("seek to last term " + lastTerm + " failed");
           }
+          if (termsEnum.term().equals(lastTerm.get()) == false) {
+            throw new RuntimeException("seek to last term " + lastTerm.get() + " returned FOUND but seeked to the wrong term " + termsEnum.term());
+          }
           
           int expectedDocFreq = termsEnum.docFreq();
           PostingsEnum d = termsEnum.postings(null, null, PostingsEnum.NONE);
@@ -1364,21 +1553,21 @@ public class CheckIndex implements Close
             docFreq++;
           }
           if (docFreq != expectedDocFreq) {
-            throw new RuntimeException("docFreq for last term " + lastTerm + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq);
+            throw new RuntimeException("docFreq for last term " + lastTerm.toBytesRef() + "=" + expectedDocFreq + " != recomputed docFreq=" + docFreq);
           }
         }
         
         // check unique term count
         long termCount = -1;
         
-        if ((status.delTermCount+status.termCount)-termCountStart > 0) {
+        if (fieldTermCount > 0) {
           termCount = fields.terms(field).size();
           
-          if (termCount != -1 && termCount != status.delTermCount + status.termCount - termCountStart) {
-            throw new RuntimeException("termCount mismatch " + (status.delTermCount + termCount) + " vs " + (status.termCount - termCountStart));
+          if (termCount != -1 && termCount != fieldTermCount) {
+            throw new RuntimeException("termCount mismatch " + termCount + " vs " + fieldTermCount);
           }
         }
-        
+
         // Test seeking by ord
         if (hasOrd && status.termCount-termCountStart > 0) {
           int seekCount = (int) Math.min(10000L, termCount);
@@ -1398,6 +1587,9 @@ public class CheckIndex implements Close
               if (termsEnum.seekCeil(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) {
                 throw new RuntimeException("seek to existing term " + seekTerms[i] + " failed");
               }
+              if (termsEnum.term().equals(seekTerms[i]) == false) {
+                throw new RuntimeException("seek to existing term " + seekTerms[i] + " returned FOUND but seeked to the wrong term " + termsEnum.term());
+              }
               
               postings = termsEnum.postings(liveDocs, postings, PostingsEnum.NONE);
               if (postings == null) {

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java Thu Apr  2 15:37:39 2015
@@ -151,7 +151,6 @@ class FreqProxFields extends Fields {
     }
 
     public SeekStatus seekCeil(BytesRef text) {
-
       // TODO: we could instead keep the BytesRefHash
       // intact so this is a hash lookup
 
@@ -170,17 +169,19 @@ class FreqProxFields extends Fields {
         } else {
           // found:
           ord = mid;
+          assert term().compareTo(text) == 0;
           return SeekStatus.FOUND;
         }
       }
 
       // not found:
-      ord = lo + 1;
+      ord = lo;
       if (ord >= numTerms) {
         return SeekStatus.END;
       } else {
         int textStart = postingsArray.textStarts[sortedTermIDs[ord]];
         terms.bytePool.setBytesRef(scratch, textStart);
+        assert term().compareTo(text) > 0;
         return SeekStatus.NOT_FOUND;
       }
     }
@@ -309,7 +310,7 @@ class FreqProxFields extends Fields {
     final FreqProxPostingsArray postingsArray;
     final ByteSliceReader reader = new ByteSliceReader();
     final boolean readTermFreq;
-    int docID;
+    int docID = -1;
     int freq;
     boolean ended;
     int termID;
@@ -324,7 +325,7 @@ class FreqProxFields extends Fields {
       this.termID = termID;
       terms.initReader(reader, termID, 0);
       ended = false;
-      docID = 0;
+      docID = -1;
     }
 
     @Override
@@ -365,6 +366,9 @@ class FreqProxFields extends Fields {
 
     @Override
     public int nextDoc() throws IOException {
+      if (docID == -1) {
+        docID = 0;
+      }
       if (reader.eof()) {
         if (ended) {
           return NO_MORE_DOCS;
@@ -412,7 +416,7 @@ class FreqProxFields extends Fields {
     final ByteSliceReader reader = new ByteSliceReader();
     final ByteSliceReader posReader = new ByteSliceReader();
     final boolean readOffsets;
-    int docID;
+    int docID = -1;
     int freq;
     int pos;
     int startOffset;
@@ -436,7 +440,7 @@ class FreqProxFields extends Fields {
       terms.initReader(reader, termID, 0);
       terms.initReader(posReader, termID, 1);
       ended = false;
-      docID = 0;
+      docID = -1;
       posLeft = 0;
     }
 
@@ -452,6 +456,9 @@ class FreqProxFields extends Fields {
 
     @Override
     public int nextDoc() throws IOException {
+      if (docID == -1) {
+        docID = 0;
+      }
       while (posLeft != 0) {
         nextPosition();
       }

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java Thu Apr  2 15:37:39 2015
@@ -49,6 +49,7 @@ final class MappingMultiPostingsEnum ext
     this.numSubs = postingsEnum.getNumSubs();
     this.subs = postingsEnum.getSubs();
     upto = -1;
+    doc = -1;
     current = null;
     this.multiDocsAndPositionsEnum = postingsEnum;
     return this;

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/TermContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/TermContext.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/TermContext.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/TermContext.java Thu Apr  2 15:37:39 2015
@@ -17,6 +17,7 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
+import org.apache.lucene.codecs.BlockTermState;
 import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
@@ -165,4 +166,30 @@ public final class TermContext {
   public void setDocFreq(int docFreq) {
     this.docFreq = docFreq;
   }
-}
\ No newline at end of file
+
+  /** Returns true if all terms stored here are real (e.g., not auto-prefix terms).
+   *
+   *  @lucene.internal */
+  public boolean hasOnlyRealTerms() {
+    for(TermState termState : states) {
+      if (termState instanceof BlockTermState && ((BlockTermState) termState).isRealTerm == false) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("TermContext\n");
+    for(TermState termState : states) {
+      sb.append("  state=");
+      sb.append(termState.toString());
+      sb.append('\n');
+    }
+
+    return sb.toString();
+  }
+}

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/Terms.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/index/Terms.java Thu Apr  2 15:37:39 2015
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 
 import java.io.IOException;
 
+import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -42,17 +43,23 @@ public abstract class Terms {
    *  implementation can do so. */
   public abstract TermsEnum iterator(TermsEnum reuse) throws IOException;
 
-  /** Returns a TermsEnum that iterates over all terms that
-   *  are accepted by the provided {@link
+  /** Returns a TermsEnum that iterates over all terms and
+   *  documents that are accepted by the provided {@link
    *  CompiledAutomaton}.  If the <code>startTerm</code> is
-   *  provided then the returned enum will only accept terms
+   *  provided then the returned enum will only return terms
    *  {@code > startTerm}, but you still must call
    *  next() first to get to the first term.  Note that the
    *  provided <code>startTerm</code> must be accepted by
    *  the automaton.
    *
    * <p><b>NOTE</b>: the returned TermsEnum cannot
-   * seek</p>. */
+   * seek</p>.
+   *
+   *  <p><b>NOTE</b>: the terms dictionary is free to
+   *  return arbitrary terms as long as the resulted visited
+   *  docs is the same.  E.g., {@link BlockTreeTermsWriter}
+   *  creates auto-prefix terms during indexing to reduce the
+   *  number of terms visited. */
   public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
     
     // TODO: could we factor out a common interface b/w
@@ -64,13 +71,17 @@ public abstract class Terms {
     // TODO: eventually we could support seekCeil/Exact on
     // the returned enum, instead of only being able to seek
     // at the start
+
+    TermsEnum termsEnum = iterator(null);
+
     if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
       throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
     }
+
     if (startTerm == null) {
-      return new AutomatonTermsEnum(iterator(null), compiled);
+      return new AutomatonTermsEnum(termsEnum, compiled);
     } else {
-      return new AutomatonTermsEnum(iterator(null), compiled) {
+      return new AutomatonTermsEnum(termsEnum, compiled) {
         @Override
         protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
           if (term == null) {

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java Thu Apr  2 15:37:39 2015
@@ -99,6 +99,7 @@ public class AutomatonQuery extends Mult
     super(term.field());
     this.term = term;
     this.automaton = automaton;
+    // TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
     this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
   }
 

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java Thu Apr  2 15:37:39 2015
@@ -17,12 +17,7 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
-import java.io.IOException;
-
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.ToStringUtils;
 import org.apache.lucene.util.automaton.Automaton;
@@ -33,6 +28,7 @@ import org.apache.lucene.util.automaton.
  * <p>This query uses the {@link
  * MultiTermQuery#CONSTANT_SCORE_REWRITE}
  * rewrite method. */
+
 public class PrefixQuery extends AutomatonQuery {
 
   /** Constructs a query for terms starting with <code>prefix</code>. */

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java Thu Apr  2 15:37:39 2015
@@ -18,19 +18,19 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
-
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
 
 /** 
  * Base rewrite method that translates each term into a query, and keeps
@@ -112,7 +112,7 @@ public abstract class ScoringRewrite<Q e
       for (int i = 0; i < size; i++) {
         final int pos = sort[i];
         final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
-        assert reader.docFreq(term) == termStates[pos].docFreq();
+        assert termStates[pos].hasOnlyRealTerms() == false || reader.docFreq(term) == termStates[pos].docFreq();
         addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);
       }
     }
@@ -137,7 +137,7 @@ public abstract class ScoringRewrite<Q e
       final int e = terms.add(bytes);
       final TermState state = termsEnum.termState();
       assert state != null; 
-      if (e < 0 ) {
+      if (e < 0) {
         // duplicate term: update docFreq
         final int pos = (-e)-1;
         array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/search/TermRangeQuery.java Thu Apr  2 15:37:39 2015
@@ -17,22 +17,17 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
-import java.io.IOException;
-
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
 
 /**
  * A Query that matches documents within an range of terms.
  *
  * <p>This query matches the documents looking for terms that fall into the
- * supplied range according to {@link
- * Byte#compareTo(Byte)}. It is not intended
- * for numerical ranges; use {@link NumericRangeQuery} instead.
+ * supplied range according to {@link BytesRef#compareTo(BytesRef)}.
  *
  * <p>This query uses the {@link
  * MultiTermQuery#CONSTANT_SCORE_REWRITE}
@@ -40,12 +35,11 @@ import org.apache.lucene.util.ToStringUt
  * @since 2.9
  */
 
-public class TermRangeQuery extends MultiTermQuery {
-  private BytesRef lowerTerm;
-  private BytesRef upperTerm;
-  private boolean includeLower;
-  private boolean includeUpper;
-
+public class TermRangeQuery extends AutomatonQuery {
+  private final BytesRef lowerTerm;
+  private final BytesRef upperTerm;
+  private final boolean includeLower;
+  private final boolean includeUpper;
 
   /**
    * Constructs a query selecting all terms greater/equal than <code>lowerTerm</code>
@@ -70,13 +64,28 @@ public class TermRangeQuery extends Mult
    *          included in the range.
    */
   public TermRangeQuery(String field, BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
-    super(field);
+    super(new Term(field, lowerTerm), toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper), Integer.MAX_VALUE, true);
     this.lowerTerm = lowerTerm;
     this.upperTerm = upperTerm;
     this.includeLower = includeLower;
     this.includeUpper = includeUpper;
   }
 
+  public static Automaton toAutomaton(BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
+
+    if (lowerTerm == null) {
+      // makeBinaryInterval is more picky than we are:
+      includeLower = true;
+    }
+
+    if (upperTerm == null) {
+      // makeBinaryInterval is more picky than we are:
+      includeUpper = true;
+    }
+
+    return Automata.makeBinaryInterval(lowerTerm, includeLower, upperTerm, includeUpper);
+  }
+
   /**
    * Factory that creates a new TermRangeQuery using Strings for term text.
    */
@@ -98,37 +107,22 @@ public class TermRangeQuery extends Mult
   /** Returns <code>true</code> if the upper endpoint is inclusive */
   public boolean includesUpper() { return includeUpper; }
   
-  @Override
-  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-    if (lowerTerm != null && upperTerm != null && lowerTerm.compareTo(upperTerm) > 0) {
-      return TermsEnum.EMPTY;
-    }
-    
-    TermsEnum tenum = terms.iterator(null);
-    
-    if ((lowerTerm == null || (includeLower && lowerTerm.length == 0)) && upperTerm == null) {
-      return tenum;
-    }
-    return new TermRangeTermsEnum(tenum,
-        lowerTerm, upperTerm, includeLower, includeUpper);
-  }
-
   /** Prints a user-readable version of this query. */
   @Override
   public String toString(String field) {
-      StringBuilder buffer = new StringBuilder();
-      if (!getField().equals(field)) {
-          buffer.append(getField());
-          buffer.append(":");
-      }
-      buffer.append(includeLower ? '[' : '{');
-      // TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8!
-      buffer.append(lowerTerm != null ? ("*".equals(Term.toString(lowerTerm)) ? "\\*" : Term.toString(lowerTerm))  : "*");
-      buffer.append(" TO ");
-      buffer.append(upperTerm != null ? ("*".equals(Term.toString(upperTerm)) ? "\\*" : Term.toString(upperTerm)) : "*");
-      buffer.append(includeUpper ? ']' : '}');
-      buffer.append(ToStringUtils.boost(getBoost()));
-      return buffer.toString();
+    StringBuilder buffer = new StringBuilder();
+    if (!getField().equals(field)) {
+      buffer.append(getField());
+      buffer.append(":");
+    }
+    buffer.append(includeLower ? '[' : '{');
+    // TODO: all these toStrings for queries should just output the bytes, it might not be UTF-8!
+    buffer.append(lowerTerm != null ? ("*".equals(Term.toString(lowerTerm)) ? "\\*" : Term.toString(lowerTerm))  : "*");
+    buffer.append(" TO ");
+    buffer.append(upperTerm != null ? ("*".equals(Term.toString(upperTerm)) ? "\\*" : Term.toString(upperTerm)) : "*");
+    buffer.append(includeUpper ? ']' : '}');
+    buffer.append(ToStringUtils.boost(getBoost()));
+    return buffer.toString();
   }
 
   @Override
@@ -167,5 +161,4 @@ public class TermRangeQuery extends Mult
       return false;
     return true;
   }
-
 }

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java Thu Apr  2 15:37:39 2015
@@ -72,6 +72,18 @@ final public class Automata {
     a.finishState();
     return a;
   }
+
+  /**
+   * Returns a new (deterministic) automaton that accepts all binary terms.
+   */
+  public static Automaton makeAnyBinary() {
+    Automaton a = new Automaton();
+    int s = a.createState();
+    a.setAccept(s, true);
+    a.addTransition(s, s, 0, 255);
+    a.finishState();
+    return a;
+  }
   
   /**
    * Returns a new (deterministic) automaton that accepts any single codepoint.
@@ -204,8 +216,172 @@ final public class Automata {
     return s;
   }
 
+  /** Creates a new deterministic, minimal automaton accepting
+   *  all binary terms in the specified interval.  Note that unlike
+   *  {@link #makeDecimalInterval}, the returned automaton is infinite,
+   *  because terms behave like floating point numbers leading with
+   *  a decimal point.  However, in the special case where min == max,
+   *  and both are inclusive, the automata will be finite and accept
+   *  exactly one term. */
+  public static Automaton makeBinaryInterval(BytesRef min, boolean minInclusive, BytesRef max, boolean maxInclusive) {
+
+    if (min == null && minInclusive == false) {
+      throw new IllegalArgumentException("minInclusive must be true when min is null (open ended)");
+    }
+
+    if (max == null && maxInclusive == false) {
+      throw new IllegalArgumentException("maxInclusive must be true when max is null (open ended)");
+    }
+
+    if (min != null && min.length == 0 && minInclusive == true) {
+      // Silly empty string corner case:
+      min = null;
+    }
+
+    if (min == null) {
+      if (max == null) {
+        // Accepts all terms:
+        return makeAnyBinary();
+      }
+      min = new BytesRef();
+      minInclusive = true;
+    }
+    int cmp;
+    if (max != null) {
+      cmp = min.compareTo(max);
+    } else {
+      cmp = -1;
+    }
+    if (cmp == 0) {
+      if (minInclusive == false || maxInclusive == false) {
+        return makeEmpty();
+      } else {
+        return makeBinary(min);
+      }
+    } else if (cmp > 0) {
+      // max > min
+      return makeEmpty();
+    }
+
+    Automaton a = new Automaton();
+    int startState = a.createState();
+    int sinkState = a.createState();
+    a.setAccept(sinkState, true);
+
+    // This state accepts all suffixes:
+    a.addTransition(sinkState, sinkState, 0, 255);
+
+    boolean equalPrefix = true;
+    int lastState = startState;
+    int firstMaxState = -1;
+    int sharedPrefixLength = 0;
+    for(int i=0;i<min.length;i++) {
+      int minLabel = min.bytes[min.offset+i] & 0xff;
+
+      int maxLabel;
+      if (max != null && equalPrefix && i < max.length) {
+        maxLabel = max.bytes[max.offset+i] & 0xff;
+      } else {
+        maxLabel = -1;
+      }
+
+      int nextState;
+      if (minInclusive && i == min.length-1 && (equalPrefix == false || minLabel != maxLabel)) {
+        nextState = sinkState;
+      } else {
+        nextState = a.createState();
+      }
+
+      if (equalPrefix) {
+
+        if (minLabel == maxLabel) {
+          // Still in shared prefix
+          a.addTransition(lastState, nextState, minLabel);
+        } else if (max == null) {
+          equalPrefix = false;
+          sharedPrefixLength = 0;
+          a.addTransition(lastState, sinkState, minLabel+1, 0xff);
+          a.addTransition(lastState, nextState, minLabel);
+        } else {
+          // This is the first point where min & max diverge:
+          assert maxLabel > minLabel;
+
+          a.addTransition(lastState, nextState, minLabel);
+
+          if (maxLabel > minLabel + 1) {
+            a.addTransition(lastState, sinkState, minLabel+1, maxLabel-1);
+          }
+
+          // Now fork off path for max:
+          if (maxInclusive || i < max.length-1) {
+            firstMaxState = a.createState();
+            if (i < max.length-1) {
+              a.setAccept(firstMaxState, true);
+            }
+            a.addTransition(lastState, firstMaxState, maxLabel);
+          }
+          equalPrefix = false;
+          sharedPrefixLength = i;
+        }
+      } else {
+        // OK, already diverged:
+        a.addTransition(lastState, nextState, minLabel);
+        if (minLabel < 255) {
+          a.addTransition(lastState, sinkState, minLabel+1, 255);
+        }
+      }
+      lastState = nextState;
+    }
+
+    // Accept any suffix appended to the min term:
+    if (equalPrefix == false && lastState != sinkState && lastState != startState) {
+      a.addTransition(lastState, sinkState, 0, 255);
+    }
+
+    if (minInclusive) {
+      // Accept exactly the min term:
+      a.setAccept(lastState, true);
+    }
+
+    if (max != null) {
+
+      // Now do max:
+      if (firstMaxState == -1) {
+        // Min was a full prefix of max
+        sharedPrefixLength = min.length;
+      } else {
+        lastState = firstMaxState;
+        sharedPrefixLength++;
+      }
+      for(int i=sharedPrefixLength;i<max.length;i++) {
+        int maxLabel = max.bytes[max.offset+i]&0xff;
+        if (maxLabel > 0) {
+          a.addTransition(lastState, sinkState, 0, maxLabel-1);
+        }
+        if (maxInclusive || i < max.length-1) {
+          int nextState = a.createState();
+          if (i < max.length-1) {
+            a.setAccept(nextState, true);
+          }
+          a.addTransition(lastState, nextState, maxLabel);
+          lastState = nextState;
+        }
+      }
+
+      if (maxInclusive) {
+        a.setAccept(lastState, true);
+      }
+    }
+
+    a.finishState();
+
+    assert a.isDeterministic(): a.toDot();
+
+    return a;
+  }
+
   /**
-   * Returns a new automaton that accepts strings representing decimal
+   * Returns a new automaton that accepts strings representing decimal (base 10)
    * non-negative integers in the given interval.
    * 
    * @param min minimal value of interval
@@ -218,7 +394,7 @@ final public class Automata {
    *              interval cannot be expressed with the given fixed number of
    *              digits
    */
-  public static Automaton makeInterval(int min, int max, int digits)
+  public static Automaton makeDecimalInterval(int min, int max, int digits)
       throws IllegalArgumentException {
     String x = Integer.toString(min);
     String y = Integer.toString(max);
@@ -275,7 +451,30 @@ final public class Automata {
     for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
       int state = a.createState();
       cp = s.codePointAt(i);
-      a.addTransition(lastState, state, cp, cp);
+      a.addTransition(lastState, state, cp);
+      lastState = state;
+    }
+
+    a.setAccept(lastState, true);
+    a.finishState();
+
+    assert a.isDeterministic();
+    assert Operations.hasDeadStates(a) == false;
+
+    return a;
+  }
+
+  /**
+   * Returns a new (deterministic) automaton that accepts the single given
+   * binary term.
+   */
+  public static Automaton makeBinary(BytesRef term) {
+    Automaton a = new Automaton();
+    int lastState = a.createState();
+    for (int i=0;i<term.length;i++) {
+      int state = a.createState();
+      int label = term.bytes[term.offset+i] & 0xff;
+      a.addTransition(lastState, state, label);
       lastState = state;
     }
 

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java Thu Apr  2 15:37:39 2015
@@ -491,11 +491,50 @@ public class Automaton implements Accoun
   public void getNextTransition(Transition t) {
     // Make sure there is still a transition left:
     assert (t.transitionUpto+3 - states[2*t.source]) <= 3*states[2*t.source+1];
+
+    // Make sure transitions are in fact sorted:
+    assert transitionSorted(t);
+
     t.dest = transitions[t.transitionUpto++];
     t.min = transitions[t.transitionUpto++];
     t.max = transitions[t.transitionUpto++];
   }
 
+  private boolean transitionSorted(Transition t) {
+
+    int upto = t.transitionUpto;
+    if (upto == states[2*t.source]) {
+      // Transition isn't initialzed yet (this is the first transition); don't check:
+      return true;
+    }
+
+    int nextDest = transitions[upto];
+    int nextMin = transitions[upto+1];
+    int nextMax = transitions[upto+2];
+    if (nextMin > t.min) {
+      return true;
+    } else if (nextMin < t.min) {
+      return false;
+    }
+
+    // Min is equal, now test max:
+    if (nextMax > t.max) {
+      return true;
+    } else if (nextMax < t.max) {
+      return false;
+    }
+
+    // Max is also equal, now test dest:
+    if (nextDest > t.dest) {
+      return true;
+    } else if (nextDest < t.dest) {
+      return false;
+    }
+
+    // We should never see fully equal transitions here:
+    return false;
+  }
+
   /** Fill the provided {@link Transition} with the index'th
    *  transition leaving the specified state. */
   public void getTransition(int state, int index, Transition t) {
@@ -565,7 +604,7 @@ public class Automaton implements Accoun
       //System.out.println("toDot: state " + state + " has " + numTransitions + " transitions; t.nextTrans=" + t.transitionUpto);
       for(int i=0;i<numTransitions;i++) {
         getNextTransition(t);
-        //System.out.println("  t.nextTrans=" + t.transitionUpto);
+        //System.out.println("  t.nextTrans=" + t.transitionUpto + " t=" + t);
         assert t.max >= t.min;
         b.append("  ");
         b.append(state);

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java Thu Apr  2 15:37:39 2015
@@ -28,8 +28,8 @@ public class ByteRunAutomaton extends Ru
   }
   
   /** expert: if utf8 is true, the input is already byte-based */
-  public ByteRunAutomaton(Automaton a, boolean utf8, int maxDeterminizedStates) {
-    super(utf8 ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
+  public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
+    super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
   }
 
   /**

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Thu Apr  2 15:37:39 2015
@@ -90,12 +90,41 @@ public class CompiledAutomaton {
    */
   public final Boolean finite;
 
+  /** Which state, if any, accepts all suffixes, else -1. */
+  public final int sinkState;
+
   /** Create this, passing simplify=true and finite=null, so that we try
    *  to simplify the automaton and determine if it is finite. */
   public CompiledAutomaton(Automaton automaton) {
     this(automaton, null, true);
   }
 
+  /** Returns sink state, if present, else -1. */
+  private static int findSinkState(Automaton automaton) {
+    int numStates = automaton.getNumStates();
+    Transition t = new Transition();
+    int foundState = -1;
+    for (int s=0;s<numStates;s++) {
+      if (automaton.isAccept(s)) {
+        int count = automaton.initTransition(s, t);
+        boolean isSinkState = false;
+        for(int i=0;i<count;i++) {
+          automaton.getNextTransition(t);
+          if (t.dest == s && t.min == 0 && t.max == 0xff) {
+            isSinkState = true;
+            break;
+          }
+        }
+        if (isSinkState) {
+          foundState = s;
+          break;
+        }
+      }
+    }
+
+    return foundState;
+  }
+
   /** Create this.  If finite is null, we use {@link Operations#isFinite}
    *  to determine whether it is finite.  If simplify is true, we run
    *  possibly expensive operations to determine if the automaton is one
@@ -134,6 +163,7 @@ public class CompiledAutomaton {
         runAutomaton = null;
         this.automaton = null;
         this.finite = null;
+        sinkState = -1;
         return;
       }
 
@@ -154,6 +184,7 @@ public class CompiledAutomaton {
         runAutomaton = null;
         this.automaton = null;
         this.finite = null;
+        sinkState = -1;
         return;
       }
 
@@ -174,7 +205,7 @@ public class CompiledAutomaton {
         } else {
           term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
         }
-
+        sinkState = -1;
         return;
       }
     }
@@ -202,7 +233,8 @@ public class CompiledAutomaton {
     if (this.finite) {
       commonSuffixRef = null;
     } else {
-      // NOTE: this is a very costly operation!  We should test if it's really warranted in practice...
+      // NOTE: this is a very costly operation!  We should test if it's really warranted in practice... we could do a fast match
+      // by looking for a sink state (which means it has no common suffix).  Or maybe we shouldn't do it when simplify is false?:
       BytesRef suffix = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
       if (suffix.length == 0) {
         commonSuffixRef = null;
@@ -215,6 +247,10 @@ public class CompiledAutomaton {
     runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
 
     this.automaton = runAutomaton.automaton;
+
+    // TODO: this is a bit fragile because if the automaton is not minimized there could be more than 1 sink state but auto-prefix will fail
+    // to run for those:
+    sinkState = findSinkState(this.automaton);
   }
 
   private Transition transition = new Transition();

Modified: lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java Thu Apr  2 15:37:39 2015
@@ -599,7 +599,7 @@ public class RegExp {
         a = aa;
         break;
       case REGEXP_INTERVAL:
-        a = Automata.makeInterval(min, max, digits);
+        a = Automata.makeDecimalInterval(min, max, digits);
         break;
     }
     return a;

Modified: lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java Thu Apr  2 15:37:39 2015
@@ -117,8 +117,8 @@ public class TestAutomatonQuery extends
     assertAutomatonHits(2, Automata.makeString("doc"));
     assertAutomatonHits(1, Automata.makeChar('a'));
     assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
-    assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0));
-    assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0));
+    assertAutomatonHits(2, Automata.makeDecimalInterval(1233, 2346, 0));
+    assertAutomatonHits(1, Automata.makeDecimalInterval(0, 2000, 0));
     assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
         Automata.makeChar('b')));
     assertAutomatonHits(0, Operations.intersection(Automata
@@ -194,7 +194,6 @@ public class TestAutomatonQuery extends
     Automaton pfx = Automata.makeString("do");
     Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
     AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
-    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
     assertEquals(3, automatonQueryNrHits(aq));
   }
   

Modified: lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java?rev=1670929&r1=1670928&r2=1670929&view=diff
==============================================================================
--- lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java (original)
+++ lucene/dev/branches/lucene6271/lucene/core/src/test/org/apache/lucene/search/TestMultiTermQueryRewrites.java Thu Apr  2 15:37:39 2015
@@ -17,16 +17,19 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
@@ -34,8 +37,6 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 
-import java.io.IOException;
-
 public class TestMultiTermQueryRewrites extends LuceneTestCase {
 
   static Directory dir, sdir1, sdir2;
@@ -152,14 +153,27 @@ public class TestMultiTermQueryRewrites
     final MultiTermQuery mtq = new MultiTermQuery("data") {
       @Override
       protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-        return new TermRangeTermsEnum(terms.iterator(null), new BytesRef("2"), new BytesRef("7"), true, true) {
+        return new FilteredTermsEnum(terms.iterator(null)) {
+
           final BoostAttribute boostAtt =
             attributes().addAttribute(BoostAttribute.class);
         
           @Override
           protected AcceptStatus accept(BytesRef term) {
             boostAtt.setBoost(Float.parseFloat(term.utf8ToString()));
-            return super.accept(term);
+            if (term.length == 0) {
+              return AcceptStatus.NO;
+            }
+            char c = (char) (term.bytes[term.offset] & 0xff);
+            if (c >= '2') {
+              if (c <= '7') {
+                return AcceptStatus.YES;
+              } else {
+                return AcceptStatus.END;
+              }
+            } else {
+              return AcceptStatus.NO;
+            }
           }
         };
       }



Mime
View raw message