lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r966819 [7/20] - in /lucene/dev/branches/realtime_search: ./ lucene/ lucene/backwards/ lucene/contrib/ lucene/contrib/benchmark/conf/ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ lucene/contrib/benchmark/src/j...
Date Thu, 22 Jul 2010 19:34:52 GMT
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java Thu Jul 22 19:34:35 2010
@@ -17,7 +17,7 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.DateField;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.Term;
@@ -33,7 +33,9 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
 
 /**
  * This class is generated by JavaCC.  The most important method is
@@ -107,6 +109,8 @@ import org.apache.lucene.util.Version;
  * <ul>
  *    <li> As of 2.9, {@link #setEnablePositionIncrements} is true by
  *         default.
+ *    <li> As of 3.1, {@link #setAutoGeneratePhraseQueries} is false by
+ *         default.
  * </ul>
  */
 public class QueryParser implements QueryParserConstants {
@@ -150,6 +154,19 @@ public class QueryParser implements Quer
   // for use when constructing RangeQuerys.
   Collator rangeCollator = null;
 
+  /** @deprecated remove when getFieldQuery is removed */
+  private static final VirtualMethod<QueryParser> getFieldQueryMethod =
+    new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class);
+  /** @deprecated remove when getFieldQuery is removed */
+  private static final VirtualMethod<QueryParser> getFieldQueryWithQuotedMethod =
+    new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class, boolean.class);
+  /** @deprecated remove when getFieldQuery is removed */
+  private final boolean hasNewAPI =
+    VirtualMethod.compareImplementationDistance(getClass(),
+        getFieldQueryWithQuotedMethod, getFieldQueryMethod) >= 0; // its ok for both to be overridden
+
+  private boolean autoGeneratePhraseQueries;
+
   /** The default operator for parsing queries. 
    * Use {@link QueryParser#setDefaultOperator} to change it.
    */
@@ -169,6 +186,11 @@ public class QueryParser implements Quer
     } else {
       enablePositionIncrements = false;
     }
+    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+      setAutoGeneratePhraseQueries(false);
+    } else {
+      setAutoGeneratePhraseQueries(true);
+    }
   }
 
   /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -214,6 +236,29 @@ public class QueryParser implements Quer
     return field;
   }
 
+  /**
+   * @see #setAutoGeneratePhraseQueries(boolean)
+   */
+  public final boolean getAutoGeneratePhraseQueries() {
+    return autoGeneratePhraseQueries;
+  }
+
+  /**
+   * Set to true if phrase queries will be automatically generated
+   * when the analyzer returns more than one term from whitespace
+   * delimited text.
+   * NOTE: this behavior may not be suitable for all languages.
+   * <p>
+   * Set to false if phrase queries should only be generated when
+   * surrounded by double quotes.
+   */
+  public final void setAutoGeneratePhraseQueries(boolean value) {
+    if (value == false && !hasNewAPI)
+      throw new IllegalArgumentException("You must implement the new API: getFieldQuery(String,String,boolean)"
+       + " to use setAutoGeneratePhraseQueries(false)");
+    this.autoGeneratePhraseQueries = value;
+  }
+
    /**
    * Get the minimal similarity for fuzzy queries.
    */
@@ -506,11 +551,19 @@ public class QueryParser implements Quer
       throw new RuntimeException("Clause cannot be both required and prohibited");
   }
 
+  /**
+   * @deprecated Use {@link #getFieldQuery(String,String,boolean)} instead.
+   */
+  @Deprecated
+  protected Query getFieldQuery(String field, String queryText) throws ParseException {
+    // treat the text as if it was quoted, to drive phrase logic with old versions.
+    return getFieldQuery(field, queryText, true);
+  }
 
   /**
    * @exception ParseException throw in overridden method to disallow
    */
-  protected Query getFieldQuery(String field, String queryText)  throws ParseException {
+  protected Query getFieldQuery(String field, String queryText, boolean quoted)  throws ParseException {
     // Use the analyzer to get all the tokens, and then build a TermQuery,
     // PhraseQuery, or nothing based on the term count
 
@@ -522,7 +575,7 @@ public class QueryParser implements Quer
       source = analyzer.tokenStream(field, new StringReader(queryText));
     }
     CachingTokenFilter buffer = new CachingTokenFilter(source);
-    CharTermAttribute termAtt = null;
+    TermToBytesRefAttribute termAtt = null;
     PositionIncrementAttribute posIncrAtt = null;
     int numTokens = 0;
 
@@ -534,8 +587,8 @@ public class QueryParser implements Quer
       // success==false if we hit an exception
     }
     if (success) {
-      if (buffer.hasAttribute(CharTermAttribute.class)) {
-        termAtt = buffer.getAttribute(CharTermAttribute.class);
+      if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+        termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
       }
       if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
         posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
@@ -577,33 +630,37 @@ public class QueryParser implements Quer
     if (numTokens == 0)
       return null;
     else if (numTokens == 1) {
-      String term = null;
+      BytesRef term = new BytesRef();
       try {
         boolean hasNext = buffer.incrementToken();
         assert hasNext == true;
-        term = termAtt.toString();
+        termAtt.toBytesRef(term);
       } catch (IOException e) {
         // safe to ignore, because we know the number of tokens
       }
       return newTermQuery(new Term(field, term));
     } else {
-      if (severalTokensAtSamePosition) {
-        if (positionCount == 1) {
+      if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
+        if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
           // no phrase query:
-          BooleanQuery q = newBooleanQuery(true);
+          BooleanQuery q = newBooleanQuery(positionCount == 1);
+
+          BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
+            BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+
           for (int i = 0; i < numTokens; i++) {
-            String term = null;
+            BytesRef term = new BytesRef();
             try {
               boolean hasNext = buffer.incrementToken();
               assert hasNext == true;
-              term = termAtt.toString();
+              termAtt.toBytesRef(term);
             } catch (IOException e) {
               // safe to ignore, because we know the number of tokens
             }
 
             Query currentQuery = newTermQuery(
                 new Term(field, term));
-            q.add(currentQuery, BooleanClause.Occur.SHOULD);
+            q.add(currentQuery, occur);
           }
           return q;
         }
@@ -614,12 +671,12 @@ public class QueryParser implements Quer
           List<Term> multiTerms = new ArrayList<Term>();
           int position = -1;
           for (int i = 0; i < numTokens; i++) {
-            String term = null;
+            BytesRef term = new BytesRef();
             int positionIncrement = 1;
             try {
               boolean hasNext = buffer.incrementToken();
               assert hasNext == true;
-              term = termAtt.toString();
+              termAtt.toBytesRef(term);
               if (posIncrAtt != null) {
                 positionIncrement = posIncrAtt.getPositionIncrement();
               }
@@ -653,13 +710,13 @@ public class QueryParser implements Quer
 
 
         for (int i = 0; i < numTokens; i++) {
-          String term = null;
+          BytesRef term = new BytesRef();
           int positionIncrement = 1;
 
           try {
             boolean hasNext = buffer.incrementToken();
             assert hasNext == true;
-            term = termAtt.toString();
+            termAtt.toBytesRef(term);
             if (posIncrAtt != null) {
               positionIncrement = posIncrAtt.getPositionIncrement();
             }
@@ -682,7 +739,7 @@ public class QueryParser implements Quer
 
 
   /**
-   * Base implementation delegates to {@link #getFieldQuery(String,String)}.
+   * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
    * This method may be overridden, for example, to return
    * a SpanNearQuery instead of a PhraseQuery.
    *
@@ -690,7 +747,7 @@ public class QueryParser implements Quer
    */
   protected Query getFieldQuery(String field, String queryText, int slop)
         throws ParseException {
-    Query query = getFieldQuery(field, queryText);
+    Query query = hasNewAPI ? getFieldQuery(field, queryText, true) : getFieldQuery(field, queryText);
 
     if (query instanceof PhraseQuery) {
       ((PhraseQuery) query).setSlop(slop);
@@ -1343,7 +1400,7 @@ public class QueryParser implements Quer
          }
          q = getFuzzyQuery(field, termImage,fms);
        } else {
-         q = getFieldQuery(field, termImage);
+         q = hasNewAPI ? getFieldQuery(field, termImage, false) : getFieldQuery(field, termImage);
        }
       break;
     case RANGEIN_START:
@@ -1512,12 +1569,6 @@ public class QueryParser implements Quer
     finally { jj_save(0, xla); }
   }
 
-  private boolean jj_3R_2() {
-    if (jj_scan_token(TERM)) return true;
-    if (jj_scan_token(COLON)) return true;
-    return false;
-  }
-
   private boolean jj_3_1() {
     Token xsp;
     xsp = jj_scanpos;
@@ -1534,6 +1585,12 @@ public class QueryParser implements Quer
     return false;
   }
 
+  private boolean jj_3R_2() {
+    if (jj_scan_token(TERM)) return true;
+    if (jj_scan_token(COLON)) return true;
+    return false;
+  }
+
   /** Generated Token Manager. */
   public QueryParserTokenManager token_source;
   /** Current token. */

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj Thu Jul 22 19:34:35 2010
@@ -41,7 +41,7 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.DateField;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.Term;
@@ -57,7 +57,9 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
 
 /**
  * This class is generated by JavaCC.  The most important method is
@@ -131,6 +133,8 @@ import org.apache.lucene.util.Version;
  * <ul>
  *    <li> As of 2.9, {@link #setEnablePositionIncrements} is true by
  *         default.
+ *    <li> As of 3.1, {@link #setAutoGeneratePhraseQueries} is false by
+ *         default.
  * </ul>
  */
 public class QueryParser {
@@ -174,6 +178,19 @@ public class QueryParser {
   // for use when constructing RangeQuerys.
   Collator rangeCollator = null;
 
+  /** @deprecated remove when getFieldQuery is removed */
+  private static final VirtualMethod<QueryParser> getFieldQueryMethod =
+    new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class);
+  /** @deprecated remove when getFieldQuery is removed */
+  private static final VirtualMethod<QueryParser> getFieldQueryWithQuotedMethod =
+    new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class, boolean.class);
+  /** @deprecated remove when getFieldQuery is removed */
+  private final boolean hasNewAPI = 
+    VirtualMethod.compareImplementationDistance(getClass(), 
+        getFieldQueryWithQuotedMethod, getFieldQueryMethod) >= 0; // its ok for both to be overridden
+
+  private boolean autoGeneratePhraseQueries;
+
   /** The default operator for parsing queries. 
    * Use {@link QueryParser#setDefaultOperator} to change it.
    */
@@ -193,6 +210,11 @@ public class QueryParser {
     } else {
       enablePositionIncrements = false;
     }
+    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+      setAutoGeneratePhraseQueries(false);
+    } else {
+      setAutoGeneratePhraseQueries(true);
+    }
   }
 
   /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -238,6 +260,29 @@ public class QueryParser {
     return field;
   }
 
+  /**
+   * @see #setAutoGeneratePhraseQueries(boolean)
+   */
+  public final boolean getAutoGeneratePhraseQueries() {
+    return autoGeneratePhraseQueries;
+  }
+  
+  /**
+   * Set to true if phrase queries will be automatically generated
+   * when the analyzer returns more than one term from whitespace
+   * delimited text.
+   * NOTE: this behavior may not be suitable for all languages.
+   * <p>
+   * Set to false if phrase queries should only be generated when
+   * surrounded by double quotes.
+   */
+  public final void setAutoGeneratePhraseQueries(boolean value) {
+    if (value == false && !hasNewAPI)
+      throw new IllegalArgumentException("You must implement the new API: getFieldQuery(String,String,boolean)"
+       + " to use setAutoGeneratePhraseQueries(false)");
+    this.autoGeneratePhraseQueries = value;
+  }
+  
    /**
    * Get the minimal similarity for fuzzy queries.
    */
@@ -530,11 +575,19 @@ public class QueryParser {
       throw new RuntimeException("Clause cannot be both required and prohibited");
   }
 
+  /**
+   * @deprecated Use {@link #getFieldQuery(String,String,boolean)} instead.
+   */
+  @Deprecated
+  protected Query getFieldQuery(String field, String queryText) throws ParseException {
+    // treat the text as if it was quoted, to drive phrase logic with old versions.
+    return getFieldQuery(field, queryText, true);
+  }
 
   /**
    * @exception ParseException throw in overridden method to disallow
    */
-  protected Query getFieldQuery(String field, String queryText)  throws ParseException {
+  protected Query getFieldQuery(String field, String queryText, boolean quoted)  throws ParseException {
     // Use the analyzer to get all the tokens, and then build a TermQuery,
     // PhraseQuery, or nothing based on the term count
 
@@ -546,7 +599,7 @@ public class QueryParser {
       source = analyzer.tokenStream(field, new StringReader(queryText));
     }
     CachingTokenFilter buffer = new CachingTokenFilter(source);
-    CharTermAttribute termAtt = null;
+    TermToBytesRefAttribute termAtt = null;
     PositionIncrementAttribute posIncrAtt = null;
     int numTokens = 0;
 
@@ -558,8 +611,8 @@ public class QueryParser {
       // success==false if we hit an exception
     }
     if (success) {
-      if (buffer.hasAttribute(CharTermAttribute.class)) {
-        termAtt = buffer.getAttribute(CharTermAttribute.class);
+      if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
+        termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
       }
       if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
         posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
@@ -601,33 +654,37 @@ public class QueryParser {
     if (numTokens == 0)
       return null;
     else if (numTokens == 1) {
-      String term = null;
+      BytesRef term = new BytesRef();
       try {
         boolean hasNext = buffer.incrementToken();
         assert hasNext == true;
-        term = termAtt.toString();
+        termAtt.toBytesRef(term);
       } catch (IOException e) {
         // safe to ignore, because we know the number of tokens
       }
       return newTermQuery(new Term(field, term));
     } else {
-      if (severalTokensAtSamePosition) {
-        if (positionCount == 1) {
+      if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
+        if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
           // no phrase query:
-          BooleanQuery q = newBooleanQuery(true);
+          BooleanQuery q = newBooleanQuery(positionCount == 1);
+          
+          BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? 
+            BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+
           for (int i = 0; i < numTokens; i++) {
-            String term = null;
+            BytesRef term = new BytesRef();
             try {
               boolean hasNext = buffer.incrementToken();
               assert hasNext == true;
-              term = termAtt.toString();
+              termAtt.toBytesRef(term);
             } catch (IOException e) {
               // safe to ignore, because we know the number of tokens
             }
 
             Query currentQuery = newTermQuery(
                 new Term(field, term));
-            q.add(currentQuery, BooleanClause.Occur.SHOULD);
+            q.add(currentQuery, occur);
           }
           return q;
         }
@@ -638,12 +695,12 @@ public class QueryParser {
           List<Term> multiTerms = new ArrayList<Term>();
           int position = -1;
           for (int i = 0; i < numTokens; i++) {
-            String term = null;
+            BytesRef term = new BytesRef();
             int positionIncrement = 1;
             try {
               boolean hasNext = buffer.incrementToken();
               assert hasNext == true;
-              term = termAtt.toString();
+              termAtt.toBytesRef(term);
               if (posIncrAtt != null) {
                 positionIncrement = posIncrAtt.getPositionIncrement();
               }
@@ -677,13 +734,13 @@ public class QueryParser {
 
 
         for (int i = 0; i < numTokens; i++) {
-          String term = null;
+          BytesRef term = new BytesRef();
           int positionIncrement = 1;
 
           try {
             boolean hasNext = buffer.incrementToken();
             assert hasNext == true;
-            term = termAtt.toString();
+            termAtt.toBytesRef(term);
             if (posIncrAtt != null) {
               positionIncrement = posIncrAtt.getPositionIncrement();
             }
@@ -706,7 +763,7 @@ public class QueryParser {
 
 
   /**
-   * Base implementation delegates to {@link #getFieldQuery(String,String)}.
+   * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}.
    * This method may be overridden, for example, to return
    * a SpanNearQuery instead of a PhraseQuery.
    *
@@ -714,7 +771,7 @@ public class QueryParser {
    */
   protected Query getFieldQuery(String field, String queryText, int slop)
         throws ParseException {
-    Query query = getFieldQuery(field, queryText);
+    Query query = hasNewAPI ? getFieldQuery(field, queryText, true) : getFieldQuery(field, queryText);
 
     if (query instanceof PhraseQuery) {
       ((PhraseQuery) query).setSlop(slop);
@@ -1314,7 +1371,7 @@ Query Term(String field) : {
        	 }
        	 q = getFuzzyQuery(field, termImage,fms);
        } else {
-         q = getFieldQuery(field, termImage);
+         q = hasNewAPI ? getFieldQuery(field, termImage, false) : getFieldQuery(field, termImage);
        }
      }
      | ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java Thu Jul 22 19:34:35 2010
@@ -15,7 +15,7 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.DateField;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.Term;
@@ -31,7 +31,9 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
 
 /** Token Manager. */
 public class QueryParserTokenManager implements QueryParserConstants

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java Thu Jul 22 19:34:35 2010
@@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends 
     // build a cache of sorted transitions for every state
     allTransitions = new Transition[runAutomaton.getSize()][];
     for (State state : this.automaton.getNumberedStates()) {
-      state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+      state.sortTransitions(Transition.CompareByMinMaxThenDest);
       state.trimTransitionsArray();
       allTransitions[state.getNumber()] = state.transitionsArray;
     }
@@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends 
     // seek to the next possible string;
     if (nextString()) {
       // reposition
-      
-      // FIXME: this is really bad to turn off
-      // but it cannot work correctly until terms are in utf8 order.
-      linear = false;
-      
+           
       if (linear)
         setLinear(infinitePosition);
       return seekBytesRef;
@@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends 
     }
     for (int i = 0; i < allTransitions[state].length; i++) {
       Transition t = allTransitions[state][i];
-      if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && 
-          compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
+      if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && 
+          (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
         maxInterval = t.getMax();
         break;
       }
     }
-    // 0xef terms don't get the optimization... not worth the trouble.
-    if (maxInterval != 0xef)
-      maxInterval = incrementUTF16(maxInterval);
+    // 0xff terms don't get the optimization... not worth the trouble.
+    if (maxInterval != 0xff)
+      maxInterval = incrementUTF8(maxInterval);
     int length = position + 1; /* position + maxTransition */
     if (linearUpperBound.bytes.length < length)
       linearUpperBound.bytes = new byte[length];
@@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends 
       // if the next character is U+FFFF and is not part of the useful portion,
       // then by definition it puts us in a reject state, and therefore this
       // path is dead. there cannot be any higher transitions. backtrack.
-      c = incrementUTF16(c);
+      c = incrementUTF8(c);
       if (c == -1)
         return false;
     }
@@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends 
     
     for (int i = 0; i < transitions.length; i++) {
       Transition transition = transitions[i];
-      if (compareToUTF16(transition.getMax(), c) >= 0) {
-        int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
+      if (transition.getMax() >= c) {
+        int nextChar = Math.max(c, transition.getMin());
         // append either the next sequential char, or the minimum transition
         seekBytesRef.grow(seekBytesRef.length + 1);
         seekBytesRef.length++;
@@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends 
   private boolean backtrack(int position) {
     while (position > 0) {
       int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
-      // if a character is 0xef its a dead-end too,
-      // because there is no higher character in UTF-16 sort order.
-      nextChar = incrementUTF16(nextChar);
+      // if a character is 0xff its a dead-end too,
+      // because there is no higher character in UTF-8 sort order.
+      nextChar = incrementUTF8(nextChar);
       if (nextChar != -1) {
         seekBytesRef.bytes[position - 1] = (byte) nextChar;
         seekBytesRef.length = position;
@@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends 
     return false; /* all solutions exhausted */
   }
 
-  /* return the next utf8 byte in utf16 order, or -1 if exhausted */
-  private final int incrementUTF16(int utf8) {
+  /* return the next utf8 byte in utf8 order, or -1 if exhausted */
+  private final int incrementUTF8(int utf8) {
     switch(utf8) {
-      case 0xed: return 0xf0;
-      case 0xfd: return 0xee;
-      case 0xee: return 0xef;
-      case 0xef: return -1;
+      case 0xff: return -1;
       default: return utf8 + 1;
     }
   }
-  
-  int compareToUTF16(int aByte, int bByte) {
-    if (aByte != bByte) {
-      // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
-      // We know the terms are not equal, but, we may
-      // have to carefully fixup the bytes at the
-      // difference to match UTF16's sort order:
-      if (aByte >= 0xee && bByte >= 0xee) {
-        if ((aByte & 0xfe) == 0xee) {
-          aByte += 0x10;
-        }
-        if ((bByte&0xfe) == 0xee) {
-          bByte += 0x10;
-        }
-      }
-      return aByte - bByte;
-    }
-    return 0;
-  }
 }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java Thu Jul 22 19:34:35 2010
@@ -18,39 +18,317 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+import java.util.Arrays;
+
 import org.apache.lucene.index.*;
 
-final class ExactPhraseScorer extends PhraseScorer {
+final class ExactPhraseScorer extends Scorer {
+  private final Weight weight;
+  private final byte[] norms;
+  private final float value;
+
+  private static final int SCORE_CACHE_SIZE = 32;
+  private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
+
+  private final int endMinus1;
+
+  private final static int CHUNK = 4096;
+
+  private int gen;
+  private final int[] counts = new int[CHUNK];
+  private final int[] gens = new int[CHUNK];
+
+  boolean noDocs;
 
-  ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets,
-      Similarity similarity, byte[] norms) {
-    super(weight, postings, offsets, similarity, norms);
+  private final static class ChunkState {
+    final DocsAndPositionsEnum posEnum;
+    final int offset;
+    final boolean useAdvance;
+    int posUpto;
+    int posLimit;
+    int pos;
+    int lastPos;
+
+    public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
+      this.posEnum = posEnum;
+      this.offset = offset;
+      this.useAdvance = useAdvance;
+    }
+  }
+
+  private final ChunkState[] chunkStates;
+
+  private int docID = -1;
+  private int freq;
+
+  ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
+                    Similarity similarity, byte[] norms) throws IOException {
+    super(similarity);
+    this.weight = weight;
+    this.norms = norms;
+    this.value = weight.getValue();
+
+    chunkStates = new ChunkState[postings.length];
+
+    endMinus1 = postings.length-1;
+
+    for(int i=0;i<postings.length;i++) {
+
+      // Coarse optimization: advance(target) is fairly
+      // costly, so, if the relative freq of the 2nd
+      // rarest term is not that much (> 1/5th) rarer than
+      // the first term, then we just use .nextDoc() when
+      // ANDing.  This buys ~15% gain for phrases where
+      // freq of rarest 2 terms is close:
+      final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
+      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
+      if (i > 0 && postings[i].postings.nextDoc() == DocsEnum.NO_MORE_DOCS) {
+        noDocs = true;
+        return;
+      }
+    }
+
+    for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
+      scoreCache[i] = getSimilarity().tf((float) i) * value;
+    }
   }
 
   @Override
-  protected final float phraseFreq() throws IOException {
-    // sort list with pq
-    pq.clear();
-    for (PhrasePositions pp = first; pp != null; pp = pp.next) {
-      pp.firstPosition();
-      pq.add(pp);				  // build pq from list
-    }
-    pqToList();					  // rebuild list from pq
-
-    // for counting how many times the exact phrase is found in current document,
-    // just count how many times all PhrasePosition's have exactly the same position.   
-    int freq = 0;
-    do {					  // find position w/ all terms
-      while (first.position < last.position) {	  // scan forward in first
-        do {
-          if (!first.nextPosition())
-            return freq;
-        } while (first.position < last.position);
-        firstToLast();
-      }
-      freq++;					  // all equal: a match
-    } while (last.nextPosition());
-  
+  public int nextDoc() throws IOException {
+    while(true) {
+
+      // first (rarest) term
+      final int doc = chunkStates[0].posEnum.nextDoc();
+      if (doc == DocsEnum.NO_MORE_DOCS) {
+        docID = doc;
+        return doc;
+      }
+
+      // not-first terms
+      int i = 1;
+      while(i < chunkStates.length) {
+        final ChunkState cs = chunkStates[i];
+        int doc2 = cs.posEnum.docID();
+        if (cs.useAdvance) {
+          if (doc2 < doc) {
+            doc2 = cs.posEnum.advance(doc);
+          }
+        } else {
+          int iter = 0;
+          while(doc2 < doc) {
+            // safety net -- fallback to .advance if we've
+            // done too many .nextDocs
+            if (++iter == 50) {
+              doc2 = cs.posEnum.advance(doc);
+              break;
+            } else {
+              doc2 = cs.posEnum.nextDoc();
+            }
+          }
+        }
+        if (doc2 > doc) {
+          break;
+        }
+        i++;
+      }
+
+      if (i == chunkStates.length) {
+        // this doc has all the terms -- now test whether
+        // phrase occurs
+        docID = doc;
+
+        freq = phraseFreq();
+        if (freq != 0) {
+          return docID;
+        }
+      }
+    }
+  }
+
+  @Override
+  public int advance(int target) throws IOException {
+
+    // first term
+    int doc = chunkStates[0].posEnum.advance(target);
+    if (doc == DocsEnum.NO_MORE_DOCS) {
+      docID = DocsEnum.NO_MORE_DOCS;
+      return doc;
+    }
+
+    while(true) {
+      
+      // not-first terms
+      int i = 1;
+      while(i < chunkStates.length) {
+        int doc2 = chunkStates[i].posEnum.docID();
+        if (doc2 < doc) {
+          doc2 = chunkStates[i].posEnum.advance(doc);
+        }
+        if (doc2 > doc) {
+          break;
+        }
+        i++;
+      }
+
+      if (i == chunkStates.length) {
+        // this doc has all the terms -- now test whether
+        // phrase occurs
+        docID = doc;
+        freq = phraseFreq();
+        if (freq != 0) {
+          return docID;
+        }
+      }
+
+      doc = chunkStates[0].posEnum.nextDoc();
+      if (doc == DocsEnum.NO_MORE_DOCS) {
+        docID = doc;
+        return doc;
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "ExactPhraseScorer(" + weight + ")";
+  }
+
+  // used by MultiPhraseQuery
+  float currentFreq() {
+    return freq;
+  }
+
+  @Override
+  public int docID() {
+    return docID;
+  }
+
+  @Override
+  public float score() throws IOException {
+    final float raw; // raw score
+    if (freq < SCORE_CACHE_SIZE) {
+      raw = scoreCache[freq];
+    } else {
+      raw = getSimilarity().tf((float) freq) * value;
+    }
+    return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+  }
+
+  private int phraseFreq() throws IOException {
+
+    freq = 0;
+
+    // init chunks
+    for(int i=0;i<chunkStates.length;i++) {
+      final ChunkState cs = chunkStates[i];
+      cs.posLimit = cs.posEnum.freq();
+      cs.pos = cs.offset + cs.posEnum.nextPosition();
+      cs.posUpto = 1;
+      cs.lastPos = -1;
+    }
+
+    int chunkStart = 0;
+    int chunkEnd = CHUNK;
+
+    // process chunk by chunk
+    boolean end = false;
+
+    // TODO: we could fold in chunkStart into offset and
+    // save one subtract per pos incr
+
+    while(!end) {
+
+      gen++;
+
+      if (gen == 0) {
+        // wraparound
+        Arrays.fill(gens, 0);
+        gen++;
+      }
+
+      // first term
+      {
+        final ChunkState cs = chunkStates[0];
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            counts[posIndex] = 1;
+            assert gens[posIndex] != gen;
+            gens[posIndex] = gen;
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+      }
+
+      // middle terms
+      boolean any = true;
+      for(int t=1;t<endMinus1;t++) {
+        final ChunkState cs = chunkStates[t];
+        any = false;
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
+              // viable
+              counts[posIndex]++;
+              any = true;
+            }
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+
+        if (!any) {
+          break;
+        }
+      }
+
+      if (!any) {
+        // petered out for this chunk
+        chunkStart += CHUNK;
+        chunkEnd += CHUNK;
+        continue;
+      }
+
+      // last term
+
+      {
+        final ChunkState cs = chunkStates[endMinus1];
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
+              freq++;
+            }
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+      }
+
+      chunkStart += CHUNK;
+      chunkEnd += CHUNK;
+    }
+
     return freq;
   }
 }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCache.java Thu Jul 22 19:34:35 2010
@@ -18,11 +18,13 @@ package org.apache.lucene.search;
  */
 
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.document.NumericField; // for javadocs
 import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
+import org.apache.lucene.util.packed.PackedInts;
 
 import java.io.IOException;
 import java.io.Serializable;
@@ -530,6 +532,12 @@ public interface FieldCache {
 
     /** Number of documents */
     public abstract int size();
+
+    /** Returns a TermsEnum that can iterate over the values in this index entry */
+    public abstract TermsEnum getTermsEnum();
+
+    /** @lucene.internal */
+    public abstract PackedInts.Reader getDocToOrd();
   }
 
   /** Checks the internal cache for an appropriate entry, and if none

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java Thu Jul 22 19:34:35 2010
@@ -19,17 +19,9 @@ package org.apache.lucene.search;
 
 import java.io.IOException;
 import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.WeakHashMap;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.TermsEnum;
+import java.util.*;
+
+import org.apache.lucene.index.*;
 import org.apache.lucene.util.PagedBytes;
 import org.apache.lucene.util.packed.PackedInts;
 import org.apache.lucene.util.packed.GrowableWriter;
@@ -642,7 +634,7 @@ class FieldCacheImpl implements FieldCac
     }
   }
 
-  private static class DocTermsIndexImpl extends DocTermsIndex {
+  public static class DocTermsIndexImpl extends DocTermsIndex {
     private final PagedBytes.Reader bytes;
     private final PackedInts.Reader termOrdToBytesOffset;
     private final PackedInts.Reader docToTermOrd;
@@ -656,6 +648,11 @@ class FieldCacheImpl implements FieldCac
     }
 
     @Override
+    public PackedInts.Reader getDocToOrd() {
+      return docToTermOrd;
+    }
+
+    @Override
     public int numOrd() {
       return numOrd;
     }
@@ -674,6 +671,105 @@ class FieldCacheImpl implements FieldCac
     public BytesRef lookup(int ord, BytesRef ret) {
       return bytes.fillUsingLengthPrefix(ret, termOrdToBytesOffset.get(ord));
     }
+
+    @Override
+    public TermsEnum getTermsEnum() {
+      return this.new DocTermsIndexEnum();
+    }
+
+    class DocTermsIndexEnum extends TermsEnum {
+      int currentOrd;
+      int currentBlockNumber;
+      int end;  // end position in the current block
+      final byte[][] blocks;
+      final int[] blockEnds;
+
+      final BytesRef term = new BytesRef();
+
+      public DocTermsIndexEnum() {
+        currentOrd = 0;
+        currentBlockNumber = 0;
+        blocks = bytes.getBlocks();
+        blockEnds = bytes.getBlockEnds();
+        currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get(0));
+        end = blockEnds[currentBlockNumber];
+      }
+
+      @Override
+      public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
+        // TODO - we can support with binary search
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public SeekStatus seek(long ord) throws IOException {
+        assert(ord >= 0 && ord <= numOrd);
+        // TODO: if gap is small, could iterate from current position?  Or let user decide that?
+        currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get((int)ord));
+        end = blockEnds[currentBlockNumber];
+        currentOrd = (int)ord;
+        return SeekStatus.FOUND;
+      }
+
+      @Override
+      public BytesRef next() throws IOException {
+        int start = term.offset + term.length;
+        if (start >= end) {
+          // switch byte blocks
+          if (currentBlockNumber +1 >= blocks.length) {
+            return null;
+          }
+          currentBlockNumber++;
+          term.bytes = blocks[currentBlockNumber];
+          end = blockEnds[currentBlockNumber];
+          start = 0;
+          if (end<=0) return null;  // special case of empty last array
+        }
+
+        currentOrd++;
+
+        byte[] block = term.bytes;
+        if ((block[start] & 128) == 0) {
+          term.length = block[start];
+          term.offset = start+1;
+        } else {
+          term.length = (((int) (block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
+          term.offset = start+2;
+        }
+
+        return term;
+      }
+
+      @Override
+      public BytesRef term() throws IOException {
+        return term;
+      }
+
+      @Override
+      public long ord() throws IOException {
+        return currentOrd;
+      }
+
+      @Override
+      public int docFreq() {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public Comparator<BytesRef> getComparator() throws IOException {
+        throw new UnsupportedOperationException();
+      }
+    }
   }
 
   private static boolean DEFAULT_FASTER_BUT_MORE_RAM = true;
@@ -706,6 +802,14 @@ class FieldCacheImpl implements FieldCac
       int startTermsBPV;
       int startNumUniqueTerms;
 
+      int maxDoc = reader.maxDoc();
+      final int termCountHardLimit;
+      if (maxDoc == Integer.MAX_VALUE) {
+        termCountHardLimit = Integer.MAX_VALUE;
+      } else {
+        termCountHardLimit = maxDoc+1;
+      }
+
       if (terms != null) {
         // Try for coarse estimate for number of bits; this
         // should be an underestimate most of the time, which
@@ -717,11 +821,17 @@ class FieldCacheImpl implements FieldCac
           numUniqueTerms = -1;
         }
         if (numUniqueTerms != -1) {
+
+          if (numUniqueTerms > termCountHardLimit) {
+            // app is misusing the API (there is more than
+            // one term per doc); in this case we make best
+            // effort to load what we can (see LUCENE-2142)
+            numUniqueTerms = termCountHardLimit;
+          }
+
           startBytesBPV = PackedInts.bitsRequired(numUniqueTerms*4);
           startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
-          if (numUniqueTerms > Integer.MAX_VALUE-1) {
-            throw new IllegalStateException("this field has too many (" + numUniqueTerms + ") unique terms");
-          }
+
           startNumUniqueTerms = (int) numUniqueTerms;
         } else {
           startBytesBPV = 1;
@@ -751,6 +861,10 @@ class FieldCacheImpl implements FieldCac
           if (term == null) {
             break;
           }
+          if (termOrd >= termCountHardLimit) {
+            break;
+          }
+
           if (termOrd == termOrdToBytesOffset.size()) {
             // NOTE: this code only runs if the incoming
             // reader impl doesn't implement
@@ -775,7 +889,7 @@ class FieldCacheImpl implements FieldCac
       }
 
       // maybe an int-only impl?
-      return new DocTermsIndexImpl(bytes.freeze(), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd);
+      return new DocTermsIndexImpl(bytes.freeze(true), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd);
     }
   }
 
@@ -829,6 +943,8 @@ class FieldCacheImpl implements FieldCac
 
       final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue();
 
+      final int termCountHardLimit = reader.maxDoc();
+
       // Holds the actual term data, expanded.
       final PagedBytes bytes = new PagedBytes(15);
 
@@ -845,6 +961,9 @@ class FieldCacheImpl implements FieldCac
           numUniqueTerms = -1;
         }
         if (numUniqueTerms != -1) {
+          if (numUniqueTerms > termCountHardLimit) {
+            numUniqueTerms = termCountHardLimit;
+          }
           startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
         } else {
           startBPV = 1;
@@ -859,10 +978,18 @@ class FieldCacheImpl implements FieldCac
       bytes.copyUsingLengthPrefix(new BytesRef());
 
       if (terms != null) {
+        int termCount = 0;
         final TermsEnum termsEnum = terms.iterator();
         final Bits delDocs = MultiFields.getDeletedDocs(reader);
         DocsEnum docs = null;
         while(true) {
+          if (termCount++ == termCountHardLimit) {
+            // app is misusing the API (there is more than
+            // one term per doc); in this case we make best
+            // effort to load what we can (see LUCENE-2142)
+            break;
+          }
+
           final BytesRef term = termsEnum.next();
           if (term == null) {
             break;
@@ -880,7 +1007,7 @@ class FieldCacheImpl implements FieldCac
       }
 
       // maybe an int-only impl?
-      return new DocTermsImpl(bytes.freeze(), docToOffset.getMutable());
+      return new DocTermsImpl(bytes.freeze(true), docToOffset.getMutable());
     }
   }
   private volatile PrintStream infoStream;

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java Thu Jul 22 19:34:35 2010
@@ -22,7 +22,6 @@ import java.io.IOException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.index.TermDocs;  // for javadocs
 
 /**
  * A {@link Filter} that only accepts documents whose single
@@ -70,7 +69,7 @@ import org.apache.lucene.index.TermDocs;
  * 
  * In contrast, TermsFilter builds up an {@link OpenBitSet},
  * keyed by docID, every time it's created, by enumerating
- * through all matching docs using {@link TermDocs} to seek
+ * through all matching docs using {@link DocsEnum} to seek
  * and scan through each term's docID list.  While there is
  * no linear scan of all docIDs, besides the allocation of
  * the underlying array in the {@link OpenBitSet}, this
@@ -96,13 +95,20 @@ import org.apache.lucene.index.TermDocs;
 
 public class FieldCacheTermsFilter extends Filter {
   private String field;
-  private String[] terms;
+  private BytesRef[] terms;
 
-  public FieldCacheTermsFilter(String field, String... terms) {
+  public FieldCacheTermsFilter(String field, BytesRef... terms) {
     this.field = field;
     this.terms = terms;
   }
 
+  public FieldCacheTermsFilter(String field, String... terms) {
+    this.field = field;
+    this.terms = new BytesRef[terms.length];
+    for (int i = 0; i < terms.length; i++)
+      this.terms[i] = new BytesRef(terms[i]);
+  }
+
   public FieldCache getFieldCache() {
     return FieldCache.DEFAULT;
   }
@@ -122,7 +128,7 @@ public class FieldCacheTermsFilter exten
       openBitSet = new OpenBitSet(this.fcsi.size());
       final BytesRef spare = new BytesRef();
       for (int i=0;i<terms.length;i++) {
-        int termNumber = this.fcsi.binarySearchLookup(new BytesRef(terms[i]), spare);
+        int termNumber = this.fcsi.binarySearchLookup(terms[i], spare);
         if (termNumber > 0) {
           openBitSet.fastSet(termNumber);
         }

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FieldComparator.java Thu Jul 22 19:34:35 2010
@@ -31,6 +31,7 @@ import org.apache.lucene.search.FieldCac
 import org.apache.lucene.search.FieldCache.DocTermsIndex;
 import org.apache.lucene.search.FieldCache.DocTerms;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.packed.PackedInts;
 
 /**
  * Expert: a FieldComparator compares hits so as to determine their
@@ -709,23 +710,21 @@ public abstract class FieldComparator {
     private final BytesRef[] values;
     private final int[] readerGen;
 
+    private PackedInts.Reader currentDocToOrd;
     private int currentReaderGen = -1;
     private DocTermsIndex termsIndex;
     private final String field;
 
     private int bottomSlot = -1;
     private int bottomOrd;
+    private boolean bottomSameReader;
     private BytesRef bottomValue;
-    private final boolean reversed;
-    private final int sortPos;
     private final BytesRef tempBR = new BytesRef();
 
     public TermOrdValComparator(int numHits, String field, int sortPos, boolean reversed) {
       ords = new int[numHits];
       values = new BytesRef[numHits];
       readerGen = new int[numHits];
-      this.sortPos = sortPos;
-      this.reversed = reversed;
       this.field = field;
     }
 
@@ -754,59 +753,38 @@ public abstract class FieldComparator {
     @Override
     public int compareBottom(int doc) {
       assert bottomSlot != -1;
-      int order = termsIndex.getOrd(doc);
-      final int cmp = bottomOrd - order;
-      if (cmp != 0) {
-        return cmp;
-      }
-
-      if (bottomValue == null) {
-        if (order == 0) {
-          // unset
-          return 0;
+      if (bottomSameReader) {
+        // ord is precisely comparable, even in the equal case
+        return bottomOrd - (int) currentDocToOrd.get(doc);
+      } else {
+        // ord is only approx comparable: if they are not
+        // equal, we can use that; if they are equal, we
+        // must fallback to compare by value
+        final int order = (int) currentDocToOrd.get(doc);
+        final int cmp = bottomOrd - order;
+        if (cmp != 0) {
+          return cmp;
         }
-        // bottom wins
-        return -1;
-      } else if (order == 0) {
-        // doc wins
-        return 1;
-      }
-      termsIndex.lookup(order, tempBR);
-      return bottomValue.compareTo(tempBR);
-    }
 
-    private void convert(int slot) {
-      readerGen[slot] = currentReaderGen;
-      int index = 0;
-      BytesRef value = values[slot];
-      if (value == null) {
-        // 0 ord is null for all segments
-        assert ords[slot] == 0;
-        return;
-      }
-
-      if (sortPos == 0 && bottomSlot != -1 && bottomSlot != slot) {
-        // Since we are the primary sort, the entries in the
-        // queue are bounded by bottomOrd:
-        if (reversed) {
-          index = binarySearch(tempBR, termsIndex, value, bottomOrd, termsIndex.numOrd()-1);
-        } else {
-          index = binarySearch(tempBR, termsIndex, value, 0, bottomOrd);
+        if (bottomValue == null) {
+          if (order == 0) {
+            // unset
+            return 0;
+          }
+          // bottom wins
+          return -1;
+        } else if (order == 0) {
+          // doc wins
+          return 1;
         }
-      } else {
-        // Full binary search
-        index = binarySearch(tempBR, termsIndex, value);
-      }
-
-      if (index < 0) {
-        index = -index - 2;
+        termsIndex.lookup(order, tempBR);
+        return bottomValue.compareTo(tempBR);
       }
-      ords[slot] = index;
     }
 
     @Override
     public void copy(int slot, int doc) {
-      final int ord = termsIndex.getOrd(doc);
+      final int ord = (int) currentDocToOrd.get(doc);
       if (ord == 0) {
         values[slot] = null;
       } else {
@@ -823,21 +801,34 @@ public abstract class FieldComparator {
     @Override
     public void setNextReader(IndexReader reader, int docBase) throws IOException {
       termsIndex = FieldCache.DEFAULT.getTermsIndex(reader, field);
+      currentDocToOrd = termsIndex.getDocToOrd();
       currentReaderGen++;
       if (bottomSlot != -1) {
-        convert(bottomSlot);
-        bottomOrd = ords[bottomSlot];
+        setBottom(bottomSlot);
       }
     }
     
     @Override
     public void setBottom(final int bottom) {
       bottomSlot = bottom;
-      if (readerGen[bottom] != currentReaderGen) {
-        convert(bottomSlot);
+
+      bottomValue = values[bottomSlot];
+      if (bottomValue == null) {
+        // 0 ord is null for all segments
+        assert ords[bottomSlot] == 0;
+        bottomOrd = 0;
+        bottomSameReader = true;
+      } else {
+        final int index = binarySearch(tempBR, termsIndex, bottomValue);
+        if (index < 0) {
+          bottomOrd = -index - 2;
+          bottomSameReader = false;
+        } else {
+          bottomOrd = index;
+          // exact value match
+          bottomSameReader = true;
+        }
       }
-      bottomOrd = ords[bottom];
-      bottomValue = values[bottom];
     }
 
     @Override

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyQuery.java Thu Jul 22 19:34:35 2010
@@ -130,14 +130,6 @@ public class FuzzyQuery extends MultiTer
     return prefixLength;
   }
 
-  @Override @Deprecated
-  protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-    if (!termLongEnough) {  // can only match if it's exact
-      return new SingleTermEnum(reader, term);
-    }
-    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
-  }
-  
   @Override
   protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
     if (!termLongEnough) {  // can only match if it's exact

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Jul 22 19:34:35 2010
@@ -33,7 +33,7 @@ import org.apache.lucene.util.automaton.
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 
 import java.io.IOException;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 
@@ -135,7 +135,7 @@ public final class FuzzyTermsEnum extend
       LevenshteinAutomata builder = 
         new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength));
 
-      runAutomata = new ArrayList<ByteRunAutomaton>(maxDistance);
+      final ByteRunAutomaton[] ra = new ByteRunAutomaton[maxDistance + 1];
       for (int i = 0; i <= maxDistance; i++) {
         Automaton a = builder.toAutomaton(i);
         // constant prefix
@@ -144,8 +144,9 @@ public final class FuzzyTermsEnum extend
               UnicodeUtil.newString(termText, 0, realPrefixLength));
           a = BasicOperations.concatenate(prefix, a);
         }
-        runAutomata.add(new ByteRunAutomaton(a));
+        ra[i] = new ByteRunAutomaton(a);
       }
+      runAutomata = Arrays.asList(ra);
     }
   }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java Thu Jul 22 19:34:35 2010
@@ -171,32 +171,64 @@ public class MultiPhraseQuery extends Qu
       if (termArrays.size() == 0)                  // optimize zero-term case
         return null;
 
-      DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[termArrays.size()];
-      for (int i=0; i<postings.length; i++) {
+      final Bits delDocs = MultiFields.getDeletedDocs(reader);
+      
+      PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[termArrays.size()];
+
+      for (int i=0; i<postingsFreqs.length; i++) {
         Term[] terms = termArrays.get(i);
 
         final DocsAndPositionsEnum postingsEnum;
+        int docFreq;
+
         if (terms.length > 1) {
           postingsEnum = new UnionDocsAndPositionsEnum(reader, terms);
+
+          // coarse -- this overcounts since a given doc can
+          // have more than one terms:
+          docFreq = 0;
+          for(int j=0;j<terms.length;j++) {
+            docFreq += reader.docFreq(terms[i]);
+          }
         } else {
-          postingsEnum = reader.termPositionsEnum(MultiFields.getDeletedDocs(reader),
+          final BytesRef text = new BytesRef(terms[0].text());
+          postingsEnum = reader.termPositionsEnum(delDocs,
                                                   terms[0].field(),
-                                                  new BytesRef(terms[0].text()));
-        }
+                                                  text);
 
-        if (postingsEnum == null) {
-          return null;
+          if (postingsEnum == null) {
+            if (MultiFields.getTermDocsEnum(reader, delDocs, terms[0].field(), text) != null) {
+              // term does exist, but has no positions
+              throw new IllegalStateException("field \"" + terms[0].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[0].text() + ")");
+            } else {
+              // term does not exist
+              return null;
+            }
+          }
+
+          docFreq = reader.docFreq(terms[0].field(), text);
         }
 
-        postings[i] = postingsEnum;
+        postingsFreqs[i] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(i).intValue());
       }
 
-      if (slop == 0)
-        return new ExactPhraseScorer(this, postings, getPositions(), similarity,
-                                     reader.norms(field));
-      else
-        return new SloppyPhraseScorer(this, postings, getPositions(), similarity,
+      // sort by increasing docFreq order
+      if (slop == 0) {
+        Arrays.sort(postingsFreqs);
+      }
+
+      if (slop == 0) {
+        ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity,
+                                                    reader.norms(field));
+        if (s.noDocs) {
+          return null;
+        } else {
+          return s;
+        }
+      } else {
+        return new SloppyPhraseScorer(this, postingsFreqs, similarity,
                                       slop, reader.norms(field));
+      }
     }
 
     @Override
@@ -231,13 +263,24 @@ public class MultiPhraseQuery extends Qu
       fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+
                                "), product of:");
 
-      PhraseScorer scorer = (PhraseScorer) scorer(reader, true, false);
+      Scorer scorer = (Scorer) scorer(reader, true, false);
       if (scorer == null) {
         return new Explanation(0.0f, "no matching docs");
       }
+
       Explanation tfExplanation = new Explanation();
       int d = scorer.advance(doc);
-      float phraseFreq = (d == doc) ? scorer.currentFreq() : 0.0f;
+      float phraseFreq;
+      if (d == doc) {
+        if (slop == 0) {
+          phraseFreq = ((ExactPhraseScorer) scorer).currentFreq();
+        } else {
+          phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq();
+        }
+      } else {
+        phraseFreq = 0.0f;
+      }
+
       tfExplanation.setValue(similarity.tf(phraseFreq));
       tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
       fieldExpl.addDetail(tfExplanation);
@@ -293,7 +336,7 @@ public class MultiPhraseQuery extends Qu
   @Override
   public final String toString(String f) {
     StringBuilder buffer = new StringBuilder();
-    if (!field.equals(f)) {
+    if (field == null || !field.equals(f)) {
       buffer.append(field);
       buffer.append(":");
     }
@@ -458,9 +501,14 @@ class UnionDocsAndPositionsEnum extends 
     for (int i = 0; i < terms.length; i++) {
       DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs,
                                                                     terms[i].field(),
-                                                                    new BytesRef(terms[i].text()));
+                                                                    terms[i].bytes());
       if (postings != null) {
         docsEnums.add(postings);
+      } else {
+        if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), terms[i].bytes()) != null) {
+          // term does exist, but has no positions
+          throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")");
+        }
       }
     }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java Thu Jul 22 19:34:35 2010
@@ -32,7 +32,7 @@ import org.apache.lucene.index.Terms;
 import org.apache.lucene.queryParser.QueryParser; // for javadoc
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.VirtualMethod;
+import org.apache.lucene.util.PagedBytes;
 
 /**
  * An abstract {@link Query} that matches documents
@@ -72,17 +72,6 @@ public abstract class MultiTermQuery ext
   protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
   transient int numberOfTerms = 0;
   
-  /** @deprecated remove when getEnum is removed */
-  private static final VirtualMethod<MultiTermQuery> getEnumMethod =
-    new VirtualMethod<MultiTermQuery>(MultiTermQuery.class, "getEnum", IndexReader.class);
-  /** @deprecated remove when getEnum is removed */
-  private static final VirtualMethod<MultiTermQuery> getTermsEnumMethod =
-    new VirtualMethod<MultiTermQuery>(MultiTermQuery.class, "getTermsEnum", IndexReader.class);
-  /** @deprecated remove when getEnum is removed */
-  final boolean hasNewAPI = 
-    VirtualMethod.compareImplementationDistance(getClass(), 
-        getTermsEnumMethod, getEnumMethod) >= 0; // its ok for both to be overridden
-
   /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum}
    * and update the boost on each returned term. This enables to control the boost factor
    * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or
@@ -189,77 +178,49 @@ public abstract class MultiTermQuery ext
   private abstract static class BooleanQueryRewrite extends RewriteMethod {
   
     protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
+      final Fields fields = MultiFields.getFields(reader);
+      if (fields == null) {
+        // reader has no fields
+        return 0;
+      }
 
-      if (query.hasNewAPI) {
-
-        if (query.field == null) {
-          throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
-        }
-
-        final Fields fields = MultiFields.getFields(reader);
-        if (fields == null) {
-          // reader has no fields
-          return 0;
-        }
-
-        final Terms terms = fields.terms(query.field);
-        if (terms == null) {
-          // field does not exist
-          return 0;
-        }
+      final Terms terms = fields.terms(query.field);
+      if (terms == null) {
+        // field does not exist
+        return 0;
+      }
 
-        final TermsEnum termsEnum = query.getTermsEnum(reader);
-        assert termsEnum != null;
+      final TermsEnum termsEnum = query.getTermsEnum(reader);
+      assert termsEnum != null;
 
-        if (termsEnum == TermsEnum.EMPTY)
-          return 0;
-        final BoostAttribute boostAtt =
-          termsEnum.attributes().addAttribute(BoostAttribute.class);
-        collector.boostAtt = boostAtt;
-        int count = 0;
-        BytesRef term;
-        final Term placeholderTerm = new Term(query.field);
-        while ((term = termsEnum.next()) != null) {
-          if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) {
-            count++;
-          } else {
-            break;
-          }
-        }
-        collector.boostAtt = null;
-        return count;
-      } else {
-        // deprecated case
-        final FilteredTermEnum enumerator = query.getEnum(reader);
-        int count = 0;
-        try {
-          do {
-            Term t = enumerator.term();
-            if (t != null) {
-              if (collector.collect(t, enumerator.difference())) {
-                count++;
-              } else {
-                break;
-              }
-            }
-          } while (enumerator.next());    
-        } finally {
-          enumerator.close();
+      if (termsEnum == TermsEnum.EMPTY)
+        return 0;
+      final BoostAttribute boostAtt =
+        termsEnum.attributes().addAttribute(BoostAttribute.class);
+      collector.boostAtt = boostAtt;
+      int count = 0;
+      BytesRef bytes;
+      while ((bytes = termsEnum.next()) != null) {
+        if (collector.collect(bytes, boostAtt.getBoost())) {
+          count++;
+        } else {
+          break;
         }
-        return count;
       }
+      collector.boostAtt = null;
+      return count;
     }
     
     protected static abstract class TermCollector {
-      /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */
       private BoostAttribute boostAtt = null;
     
       /** return false to stop collecting */
-      public abstract boolean collect(Term t, float boost) throws IOException;
+      public abstract boolean collect(BytesRef bytes, float boost) throws IOException;
       
       /** set the minimum boost as a hint for the term producer */
       protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) {
-        if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
+        assert boostAtt != null;
+        boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost);
       }
     }
   }
@@ -268,9 +229,11 @@ public abstract class MultiTermQuery ext
     @Override
     public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
       final BooleanQuery result = new BooleanQuery(true);
+      final Term placeholderTerm = new Term(query.field);
       query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() {
-        public boolean collect(Term t, float boost) {
-          TermQuery tq = new TermQuery(t); // found a match
+        public boolean collect(BytesRef bytes, float boost) {
+          // add new TQ, we must clone the term, else it may get overwritten!
+          TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)));
           tq.setBoost(query.getBoost() * boost); // set the boost
           result.add(tq, BooleanClause.Occur.SHOULD); // add to query
           return true;
@@ -331,16 +294,16 @@ public abstract class MultiTermQuery ext
     protected abstract Query getQuery(Term term);
 
     @Override
-    public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
+    public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
       final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
       final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
       collectTerms(reader, query, new TermCollector() {
-        public boolean collect(Term t, float boost) {
+        public boolean collect(BytesRef bytes, float boost) {
           // ignore uncompetetive hits
           if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost)
             return true;
-          // add new entry in PQ
-          st.term = t;
+          // add new entry in PQ, we must clone the term, else it may get overwritten!
+          st.bytes.copy(bytes);
           st.boost = boost;
           stQueue.offer(st);
           // possibly drop entries from queue
@@ -353,9 +316,11 @@ public abstract class MultiTermQuery ext
         private ScoreTerm st = new ScoreTerm();
       });
       
+      final Term placeholderTerm = new Term(query.field);
       final BooleanQuery bq = new BooleanQuery(true);
       for (final ScoreTerm st : stQueue) {
-        Query tq = getQuery(st.term);    // found a match
+        // add new query, we must clone the term, else it may get overwritten!
+        Query tq = getQuery(placeholderTerm.createTerm(st.bytes));
         tq.setBoost(query.getBoost() * st.boost); // set the boost
         bq.add(tq, BooleanClause.Occur.SHOULD);   // add to query
       }
@@ -382,12 +347,13 @@ public abstract class MultiTermQuery ext
     }
   
     private static class ScoreTerm implements Comparable<ScoreTerm> {
-      public Term term;
+      public final BytesRef bytes = new BytesRef();
       public float boost;
       
       public int compareTo(ScoreTerm other) {
         if (this.boost == other.boost)
-          return other.term.compareTo(this.term);
+          // TODO: is it OK to use default compare here?
+          return other.bytes.compareTo(this.bytes);
         else
           return Float.compare(this.boost, other.boost);
       }
@@ -564,58 +530,67 @@ public abstract class MultiTermQuery ext
       final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
       final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
 
-      final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit);
+      final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit);
       collectTerms(reader, query, col);
       
       if (col.hasCutOff) {
         return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
+      } else if (col.termCount == 0) {
+        return new BooleanQuery(true);
       } else {
-        final Query result;
-        if (col.pendingTerms.isEmpty()) {
-          result = new BooleanQuery(true);
-        } else {
-          BooleanQuery bq = new BooleanQuery(true);
-          for(Term term : col.pendingTerms) {
-            TermQuery tq = new TermQuery(term);
-            bq.add(tq, BooleanClause.Occur.SHOULD);
+        final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false);
+        try {
+          final BooleanQuery bq = new BooleanQuery(true);
+          final Term placeholderTerm = new Term(query.field);
+          long start = col.startOffset;
+          for(int i = 0; i < col.termCount; i++) {
+            final BytesRef bytes = new BytesRef();
+            start = bytesReader.fillUsingLengthPrefix3(bytes, start);
+            bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD);
           }
           // Strip scores
-          result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
+          final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
           result.setBoost(query.getBoost());
+          query.incTotalNumberOfTerms(col.termCount);
+          return result;
+        } finally {
+          bytesReader.close();
         }
-        query.incTotalNumberOfTerms(col.pendingTerms.size());
-        return result;
       }
     }
     
     private static final class CutOffTermCollector extends TermCollector {
-      CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) {
+      CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) {
         this.reader = reader;
+        this.field = field;
         this.docCountCutoff = docCountCutoff;
         this.termCountLimit = termCountLimit;
       }
     
-      public boolean collect(Term t, float boost) throws IOException {
-        pendingTerms.add(t);
-        if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
+      public boolean collect(BytesRef bytes, float boost) throws IOException {
+        termCount++;
+        if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) {
           hasCutOff = true;
           return false;
         }
+        pendingTerms.copyUsingLengthPrefix(bytes);
         // Loading the TermInfo from the terms dict here
         // should not be costly, because 1) the
         // query/filter will load the TermInfo when it
         // runs, and 2) the terms dict has a cache:
-        // @deprecated: in 4.0 use BytesRef for collectTerms()
-        docVisitCount += reader.docFreq(t);
+        docVisitCount += reader.docFreq(field, bytes);
         return true;
       }
       
       int docVisitCount = 0;
       boolean hasCutOff = false;
+      int termCount = 0;
       
       final IndexReader reader;
+      final String field;
       final int docCountCutoff, termCountLimit;
-      final ArrayList<Term> pendingTerms = new ArrayList<Term>();
+      final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB
+      final long startOffset = pendingTerms.getPointer();
     }
 
     @Override
@@ -681,42 +656,20 @@ public abstract class MultiTermQuery ext
    */
   public MultiTermQuery(final String field) {
     this.field = field;
-  }
-
-  /**
-   * Constructs a query matching terms that cannot be represented with a single
-   * Term.
-   * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can
-   * only work on one field per terms enum. If you override
-   * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor.
-   */
-  @Deprecated
-  public MultiTermQuery() {
-    this(null);
+    assert field != null;
   }
 
   /** Returns the field name for this query */
   public final String getField() { return field; }
 
   /** Construct the enumeration to be used, expanding the
-   * pattern term.
-   * @deprecated Please override {@link #getTermsEnum} instead */
-  @Deprecated
-  protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-    throw new UnsupportedOperationException();
-  }
-
-  /** Construct the enumeration to be used, expanding the
    *  pattern term.  This method should only be called if
    *  the field exists (ie, implementations can assume the
    *  field does exist).  This method should not return null
    *  (should instead return {@link TermsEnum#EMPTY} if no
    *  terms match).  The TermsEnum must already be
    *  positioned to the first matching term. */
-  // TODO 4.0: make this method abstract
-  protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
-    throw new UnsupportedOperationException();
-  }
+  protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException;
 
   /**
    * Expert: Return the number of unique terms visited during execution of the query.

Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java?rev=966819&r1=966818&r2=966819&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java Thu Jul 22 19:34:35 2010
@@ -20,11 +20,8 @@ package org.apache.lucene.search;
 import java.io.IOException;
 
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.MultiFields;
@@ -109,97 +106,50 @@ public class MultiTermQueryWrapperFilter
    */
   @Override
   public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
-    if (query.hasNewAPI) {
-      if (query.field == null) {
-        throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery.");
-      }
-
-      final Fields fields = MultiFields.getFields(reader);
-      if (fields == null) {
-        // reader has no fields
-        return DocIdSet.EMPTY_DOCIDSET;
-      }
-
-      final Terms terms = fields.terms(query.field);
-      if (terms == null) {
-        // field does not exist
-        return DocIdSet.EMPTY_DOCIDSET;
-      }
-
-      final TermsEnum termsEnum = query.getTermsEnum(reader);
-      assert termsEnum != null;
-      if (termsEnum.next() != null) {
-        // fill into a OpenBitSet
-        final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
-        int termCount = 0;
-        final Bits delDocs = MultiFields.getDeletedDocs(reader);
-        DocsEnum docsEnum = null;
-        do {
-          termCount++;
-          // System.out.println("  iter termCount=" + termCount + " term=" +
-          // enumerator.term().toBytesString());
-          docsEnum = termsEnum.docs(delDocs, docsEnum);
-          final DocsEnum.BulkReadResult result = docsEnum.getBulkResult();
-          while (true) {
-            final int count = docsEnum.read();
-            if (count != 0) {
-              final int[] docs = result.docs.ints;
-              for (int i = 0; i < count; i++) {
-                bitSet.set(docs[i]);
-              }
-            } else {
-              break;
+    final Fields fields = MultiFields.getFields(reader);
+    if (fields == null) {
+      // reader has no fields
+      return DocIdSet.EMPTY_DOCIDSET;
+    }
+
+    final Terms terms = fields.terms(query.field);
+    if (terms == null) {
+      // field does not exist
+      return DocIdSet.EMPTY_DOCIDSET;
+    }
+
+    final TermsEnum termsEnum = query.getTermsEnum(reader);
+    assert termsEnum != null;
+    if (termsEnum.next() != null) {
+      // fill into a OpenBitSet
+      final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
+      int termCount = 0;
+      final Bits delDocs = MultiFields.getDeletedDocs(reader);
+      DocsEnum docsEnum = null;
+      do {
+        termCount++;
+        // System.out.println("  iter termCount=" + termCount + " term=" +
+        // enumerator.term().toBytesString());
+        docsEnum = termsEnum.docs(delDocs, docsEnum);
+        final DocsEnum.BulkReadResult result = docsEnum.getBulkResult();
+        while (true) {
+          final int count = docsEnum.read();
+          if (count != 0) {
+            final int[] docs = result.docs.ints;
+            for (int i = 0; i < count; i++) {
+              bitSet.set(docs[i]);
             }
+          } else {
+            break;
           }
-        } while (termsEnum.next() != null);
-        // System.out.println("  done termCount=" + termCount);
+        }
+      } while (termsEnum.next() != null);
+      // System.out.println("  done termCount=" + termCount);
 
-        query.incTotalNumberOfTerms(termCount);
-        return bitSet;
-      } else {
-        return DocIdSet.EMPTY_DOCIDSET;
-      }
+      query.incTotalNumberOfTerms(termCount);
+      return bitSet;
     } else {
-      final TermEnum enumerator = query.getEnum(reader);
-      try {
-        // if current term in enum is null, the enum is empty -> shortcut
-        if (enumerator.term() == null)
-          return DocIdSet.EMPTY_DOCIDSET;
-        // else fill into a OpenBitSet
-        final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
-        final int[] docs = new int[32];
-        final int[] freqs = new int[32];
-        TermDocs termDocs = reader.termDocs();
-        try {
-          int termCount = 0;
-          do {
-            Term term = enumerator.term();
-            if (term == null)
-              break;
-            termCount++;
-            termDocs.seek(term);
-            while (true) {
-              final int count = termDocs.read(docs, freqs);
-              if (count != 0) {
-                for (int i = 0; i < count; i++) {
-                  bitSet.set(docs[i]);
-                }
-              } else {
-                break;
-              }
-            }
-          } while (enumerator.next());
-
-          query.incTotalNumberOfTerms(termCount);
-
-        } finally {
-          termDocs.close();
-        }
-        return bitSet;
-      } finally {
-        enumerator.close();
-      }
+      return DocIdSet.EMPTY_DOCIDSET;
     }
   }
-
 }

Propchange: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu Jul 22 19:34:35 2010
@@ -1,4 +1,5 @@
-/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730
+/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:943137,949730,957490,960490,961612
+/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:953476-966816
 /lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:824912-931101
 /lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:909334,948516
 /lucene/java/trunk/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java:924483-924731,924781,925176-925462



Mime
View raw message