lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r929065 - in /lucene/java/branches/flex_1458/src: java/org/apache/lucene/search/ java/org/apache/lucene/util/ test/org/apache/lucene/search/ test/org/apache/lucene/util/
Date Tue, 30 Mar 2010 10:06:37 GMT
Author: rmuir
Date: Tue Mar 30 10:06:36 2010
New Revision: 929065

URL: http://svn.apache.org/viewvc?rev=929065&view=rev
Log:
LUCENE-2351: optimize automatonquery, improve tests

Added:
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java
  (with props)
Modified:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestUnicodeUtil.java

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=929065&r1=929064&r2=929065&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
(original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
Tue Mar 30 10:06:36 2010
@@ -18,6 +18,7 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+import java.util.Comparator;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
@@ -44,15 +45,6 @@ import org.apache.lucene.util.automaton.
  * completely accepted. This is not possible when the language accepted by the
  * FSM is not finite (i.e. * operator).
  * </p>
- * <p>
- * If the DFA has a leading kleene star, or something similar, it will
- * need to run against the entire term dictionary. In this case its much
- * better to do just that than to use smart enumeration.
- * This heuristic looks for an initial loop, with a range of at least 1/3
- * of the unicode BMP.
- * Use {@link #usesLinearMode} to find out if it enumerates all terms
- * in linear mode without seeking.
- * </p>
  * @lucene.experimental
  */
 public class AutomatonTermsEnum extends FilteredTermsEnum {
@@ -60,8 +52,6 @@ public class AutomatonTermsEnum extends 
   private final Automaton automaton;
   // a tableized array-based form of the DFA
   private final RunAutomaton runAutomaton;
-  // true if this enum will not seek around
-  private final boolean linearMode;
   // common suffix of the automaton
   private final BytesRef commonSuffixRef;
   // true if the automaton accepts a finite language
@@ -75,11 +65,16 @@ public class AutomatonTermsEnum extends 
   // used for unicode conversion from BytesRef byte[] to char[]
   private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
   // the reference used for seeking forwards through the term dictionary
-  private final BytesRef seekBytesRef = new BytesRef(10);
-  
-  // this accept stati will be returned by accept() dependent on internal mode
-  private final AcceptStatus NO_MATCH, YES_MATCH;
-  
+  private final BytesRef seekBytesRef = new BytesRef(10); 
+  // true if we are enumerating an infinite portion of the DFA.
+  // in this case it is faster to drive the query based on the terms dictionary.
+  // when this is true, linearUpperBound indicate the end of range
+  // of terms where we should simply do sequential reads instead.
+  private boolean linear = false;
+  private final BytesRef linearUpperBound = new BytesRef(10);
+  private final UnicodeUtil.UTF16Result linearUpperBoundUTF16 = new UnicodeUtil.UTF16Result();
+  private final Comparator<BytesRef> termComp;
+
   /**
    * Expert ctor:
    * Construct an enumerator based upon an automaton, enumerating the specified
@@ -92,15 +87,15 @@ public class AutomatonTermsEnum extends 
    * State numbering, or you will get undefined behavior.
    * <p>
    * @param preCompiled optional pre-compiled RunAutomaton (can be null)
-   * @param linearMode determines whether or not it will use smart enumeration.
+   * @param finite true if the automaton accepts a finite language
    */
   AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled,
-      Term queryTerm, IndexReader reader, boolean linearMode)
+      Term queryTerm, IndexReader reader, boolean finite)
       throws IOException {
     super(reader, queryTerm.field());
     this.automaton = automaton;
-    this.linearMode = linearMode;
-    
+    this.finite = finite;
+
     /* 
      * tableize the automaton. this also ensures it is deterministic, and has no 
      * transitions to dead states. it also invokes Automaton.setStateNumbers to
@@ -111,130 +106,109 @@ public class AutomatonTermsEnum extends 
     else
       runAutomaton = preCompiled;
 
-    if (this.linearMode) {
-      // iterate all terms in linear mode
-      this.finite = false;
-      allTransitions = null;
-      visited = null;
-      commonSuffixRef = new BytesRef(getValidUTF16Suffix(SpecialOperations
-          .getCommonSuffix(automaton)));
-      NO_MATCH = AcceptStatus.NO;
-      YES_MATCH = AcceptStatus.YES;
-    } else {
-      // if the automaton is finite, we will never read sequentially, but always seek.
-      this.finite = SpecialOperations.isFinite(this.automaton);
-      // in nonlinear mode, the common suffix isn't that helpful.
-      // we will seek each time anyway (and take the unicode conversion hit).
-      // its also currently expensive to calculate, because getCommonSuffix is 
-      // a bit expensive.
-      commonSuffixRef = null;
-      // build a cache of sorted transitions for every state
-      allTransitions = new Transition[runAutomaton.getSize()][];
-      for (State state : this.automaton.getStates())
-        allTransitions[state.getNumber()] = state.getSortedTransitionArray(false);
-      // used for path tracking, where each bit is a numbered state.
-      visited = new long[runAutomaton.getSize()];
-      NO_MATCH = AcceptStatus.NO_AND_SEEK;
-      YES_MATCH = finite ? AcceptStatus.YES_AND_SEEK : AcceptStatus.YES;
-    }
+    commonSuffixRef = finite ? null : new BytesRef(getValidUTF16Suffix(SpecialOperations
+        .getCommonSuffix(automaton)));
+    
+    // build a cache of sorted transitions for every state
+    allTransitions = new Transition[runAutomaton.getSize()][];
+    for (State state : this.automaton.getStates())
+      allTransitions[state.getNumber()] = state.getSortedTransitionArray(false);
+    // used for path tracking, where each bit is a numbered state.
+    visited = new long[runAutomaton.getSize()];
 
     setUseTermsCache(finite);
+    termComp = getComparator();
   }
   
   /**
    * Construct an enumerator based upon an automaton, enumerating the specified
    * field, working on a supplied reader.
    * <p>
-   * It will automagically determine whether or not to enumerate the term dictionary
-   * in a smart way, or to just do a linear scan depending upon a heuristic.
+   * It will automatically calculate whether or not the automaton is finite
    */
   public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
       throws IOException {
-    this(automaton, null, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
-  }
-  
-  /**
-   * Heuristic to detect if an automaton will be so slow,
-   * that it is better to do a linear enumeration.
-   * <p>
-   * A very slow automaton will simply cause a lot of wasted disk seeks.
-   * Instead in that case it is actually faster to do a linear enumeration.
-   * <p>
-   * @param automaton automaton
-   * @return true if it will result in bad search performance
-   */
-  private static boolean isSlow(Automaton automaton) {
-    /*
-     * If the DFA has a leading kleene star, or something similar, it will
-     * need to run against the entire term dictionary. In this case its much
-     * better to do just that than to use smart enumeration.
-     * 
-     * this heuristic looks for an initial loop, with a range of at least 1/3
-     * of the unicode BMP.
-     */
-    State initialState = automaton.getInitialState();
-    boolean linearMode = false;
-    for (Transition transition : initialState.getTransitions()) {
-      if (transition.getDest() == initialState && 
-          (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) {
-        linearMode = true;
-        break;
-      }
-    }
-    return linearMode;
-  }
-  
-  /**
-   * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode.
-   */
-  public final boolean usesLinearMode() {
-    return linearMode;
+    this(automaton, null, queryTerm, reader, SpecialOperations.isFinite(automaton));
   }
  
   /**
-   * Returns true if the term matches the automaton. Also stashes away the term
-   * to assist with smart enumeration.
-   * <p>In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted.
-   * In smart mode, it will never do this.   
+   * Returns true if the term matches the automaton. 
    */
   @Override
   protected AcceptStatus accept(final BytesRef term) {
     if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) {
       UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
-      return runAutomaton.run(utf16.result, 0, utf16.length) ? YES_MATCH : NO_MATCH;
+      if (runAutomaton.run(utf16.result, 0, utf16.length))
+        return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK;
+      else
+        return (linear && termComp.compare(term, linearUpperBound) < 0) ? 
+            AcceptStatus.NO : AcceptStatus.NO_AND_SEEK;
     } else {
-      return NO_MATCH;
+      return (linear && termComp.compare(term, linearUpperBound) < 0) ? 
+          AcceptStatus.NO : AcceptStatus.NO_AND_SEEK;
     }
   }
   
   @Override
   protected BytesRef nextSeekTerm(final BytesRef term) throws IOException {
     if (term == null) {
-      // return the first seek term
-      if (linearMode) {
+      // return the empty term, as its valid
+      if (runAutomaton.run("")) {
         seekBytesRef.copy("");
-      } else {
-        utf16.copyText("");
-        if (!nextString())
-          return null;
-        UnicodeUtil.nextValidUTF16String(utf16);
-        UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef);
-      }
-      return seekBytesRef;
-    } else if (!linearMode) {
-      // seek to the next possible string
-      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
-      if (nextString()) {
-        // reposition
-        UnicodeUtil.nextValidUTF16String(utf16);
-        UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef);
         return seekBytesRef;
       }
+      
+      utf16.copyText("");
+    } else {
+      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+    }
+
+    // seek to the next possible string;
+    if (nextString()) {
+      // reposition
+      if (linear)
+        setLinear(infinitePosition);
+      UnicodeUtil.nextValidUTF16String(utf16);
+      UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef);
+      return seekBytesRef;
     }
     // no more possible strings can match
     return null;
   }
 
+  // this instance prevents unicode conversion during backtracking,
+  // we can just call setLinear once at the end.
+  int infinitePosition;
+
+  /**
+   * Sets the enum to operate in linear fashion, as we have found
+   * a looping transition at position
+   */
+  private void setLinear(int position) {
+    int state = runAutomaton.getInitialState();
+    char maxInterval = 0xffff;
+    for (int i = 0; i < position; i++)
+      state = runAutomaton.step(state, utf16.result[i]);
+    for (int i = 0; i < allTransitions[state].length; i++) {
+      Transition t = allTransitions[state][i];
+      if (t.getMin() <= utf16.result[position] && utf16.result[position] <=
t.getMax()) {
+        maxInterval = t.getMax();
+        break;
+      }
+    }
+    // 0xffff terms don't get the optimization... not worth the trouble.
+    if (maxInterval < 0xffff)
+      maxInterval++;
+    int length = position + 1; /* position + maxTransition */
+    if (linearUpperBoundUTF16.result.length < length)
+      linearUpperBoundUTF16.result = new char[length];
+    System.arraycopy(utf16.result, 0, linearUpperBoundUTF16.result, 0, position);
+    linearUpperBoundUTF16.result[position] = maxInterval;
+    linearUpperBoundUTF16.setLength(length);
+    UnicodeUtil.nextValidUTF16String(linearUpperBoundUTF16);
+    UnicodeUtil.UTF16toUTF8(linearUpperBoundUTF16.result, 0, length, linearUpperBound);
+  }
+
   /**
    * Increments the utf16 buffer to the next String in lexicographic order after s that will
not put
    * the machine into a reject state. If such a string does not exist, returns
@@ -250,14 +224,21 @@ public class AutomatonTermsEnum extends 
     int pos = 0;
 
     while (true) {
+      curGen++;
+      linear = false;
       state = runAutomaton.getInitialState();
       // walk the automaton until a character is rejected.
       for (pos = 0; pos < utf16.length; pos++) {
+        visited[state] = curGen;
         int nextState = runAutomaton.step(state, utf16.result[pos]);
         if (nextState == -1)
           break;
-        else
-          state = nextState;
+        // we found a loop, record it for faster enumeration
+        if (!finite && !linear && visited[nextState] == curGen) {
+          linear = true;
+          infinitePosition = pos;
+        }
+        state = nextState;
       }
 
       // take the useful portion, and the last non-reject state, and attempt to
@@ -310,8 +291,6 @@ public class AutomatonTermsEnum extends 
         c++;
     }
 
-    curGen++;
-
     utf16.setLength(position);
     visited[state] = curGen;
 
@@ -339,10 +318,15 @@ public class AutomatonTermsEnum extends 
            * then there MUST be at least one transition.
            */
           transition = allTransitions[state][0];
+          state = transition.getDest().getNumber();
+          // we found a loop, record it for faster enumeration
+          if (!finite && !linear && visited[state] == curGen) {
+            linear = true;
+            infinitePosition = utf16.length;
+          }
           // append the minimum transition
           utf16.setLength(utf16.length + 1);
           utf16.result[utf16.length - 1] = transition.getMin();
-          state = transition.getDest().getNumber();
         }
         return true;
       }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=929065&r1=929064&r2=929065&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Tue
Mar 30 10:06:36 2010
@@ -268,7 +268,7 @@ final class AutomatonFuzzyTermsEnum exte
   
   public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm,
       IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm)
throws IOException {
-    super(automaton, matchers[matchers.length - 1], queryTerm, reader, false);
+    super(automaton, matchers[matchers.length - 1], queryTerm, reader, true);
     this.minimumSimilarity = minSimilarity;
     this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
     this.matchers = matchers;

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=929065&r1=929064&r2=929065&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java Tue Mar
30 10:06:36 2010
@@ -372,8 +372,14 @@ final public class UnicodeUtil {
               s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;             
               return;
             } else { // SMP already enumerated
-              s.setLength(i);
-              s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);              
+              if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) {
+                s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+                s.setLength(i);               
+              } else {
+                s.result[i - 1]++;
+                s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
+                s.setLength(i + 1);
+              }            
               return;
             }
         } else {

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java?rev=929065&r1=929064&r2=929065&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java
(original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java
Tue Mar 30 10:06:36 2010
@@ -200,19 +200,7 @@ public class TestAutomatonQuery extends 
   }
   
   /**
-   * Test that a badly-performing automaton that must visit all the terms does
-   * not use the smart enumeration, this will just waste cpu.
-   */
-  public void testLinearOptimization() throws IOException {
-    AutomatonQuery aq = new RegexpQuery(newTerm(".*ument"));
-    assertTrue(((AutomatonTermsEnum) aq.getTermsEnum(searcher.getIndexReader()))
-        .usesLinearMode());
-    assertEquals(1, automatonQueryNrHits(aq));
-  }
-  
-  /**
-   * Test that a badly-performing automaton that must visit all the terms does
-   * not use the smart enumeration, this will just waste cpu.
+   * Test handling of the empty language
    */
   public void testEmptyOptimization() throws IOException {
     AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), BasicAutomata

Added: lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java?rev=929065&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java
(added)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java
Tue Mar 30 10:06:36 2010
@@ -0,0 +1,221 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.util.automaton.RunAutomaton;
+
+/**
+ * Create an index with random unicode terms
+ * Generates random regexps, and validates against a simple impl.
+ */
+public class TestRegexpRandom2 extends LuceneTestCase {
+  private IndexSearcher searcher;
+  private Random random;
+  
+  @Override
+  protected void setUp() throws Exception {
+    super.setUp();
+    random = newRandom(System.nanoTime());
+    RAMDirectory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
+        IndexWriter.MaxFieldLength.UNLIMITED);
+    
+    Document doc = new Document();
+    Field field = new Field("field", "", Field.Store.YES, Field.Index.ANALYZED);
+    doc.add(field);
+    
+    for (int i = 0; i < 1000; i++) {
+      field.setValue(randomString());
+      writer.addDocument(doc);
+    }
+    
+    writer.optimize();
+    writer.close();
+    searcher = new IndexSearcher(dir);
+  }
+
+  @Override
+  protected void tearDown() throws Exception {
+    searcher.close();
+    super.tearDown();
+  }
+  
+  /** a stupid regexp query that just blasts thru the terms */
+  private class DumbRegexpQuery extends MultiTermQuery {
+    private final Automaton automaton;
+    
+    DumbRegexpQuery(Term term) {
+      super(term.field());
+      RegExp re = new RegExp(term.text());
+      automaton = re.toAutomaton();
+    }
+    
+    @Override
+    protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
+      return new SimpleAutomatonTermsEnum(reader, field);
+    }
+
+    private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
+      RunAutomaton runAutomaton = new RunAutomaton(automaton);
+      UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+
+      private SimpleAutomatonTermsEnum(IndexReader reader, String field) throws IOException
{
+        super(reader, field);
+        setInitialSeekTerm(new BytesRef(""));
+      }
+      
+      @Override
+      protected AcceptStatus accept(BytesRef term) throws IOException {
+        UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+        return runAutomaton.run(utf16.result, 0, utf16.length) ? 
+            AcceptStatus.YES : AcceptStatus.NO;
+      }
+    }
+
+    @Override
+    public String toString(String field) {
+      return field.toString() + automaton.toString();
+    }
+  }
+  
+  /** test a bunch of random regular expressions */
+  public void testRegexps() throws Exception {
+      for (int i = 0; i < 500; i++)
+        assertSame(randomRegex());
+  }
+  
+  /** check that the # of hits is the same as from a very
+   * simple regexpquery implementation.
+   */
+  private void assertSame(String regexp) throws IOException {
+    // we will generate some illegal syntax regular expressions...
+    try {
+      new RegExp(regexp).toAutomaton();
+    } catch (Exception e) {
+      return;
+    }
+    
+    // we will also generate some undefined unicode queries
+    if (!UnicodeUtil.validUTF16String(regexp))
+      return;
+    
+    RegexpQuery smart = new RegexpQuery(new Term("field", regexp));
+    DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp));
+    
+    // we can't compare the two if automaton rewrites to a simpler enum.
+    // for example: "a\uda07\udcc7?.*?" gets rewritten to a simpler query:
+    // a\uda07* prefixquery. Prefixquery then does the "wrong" thing, which
+    // isn't really wrong as the query was undefined to begin with... but not
+    // automatically comparable.
+    if (!(smart.getTermsEnum(searcher.getIndexReader()) instanceof AutomatonTermsEnum))
+      return;
+    
+    TopDocs smartDocs = searcher.search(smart, 25);
+    TopDocs dumbDocs = searcher.search(dumb, 25);
+
+    assertEquals(dumbDocs.totalHits, smartDocs.totalHits);
+  }
+  
+  char buffer[] = new char[20];
+  
+  // start is inclusive and end is exclusive
+  public int nextInt(int start, int end) {
+    return start + random.nextInt(end - start);
+  }
+  
+  public String randomString() {
+    final int end = random.nextInt(20);
+    if (buffer.length < 1 + end) {
+      char[] newBuffer = new char[(int) ((1 + end) * 1.25)];
+      System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
+      buffer = newBuffer;
+    }
+    for (int i = 0; i < end - 1; i++) {
+      int t = random.nextInt(6);
+      if (0 == t && i < end - 1) {
+        // Make a surrogate pair
+        // High surrogate
+        buffer[i++] = (char) nextInt(0xd800, 0xdc00);
+        // Low surrogate
+        buffer[i] = (char) nextInt(0xdc00, 0xe000);
+      } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80);
+      else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800);
+      else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800);
+      else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff);
+      else if (5 == t) {
+        // Illegal unpaired surrogate
+        if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00);
+        else buffer[i] = (char) nextInt(0xdc00, 0xe000);
+      }
+    }
+    return new String(buffer, 0, end);
+  }
+  
+  // a random string biased towards populating a ton of operators
+  public String randomRegex() {
+    final int end = random.nextInt(20);
+    if (buffer.length < 1 + end) {
+      char[] newBuffer = new char[(int) ((1 + end) * 1.25)];
+      System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
+      buffer = newBuffer;
+    }
+    for (int i = 0; i < end - 1; i++) {
+      int t = random.nextInt(10);
+      if (0 == t && i < end - 1) {
+        // Make a surrogate pair
+        // High surrogate
+        buffer[i++] = (char) nextInt(0xd800, 0xdc00);
+        // Low surrogate
+        buffer[i] = (char) nextInt(0xdc00, 0xe000);
+      } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80);
+      else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800);
+      else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800);
+      else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff);
+      else if (5 == t) {
+        // Illegal unpaired surrogate
+        if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00);
+        else buffer[i] = (char) nextInt(0xdc00, 0xe000);
+      } else if (6 == t) {
+        buffer[i] = '.';
+      } else if (7 == t) {
+        buffer[i] = '?';
+      } else if (8 == t) {
+        buffer[i] = '*';
+      } else if (9 == t) {
+        buffer[i] = '+';
+      }
+    }
+    return new String(buffer, 0, end);
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom2.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestUnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestUnicodeUtil.java?rev=929065&r1=929064&r2=929065&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestUnicodeUtil.java (original)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/TestUnicodeUtil.java Tue
Mar 30 10:06:36 2010
@@ -65,12 +65,16 @@ public class TestUnicodeUtil extends Luc
     assertEquals("dogs\uD801\uDC00", UnicodeUtil
         .nextValidUTF16String("dogs\uD801\uD800"));
     
-    // an illegal combination where we have already enumerated the supp plane
-    // we must replace both H and Z with \uE000 (the lowest possible
-    // "upper BMP")
-    assertEquals("dogs\uE000", UnicodeUtil
+    // an illegal combination where we have already enumerated the trail
+    // we must increment the lead and start the trail back at the beginning.
+    assertEquals("dogs\uD802\uDC00", UnicodeUtil
         .nextValidUTF16String("dogs\uD801\uE001"));
     
+    // an illegal combination where we have exhausted the supp plane
+    // we must now move to the lower bmp.
+    assertEquals("dogs\uE000", UnicodeUtil
+        .nextValidUTF16String("dogs\uDBFF\uE001"));
+
     // an unpaired trail surrogate. this is invalid when not preceded by a lead
     // surrogate. in this case we have to bump to \uE000 (the lowest possible
     // "upper BMP")



Mime
View raw message