lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r921820 [1/2] - in /lucene/java/branches/flex_1458: ./ src/java/org/apache/lucene/search/ src/java/org/apache/lucene/util/automaton/ src/test/org/apache/lucene/search/ src/test/org/apache/lucene/util/automaton/
Date Thu, 11 Mar 2010 12:15:49 GMT
Author: rmuir
Date: Thu Mar 11 12:15:48 2010
New Revision: 921820

URL: http://svn.apache.org/viewvc?rev=921820&view=rev
Log:
LUCENE-2089: implement FuzzyQuery with automaton

Added:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/fuzzyTestData.txt   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java   (with props)
Modified:
    lucene/java/branches/flex_1458/CHANGES.txt
    lucene/java/branches/flex_1458/LICENSE.txt
    lucene/java/branches/flex_1458/NOTICE.txt
    lucene/java/branches/flex_1458/build.xml
    lucene/java/branches/flex_1458/common-build.xml
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/   (props changed)

Modified: lucene/java/branches/flex_1458/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/CHANGES.txt?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/CHANGES.txt (original)
+++ lucene/java/branches/flex_1458/CHANGES.txt Thu Mar 11 12:15:48 2010
@@ -18,6 +18,13 @@ Bug Fixes
 
 * LUCENE-2222: FixedIntBlockIndexInput incorrectly read one block of
   0s before the actual data.  (Renaud Delbru via Mike McCandless)
+  
+New features
+
+* LUCENE-1606, LUCENE-2089: Adds AutomatonQuery, a MultiTermQuery that 
+  matches terms against a finite-state machine. Implement WildcardQuery
+  and FuzzyQuery with finite-state methods. Adds RegexpQuery.
+  (Robert Muir, Mike McCandless, Uwe Schindler, Mark Miller)
 
 ======================= Trunk (not yet released) =======================
 

Modified: lucene/java/branches/flex_1458/LICENSE.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/LICENSE.txt?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/LICENSE.txt (original)
+++ lucene/java/branches/flex_1458/LICENSE.txt Thu Mar 11 12:15:48 2010
@@ -268,3 +268,29 @@ www.brics.dk/automaton/. Here is the cop
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
  
+The levenshtein automata tables in src/java/org/apache/lucene/util/automaton 
+were automatically generated with the moman/finenight FSA package.
+Here is the copyright for those sources:
+
+# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, <jpb@rrette.com>
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.

Modified: lucene/java/branches/flex_1458/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/NOTICE.txt?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/NOTICE.txt (original)
+++ lucene/java/branches/flex_1458/NOTICE.txt Thu Mar 11 12:15:48 2010
@@ -49,3 +49,9 @@ International Business Machines Corporat
 
 Brics Automaton (under src/java/org/apache/lucene/util/automaton) is 
 BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
+
+The levenshtein automata tables (under src/java/org/apache/lucene/util/automaton) were
+automatically generated with the moman/finenight FSA library, created by
+Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
+see http://sites.google.com/site/rrettesite/moman and 
+http://bitbucket.org/jpbarrette/moman/overview/

Modified: lucene/java/branches/flex_1458/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/build.xml?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/build.xml (original)
+++ lucene/java/branches/flex_1458/build.xml Thu Mar 11 12:15:48 2010
@@ -706,6 +706,41 @@
     </delete>
   </target>
 
+  <macrodef name="createLevAutomaton">
+  	<attribute name="n"/>
+  	<sequential>
+  		<exec dir="src/java/org/apache/lucene/util/automaton"
+  			executable="${python.exe}" failonerror="true">
+  	  	<arg line="createLevAutomata.py @{n}"/>
+  	</exec>
+  	</sequential>
+  </macrodef>
+
+  <target name="createLevAutomata" depends="check-moman,clone-moman,pull-moman">
+  	<createLevAutomaton n="1"/>
+  	<createLevAutomaton n="2"/>
+  </target>
+  
+  <target name="check-moman">
+  	<condition property="moman.cloned">
+      <available file="src/java/org/apache/lucene/util/automaton/moman"/>
+  	</condition>
+  </target>
+	
+  <target name="clone-moman" unless="moman.cloned">
+  	<exec dir="src/java/org/apache/lucene/util/automaton" 
+  		executable="${hg.exe}" failonerror="true">
+      <arg line="clone -r ${moman.rev} ${moman.url} moman"/>
+    </exec>
+  </target>
+
+  <target name="pull-moman" if="moman.cloned">
+	<exec dir="src/java/org/apache/lucene/util/automaton/moman" 
+		executable="${hg.exe}" failonerror="true">
+	  <arg line="pull -f -r ${moman.rev}"/>
+	</exec>
+  </target>
+
   <macrodef name="contrib-crawl">
     <attribute name="target" default=""/>
     <attribute name="failonerror" default="true"/>

Modified: lucene/java/branches/flex_1458/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/common-build.xml?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/common-build.xml (original)
+++ lucene/java/branches/flex_1458/common-build.xml Thu Mar 11 12:15:48 2010
@@ -113,6 +113,11 @@
   <property name="svnversion.exe" value="svnversion" />
   <property name="svn.exe" value="svn" />
   
+  <property name="hg.exe" value="hg" />
+  <property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
+  <property name="moman.rev" value="115" />
+  <property name="python.exe" value="python" />
+
   <property name="gpg.exe" value="gpg" />
   <property name="gpg.key" value="CODE SIGNING KEY" />
 

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java Thu Mar 11 12:15:48 2010
@@ -80,12 +80,21 @@ public class AutomatonTermsEnum extends 
   private final AcceptStatus NO_MATCH, YES_MATCH;
   
   /**
+   * Expert ctor:
    * Construct an enumerator based upon an automaton, enumerating the specified
    * field, working on a supplied reader.
    * <p>
-   * The parameter linearMode determines whether or not it will use smart enumeration.
+   * @lucene.internal Use the public ctor instead. This constructor allows the
+   * (dangerous) option of passing in a pre-compiled RunAutomaton. If you use 
+   * this ctor and compile your own RunAutomaton, you are responsible for 
+   * ensuring it is in sync with the Automaton object, including internal
+   * State numbering, or you will get undefined behavior.
+   * <p>
+   * @param preCompiled optional pre-compiled RunAutomaton (can be null)
+   * @param linearMode determines whether or not it will use smart enumeration.
    */
-  AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader, boolean linearMode)
+  AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled,
+      Term queryTerm, IndexReader reader, boolean linearMode)
       throws IOException {
     super(reader, queryTerm.field());
     this.automaton = automaton;
@@ -96,7 +105,10 @@ public class AutomatonTermsEnum extends 
      * transitions to dead states. it also invokes Automaton.setStateNumbers to
      * number the original states (this is how they are tableized)
      */
-    runAutomaton = new RunAutomaton(this.automaton);
+    if (preCompiled == null)
+      runAutomaton = new RunAutomaton(this.automaton);
+    else
+      runAutomaton = preCompiled;
 
     if (this.linearMode) {
       // iterate all terms in linear mode
@@ -135,7 +147,7 @@ public class AutomatonTermsEnum extends 
    */
   public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
       throws IOException {
-    this(automaton, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
+    this(automaton, null, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
   }
   
   /**

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java?rev=921820&r1=921819&r2=921820&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/FuzzyTermsEnum.java Thu Mar 11 12:15:48 2010
@@ -17,21 +17,305 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.RunAutomaton;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
 
-/** Subclass of FilteredTermEnum for enumerating all terms that are similar
+/** Subclass of TermsEnum for enumerating all terms that are similar
  * to the specified filter term.
  *
  * <p>Term enumerations are always ordered by
  * {@link #getTermComparator}.  Each term in the enumeration is
  * greater than all that precede it.</p>
  */
-public final class FuzzyTermsEnum extends FilteredTermsEnum {
+public final class FuzzyTermsEnum extends TermsEnum {
+  private TermsEnum actualEnum;
+  private MultiTermQuery.BoostAttribute actualBoostAtt;
+  
+  private final MultiTermQuery.BoostAttribute boostAtt =
+    attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
+
+  private float bottom = boostAtt.getMaxNonCompetitiveBoost();
+  
+  private final float minSimilarity;
+  private final float scale_factor;
+  
+  private final int termLength;
+  
+  private int maxEdits;
+  private List<Automaton> automata;
+  private List<RunAutomaton> runAutomata;
+  
+  private final IndexReader reader;
+  private final Term term;
+  private final int realPrefixLength;
+  
+  /**
+   * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
+   * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
+   * <code>minSimilarity</code>.
+   * <p>
+   * After calling the constructor the enumeration is already pointing to the first 
+   * valid term if such a term exists. 
+   * 
+   * @param reader Delivers terms.
+   * @param term Pattern term.
+   * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
+   * @param prefixLength Length of required common prefix. Default value is 0.
+   * @throws IOException
+   */
+  public FuzzyTermsEnum(IndexReader reader, Term term, 
+      final float minSimilarity, final int prefixLength) throws IOException {
+    if (minSimilarity >= 1.0f)
+      throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1");
+    else if (minSimilarity < 0.0f)
+      throw new IllegalArgumentException("minimumSimilarity cannot be less than 0");
+    if(prefixLength < 0)
+      throw new IllegalArgumentException("prefixLength cannot be less than 0");
+    this.reader = reader;
+    this.term = term;
+    //The prefix could be longer than the word.
+    //It's kind of silly though.  It means we must match the entire word.
+    this.termLength = term.text().length();
+    this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength;
+    this.minSimilarity = minSimilarity;
+    this.scale_factor = 1.0f / (1.0f - minSimilarity);
+    
+    // calculate the maximum k edits for this similarity
+    maxEdits = initialMaxDistance(minSimilarity, termLength);
+  
+    TermsEnum subEnum = getAutomatonEnum(maxEdits, null);
+    setEnum(subEnum != null ? subEnum : 
+      new LinearFuzzyTermsEnum(reader, term, minSimilarity, prefixLength));
+  }
+  
+  /**
+   * return an automata-based enum for matching up to editDistance from
+   * lastTerm, if possible
+   */
+  private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
+      throws IOException {
+    initAutomata(editDistance);
+    if (automata != null && editDistance < automata.size()) {
+      return new AutomatonFuzzyTermsEnum(automata.get(editDistance), term,
+          reader, minSimilarity, runAutomata.subList(0, editDistance + 1)
+              .toArray(new RunAutomaton[0]), lastTerm);
+    } else {
+      return null;
+    }
+  }
+  
+  /** initialize levenshtein DFAs up to maxDistance, if possible */
+  private void initAutomata(int maxDistance) {
+    if (automata == null && 
+        maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
+      LevenshteinAutomata builder = 
+        new LevenshteinAutomata(term.text().substring(realPrefixLength));
+      automata = new ArrayList<Automaton>(maxDistance);
+      runAutomata = new ArrayList<RunAutomaton>(maxDistance);
+      for (int i = 0; i <= maxDistance; i++) {
+        Automaton a = builder.toAutomaton(i);
+        // constant prefix
+        if (realPrefixLength > 0) {
+          Automaton prefix = BasicAutomata.makeString(
+              term.text().substring(0, realPrefixLength));
+          a = BasicOperations.concatenate(prefix, a);
+        }
+        automata.add(a);
+        runAutomata.add(new RunAutomaton(a));
+      }
+    }
+  }
+ 
+  /** swap in a new actual enum to proxy to */
+  private void setEnum(TermsEnum actualEnum) {
+    this.actualEnum = actualEnum;
+    this.actualBoostAtt = actualEnum.attributes().addAttribute(
+        MultiTermQuery.BoostAttribute.class);
+  }
+  
+  /**
+   * fired when the max non-competitive boost has changed. this is the hook to
+   * swap in a smarter actualEnum
+   */
+  private void bottomChanged(float boostValue, BytesRef lastTerm)
+      throws IOException {
+    int oldMaxEdits = maxEdits;
+    
+    // as long as the max non-competitive boost is >= the max boost
+    // for some edit distance, keep dropping the max edit distance.
+    while (maxEdits > 0 && boostValue >= calculateMaxBoost(maxEdits))
+      maxEdits--;
+    
+    if (oldMaxEdits != maxEdits) { // the maximum n has changed
+      TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
+      if (newEnum != null) {
+        setEnum(newEnum);
+      }
+    }
+    // TODO, besides changing linear -> automaton, and swapping in a smaller
+    // automaton, we can also use this information to optimize the linear case
+    // itself: re-init maxDistances so the fast-fail happens for more terms due
+    // to the now stricter constraints.
+  }
+   
+  // for some raw min similarity and input term length, the maximum # of edits
+  private int initialMaxDistance(float minimumSimilarity, int termLen) {
+    return (int) ((1-minimumSimilarity) * termLen);
+  }
+  
+  // for some number of edits, the maximum possible scaled boost
+  private float calculateMaxBoost(int nEdits) {
+    final float similarity = 1.0f - ((float) nEdits / (float) (termLength));
+    return (similarity - minSimilarity) * scale_factor;
+  }
+
+  @Override
+  public BytesRef next() throws IOException {
+    BytesRef term = actualEnum.next();
+    boostAtt.setBoost(actualBoostAtt.getBoost());
+    
+    final float bottom = boostAtt.getMaxNonCompetitiveBoost();
+    if (bottom != this.bottom) {
+      this.bottom = bottom;
+      // clone the term before potentially doing something with it
+      // this is a rare but wonderful occurrence anyway
+      bottomChanged(bottom, term == null ? null : (BytesRef) term.clone());
+    }
+    
+    return term;
+  }
+  
+  // proxy all other enum calls to the actual enum
+  @Override
+  public int docFreq() {
+    return actualEnum.docFreq();
+  }
+  
+  @Override
+  public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+    return actualEnum.docs(skipDocs, reuse);
+  }
+  
+  @Override
+  public DocsAndPositionsEnum docsAndPositions(Bits skipDocs,
+      DocsAndPositionsEnum reuse) throws IOException {
+    return actualEnum.docsAndPositions(skipDocs, reuse);
+  }
+  
+  @Override
+  public Comparator<BytesRef> getComparator() throws IOException {
+    return actualEnum.getComparator();
+  }
+  
+  @Override
+  public long ord() throws IOException {
+    return actualEnum.ord();
+  }
+  
+  @Override
+  public SeekStatus seek(BytesRef text) throws IOException {
+    return actualEnum.seek(text);
+  }
+  
+  @Override
+  public SeekStatus seek(long ord) throws IOException {
+    return actualEnum.seek(ord);
+  }
+  
+  @Override
+  public BytesRef term() throws IOException {
+    return actualEnum.term();
+  }
+}
+
+/**
+ * Implement fuzzy enumeration with automaton.
+ * <p>
+ * This is the fastest method as opposed to LinearFuzzyTermsEnum:
+ * as enumeration is logarithmic to the number of terms (instead of linear)
+ * and comparison is linear to length of the term (rather than quadratic)
+ */
+final class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum {
+  private final RunAutomaton matchers[];
+  // used for unicode conversion from BytesRef byte[] to char[]
+  private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+  
+  private final float minimumSimilarity;
+  private final float scale_factor;
+  
+  private final int fullSearchTermLength;
+  private final BytesRef termRef;
+  
+  private final BytesRef lastTerm;
+  private final MultiTermQuery.BoostAttribute boostAtt =
+    attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
+  
+  public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm,
+      IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm) throws IOException {
+    super(automaton, matchers[matchers.length - 1], queryTerm, reader, false);
+    this.minimumSimilarity = minSimilarity;
+    this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
+    this.matchers = matchers;
+    this.lastTerm = lastTerm;
+    termRef = new BytesRef(queryTerm.text());
+    fullSearchTermLength = queryTerm.text().length();
+  }
+  
+  /** finds the smallest Lev(n) DFA that accepts the term. */
+  @Override
+  protected AcceptStatus accept(BytesRef term) {
+    if (term.equals(termRef)) { // ed = 0
+      boostAtt.setBoost(1.0F);
+      return AcceptStatus.YES_AND_SEEK;
+    }
+    
+    UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+    
+    // TODO: benchmark doing this backwards
+    for (int i = 1; i < matchers.length; i++)
+      if (matchers[i].run(utf16.result, 0, utf16.length)) {
+        final float similarity = 1.0f - ((float) i / (float) 
+            (Math.min(utf16.length, fullSearchTermLength)));
+        if (similarity > minimumSimilarity) {
+          boostAtt.setBoost((float) ((similarity - minimumSimilarity) * scale_factor));
+          return AcceptStatus.YES_AND_SEEK;
+        } else {
+          return AcceptStatus.NO_AND_SEEK;
+        }
+      }
+
+    return AcceptStatus.NO_AND_SEEK;
+  }
+
+  /** defers to superclass, except can start at an arbitrary location */
+  @Override
+  protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
+    if (term == null)
+      term = lastTerm;
+    return super.nextSeekTerm(term);
+  }
+}
+
+/**
+ * Implement fuzzy enumeration with linear brute force.
+ */
+final class LinearFuzzyTermsEnum extends FilteredTermsEnum {
 
   /* This should be somewhere around the average long word.
    * If it is longer, we waste time and space. If it is shorter, we waste a
@@ -68,7 +352,7 @@ public final class FuzzyTermsEnum extend
    * @param prefixLength Length of required common prefix. Default value is 0.
    * @throws IOException
    */
-  public FuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
+  public LinearFuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
     super(reader, term.field());
     
     if (minSimilarity >= 1.0f)

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Mar 11 12:15:48 2010
@@ -0,0 +1 @@
+moman

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java?rev=921820&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java Thu Mar 11 12:15:48 2010
@@ -0,0 +1,117 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The following code was generated with the moman/finenight pkg
+// This package is available under the MIT License, see NOTICE.txt
+// for more details.
+
+import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
+
+/** Parametric description for generating a Levenshtein automaton of degree 1 */
+class Lev1ParametricDescription extends ParametricDescription {
+  
+  @Override
+  int transition(int absState, int position, int vector) {
+    // null absState should never be passed in
+    assert absState != -1;
+    
+    // decode absState -> state, offset
+    int state = absState/(w+1);
+    int offset = absState%(w+1);
+    assert offset >= 0;
+    
+    if (position == w) {
+      if (state < 2) {
+        final int loc = vector * 2 + state;
+        offset += unpack(offsetIncrs0, loc, 1);
+        state = unpack(toStates0, loc, 2)-1;
+      }
+    } else if (position == w-1) {
+      if (state < 3) {
+        final int loc = vector * 3 + state;
+        offset += unpack(offsetIncrs1, loc, 1);
+        state = unpack(toStates1, loc, 2)-1;
+      }
+    } else if (position == w-2) {
+      if (state < 5) {
+        final int loc = vector * 5 + state;
+        offset += unpack(offsetIncrs2, loc, 2);
+        state = unpack(toStates2, loc, 3)-1;
+      }
+    } else {
+      if (state < 5) {
+        final int loc = vector * 5 + state;
+        offset += unpack(offsetIncrs3, loc, 2);
+        state = unpack(toStates3, loc, 3)-1;
+      }
+    }
+    
+    if (state == -1) {
+      // null state
+      return -1;
+    } else {
+      // translate back to abs
+      return state*(w+1)+offset;
+    }
+  }
+    
+  // 1 vectors; 2 states per vector; array length = 2
+  private final static long[] toStates0 = new long[] /*2 bits per value */ {
+    0x2L
+  };
+  private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
+    0x0L
+  };
+    
+  // 2 vectors; 3 states per vector; array length = 6
+  private final static long[] toStates1 = new long[] /*2 bits per value */ {
+    0xa43L
+  };
+  private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
+    0x38L
+  };
+    
+  // 4 vectors; 5 states per vector; array length = 20
+  private final static long[] toStates2 = new long[] /*3 bits per value */ {
+    0x4da292442420003L
+  };
+  private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
+    0x5555528000L
+  };
+    
+  // 8 vectors; 5 states per vector; array length = 40
+  private final static long[] toStates3 = new long[] /*3 bits per value */ {
+    0x14d0812112018003L,0xb1a29b46d48a49L
+  };
+  private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
+    0x555555e80a0f0000L,0x5555L
+  };
+  
+  // state map
+  //   0 -> [(0, 0)]
+  //   1 -> [(0, 1)]
+  //   2 -> [(0, 1), (1, 1)]
+  //   3 -> [(0, 1), (1, 1), (2, 1)]
+  //   4 -> [(0, 1), (2, 1)]
+  
+  
+  public Lev1ParametricDescription(int w) {
+    super(w, 1, new int[] {0,1,0,-1,-1});
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev1ParametricDescription.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java?rev=921820&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java Thu Mar 11 12:15:48 2010
@@ -0,0 +1,217 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The following code was generated with the moman/finenight pkg
+// This package is available under the MIT License, see NOTICE.txt
+// for more details.
+
+import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;
+
+/** Parametric description for generating a Levenshtein automaton of degree 2 */
+class Lev2ParametricDescription extends ParametricDescription {
+  
+  @Override
+  int transition(int absState, int position, int vector) {
+    // null absState should never be passed in
+    assert absState != -1;
+    
+    // decode absState -> state, offset
+    int state = absState/(w+1);
+    int offset = absState%(w+1);
+    assert offset >= 0;
+    
+    if (position == w) {
+      if (state < 3) {
+        final int loc = vector * 3 + state;
+        offset += unpack(offsetIncrs0, loc, 1);
+        state = unpack(toStates0, loc, 2)-1;
+      }
+    } else if (position == w-1) {
+      if (state < 5) {
+        final int loc = vector * 5 + state;
+        offset += unpack(offsetIncrs1, loc, 1);
+        state = unpack(toStates1, loc, 3)-1;
+      }
+    } else if (position == w-2) {
+      if (state < 11) {
+        final int loc = vector * 11 + state;
+        offset += unpack(offsetIncrs2, loc, 2);
+        state = unpack(toStates2, loc, 4)-1;
+      }
+    } else if (position == w-3) {
+      if (state < 21) {
+        final int loc = vector * 21 + state;
+        offset += unpack(offsetIncrs3, loc, 2);
+        state = unpack(toStates3, loc, 5)-1;
+      }
+    } else if (position == w-4) {
+      if (state < 30) {
+        final int loc = vector * 30 + state;
+        offset += unpack(offsetIncrs4, loc, 3);
+        state = unpack(toStates4, loc, 5)-1;
+      }
+    } else {
+      if (state < 30) {
+        final int loc = vector * 30 + state;
+        offset += unpack(offsetIncrs5, loc, 3);
+        state = unpack(toStates5, loc, 5)-1;
+      }
+    }
+    
+    if (state == -1) {
+      // null state
+      return -1;
+    } else {
+      // translate back to abs
+      return state*(w+1)+offset;
+    }
+  }
+    
+  // 1 vectors; 3 states per vector; array length = 3
+  private final static long[] toStates0 = new long[] /*2 bits per value */ {
+    0x23L
+  };
+  private final static long[] offsetIncrs0 = new long[] /*1 bits per value */ {
+    0x0L
+  };
+    
+  // 2 vectors; 5 states per vector; array length = 10
+  private final static long[] toStates1 = new long[] /*3 bits per value */ {
+    0x1a68c105L
+  };
+  private final static long[] offsetIncrs1 = new long[] /*1 bits per value */ {
+    0x3e0L
+  };
+    
+  // 4 vectors; 11 states per vector; array length = 44
+  private final static long[] toStates2 = new long[] /*4 bits per value */ {
+    0x6280b80804280405L,0x2323432321608282L,0x523434543213L
+  };
+  private final static long[] offsetIncrs2 = new long[] /*2 bits per value */ {
+    0x5555502220000800L,0x555555L
+  };
+    
+  // 8 vectors; 21 states per vector; array length = 168
+  private final static long[] toStates3 = new long[] /*5 bits per value */ {
+    0x40300c0108801005L,0x80202a8208801000L,0x4021006280a0288dL,0x30482184802d8414L,
+    0x5990240880010460L,0x191a28118330900L,0x310c413204c1104L,0x8625084811c4710dL,
+    0xa92a398e2188231aL,0x104e351c4a508ca4L,0x21208511c8341483L,0xe6290620946a1910L,
+    0xd47221423216a4a0L,0x28L
+  };
+  private final static long[] offsetIncrs3 = new long[] /*2 bits per value */ {
+    0x33300030c2000800L,0x32828088800c3cfL,0x5555550cace32320L,0x5555555555555555L,
+    0x5555555555555555L,0x5555L
+  };
+    
+  // 16 vectors; 30 states per vector; array length = 480
+  private final static long[] toStates4 = new long[] /*5 bits per value */ {
+    0x80300c0108801005L,0x88210802000L,0x44200401400000L,0x7ae3b88621185c07L,
+    0x101500042100404L,0x20803140501446cL,0x40100420006c2122L,0x490140511b004054L,
+    0x8401f2e3c086411L,0x120861200b100822L,0x641102400081180cL,0x4802c40100001088L,
+    0x8c21195607048418L,0x1421014245bc3f2L,0x23450230661200b1L,0x2108664118240803L,
+    0x8c1984802c802004L,0xbc3e28c41150d140L,0xc4120102209421dL,0x7884c11c4710d031L,
+    0x210842109031bc62L,0xd21484360c431044L,0x9c265293a3a6e741L,0x1cc710c41109ce70L,
+    0x1bce27a846525495L,0x3105425094a108c7L,0x6f735e95254731c4L,0x9ee7a9c234a9393aL,
+    0x144720d0520c4150L,0x211051bc646084c2L,0x3614831048220842L,0x93a460e742351488L,
+    0xc4120a2e70a24656L,0x284642d4941cc520L,0x4094a210c51bce46L,0xb525073148310502L,
+    0x24356939460f7358L,0x4098e7aaL
+  };
+  private final static long[] offsetIncrs4 = new long[] /*3 bits per value */ {
+    0xc0602000010000L,0xa000040000000001L,0x248204041248L,0xb0180c06c3618618L,
+    0x238d861860001861L,0x41040061c6e06041L,0x4004900c2402400L,0x409489001041001L,
+    0x4184184004148124L,0x1041b4980c24c3L,0xd26040938d061061L,0x2492492492494146L,
+    0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
+    0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
+    0x2492492492492492L,0x9249249249249249L,0x24924924L
+  };
+    
+  // 32 vectors; 30 states per vector; array length = 960
+  private final static long[] toStates5 = new long[] /*5 bits per value */ {
+    0x80300c0108801005L,0x88210802000L,0x42200401400000L,0xa088201000300c03L,
+    0x100510842108428L,0x2188461701c01108L,0x108401011eb8eeL,0x85c0700442004014L,
+    0x88267ae3b886211L,0x1446c01015108842L,0xc212202080314050L,0x405440100420006L,
+    0x10201c50140511b0L,0x942528423b08888L,0x240501446c010155L,0x21007cb8f0219045L,
+    0x511b004054402088L,0x2e3c086411490140L,0x200b50904428823fL,0x400081180c120861L,
+    0x100001088641102L,0x46030482184802c4L,0x9ce8990840980030L,0x21061200b709c210L,
+    0xf0fca308465581c1L,0x802c405084050916L,0xc211956070484184L,0x9e4209ee65bc3f28L,
+    0x3450230661200b70L,0x1086641182408032L,0xc1984802c8020042L,0x86098201c8d1408L,
+    0xb88a22529ce399L,0x1045434502306612L,0x4088250876f0f8a3L,0xd1408c1984802c80L,
+    0xee3dbc3e28c41150L,0xd0310c4188984429L,0xbc627884c11c4710L,0x1044210842109031L,
+    0x21704711c4340c43L,0xbdef7bdf0c7a18b4L,0x85210d8310c41ef7L,0x994a4e8e9b9d074L,
+    0x60c4310442739c27L,0x3a3a6e741d214843L,0x41ef77bdf77de529L,0x8465254951cc710cL,
+    0x94a108c71bce27aL,0x5254731c43105425L,0xdb1c7a38b4a15949L,0xc710c41cf73dce7bL,
+    0xe4e9bdcd7a54951cL,0x5427b9ea708d2a4L,0x735e95254731c431L,0xbd677db4a9393a6fL,
+    0x4720d0520c41cf75L,0x1051bc646084c214L,0x1483104822084221L,0x193821708511c834L,
+    0x1bf6fdef6f7f147aL,0xd08d45220d8520c4L,0x9c289195a4e91839L,0x488361483104828bL,
+    0xe5693a460e742351L,0x520c41bf71bdf717L,0xe46284642d4941ccL,0x5024094a210c51bcL,
+    0x590b525073148310L,0xce6f7b147a3938a1L,0x941cc520c41f77ddL,0xd5a4e5183dcd62d4L,
+    0x48310502639ea890L,0x460f7358b5250731L,0xf779bd6717b56939L
+  };
+  private final static long[] offsetIncrs5 = new long[] /*3 bits per value */ {
+    0xc0602000010000L,0x8000040000000001L,0xb6db6d4030180L,0x810104922800010L,
+    0x248a000040000092L,0x618000b649654041L,0x861b0180c06c3618L,0x301b0d861860001L,
+    0x61861800075d6ed6L,0x1871b8181048e3L,0xe56041238d861860L,0x40240041040075c6L,
+    0x4100104004900c2L,0x55b5240309009001L,0x1025224004104005L,0x10410010520490L,
+    0x55495240409489L,0x4980c24c34184184L,0x30d061061001041bL,0x184005556d260309L,
+    0x51b4981024e34184L,0x40938d0610610010L,0x492492495546d260L,0x2492492492492492L,
+    0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
+    0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
+    0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,
+    0x9249249249249249L,0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,
+    0x4924924924924924L,0x2492492492492492L,0x9249249249249249L,0x4924924924924924L,
+    0x2492492492492492L
+  };
+  
+  // state map
+  //   0 -> [(0, 0)]
+  //   1 -> [(0, 2)]
+  //   2 -> [(0, 1)]
+  //   3 -> [(0, 2), (1, 2)]
+  //   4 -> [(0, 1), (1, 1)]
+  //   5 -> [(0, 2), (2, 1)]
+  //   6 -> [(0, 1), (2, 2)]
+  //   7 -> [(0, 2), (1, 2), (2, 2)]
+  //   8 -> [(0, 1), (2, 1)]
+  //   9 -> [(0, 2), (2, 2)]
+  //   10 -> [(0, 1), (1, 1), (2, 1)]
+  //   11 -> [(0, 2), (1, 2), (2, 2), (3, 2)]
+  //   12 -> [(0, 2), (2, 1), (3, 1)]
+  //   13 -> [(0, 2), (3, 2)]
+  //   14 -> [(0, 2), (2, 2), (3, 2)]
+  //   15 -> [(0, 2), (1, 2), (3, 1)]
+  //   16 -> [(0, 2), (1, 2), (3, 2)]
+  //   17 -> [(0, 1), (2, 2), (3, 2)]
+  //   18 -> [(0, 2), (3, 1)]
+  //   19 -> [(0, 1), (3, 2)]
+  //   20 -> [(0, 1), (1, 1), (3, 2)]
+  //   21 -> [(0, 2), (2, 1), (4, 2)]
+  //   22 -> [(0, 2), (1, 2), (4, 2)]
+  //   23 -> [(0, 2), (1, 2), (3, 2), (4, 2)]
+  //   24 -> [(0, 2), (2, 2), (4, 2)]
+  //   25 -> [(0, 2), (2, 2), (3, 2), (4, 2)]
+  //   26 -> [(0, 2), (3, 2), (4, 2)]
+  //   27 -> [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)]
+  //   28 -> [(0, 2), (4, 2)]
+  //   29 -> [(0, 2), (1, 2), (2, 2), (4, 2)]
+  
+  
+  public Lev2ParametricDescription(int w) {
+    super(w, 2, new int[] {0,2,1,1,0,-1,0,0,-1,0,-1,-1,-2,-1,-1,-2,-1,-1,-2,-1,-1,-2,-2,-2,-2,-2,-2,-2,-2,-2});
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Lev2ParametricDescription.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java?rev=921820&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java Thu Mar 11 12:15:48 2010
@@ -0,0 +1,258 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+/**
+ * Class to construct DFAs that match a word within some edit distance.
+ * <p>
+ * Implements the algorithm described in:
+ * Schulz and Mihov: Fast String Correction with Levenshtein Automata
+ * <p>
+ * @lucene.experimental
+ */
+public class LevenshteinAutomata {
+  /** @lucene.internal */
+  public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
+  /* input word */
+  final String input;
+  final char word[];
+  /* the automata alphabet. */
+  final char alphabet[];
+
+  /* the unicode ranges outside of alphabet */
+  final char rangeLower[];
+  final char rangeUpper[];
+  int numRanges = 0;
+  
+  ParametricDescription descriptions[]; 
+  
+  /**
+   * Create a new LevenshteinAutomata for some input String.
+   */
+  public LevenshteinAutomata(String input) {
+    this.input = input;
+    this.word = input.toCharArray();
+    
+    // calculate the alphabet
+    SortedSet<Character> set = new TreeSet<Character>();
+    for (int i = 0; i < word.length; i++)
+      set.add(word[i]);
+    alphabet = new char[set.size()];
+    Iterator<Character> iterator = set.iterator();
+    for (int i = 0; i < alphabet.length; i++)
+      alphabet[i] = iterator.next();
+      
+    rangeLower = new char[alphabet.length + 2];
+    rangeUpper = new char[alphabet.length + 2];
+    // calculate the unicode range intervals that exclude the alphabet
+    // these are the ranges for all unicode characters not in the alphabet
+    int lower = 0;
+    for (int i = 0; i < alphabet.length; i++) {
+      char higher = alphabet[i];
+      if (higher > lower) {
+        rangeLower[numRanges] = (char) lower;
+        rangeUpper[numRanges] = (char) (higher - 1);
+        numRanges++;
+      }
+      lower = higher + 1;
+    }
+    /* add the final endpoint */
+    if (lower <= 0xFFFF) {
+      rangeLower[numRanges] = (char) lower;
+      rangeUpper[numRanges] = '\uFFFF';
+      numRanges++;
+    }
+
+    descriptions = new ParametricDescription[] {
+        null, /* for n=0, we do not need to go through the trouble */
+        new Lev1ParametricDescription(input.length()),
+        new Lev2ParametricDescription(input.length()),
+    };
+  }
+  
+  /**
+   * Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
+   * <p>
+   * All automata have the following properties:
+   * <ul>
+   * <li>They are deterministic (DFA).
+   * <li>There are no transitions to dead states.
+   * <li>They are not minimal (some transitions could be combined).
+   * </ul>
+   * </p>
+   */
+  public Automaton toAutomaton(int n) {
+    if (n == 0)
+      return BasicAutomata.makeString(input);
+    
+    if (n >= descriptions.length)
+      return null;
+    
+    final int range = 2*n+1;
+    ParametricDescription description = descriptions[n];
+    // the number of states is based on the length of the word and n
+    State states[] = new State[description.size()];
+    // create all states, and mark as accept states if appropriate
+    for (int i = 0; i < states.length; i++) {
+      states[i] = new State();
+      states[i].setAccept(description.isAccept(i));
+    }
+    // create transitions from state to state
+    for (int k = 0; k < states.length; k++) {
+      final int xpos = description.getPosition(k);
+      if (xpos < 0)
+        continue;
+      final int end = xpos + Math.min(word.length - xpos, range);
+      
+      for (int x = 0; x < alphabet.length; x++) {
+        final char ch = alphabet[x];
+        // get the characteristic vector at this position wrt ch
+        final int cvec = getVector(ch, xpos, end);
+        int dest = description.transition(k, xpos, cvec);
+        if (dest >= 0)
+          states[k].addTransition(new Transition(ch, states[dest]));
+      }
+      // add transitions for all other chars in unicode
+      // by definition, their characteristic vectors are always 0,
+      // because they do not exist in the input string.
+      int dest = description.transition(k, xpos, 0); // by definition
+      if (dest >= 0)
+        for (int r = 0; r < numRanges; r++)
+          states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest]));      
+    }
+
+    Automaton a = new Automaton();
+    a.setInitialState(states[0]);
+    a.setDeterministic(true);
+    // we need not trim transitions to dead states, as they are not created.
+    // a.restoreInvariant();
+    return a;
+  }
+  
+  /**
+   * Get the characteristic vector <code>X(x, V)</code> 
+   * where V is <code>substring(pos, end)</code>
+   */
+  int getVector(char x, int pos, int end) {
+    int vector = 0;
+    for (int i = pos; i < end; i++) {
+      vector <<= 1;
+      if (word[i] == x)
+        vector |= 1;
+    }
+    return vector;
+  }
+    
+  /**
+   * A ParametricDescription describes the structure of a Levenshtein DFA for some degree n.
+   * <p>
+   * There are four components of a parametric description, all parameterized on the length
+   * of the word <code>w</code>:
+   * <ol>
+   * <li>The number of states: {@link #size()}
+   * <li>The set of final states: {@link #isAccept(int)}
+   * <li>The transition function: {@link #transition(int, int, int)}
+   * <li>Minimal boundary function: {@link #getPosition(int)}
+   * </ol>
+   */
+  static abstract class ParametricDescription {
+    protected final int w;
+    protected final int n;
+    private final int[] minErrors;
+    
+    ParametricDescription(int w, int n, int[] minErrors) {
+      this.w = w;
+      this.n = n;
+      this.minErrors = minErrors;
+    }
+    
+    /**
+     * Return the number of states needed to compute a Levenshtein DFA
+     */
+    int size() {
+      return minErrors.length * (w+1);
+    };
+
+    /**
+     * Returns true if the <code>state</code> in any Levenshtein DFA is an accept state (final state).
+     */
+    boolean isAccept(int absState) {
+      // decode absState -> state, offset
+      int state = absState/(w+1);
+      int offset = absState%(w+1);
+      assert offset >= 0;
+      return w - offset + minErrors[state] <= n;
+    }
+
+    /**
+     * Returns the position in the input word for a given <code>state</code>.
+     * This is the minimal boundary for the state.
+     */
+    int getPosition(int absState) {
+      return absState % (w+1);
+    }
+    
+    /**
+     * Returns the state number for a transition from the given <code>state</code>,
+     * assuming <code>position</code> and characteristic vector <code>vector</code>
+     */
+    abstract int transition(int state, int position, int vector);
+
+    private final static long[] MASKS = new long[] {0x1,0x3,0x7,0xf,
+                                                    0x1f,0x3f,0x7f,0xff,
+                                                    0x1ff,0x3ff,0x7ff,0xfff,
+                                                    0x1fff,0x3fff,0x7fff,0xffff,
+                                                    0x1ffff,0x3ffff,0x7ffff,0xfffff,
+                                                    0x1fffff,0x3fffff,0x7fffff,0xffffff,
+                                                    0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff,
+                                                    0x1fffffff,0x3fffffff,0x7fffffffL,0xffffffffL,
+                                                    0x1ffffffffL,0x3ffffffffL,0x7ffffffffL,0xfffffffffL,
+                                                    0x1fffffffffL,0x3fffffffffL,0x7fffffffffL,0xffffffffffL,
+                                                    0x1ffffffffffL,0x3ffffffffffL,0x7ffffffffffL,0xfffffffffffL,
+                                                    0x1fffffffffffL,0x3fffffffffffL,0x7fffffffffffL,0xffffffffffffL,
+                                                    0x1ffffffffffffL,0x3ffffffffffffL,0x7ffffffffffffL,0xfffffffffffffL,
+                                                    0x1fffffffffffffL,0x3fffffffffffffL,0x7fffffffffffffL,0xffffffffffffffL,
+                                                    0x1ffffffffffffffL,0x3ffffffffffffffL,0x7ffffffffffffffL,0xfffffffffffffffL,
+                                                    0x1fffffffffffffffL,0x3fffffffffffffffL,0x7fffffffffffffffL};
+  
+    protected int unpack(long[] data, int index, int bitsPerValue) {
+      final long bitLoc = bitsPerValue * index;
+      final int dataLoc = (int) (bitLoc >> 6);
+      final int bitStart = (int) (bitLoc & 63);
+      //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);
+      if (bitStart + bitsPerValue <= 64) {
+        // not split
+        return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);
+      } else {
+        // split
+        final int part = 64-bitStart;
+        return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +
+                      ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));
+      }
+    }
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py?rev=921820&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py Thu Mar 11 12:15:48 2010
@@ -0,0 +1,492 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note, this file is known to work with rev 115 of the moman
+# repository (http://bitbucket.org/jpbarrette/moman/overview)
+#
+# See also: http://sites.google.com/site/rrettesite/moman
+
+import math
+import os
+import sys
+sys.path.insert(0, 'moman/finenight/python')
+try:
+  from possibleStates import genTransitions
+except ImportError:
+  from finenight.possibleStates import genTransitions
+
+MODE = 'array'
+PACKED = True
+WORD = 64
+LOG2_WORD = int(math.log(WORD)/math.log(2))
+#MODE = 'switch'
+
+class LineOutput:
+
+  def __init__(self, indent=''):
+    self.l = []
+    self._indent = self.startIndent = indent
+    self.inComment = False
+
+  def __call__(self, s, indent=0):
+    if s.find('}') != -1:
+      assert self._indent != self.startIndent
+      self._indent = self._indent[:-2]
+
+    if indent != 0:
+      indent0 = '  ' * (len(self._indent)/2+indent)
+    else:
+      indent0 = self._indent
+
+    if s.find('/*') != -1:
+      if s.find('*/') == -1:
+        self.inComment = True
+    elif s.find('*/') != -1:
+      self.inComment = True
+
+    if self.inComment:
+      self.l.append(indent0 + s)
+    else:
+      self.l.append(indent0 + s.lstrip())
+
+    self.inComment = self.inComment and s.find('*/') == -1
+
+    if s.find('{') != -1:
+      self._indent += '  '
+
+  def __str__(self):
+    if True:
+      assert self._indent == self.startIndent, 'indent %d vs start indent %d' % \
+             (len(self._indent), len(self.startIndent))
+    return '\n'.join(self.l)
+
+  def indent(self):
+    self._indent += '  '
+
+  def outdent(self):
+    assert self._indent != self.startIndent
+    self._indent = self._indent[:-2]
+    
+def charVarNumber(charVar):
+  """
+  Maps binary number (eg [1, 0, 1]) to its decimal value (5).
+  """
+
+  p = 1
+  sum = 0
+  downTo = len(charVar)-1
+  while downTo >= 0:
+    sum += p * int(charVar[downTo])
+    p *= 2
+    downTo -= 1
+  return sum
+
+def main():
+
+  if len(sys.argv) != 2:
+    print
+    print 'Usage: python -u %s N' % sys.argv[0]
+    print
+    print 'NOTE: the resulting .java file is created in the current working dir!'
+    print
+    sys.exit(1)
+
+  n = int(sys.argv[1])
+
+  tables = genTransitions(n)
+
+  stateMap = {}
+
+  # init null state
+  stateMap['[]'] = -1
+
+  # init start state
+  stateMap['[(0, 0)]'] = 0
+
+  w = LineOutput()
+
+  w('package org.apache.lucene.util.automaton;')
+  w('')
+  w('/**')
+  w(' * Licensed to the Apache Software Foundation (ASF) under one or more')
+  w(' * contributor license agreements.  See the NOTICE file distributed with')
+  w(' * this work for additional information regarding copyright ownership.')
+  w(' * The ASF licenses this file to You under the Apache License, Version 2.0')
+  w(' * (the "License"); you may not use this file except in compliance with')
+  w(' * the License.  You may obtain a copy of the License at')
+  w(' *')
+  w(' *     http://www.apache.org/licenses/LICENSE-2.0')
+  w(' *')
+  w(' * Unless required by applicable law or agreed to in writing, software')
+  w(' * distributed under the License is distributed on an "AS IS" BASIS,')
+  w(' * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.')
+  w(' * See the License for the specific language governing permissions and')
+  w(' * limitations under the License.')
+  w(' */')
+  w('')
+  w('// The following code was generated with the moman/finenight pkg')
+  w('// This package is available under the MIT License, see NOTICE.txt')
+  w('// for more details.')
+  w('')
+  w('import org.apache.lucene.util.automaton.LevenshteinAutomata.ParametricDescription;')
+  w('')
+  w('/** Parametric description for generating a Levenshtein automaton of degree %s */' % n)
+  className = 'Lev%dParametricDescription' % n
+
+  w('class %s extends ParametricDescription {' % className)
+
+  w('')
+  w('@Override')
+  w('int transition(int absState, int position, int vector) {')
+
+  w('  // null absState should never be passed in')
+  w('  assert absState != -1;')
+
+  w('')
+  w('  // decode absState -> state, offset')
+  w('  int state = absState/(w+1);')
+  w('  int offset = absState%(w+1);')
+  w('  assert offset >= 0;')
+  w('')  
+
+  machines = []
+  
+  for i, map in enumerate(tables):
+    if i == 0:
+      w('if (position == w) {')
+    elif i == len(tables)-1:
+      w('} else {')
+    else:
+      w('} else if (position == w-%d) {' % i)
+
+    if i != 0 and MODE == 'switch':
+      w('switch(vector) {')
+
+    l = map.items()
+    l.sort()
+
+    numCasesPerVector = None
+    numVectors = len(l)
+
+    if MODE == 'array':
+      toStateArray = []
+      toOffsetIncrArray = []
+
+    for charVar, states in l:
+
+      # somehow it's a string:
+      charVar = eval(charVar)
+
+      if i != 0 and MODE == 'switch':
+        w('case %s: // <%s>' % (charVarNumber(charVar), ','.join([str(x) for x in charVar])))
+        w.indent()
+        
+      l = states.items()
+
+      byFromState = {}
+
+      # first pass to assign states
+      byAction = {}
+      for s, (toS, offset) in l:
+        state = str(s)
+        if state == '[]':
+          # don't waste code on the null state
+          continue
+        
+        toState = str(toS)
+        if state not in stateMap:
+          stateMap[state] = len(stateMap)-1
+        if toState not in stateMap:
+          stateMap[toState] = len(stateMap)-1
+
+        byFromState[stateMap[state]] = (1+stateMap[toState], offset)
+
+        fromStateDesc = ', '.join([str(x) for x in eval(s)])
+        toStateDesc = ', '.join([str(x) for x in toS])   
+
+        tup = (stateMap[toState], toStateDesc, offset)
+        if tup not in byAction:
+          byAction[tup] = []
+        byAction[tup].append((fromStateDesc, stateMap[state]))
+
+      if numCasesPerVector is None:
+        numCasesPerVector = len(l)-1
+      else:
+        # we require this to be uniform... empirically it seems to be!
+        assert numCasesPerVector == len(l)-1
+
+      if MODE == 'array':
+
+        for s in range(numCasesPerVector):
+          toState, offsetIncr = byFromState[s]
+          toStateArray.append(toState)
+          toOffsetIncrArray.append(offsetIncr)
+
+      else:
+
+        # render switches
+        w('switch(state) {   // %s cases' % len(l))
+
+        for (toState, toStateDesc, offset), lx in byAction.items():
+          for fromStateDesc, fromState in lx:
+            w('case %s: // %s' % (fromState, fromStateDesc))
+          w.indent()
+          w('  state = %s; // %s' % (toState, toStateDesc))
+          if offset > 0:
+            w('  offset += %s;' % offset)
+          w('break;')
+          w.outdent()
+
+        w('}')
+        if i != 0:
+          w('break;')
+          w.outdent()
+
+    if MODE == 'array':
+      # strangely state can come in wildly out of bounds....
+      w('  if (state < %d) {' % numCasesPerVector)
+      w('    final int loc = vector * %d + state;' % numCasesPerVector)
+      if PACKED:
+        w('    offset += unpack(offsetIncrs%d, loc, NBITSOFFSET%d);' % (i, i))
+        w('    state = unpack(toStates%d, loc, NBITSSTATES%d)-1;' % (i, i))
+      else:
+        w('    offset += offsetIncrs%d[loc];' % i)
+        w('    state = toStates%d[loc]-1;' % i)
+      w('  }')
+    elif i != 0:
+      w('}')
+
+    machines.append((toStateArray, toOffsetIncrArray, numCasesPerVector, numVectors))
+
+  # ends switch statement for machine
+  w('}')
+
+  w('')
+
+  w('  if (state == -1) {')
+  w('    // null state')
+  w('    return -1;')
+  w('  } else {')
+  w('    // translate back to abs')
+  w('    return state*(w+1)+offset;')
+  w('  }')
+
+  # ends transition method
+  w('}')
+
+  subs = []
+  if MODE == 'array':
+    w.indent()
+    for i, (toStateArray, toOffsetIncrsArray, numCasesPerVector, numVectors) in enumerate(machines):
+      w('')
+      w.outdent()
+      w('// %d vectors; %d states per vector; array length = %d' % \
+        (numVectors, numCasesPerVector, numVectors*numCasesPerVector))
+      w.indent()
+      if PACKED:
+        # pack in python
+        l, nbits = pack(toStateArray)
+        subs.append(('NBITSSTATES%d' % i, str(nbits)))
+        w('  private final static long[] toStates%d = new long[] /*%d bits per value */ %s;' % \
+          (i, nbits, renderList([hex(long(x)) for x in l])))
+
+        l, nbits = pack(toOffsetIncrsArray)
+        subs.append(('NBITSOFFSET%d' % i, str(nbits)))
+        w('  private final static long[] offsetIncrs%d = new long[] /*%d bits per value */ %s;' % \
+          (i, nbits, renderList([hex(long(x)) for x in l])))
+      else:
+        w('  private final static int[] toStates%d = new int[] %s;' % \
+          (i, renderList([str(x) for x in toStateArray])))
+        w('  private final static int[] offsetIncrs%d = new int[] %s;' % \
+          (i, renderList([str(x) for x in toStateArray])))
+    w.outdent()
+  
+  stateMap2 = dict([[v,k] for k,v in stateMap.items()])
+  w('')
+  w('// state map')
+  sum = 0
+  minErrors = []
+  for i in xrange(len(stateMap2)-1):
+    w('//   %s -> %s' % (i, stateMap2[i]))
+    v = eval(stateMap2[i])
+    minError = min([-i+e for i, e in v])
+    c = len(v)
+    sum += c
+    minErrors.append(minError)
+  w('')
+
+  w.indent()
+  #w('private final static int[] minErrors = new int[] {%s};' % ','.join([str(x) for x in minErrors]))
+
+  w.outdent()
+
+  w('')
+  w('  public %s(int w) {' % className)
+  w('    super(w, %d, new int[] {%s});' % (n, ','.join([str(x) for x in minErrors])), indent=1)
+  w('  }')
+
+  if 0:
+    w('')
+    w('@Override')
+    w('public int size() { // this can now move up?')
+    w('  return %d*(w+1);' % (len(stateMap2)-1))
+    w('}')
+
+    w('')
+    w('@Override')
+    w('public int getPosition(int absState) { // this can now move up?')
+    w('  return absState % (w+1);')
+    w('}')
+
+    w('')
+    w('@Override')
+    w('public boolean isAccept(int absState) { // this can now move up?')
+    w('  // decode absState -> state, offset')
+    w('  int state = absState/(w+1);')
+    w('  if (true || state < minErrors.length) {')
+    w('    int offset = absState%(w+1);')
+    w('    assert offset >= 0;')
+    w('    return w - offset + minErrors[state] <= %d;' % n)
+    w('  } else {')
+    w('    return false;')
+    w('  }')
+    w('}')
+
+  if MODE == 'array' and PACKED:
+
+    # we moved into super class
+    if False:
+      w('')
+
+      v = 2
+      l = []
+      for i in range(63):
+        l.append(hex(v-1))
+        v *= 2
+
+      w('private final static long[] MASKS = new long[] {%s};' % ','.join(l), indent=1)
+      w('')
+
+      # unpack in java
+      w('private int unpack(long[] data, int index, int bitsPerValue) {')
+      w('  final long bitLoc = bitsPerValue * index;')
+      w('  final int dataLoc = (int) (bitLoc >> %d);' % LOG2_WORD)
+      w('  final int bitStart = (int) (bitLoc & %d);' % (WORD-1))
+      w('  //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue);')
+      w('  if (bitStart + bitsPerValue <= %d) {' % WORD)
+      w('    // not split')
+      w('    return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]);')
+      w('  } else {')
+      w('    // split')
+      w('    final int part = %d-bitStart;' % WORD)
+      w('    return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) +')
+      w('      ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part));', indent=1)
+      w('  }')
+      w('}')
+  
+  # class
+  w('}')
+  w('')
+
+  fileOut = '%s.java' % className
+
+  s = str(w)
+  for sub, repl in subs:
+    s = s.replace(sub, repl)
+
+  open(fileOut, 'wb').write(s)
+
+  print 'Wrote %s [%d lines; %.1f KB]' % \
+        (fileOut, len(w.l), os.path.getsize(fileOut)/1024.)
+
+def renderList(l):
+  lx = ['    ']
+  for i in xrange(len(l)):
+    if i > 0:
+      lx.append(',')
+      if i % 4 == 0:
+        lx.append('\n    ')
+    lx.append(l[i])
+  return '{\n%s\n  }' % ''.join(lx)
+
+MASKS = []
+v = 2
+for i in xrange(63):
+  MASKS.append(v-1)
+  v *= 2
+
+# packs into longs; returns long[], numBits
+def pack(l):
+  maxV = max(l)
+  bitsPerValue = max(1, int(math.ceil(math.log(maxV+1)/math.log(2.0))))
+
+  bitsLeft = WORD
+  pendingValue = 0
+
+  packed = []
+  for i in xrange(len(l)):
+    v = l[i]
+    if pendingValue > 0:
+      bitsUsed = math.ceil(math.log(pendingValue)/math.log(2.0))
+      assert bitsUsed <= (WORD-bitsLeft), 'bitsLeft=%s (%s-%s=%s) bitsUsed=%s' % (bitsLeft, WORD, bitsLeft, WORD-bitsLeft, bitsUsed)
+      
+    if bitsLeft >= bitsPerValue:
+      pendingValue += v << (WORD-bitsLeft)
+      bitsLeft -= bitsPerValue
+      if bitsLeft == 0:
+        packed.append(pendingValue)
+        bitsLeft = WORD
+        pendingValue = 0
+    else:
+      # split
+
+      # bottom bitsLeft go in current word:
+      pendingValue += (v & MASKS[bitsLeft-1]) << (WORD-bitsLeft)
+      packed.append(pendingValue)
+
+      pendingValue = v >> bitsLeft
+      bitsLeft = WORD - (bitsPerValue-bitsLeft)
+
+  if bitsLeft < WORD:
+    packed.append(pendingValue)
+
+  # verify(l, packed, bitsPerValue)
+  
+  return packed, bitsPerValue
+
+def verify(data, packedData, bitsPerValue):
+  for i in range(len(data)):
+    assert data[i] == unpack(packedData, i, bitsPerValue)
+
+def unpack(data, index, bitsPerValue):
+  bitLoc = bitsPerValue * index
+  dataLoc = int(bitLoc >> LOG2_WORD)
+  bitStart = int(bitLoc & (WORD-1))
+  if bitStart + bitsPerValue <= WORD:
+    # not split
+    return int(((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]))
+  else:
+    # split
+    part = WORD-bitStart;
+    return int((((data[dataLoc] >> bitStart) & MASKS[part-1]) +
+                ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part)))
+  
+if __name__ == '__main__':
+  if not __debug__:
+    print
+    print 'ERROR: please run without -O'
+    print
+    sys.exit(1)
+  main()

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/createLevAutomata.py
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java?rev=921820&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (added)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java Thu Mar 11 12:15:48 2010
@@ -0,0 +1,142 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/** 
+ * Tests the results of fuzzy against pre-recorded output 
+ * The format of the file is the following:
+ * 
+ * Header Row: # of bits: generate 2^n sequential documents 
+ * with a value of Integer.toBinaryString
+ * 
+ * Entries: an entry is a param spec line, a resultCount line, and
+ * then 'resultCount' results lines. The results lines are in the
+ * expected order.
+ * 
+ * param spec line: a comma-separated list of params to FuzzyQuery
+ *   (query, prefixLen, pqSize, minScore)
+ * query = query text as a number (expand with Integer.toBinaryString)
+ * prefixLen = prefix length
+ * pqSize = priority queue maximum size for TopTermsBoostOnlyBooleanQueryRewrite
+ * minScore = minimum similarity
+ * 
+ * resultCount line: total number of expected hits.
+ * 
+ * results line: comma-separated docID, score pair
+ **/
+public class TestFuzzyQuery2 extends LuceneTestCase {
+  /** epsilon for score comparisons */
+  static final float epsilon = 0.00001f;
+
+  public void testFromTestData() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt");
+    BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+    
+    int bits = Integer.parseInt(reader.readLine());
+    int terms = (int) Math.pow(2, bits);
+    
+    RAMDirectory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
+        IndexWriter.MaxFieldLength.UNLIMITED);
+    
+    Document doc = new Document();
+    Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+    doc.add(field);
+    
+    for (int i = 0; i < terms; i++) {
+      field.setValue(Integer.toBinaryString(i));
+      writer.addDocument(doc);
+    }
+    
+    writer.optimize();
+    writer.close();   
+    
+    IndexSearcher searcher = new IndexSearcher(dir);
+    String line;
+    while ((line = reader.readLine()) != null) {
+      String params[] = line.split(",");
+      String query = Integer.toBinaryString(Integer.parseInt(params[0]));
+      int prefix = Integer.parseInt(params[1]);
+      int pqSize = Integer.parseInt(params[2]);
+      float minScore = Float.parseFloat(params[3]);
+      FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix);
+      q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
+      int expectedResults = Integer.parseInt(reader.readLine());
+      TopDocs docs = searcher.search(q, expectedResults);
+      assertEquals(expectedResults, docs.totalHits);
+      for (int i = 0; i < expectedResults; i++) {
+        String scoreDoc[] = reader.readLine().split(",");
+        assertEquals(Integer.parseInt(scoreDoc[0]), docs.scoreDocs[i].doc);
+        assertEquals(Float.parseFloat(scoreDoc[1]), docs.scoreDocs[i].score, epsilon);
+      }
+    }
+    searcher.close();
+    dir.close();
+  }
+  
+  /* Code to generate test data
+  public static void main(String args[]) throws Exception {
+    int bits = 3;
+    System.out.println(bits);
+    int terms = (int) Math.pow(2, bits);
+    
+    RAMDirectory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
+        IndexWriter.MaxFieldLength.UNLIMITED);
+    
+    Document doc = new Document();
+    Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+    doc.add(field);
+
+    for (int i = 0; i < terms; i++) {
+      field.setValue(Integer.toBinaryString(i));
+      writer.addDocument(doc);
+    }
+    
+    writer.optimize();
+    writer.close();
+
+    IndexSearcher searcher = new IndexSearcher(dir);
+    for (int prefix = 0; prefix < bits; prefix++)
+      for (int pqsize = 1; pqsize <= terms; pqsize++)
+        for (float minscore = 0.1F; minscore < 1F; minscore += 0.2F)
+          for (int query = 0; query < terms; query++) {
+            FuzzyQuery q = new FuzzyQuery(
+                new Term("field", Integer.toBinaryString(query)), minscore, prefix);
+            q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqsize));
+            System.out.println(query + "," + prefix + "," + pqsize + "," + minscore);
+            TopDocs docs = searcher.search(q, terms);
+            System.out.println(docs.totalHits);
+            for (int i = 0; i < docs.totalHits; i++)
+              System.out.println(docs.scoreDocs[i].doc + "," + docs.scoreDocs[i].score);
+          }
+  }
+  */
+}

Propchange: lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message