lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r888891 [1/3] - in /lucene/java/branches/flex_1458: ./ src/java/org/apache/lucene/search/ src/java/org/apache/lucene/util/ src/java/org/apache/lucene/util/automaton/ src/test/org/apache/lucene/search/
Date Wed, 09 Dec 2009 17:45:59 GMT
Author: rmuir
Date: Wed Dec  9 17:45:58 2009
New Revision: 888891

URL: http://svn.apache.org/viewvc?rev=888891&view=rev
Log:
LUCENE-1606: Add Automaton for NFA/DFA queries. Implement Wildcard and Regexp with it

Added:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Transition.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/TransitionComparator.java   (with props)
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/package.html   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQuery.java   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpQuery.java   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestRegexpRandom.java   (with props)
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestWildcardRandom.java   (with props)
Modified:
    lucene/java/branches/flex_1458/LICENSE.txt
    lucene/java/branches/flex_1458/NOTICE.txt
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardQuery.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermEnum.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/search/TestWildcard.java

Modified: lucene/java/branches/flex_1458/LICENSE.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/LICENSE.txt?rev=888891&r1=888890&r2=888891&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/LICENSE.txt (original)
+++ lucene/java/branches/flex_1458/LICENSE.txt Wed Dec  9 17:45:58 2009
@@ -237,4 +237,34 @@
 
   http://www.python.org/download/releases/2.4.2/license/
 
+Some code in src/java/org/apache/lucene/util/automaton was
+derived from Brics automaton sources available at
+www.brics.dk/automaton/. Here is the copyright from those sources:
 
+/*
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 

Modified: lucene/java/branches/flex_1458/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/NOTICE.txt?rev=888891&r1=888890&r2=888891&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/NOTICE.txt (original)
+++ lucene/java/branches/flex_1458/NOTICE.txt Wed Dec  9 17:45:58 2009
@@ -33,3 +33,6 @@
 ICU4J, (under contrib/collation) is licensed under an MIT styles license
 (contrib/collation/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008 
 International Business Machines Corporation and others
+
+Brics Automaton (under src/java/org/apache/lucene/util/automaton) is 
+BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,151 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.SpecialOperations;
+
+/**
+ * A {@link Query} that will match terms against a finite-state machine.
+ * <p>
+ * This query will match documents that contain terms accepted by a given
+ * finite-state machine. The automaton can be constructed with the
+ * {@link org.apache.lucene.util.automaton} API. Alternatively, it can be
+ * created from a regular expression with {@link RegexpQuery} or from
+ * the standard Lucene wildcard syntax with {@link WildcardQuery}.
+ * </p>
+ * <p>
+ * When the query is executed, it will create an equivalent minimal DFA of the
+ * finite-state machine, and will enumerate the term dictionary in an
+ * intelligent way to reduce the number of comparisons. For example: the regular
+ * expression of <code>[dl]og?</code> will make approximately four comparisons:
+ * do, dog, lo, and log.
+ * </p>
+ */
+public class AutomatonQuery extends MultiTermQuery {
+  /** the automaton to match index terms against */
+  protected Automaton automaton;
+  /** term containing the field, and possibly some pattern structure */
+  protected Term term;
+
+  /**
+   * Create a new AutomatonQuery from an {@link Automaton}.
+   * 
+   * @param term Term containing field and possibly some pattern structure. The
+   *        term text is ignored.
+   * @param automaton Automaton to run, terms that are accepted are considered a
+   *        match.
+   */
+  public AutomatonQuery(Term term, Automaton automaton) {
+    super(term.field());
+    this.term = term;
+    this.automaton = automaton;
+    MinimizationOperations.minimize(automaton);
+  }
+
+  @Override
+  protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
+    // matches nothing
+    if (BasicOperations.isEmpty(automaton)) {
+      return new EmptyTermsEnum();
+    }
+    
+    // matches all possible strings
+    if (BasicOperations.isTotal(automaton)) {
+      final Terms terms = reader.fields().terms(getField());
+      return (terms != null) ? terms.iterator() : new EmptyTermsEnum();
+    }
+    
+    // matches a fixed string in singleton representation
+    String singleton = automaton.getSingleton();
+    if (singleton != null)
+      return new SingleTermsEnum(reader, term.createTerm(singleton));
+  
+    // matches a fixed string in expanded representation
+    String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
+    if (automaton.equals(BasicAutomata.makeString(commonPrefix))) {
+      return new SingleTermsEnum(reader, term.createTerm(commonPrefix));
+    }
+    
+    // matches a constant prefix
+    Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
+        .makeString(commonPrefix), BasicAutomata.makeAnyString());
+    if (automaton.equals(prefixAutomaton)) {
+      return new PrefixTermsEnum(reader, term.createTerm(commonPrefix));
+    }
+    
+    return new AutomatonTermsEnum(automaton, term, reader);
+  }
+  
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = super.hashCode();
+    result = prime * result + ((automaton == null) ? 0 : automaton.hashCode());
+    result = prime * result + ((term == null) ? 0 : term.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (!super.equals(obj))
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    AutomatonQuery other = (AutomatonQuery) obj;
+    if (automaton == null) {
+      if (other.automaton != null)
+        return false;
+    } else if (!automaton.equals(other.automaton))
+      return false;
+    if (term == null) {
+      if (other.term != null)
+        return false;
+    } else if (!term.equals(other.term))
+      return false;
+    return true;
+  }
+
+  @Override
+  public String toString(String field) {
+    StringBuilder buffer = new StringBuilder();
+    if (!term.field().equals(field)) {
+      buffer.append(term.field());
+      buffer.append(":");
+    }
+    buffer.append(getClass().getSimpleName());
+    buffer.append(" {");
+    buffer.append('\n');
+    buffer.append(automaton.toString());
+    buffer.append("}");
+    buffer.append(ToStringUtils.boost(getBoost()));
+    return buffer.toString();
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonQuery.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,388 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+/**
+ * A FilteredTermsEnum that enumerates terms based upon what is accepted by a
+ * DFA.
+ * <p>
+ * The algorithm is such:
+ * <ol>
+ *   <li>As long as matches are successful, keep reading sequentially.
+ *   <li>When a match fails, skip to the next string in lexicographic order that
+ * does not enter a reject state.
+ * </ol>
+ * <p>
+ * The algorithm does not attempt to actually skip to the next string that is
+ * completely accepted. This is not possible when the language accepted by the
+ * FSM is not finite (i.e. * operator).
+ * </p>
+ * <p>
+ * If the DFA has a leading kleene star, or something similar, it will
+ * need to run against the entire term dictionary. In this case its much
+ * better to do just that than to use smart enumeration.
+ * This heuristic looks for an initial loop, with a range of at least 1/3
+ * of the unicode BMP.
+ * Use {@link #usesLinearMode} to find out if it enumerates all terms
+ * in linear mode without seeking.
+ * </p>
+ * <p>
+ * <font color="#FF0000"> WARNING: The status of the <b>Automaton</b> feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.</font>
+ * </p>
+ */
+public class AutomatonTermsEnum extends FilteredTermsEnum {
+  // the object-oriented form of the DFA
+  private final Automaton automaton;
+  // a tableized array-based form of the DFA
+  private final RunAutomaton runAutomaton;
+  // true if this enum will not seek around
+  private final boolean linearMode;
+  // common suffix of the automaton
+  private final TermRef commonSuffixRef;
+  // true if the automaton accepts a finite language
+  private final boolean finite;
+  // array of sorted transitions for each state, indexed by state number
+  private final Transition[][] allTransitions;
+  // for path tracking: each bit is a numbered state
+  private final BitSet visited;
+  // used for unicode conversion from TermRef byte[] to char[]
+  private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+  // used for unicode conversion from char[] to TermRef byte[]
+  private final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+  // the reference used for seeking forwards through the term dictionary
+  private final TermRef seekTermRef = new TermRef();
+  
+  // this accept stati will be returned by accept() dependent on internal mode
+  private final AcceptStatus NO_MATCH, YES_MATCH;
+  
+  /**
+   * Construct an enumerator based upon an automaton, enumerating the specified
+   * field, working on a supplied reader.
+   * <p>
+   * The parameter linearMode determines whether or not it will use smart enumeration.
+   */
+  AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader, boolean linearMode)
+      throws IOException {
+    super(reader, queryTerm.field());
+    this.automaton = automaton;
+    this.linearMode = linearMode;
+    
+    /* 
+     * tableize the automaton. this also ensures it is deterministic, and has no 
+     * transitions to dead states. it also invokes Automaton.setStateNumbers to
+     * number the original states (this is how they are tableized)
+     */
+    runAutomaton = new RunAutomaton(this.automaton);
+
+    if (this.linearMode) {
+      // iterate all terms in linear mode
+      this.finite = false;
+      allTransitions = null;
+      visited = null;
+      commonSuffixRef = new TermRef(getValidUTF16Suffix(SpecialOperations
+          .getCommonSuffix(automaton)));
+      NO_MATCH = AcceptStatus.NO;
+      YES_MATCH = AcceptStatus.YES;
+    } else {
+      // if the automaton is finite, we will never read sequentially, but always seek.
+      this.finite = SpecialOperations.isFinite(this.automaton);
+      // in nonlinear mode, the common suffix isn't that helpful.
+      // we will seek each time anyway (and take the unicode conversion hit).
+      // its also currently expensive to calculate, because getCommonSuffix is 
+      // a bit expensive.
+      commonSuffixRef = new TermRef("");
+      // build a cache of sorted transitions for every state
+      allTransitions = new Transition[runAutomaton.getSize()][];
+      for (State state : this.automaton.getStates())
+        allTransitions[state.getNumber()] = state.getSortedTransitionArray(false);
+      // used for path tracking, where each bit is a numbered state.
+      visited = new BitSet(runAutomaton.getSize());
+      NO_MATCH = AcceptStatus.NO_AND_SEEK;
+      YES_MATCH = finite ? AcceptStatus.YES_AND_SEEK : AcceptStatus.YES;
+    }
+  }
+  
+  /**
+   * Construct an enumerator based upon an automaton, enumerating the specified
+   * field, working on a supplied reader.
+   * <p>
+   * It will automagically determine whether or not to enumerate the term dictionary
+   * in a smart way, or to just do a linear scan depending upon a heuristic.
+   */
+  public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
+      throws IOException {
+    this(automaton, queryTerm, reader, AutomatonTermsEnum.isSlow(automaton));
+  }
+  
+  /**
+   * Heuristic to detect if an automaton will be so slow,
+   * that it is better to do a linear enumeration.
+   * <p>
+   * A very slow automaton will simply cause a lot of wasted disk seeks.
+   * Instead in that case it is actually faster to do a linear enumeration.
+   * <p>
+   * @param automaton automaton
+   * @return true if it will result in bad search performance
+   */
+  private static boolean isSlow(Automaton automaton) {
+    /*
+     * If the DFA has a leading kleene star, or something similar, it will
+     * need to run against the entire term dictionary. In this case its much
+     * better to do just that than to use smart enumeration.
+     * 
+     * this heuristic looks for an initial loop, with a range of at least 1/3
+     * of the unicode BMP.
+     */
+    State initialState = automaton.getInitialState();
+    boolean linearMode = false;
+    for (Transition transition : initialState.getTransitions()) {
+      if (transition.getDest() == initialState && 
+          (transition.getMax() - transition.getMin()) > (Character.MAX_VALUE / 3)) {
+        linearMode = true;
+        break;
+      }
+    }
+    return linearMode;
+  }
+  
+  /**
+   * Returns {@code true} if the enum is in linear mode, {@code false} in smart mode.
+   */
+  public final boolean usesLinearMode() {
+    return linearMode;
+  }
+ 
+  /**
+   * Returns true if the term matches the automaton. Also stashes away the term
+   * to assist with smart enumeration.
+   * <p>In linear mode, it also sets {@link #endEnum} if the enumeration is exhausted.
+   * In smart mode, it will never do this.   
+   */
+  @Override
+  protected AcceptStatus accept(final TermRef term) {
+    if (term.endsWith(commonSuffixRef)) {
+      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+      return runAutomaton.run(utf16.result, 0, utf16.length) ? YES_MATCH : NO_MATCH;
+    } else {
+      return NO_MATCH;
+    }
+  }
+  
+  @Override
+  protected TermRef nextSeekTerm(final TermRef term) throws IOException {
+    if (term == null) {
+      // return the first seek term
+      if (linearMode) {
+        seekTermRef.copy("");
+      } else {
+        utf16.copyText("");
+        if (!nextString())
+          return null;
+        UnicodeUtil.nextValidUTF16String(utf16);
+        UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, utf8);
+        seekTermRef.bytes = utf8.result;
+        seekTermRef.offset = 0;
+        seekTermRef.length = utf8.length;
+      }
+      return seekTermRef;
+    } else if (!linearMode) {
+      // seek to the next possible string
+      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+      if (nextString()) {
+        // reposition
+        UnicodeUtil.nextValidUTF16String(utf16);
+        UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, utf8);
+        seekTermRef.bytes = utf8.result;
+        seekTermRef.offset = 0;
+        seekTermRef.length = utf8.length;
+        return seekTermRef;
+      }
+    }
+    // no more possible strings can match
+    return null;
+  }
+
+  /**
+   * Increments the utf16 buffer to the next String in lexicographic order after s that will not put
+   * the machine into a reject state. If such a string does not exist, returns
+   * false.
+   * 
+   * The correctness of this method depends upon the automaton being deterministic,
+   * and having no transitions to dead states.
+   * 
+   * @return true if more possible solutions exist for the DFA
+   */
+  private boolean nextString() {
+    int state;
+    int pos = 0;
+
+    while (true) {
+      state = runAutomaton.getInitialState();
+      // walk the automaton until a character is rejected.
+      for (pos = 0; pos < utf16.length; pos++) {
+        int nextState = runAutomaton.step(state, utf16.result[pos]);
+        if (nextState == -1)
+          break;
+        else
+          state = nextState;
+      }
+
+      // take the useful portion, and the last non-reject state, and attempt to
+      // append characters that will match.
+      if (nextString(state, pos)) {
+        return true;
+      } else { /* no more solutions exist from this useful portion, backtrack */
+        if (!backtrack(pos)) /* no more solutions at all */
+          return false;
+        else if (runAutomaton.run(utf16.result, 0, utf16.length)) 
+          /* String is good to go as-is */
+          return true;
+        /* else advance further */
+      }
+    }
+  }
+  
+  /**
+   * Returns the next String in lexicographic order that will not put
+   * the machine into a reject state. 
+   * 
+   * This method traverses the DFA from the given position in the String,
+   * starting at the given state.
+   * 
+   * If this cannot satisfy the machine, returns false. This method will
+   * walk the minimal path, in lexicographic order, as long as possible.
+   * 
+   * If this method returns false, then there might still be more solutions,
+   * it is necessary to backtrack to find out.
+   * 
+   * @param state current non-reject state
+   * @param position useful portion of the string
+   * @return true if more possible solutions exist for the DFA from this
+   *         position
+   */
+  private boolean nextString(int state, int position) {
+    /* 
+     * the next lexicographic character must be greater than the existing
+     * character, if it exists.
+     */
+    char c = 0;
+    if (position < utf16.length) {
+      c = utf16.result[position];
+      // if the next character is U+FFFF and is not part of the useful portion,
+      // then by definition it puts us in a reject state, and therefore this
+      // path is dead. there cannot be any higher transitions. backtrack.
+      if (c == '\uFFFF')
+        return false;
+      else
+        c++;
+    }
+
+    utf16.setLength(position);
+    visited.clear();
+    visited.set(state);
+
+    Transition transitions[] = allTransitions[state];
+
+    // find the minimal path (lexicographic order) that is >= c
+    
+    for (int i = 0; i < transitions.length; i++) {
+      Transition transition = transitions[i];
+      if (transition.getMax() >= c) {
+        char nextChar = (char) Math.max(c, transition.getMin());
+        // append either the next sequential char, or the minimum transition
+        utf16.setLength(utf16.length + 1);
+        utf16.result[utf16.length - 1] = nextChar;
+        state = transition.getDest().getNumber();
+        /* 
+         * as long as is possible, continue down the minimal path in
+         * lexicographic order. if a loop or accept state is encountered, stop.
+         */
+        while (!visited.get(state) && !runAutomaton.isAccept(state)) {
+          visited.set(state);
+          /* 
+           * Note: we work with a DFA with no transitions to dead states.
+           * so the below is ok, if it is not an accept state,
+           * then there MUST be at least one transition.
+           */
+          transition = allTransitions[state][0];
+          // append the minimum transition
+          utf16.setLength(utf16.length + 1);
+          utf16.result[utf16.length - 1] = transition.getMin();
+          state = transition.getDest().getNumber();
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  /**
+   * Attempts to backtrack thru the string after encountering a dead end
+   * at some given position. Returns false if no more possible strings 
+   * can match.
+   * 
+   * @param position current position in the input String
+   * @return true if more possible solutions exist for the DFA
+   */
+  private boolean backtrack(int position) {
+    while (position > 0) {
+      char nextChar = utf16.result[position - 1];
+      // if a character is U+FFFF its a dead-end too,
+      // because there is no higher character in UTF-16 sort order.
+      if (nextChar != '\uFFFF') {
+        nextChar++;
+        utf16.result[position - 1] = nextChar;
+        utf16.setLength(position);
+        return true;
+      }
+      position--;
+    }
+    return false; /* all solutions exhausted */
+  }
+  
+  /**
+   * if the suffix starts with a low surrogate, remove it.
+   * This won't be quite as efficient, but can be converted to valid UTF-8
+   * 
+   * This isn't nearly as complex as cleanupPosition, because its not 
+   * going to use this suffix to walk any path thru the terms.
+   * 
+   */
+  private String getValidUTF16Suffix(String suffix) {
+    if (suffix != null && suffix.length() > 0 &&
+        Character.isLowSurrogate(suffix.charAt(0)))
+      return suffix.substring(1);
+    else
+      return suffix;
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,105 @@
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonProvider;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A fast regular expression query based on the
+ * {@link org.apache.lucene.util.automaton} package.
+ * <ul>
+ * <li>Comparisons are <a
+ * href="http://tusker.org/regex/regex_benchmark.html">fast</a>
+ * <li>The term dictionary is enumerated in an intelligent way, to avoid
+ * comparisons. See {@link AutomatonQuery} for more details.
+ * </ul>
+ * <p>
+ * The supported syntax is documented in the {@link RegExp} class.
+ * Note this might be different than other regular expression implementations.
+ * For some alternatives with different syntax, look under contrib/regex
+ * </p>
+ * <p>
+ * Note this query can be slow, as it needs to iterate over many terms. In order
+ * to prevent extremely slow RegexpQueries, a Regexp term should not start with
+ * the expression <code>.*</code>
+ * 
+ * @see RegExp
+ */
+public class RegexpQuery extends AutomatonQuery {
+  /**
+   * A provider that provides no named automata
+   */
+  private static AutomatonProvider defaultProvider = new AutomatonProvider() {
+    public Automaton getAutomaton(String name) throws IOException {
+      return null;
+    }
+  };
+  
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * <p>
+   * By default, all regular expression features are enabled.
+   * </p>
+   * 
+   * @param term regular expression.
+   */
+  public RegexpQuery(Term term) {
+    this(term, RegExp.ALL);
+  }
+  
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * 
+   * @param term regular expression.
+   * @param flags optional RegExp features from {@link RegExp}
+   */
+  public RegexpQuery(Term term, int flags) {
+    this(term, flags, defaultProvider);
+  }
+  
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   * 
+   * @param term regular expression.
+   * @param flags optional RegExp features from {@link RegExp}
+   * @param provider custom AutomatonProvider for named automata
+   */
+  public RegexpQuery(Term term, int flags, AutomatonProvider provider) {
+    super(term, new RegExp(term.text(), flags).toAutomaton(provider));
+  }
+  
+  /** Prints a user-readable version of this query. */
+  @Override
+  public String toString(String field) {
+    StringBuilder buffer = new StringBuilder();
+    if (!term.field().equals(field)) {
+      buffer.append(term.field());
+      buffer.append(":");
+    }
+    buffer.append(term.text());
+    buffer.append(ToStringUtils.boost(getBoost()));
+    return buffer.toString();
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/RegexpQuery.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardQuery.java?rev=888891&r1=888890&r2=888891&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardQuery.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardQuery.java Wed Dec  9 17:45:58 2009
@@ -19,68 +19,69 @@
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 
 /** Implements the wildcard search query. Supported wildcards are <code>*</code>, which
  * matches any character sequence (including the empty one), and <code>?</code>,
  * which matches any single character. Note this query can be slow, as it
  * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
- * a Wildcard term should not start with one of the wildcards <code>*</code> or
- * <code>?</code>.
+ * a Wildcard term should not start with the wildcard <code>*</code>
  * 
  * <p>This query uses the {@link
  * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}
  * rewrite method.
  *
- * @see WildcardTermEnums */
-public class WildcardQuery extends MultiTermQuery {
-  private boolean termContainsWildcard;
-  private boolean termIsPrefix;
-  protected Term term;
-    
+ * @see AutomatonQuery
+ */
+public class WildcardQuery extends AutomatonQuery {
+  /** String equality with support for wildcards */
+  public static final char WILDCARD_STRING = '*';
+
+  /** Char equality with support for wildcards */
+  public static final char WILDCARD_CHAR = '?';
+
+  /**
+   * Constructs a query for terms matching <code>term</code>. 
+   */
   public WildcardQuery(Term term) {
-    super(term.field());
-    this.term = term;
-    String text = term.text();
-    this.termContainsWildcard = (text.indexOf('*') != -1)
-        || (text.indexOf('?') != -1);
-    this.termIsPrefix = termContainsWildcard 
-        && (text.indexOf('?') == -1) 
-        && (text.indexOf('*') == text.length() - 1);
+    super(term, toAutomaton(term));
   }
   
-  @Override
-  protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
-    if (termIsPrefix) {
-      final String text = getTerm().text();
-      final Term t = getTerm().createTerm(text.substring(0,text.length()-1));
-      if (t.text().length() == 0) {
-        final Terms terms = reader.fields().terms(getField());
-        return (terms != null) ? terms.iterator() : new EmptyTermsEnum();
+  /**
+   * Convert Lucene wildcard syntax into an automaton.
+   */
+  static Automaton toAutomaton(Term wildcardquery) {
+    List<Automaton> automata = new ArrayList<Automaton>();
+    
+    String wildcardText = wildcardquery.text();
+    
+    for (int i = 0; i < wildcardText.length(); i++) {
+      final char c = wildcardText.charAt(i);
+      switch(c) {
+        case WILDCARD_STRING: 
+          automata.add(BasicAutomata.makeAnyString());
+          break;
+        case WILDCARD_CHAR:
+          automata.add(BasicAutomata.makeAnyChar());
+          break;
+        default:
+          automata.add(BasicAutomata.makeChar(c));
       }
-      return new PrefixTermsEnum(reader, t);
     }
-    if (termContainsWildcard)
-      return new WildcardTermsEnum(reader, getTerm());
-    else
-      return new SingleTermsEnum(reader, getTerm());
+    
+    return BasicOperations.concatenate(automata);
   }
   
   @Override @Deprecated
   protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-    if (termIsPrefix) {
-      final String text = getTerm().text();
-      final Term t = getTerm().createTerm(text.substring(0,text.length()-1));
-      return new PrefixTermEnum(reader, t);
-    }
-    if (termContainsWildcard)
-      return new WildcardTermEnum(reader, getTerm());
-    else
-      return new SingleTermEnum(reader, getTerm());
+    return new WildcardTermEnum(reader, term);
   }
   
   /**
@@ -89,7 +90,7 @@
   public Term getTerm() {
     return term;
   }
-
+  
   /** Prints a user-readable version of this query. */
   @Override
   public String toString(String field) {
@@ -102,30 +103,4 @@
     buffer.append(ToStringUtils.boost(getBoost()));
     return buffer.toString();
   }
-
-  @Override
-  public int hashCode() {
-    final int prime = 31;
-    int result = super.hashCode();
-    result = prime * result + ((term == null) ? 0 : term.hashCode());
-    return result;
-  }
-
-  @Override
-  public boolean equals(Object obj) {
-    if (this == obj)
-      return true;
-    if (!super.equals(obj))
-      return false;
-    if (getClass() != obj.getClass())
-      return false;
-    WildcardQuery other = (WildcardQuery) obj;
-    if (term == null) {
-      if (other.term != null)
-        return false;
-    } else if (!term.equals(other.term))
-      return false;
-    return true;
-  }
-
 }

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermEnum.java?rev=888891&r1=888890&r2=888891&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermEnum.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/search/WildcardTermEnum.java Wed Dec  9 17:45:58 2009
@@ -28,7 +28,7 @@
  * <p>
  * Term enumerations are always ordered by Term.compareTo().  Each term in
  * the enumeration is greater than all that precede it.
- * @deprecated Please use {@link WildcardTermsEnum} instead.
+ * @deprecated Please use {@link AutomatonTermsEnum} instead.
  */
 @Deprecated
 public class WildcardTermEnum extends FilteredTermEnum {
@@ -93,8 +93,8 @@
    * String equality with support for wildcards
    ********************************************/
 
-  public static final char WILDCARD_STRING = '*';
-  public static final char WILDCARD_CHAR = '?';
+  public static final char WILDCARD_STRING = WildcardQuery.WILDCARD_STRING;
+  public static final char WILDCARD_CHAR = WildcardQuery.WILDCARD_CHAR;
 
   /**
    * Determines if a word matches a wildcard pattern.

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=888891&r1=888890&r2=888891&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/UnicodeUtil.java Wed Dec  9 17:45:58 2009
@@ -374,36 +374,53 @@
    * @return next valid UTF-16 String in UTF-16 order
    */
   public static String nextValidUTF16String(String s) {
-    final int size = s.length();
+    if (validUTF16String(s))
+        return s;
+    else {
+      UTF16Result chars = new UTF16Result();
+      chars.copyText(s);
+      nextValidUTF16String(chars);
+      return new String(chars.result, 0, chars.length);
+    }
+  }
+  
+  public static void nextValidUTF16String(UTF16Result s) {
+    final int size = s.length;
     for (int i = 0; i < size; i++) {
-      char ch = s.charAt(i);
+      char ch = s.result[i];
       if (ch >= UnicodeUtil.UNI_SUR_HIGH_START
           && ch <= UnicodeUtil.UNI_SUR_HIGH_END) {
         if (i < size - 1) {
           i++;
-          char nextCH = s.charAt(i);
+          char nextCH = s.result[i];
           if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START
               && nextCH <= UnicodeUtil.UNI_SUR_LOW_END) {
             // Valid surrogate pair
           } else
           // Unmatched high surrogate
-            if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) // SMP not enumerated 
-              return s.substring(0, i) + 
-                (char) UnicodeUtil.UNI_SUR_LOW_START;
-            else // SMP already enumerated
-              return s.substring(0, i - 1) + 
-                (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
-        } else
+            if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated
+              s.setLength(i + 1);
+              s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;             
+              return;
+            } else { // SMP already enumerated
+              s.setLength(i);
+              s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);              
+              return;
+            }
+        } else {
         // Unmatched high surrogate in final position, SMP not yet enumerated
-        return s + (char) UnicodeUtil.UNI_SUR_LOW_START;
+          s.setLength(i + 2);
+          s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START;
+          return;
+        }
       } else if (ch >= UnicodeUtil.UNI_SUR_LOW_START
-          && ch <= UnicodeUtil.UNI_SUR_LOW_END)
+          && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
       // Unmatched low surrogate, SMP already enumerated
-      return s.substring(0, i) + 
-        (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+        s.setLength(i + 1);
+        s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
+        return;
+      }
     }
-    
-    return s;
   }
   
   // Only called from assert
@@ -460,7 +477,7 @@
       return false;
     }
   }
-
+  */
   public static final boolean validUTF16String(String s) {
     final int size = s.length();
     for(int i=0;i<size;i++) {
@@ -505,5 +522,4 @@
 
     return true;
   }
-  */
 }

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,748 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with regular expression operations.
+ * <p>
+ * Class invariants:
+ * <ul>
+ * <li>An automaton is either represented explicitly (with {@link State} and
+ * {@link Transition} objects) or with a singleton string (see
+ * {@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton
+ * is known to accept exactly one string. (Implicitly, all states and
+ * transitions of an automaton are reachable from its initial state.)
+ * <li>Automata are always reduced (see {@link #reduce()}) and have no
+ * transitions to dead states (see {@link #removeDeadTransitions()}).
+ * <li>If an automaton is nondeterministic, then {@link #isDeterministic()}
+ * returns false (but the converse is not required).
+ * <li>Automata provided as input to operations are generally assumed to be
+ * disjoint.
+ * </ul>
+ * <p>
+ * If the states or transitions are manipulated manually, the
+ * {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methods
+ * should be used afterwards to restore representation invariants that are
+ * assumed by the built-in automata operations.
+ * 
+ * <p>
+ * <font color="#FF0000"> WARNING: The status of the <b>Automaton</b> feature is
+ * experimental. The APIs introduced here might change in the future and will
+ * not be supported anymore in such a case.</font>
+ */
+public class Automaton implements Serializable, Cloneable {
+  
+  static final long serialVersionUID = 10001;
+  
+  /**
+   * Minimize using Hopcroft's O(n log n) algorithm. This is regarded as one of
+   * the most generally efficient algorithms that exist.
+   * 
+   * @see #setMinimization(int)
+   */
+  public static final int MINIMIZE_HOPCROFT = 2;
+  
+  /** Selects minimization algorithm (default: <code>MINIMIZE_HOPCROFT</code>). */
+  static int minimization = MINIMIZE_HOPCROFT;
+  
+  /** Initial state of this automaton. */
+  State initial;
+  
+  /**
+   * If true, then this automaton is definitely deterministic (i.e., there are
+   * no choices for any run, but a run may crash).
+   */
+  boolean deterministic;
+  
+  /** Extra data associated with this automaton. */
+  transient Object info;
+  
+  /**
+   * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)}
+   */
+  int hash_code;
+  
+  /** Singleton string. Null if not applicable. */
+  String singleton;
+  
+  /** Minimize always flag. */
+  static boolean minimize_always = false;
+  
+  /**
+   * Selects whether operations may modify the input automata (default:
+   * <code>false</code>).
+   */
+  static boolean allow_mutation = false;
+  
+  /**
+   * Constructs a new automaton that accepts the empty language. Using this
+   * constructor, automata can be constructed manually from {@link State} and
+   * {@link Transition} objects.
+   * 
+   * @see #setInitialState(State)
+   * @see State
+   * @see Transition
+   */
+  public Automaton() {
+    initial = new State();
+    deterministic = true;
+    singleton = null;
+  }
+  
+  boolean isDebug() {
+    return System.getProperty("dk.brics.automaton.debug") != null;
+  }
+  
+  /**
+   * Selects minimization algorithm (default: <code>MINIMIZE_HOPCROFT</code>).
+   * 
+   * @param algorithm minimization algorithm
+   */
+  static public void setMinimization(int algorithm) {
+    minimization = algorithm;
+  }
+  
+  /**
+   * Sets or resets minimize always flag. If this flag is set, then
+   * {@link MinimizationOperations#minimize(Automaton)} will automatically be
+   * invoked after all operations that otherwise may produce non-minimal
+   * automata. By default, the flag is not set.
+   * 
+   * @param flag if true, the flag is set
+   */
+  static public void setMinimizeAlways(boolean flag) {
+    minimize_always = flag;
+  }
+  
+  /**
+   * Sets or resets allow mutate flag. If this flag is set, then all automata
+   * operations may modify automata given as input; otherwise, operations will
+   * always leave input automata languages unmodified. By default, the flag is
+   * not set.
+   * 
+   * @param flag if true, the flag is set
+   * @return previous value of the flag
+   */
+  static public boolean setAllowMutate(boolean flag) {
+    boolean b = allow_mutation;
+    allow_mutation = flag;
+    return b;
+  }
+  
+  /**
+   * Returns the state of the allow mutate flag. If this flag is set, then all
+   * automata operations may modify automata given as input; otherwise,
+   * operations will always leave input automata languages unmodified. By
+   * default, the flag is not set.
+   * 
+   * @return current value of the flag
+   */
+  static boolean getAllowMutate() {
+    return allow_mutation;
+  }
+  
+  void checkMinimizeAlways() {
+    if (minimize_always) MinimizationOperations.minimize(this);
+  }
+  
+  boolean isSingleton() {
+    return singleton != null;
+  }
+  
+  /**
+   * Returns the singleton string for this automaton. An automaton that accepts
+   * exactly one string <i>may</i> be represented in singleton mode. In that
+   * case, this method may be used to obtain the string.
+   * 
+   * @return string, null if this automaton is not in singleton mode.
+   */
+  public String getSingleton() {
+    return singleton;
+  }
+  
+  /**
+   * Sets initial state.
+   * 
+   * @param s state
+   */
+  public void setInitialState(State s) {
+    initial = s;
+    singleton = null;
+  }
+  
+  /**
+   * Gets initial state.
+   * 
+   * @return state
+   */
+  public State getInitialState() {
+    expandSingleton();
+    return initial;
+  }
+  
+  /**
+   * Returns deterministic flag for this automaton.
+   * 
+   * @return true if the automaton is definitely deterministic, false if the
+   *         automaton may be nondeterministic
+   */
+  public boolean isDeterministic() {
+    return deterministic;
+  }
+  
+  /**
+   * Sets deterministic flag for this automaton. This method should (only) be
+   * used if automata are constructed manually.
+   * 
+   * @param deterministic true if the automaton is definitely deterministic,
+   *          false if the automaton may be nondeterministic
+   */
+  public void setDeterministic(boolean deterministic) {
+    this.deterministic = deterministic;
+  }
+  
+  /**
+   * Associates extra information with this automaton.
+   * 
+   * @param info extra information
+   */
+  public void setInfo(Object info) {
+    this.info = info;
+  }
+  
+  /**
+   * Returns extra information associated with this automaton.
+   * 
+   * @return extra information
+   * @see #setInfo(Object)
+   */
+  public Object getInfo() {
+    return info;
+  }
+  
+  /**
+   * Returns the set of states that are reachable from the initial state.
+   * 
+   * @return set of {@link State} objects
+   */
+  public Set<State> getStates() {
+    expandSingleton();
+    Set<State> visited;
+    if (isDebug()) visited = new LinkedHashSet<State>();
+    else visited = new HashSet<State>();
+    LinkedList<State> worklist = new LinkedList<State>();
+    worklist.add(initial);
+    visited.add(initial);
+    while (worklist.size() > 0) {
+      State s = worklist.removeFirst();
+      Collection<Transition> tr;
+      if (isDebug()) tr = s.getSortedTransitions(false);
+      else tr = s.transitions;
+      for (Transition t : tr)
+        if (!visited.contains(t.to)) {
+          visited.add(t.to);
+          worklist.add(t.to);
+        }
+    }
+    return visited;
+  }
+  
+  /**
+   * Returns the set of reachable accept states.
+   * 
+   * @return set of {@link State} objects
+   */
+  public Set<State> getAcceptStates() {
+    expandSingleton();
+    HashSet<State> accepts = new HashSet<State>();
+    HashSet<State> visited = new HashSet<State>();
+    LinkedList<State> worklist = new LinkedList<State>();
+    worklist.add(initial);
+    visited.add(initial);
+    while (worklist.size() > 0) {
+      State s = worklist.removeFirst();
+      if (s.accept) accepts.add(s);
+      for (Transition t : s.transitions)
+        if (!visited.contains(t.to)) {
+          visited.add(t.to);
+          worklist.add(t.to);
+        }
+    }
+    return accepts;
+  }
+  
+  /**
+   * Assigns consecutive numbers to the given states.
+   */
+  static void setStateNumbers(Set<State> states) {
+    int number = 0;
+    for (State s : states)
+      s.number = number++;
+  }
+  
+  /**
+   * Adds transitions to explicit crash state to ensure that transition function
+   * is total.
+   */
+  void totalize() {
+    State s = new State();
+    s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+        s));
+    for (State p : getStates()) {
+      int maxi = Character.MIN_VALUE;
+      for (Transition t : p.getSortedTransitions(false)) {
+        if (t.min > maxi) p.transitions.add(new Transition((char) maxi,
+            (char) (t.min - 1), s));
+        if (t.max + 1 > maxi) maxi = t.max + 1;
+      }
+      if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition(
+          (char) maxi, Character.MAX_VALUE, s));
+    }
+  }
+  
+  /**
+   * Restores representation invariant. This method must be invoked before any
+   * built-in automata operation is performed if automaton states or transitions
+   * are manipulated manually.
+   * 
+   * @see #setDeterministic(boolean)
+   */
+  public void restoreInvariant() {
+    removeDeadTransitions();
+  }
+  
+  /**
+   * Reduces this automaton. An automaton is "reduced" by combining overlapping
+   * and adjacent edge intervals with same destination.
+   */
+  public void reduce() {
+    if (isSingleton()) return;
+    Set<State> states = getStates();
+    setStateNumbers(states);
+    for (State s : states) {
+      List<Transition> st = s.getSortedTransitions(true);
+      s.resetTransitions();
+      State p = null;
+      int min = -1, max = -1;
+      for (Transition t : st) {
+        if (p == t.to) {
+          if (t.min <= max + 1) {
+            if (t.max > max) max = t.max;
+          } else {
+            if (p != null) s.transitions.add(new Transition((char) min,
+                (char) max, p));
+            min = t.min;
+            max = t.max;
+          }
+        } else {
+          if (p != null) s.transitions.add(new Transition((char) min,
+              (char) max, p));
+          p = t.to;
+          min = t.min;
+          max = t.max;
+        }
+      }
+      if (p != null) s.transitions
+          .add(new Transition((char) min, (char) max, p));
+    }
+  }
+  
+  /**
+   * Returns sorted array of all interval start points.
+   */
+  char[] getStartPoints() {
+    Set<Character> pointset = new HashSet<Character>();
+    for (State s : getStates()) {
+      pointset.add(Character.MIN_VALUE);
+      for (Transition t : s.transitions) {
+        pointset.add(t.min);
+        if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1));
+      }
+    }
+    char[] points = new char[pointset.size()];
+    int n = 0;
+    for (Character m : pointset)
+      points[n++] = m;
+    Arrays.sort(points);
+    return points;
+  }
+  
+  /**
+   * Returns the set of live states. A state is "live" if an accept state is
+   * reachable from it.
+   * 
+   * @return set of {@link State} objects
+   */
+  public Set<State> getLiveStates() {
+    expandSingleton();
+    return getLiveStates(getStates());
+  }
+  
+  private Set<State> getLiveStates(Set<State> states) {
+    HashMap<State,Set<State>> map = new HashMap<State,Set<State>>();
+    for (State s : states)
+      map.put(s, new HashSet<State>());
+    for (State s : states)
+      for (Transition t : s.transitions)
+        map.get(t.to).add(s);
+    Set<State> live = new HashSet<State>(getAcceptStates());
+    LinkedList<State> worklist = new LinkedList<State>(live);
+    while (worklist.size() > 0) {
+      State s = worklist.removeFirst();
+      for (State p : map.get(s))
+        if (!live.contains(p)) {
+          live.add(p);
+          worklist.add(p);
+        }
+    }
+    return live;
+  }
+  
+  /**
+   * Removes transitions to dead states and calls {@link #reduce()} and
+   * {@link #clearHashCode()}. (A state is "dead" if no accept state is
+   * reachable from it.)
+   */
+  public void removeDeadTransitions() {
+    clearHashCode();
+    if (isSingleton()) return;
+    Set<State> states = getStates();
+    Set<State> live = getLiveStates(states);
+    for (State s : states) {
+      Set<Transition> st = s.transitions;
+      s.resetTransitions();
+      for (Transition t : st)
+        if (live.contains(t.to)) s.transitions.add(t);
+    }
+    reduce();
+  }
+  
+  /**
+   * Returns a sorted array of transitions for each state (and sets state
+   * numbers).
+   */
+  static Transition[][] getSortedTransitions(Set<State> states) {
+    setStateNumbers(states);
+    Transition[][] transitions = new Transition[states.size()][];
+    for (State s : states)
+      transitions[s.number] = s.getSortedTransitionArray(false);
+    return transitions;
+  }
+  
+  /**
+   * Expands singleton representation to normal representation. Does nothing if
+   * not in singleton representation.
+   */
+  public void expandSingleton() {
+    if (isSingleton()) {
+      State p = new State();
+      initial = p;
+      for (int i = 0; i < singleton.length(); i++) {
+        State q = new State();
+        p.transitions.add(new Transition(singleton.charAt(i), q));
+        p = q;
+      }
+      p.accept = true;
+      deterministic = true;
+      singleton = null;
+    }
+  }
+  
+  /**
+   * Returns the number of states in this automaton.
+   */
+  public int getNumberOfStates() {
+    if (isSingleton()) return singleton.length() + 1;
+    return getStates().size();
+  }
+  
+  /**
+   * Returns the number of transitions in this automaton. This number is counted
+   * as the total number of edges, where one edge may be a character interval.
+   */
+  public int getNumberOfTransitions() {
+    if (isSingleton()) return singleton.length();
+    int c = 0;
+    for (State s : getStates())
+      c += s.transitions.size();
+    return c;
+  }
+  
+  /**
+   * Returns true if the language of this automaton is equal to the language of
+   * the given automaton. Implemented using <code>hashCode</code> and
+   * <code>subsetOf</code>.
+   */
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this) return true;
+    if (!(obj instanceof Automaton)) return false;
+    Automaton a = (Automaton) obj;
+    if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton);
+    return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a)
+        && BasicOperations.subsetOf(a, this);
+  }
+  
+  /**
+   * Returns hash code for this automaton. The hash code is based on the number
+   * of states and transitions in the minimized automaton. Invoking this method
+   * may involve minimizing the automaton.
+   */
+  @Override
+  public int hashCode() {
+    if (hash_code == 0) MinimizationOperations.minimize(this);
+    return hash_code;
+  }
+  
+  /**
+   * Must be invoked when the stored hash code may no longer be valid.
+   */
+  void clearHashCode() {
+    hash_code = 0;
+  }
+  
+  /**
+   * Returns a string representation of this automaton.
+   */
+  @Override
+  public String toString() {
+    StringBuilder b = new StringBuilder();
+    if (isSingleton()) {
+      b.append("singleton: ");
+      for (char c : singleton.toCharArray())
+        Transition.appendCharString(c, b);
+      b.append("\n");
+    } else {
+      Set<State> states = getStates();
+      setStateNumbers(states);
+      b.append("initial state: ").append(initial.number).append("\n");
+      for (State s : states)
+        b.append(s.toString());
+    }
+    return b.toString();
+  }
+  
+  /**
+   * Returns <a href="http://www.research.att.com/sw/tools/graphviz/"
+   * target="_top">Graphviz Dot</a> representation of this automaton.
+   */
+  public String toDot() {
+    StringBuilder b = new StringBuilder("digraph Automaton {\n");
+    b.append("  rankdir = LR;\n");
+    Set<State> states = getStates();
+    setStateNumbers(states);
+    for (State s : states) {
+      b.append("  ").append(s.number);
+      if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n");
+      else b.append(" [shape=circle,label=\"\"];\n");
+      if (s == initial) {
+        b.append("  initial [shape=plaintext,label=\"\"];\n");
+        b.append("  initial -> ").append(s.number).append("\n");
+      }
+      for (Transition t : s.transitions) {
+        b.append("  ").append(s.number);
+        t.appendDot(b);
+      }
+    }
+    return b.append("}\n").toString();
+  }
+  
+  /**
+   * Returns a clone of this automaton, expands if singleton.
+   */
+  Automaton cloneExpanded() {
+    Automaton a = clone();
+    a.expandSingleton();
+    return a;
+  }
+  
+  /**
+   * Returns a clone of this automaton unless <code>allow_mutation</code> is
+   * set, expands if singleton.
+   */
+  Automaton cloneExpandedIfRequired() {
+    if (allow_mutation) {
+      expandSingleton();
+      return this;
+    } else return cloneExpanded();
+  }
+  
+  /**
+   * Returns a clone of this automaton.
+   */
+  @Override
+  public Automaton clone() {
+    try {
+      Automaton a = (Automaton) super.clone();
+      if (!isSingleton()) {
+        HashMap<State,State> m = new HashMap<State,State>();
+        Set<State> states = getStates();
+        for (State s : states)
+          m.put(s, new State());
+        for (State s : states) {
+          State p = m.get(s);
+          p.accept = s.accept;
+          if (s == initial) a.initial = p;
+          for (Transition t : s.transitions)
+            p.transitions.add(new Transition(t.min, t.max, m.get(t.to)));
+        }
+      }
+      return a;
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+  
+  /**
+   * Returns a clone of this automaton, or this automaton itself if
+   * <code>allow_mutation</code> flag is set.
+   */
+  Automaton cloneIfRequired() {
+    if (allow_mutation) return this;
+    else return clone();
+  }
+  
+  /**
+   * See {@link BasicOperations#concatenate(Automaton, Automaton)}.
+   */
+  public Automaton concatenate(Automaton a) {
+    return BasicOperations.concatenate(this, a);
+  }
+  
+  /**
+   * See {@link BasicOperations#concatenate(List)}.
+   */
+  static public Automaton concatenate(List<Automaton> l) {
+    return BasicOperations.concatenate(l);
+  }
+  
+  /**
+   * See {@link BasicOperations#optional(Automaton)}.
+   */
+  public Automaton optional() {
+    return BasicOperations.optional(this);
+  }
+  
+  /**
+   * See {@link BasicOperations#repeat(Automaton)}.
+   */
+  public Automaton repeat() {
+    return BasicOperations.repeat(this);
+  }
+  
+  /**
+   * See {@link BasicOperations#repeat(Automaton, int)}.
+   */
+  public Automaton repeat(int min) {
+    return BasicOperations.repeat(this, min);
+  }
+  
+  /**
+   * See {@link BasicOperations#repeat(Automaton, int, int)}.
+   */
+  public Automaton repeat(int min, int max) {
+    return BasicOperations.repeat(this, min, max);
+  }
+  
+  /**
+   * See {@link BasicOperations#complement(Automaton)}.
+   */
+  public Automaton complement() {
+    return BasicOperations.complement(this);
+  }
+  
+  /**
+   * See {@link BasicOperations#minus(Automaton, Automaton)}.
+   */
+  public Automaton minus(Automaton a) {
+    return BasicOperations.minus(this, a);
+  }
+  
+  /**
+   * See {@link BasicOperations#intersection(Automaton, Automaton)}.
+   */
+  public Automaton intersection(Automaton a) {
+    return BasicOperations.intersection(this, a);
+  }
+  
+  /**
+   * See {@link BasicOperations#subsetOf(Automaton, Automaton)}.
+   */
+  public boolean subsetOf(Automaton a) {
+    return BasicOperations.subsetOf(this, a);
+  }
+  
+  /**
+   * See {@link BasicOperations#union(Automaton, Automaton)}.
+   */
+  public Automaton union(Automaton a) {
+    return BasicOperations.union(this, a);
+  }
+  
+  /**
+   * See {@link BasicOperations#union(Collection)}.
+   */
+  static public Automaton union(Collection<Automaton> l) {
+    return BasicOperations.union(l);
+  }
+  
+  /**
+   * See {@link BasicOperations#determinize(Automaton)}.
+   */
+  public void determinize() {
+    BasicOperations.determinize(this);
+  }
+  
+  /**
+   * See {@link BasicOperations#isEmptyString(Automaton)}.
+   */
+  public boolean isEmptyString() {
+    return BasicOperations.isEmptyString(this);
+  }
+  
+  /**
+   * See {@link MinimizationOperations#minimize(Automaton)}. Returns the
+   * automaton being given as argument.
+   */
+  public static Automaton minimize(Automaton a) {
+    MinimizationOperations.minimize(a);
+    return a;
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/Automaton.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,53 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+
+/**
+ * Automaton provider for <code>RegExp.</code>
+ * {@link RegExp#toAutomaton(AutomatonProvider)}
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+public interface AutomatonProvider {
+  
+  /**
+   * Returns automaton of the given name.
+   * 
+   * @param name automaton name
+   * @return automaton
+   * @throws IOException if errors occur
+   */
+  public Automaton getAutomaton(String name) throws IOException;
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/AutomatonProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,482 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Construction of basic automata.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+final public class BasicAutomata {
+  // used by getWhitespaceAutomaton to match basic whitespace
+  private static final Automaton ws = Automaton.minimize(BasicAutomata
+      .makeCharSet(" \t\n\r").repeat());
+  
+  private BasicAutomata() {}
+  
+  /**
+   * Returns a new (deterministic) automaton with the empty language.
+   */
+  public static Automaton makeEmpty() {
+    Automaton a = new Automaton();
+    State s = new State();
+    a.initial = s;
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts only the empty string.
+   */
+  public static Automaton makeEmptyString() {
+    Automaton a = new Automaton();
+    a.singleton = "";
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts all strings.
+   */
+  public static Automaton makeAnyString() {
+    Automaton a = new Automaton();
+    State s = new State();
+    a.initial = s;
+    s.accept = true;
+    s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+        s));
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts any single character.
+   */
+  public static Automaton makeAnyChar() {
+    return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE);
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts a single character of
+   * the given value.
+   */
+  public static Automaton makeChar(char c) {
+    Automaton a = new Automaton();
+    a.singleton = Character.toString(c);
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts a single char whose
+   * value is in the given interval (including both end points).
+   */
+  public static Automaton makeCharRange(char min, char max) {
+    if (min == max) return makeChar(min);
+    Automaton a = new Automaton();
+    State s1 = new State();
+    State s2 = new State();
+    a.initial = s1;
+    s2.accept = true;
+    if (min <= max) s1.transitions.add(new Transition(min, max, s2));
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts a single character in
+   * the given set.
+   */
+  public static Automaton makeCharSet(String set) {
+    if (set.length() == 1) return makeChar(set.charAt(0));
+    Automaton a = new Automaton();
+    State s1 = new State();
+    State s2 = new State();
+    a.initial = s1;
+    s2.accept = true;
+    for (int i = 0; i < set.length(); i++)
+      s1.transitions.add(new Transition(set.charAt(i), s2));
+    a.deterministic = true;
+    a.reduce();
+    return a;
+  }
+  
+  /**
+   * Constructs sub-automaton corresponding to decimal numbers of length
+   * x.substring(n).length().
+   */
+  private static State anyOfRightLength(String x, int n) {
+    State s = new State();
+    if (x.length() == n) s.setAccept(true);
+    else s.addTransition(new Transition('0', '9', anyOfRightLength(x, n + 1)));
+    return s;
+  }
+  
+  /**
+   * Constructs sub-automaton corresponding to decimal numbers of value at least
+   * x.substring(n) and length x.substring(n).length().
+   */
+  private static State atLeast(String x, int n, Collection<State> initials,
+      boolean zeros) {
+    State s = new State();
+    if (x.length() == n) s.setAccept(true);
+    else {
+      if (zeros) initials.add(s);
+      char c = x.charAt(n);
+      s.addTransition(new Transition(c, atLeast(x, n + 1, initials, zeros
+          && c == '0')));
+      if (c < '9') s.addTransition(new Transition((char) (c + 1), '9',
+          anyOfRightLength(x, n + 1)));
+    }
+    return s;
+  }
+  
+  /**
+   * Constructs sub-automaton corresponding to decimal numbers of value at most
+   * x.substring(n) and length x.substring(n).length().
+   */
+  private static State atMost(String x, int n) {
+    State s = new State();
+    if (x.length() == n) s.setAccept(true);
+    else {
+      char c = x.charAt(n);
+      s.addTransition(new Transition(c, atMost(x, (char) n + 1)));
+      if (c > '0') s.addTransition(new Transition('0', (char) (c - 1),
+          anyOfRightLength(x, n + 1)));
+    }
+    return s;
+  }
+  
+  /**
+   * Constructs sub-automaton corresponding to decimal numbers of value between
+   * x.substring(n) and y.substring(n) and of length x.substring(n).length()
+   * (which must be equal to y.substring(n).length()).
+   */
+  private static State between(String x, String y, int n,
+      Collection<State> initials, boolean zeros) {
+    State s = new State();
+    if (x.length() == n) s.setAccept(true);
+    else {
+      if (zeros) initials.add(s);
+      char cx = x.charAt(n);
+      char cy = y.charAt(n);
+      if (cx == cy) s.addTransition(new Transition(cx, between(x, y, n + 1,
+          initials, zeros && cx == '0')));
+      else { // cx<cy
+        s.addTransition(new Transition(cx, atLeast(x, n + 1, initials, zeros
+            && cx == '0')));
+        s.addTransition(new Transition(cy, atMost(y, n + 1)));
+        if (cx + 1 < cy) s.addTransition(new Transition((char) (cx + 1),
+            (char) (cy - 1), anyOfRightLength(x, n + 1)));
+      }
+    }
+    return s;
+  }
+  
+  /**
+   * Returns a new automaton that accepts strings representing decimal
+   * non-negative integers in the given interval.
+   * 
+   * @param min minimal value of interval
+   * @param max maximal value of inverval (both end points are included in the
+   *          interval)
+   * @param digits if >0, use fixed number of digits (strings must be prefixed
+   *          by 0's to obtain the right length) - otherwise, the number of
+   *          digits is not fixed
+   * @exception IllegalArgumentException if min>max or if numbers in the
+   *              interval cannot be expressed with the given fixed number of
+   *              digits
+   */
+  public static Automaton makeInterval(int min, int max, int digits)
+      throws IllegalArgumentException {
+    Automaton a = new Automaton();
+    String x = Integer.toString(min);
+    String y = Integer.toString(max);
+    if (min > max || (digits > 0 && y.length() > digits)) throw new IllegalArgumentException();
+    int d;
+    if (digits > 0) d = digits;
+    else d = y.length();
+    StringBuilder bx = new StringBuilder();
+    for (int i = x.length(); i < d; i++)
+      bx.append('0');
+    bx.append(x);
+    x = bx.toString();
+    StringBuilder by = new StringBuilder();
+    for (int i = y.length(); i < d; i++)
+      by.append('0');
+    by.append(y);
+    y = by.toString();
+    Collection<State> initials = new ArrayList<State>();
+    a.initial = between(x, y, 0, initials, digits <= 0);
+    if (digits <= 0) {
+      ArrayList<StatePair> pairs = new ArrayList<StatePair>();
+      for (State p : initials)
+        if (a.initial != p) pairs.add(new StatePair(a.initial, p));
+      BasicOperations.addEpsilons(a, pairs);
+      a.initial.addTransition(new Transition('0', a.initial));
+      a.deterministic = false;
+    } else a.deterministic = true;
+    a.checkMinimizeAlways();
+    return a;
+  }
+  
+  /**
+   * Returns a new (deterministic) automaton that accepts the single given
+   * string.
+   */
+  public static Automaton makeString(String s) {
+    Automaton a = new Automaton();
+    a.singleton = s;
+    a.deterministic = true;
+    return a;
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing nonnegative integers
+   * that are not larger than the given value.
+   * 
+   * @param n string representation of maximum value
+   */
+  public static Automaton makeMaxInteger(String n) {
+    int i = 0;
+    while (i < n.length() && n.charAt(i) == '0')
+      i++;
+    StringBuilder b = new StringBuilder();
+    b.append("0*(0|");
+    if (i < n.length()) b.append("[0-9]{1," + (n.length() - i - 1) + "}|");
+    maxInteger(n.substring(i), 0, b);
+    b.append(")");
+    return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+  }
+  
+  private static void maxInteger(String n, int i, StringBuilder b) {
+    b.append('(');
+    if (i < n.length()) {
+      char c = n.charAt(i);
+      if (c != '0') b.append("[0-" + (char) (c - 1) + "][0-9]{"
+          + (n.length() - i - 1) + "}|");
+      b.append(c);
+      maxInteger(n, i + 1, b);
+    }
+    b.append(')');
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing nonnegative integers
+   * that are not less that the given value.
+   * 
+   * @param n string representation of minimum value
+   */
+  public static Automaton makeMinInteger(String n) {
+    int i = 0;
+    while (i + 1 < n.length() && n.charAt(i) == '0')
+      i++;
+    StringBuilder b = new StringBuilder();
+    b.append("0*");
+    minInteger(n.substring(i), 0, b);
+    b.append("[0-9]*");
+    return Automaton.minimize((new RegExp(b.toString())).toAutomaton());
+  }
+  
+  private static void minInteger(String n, int i, StringBuilder b) {
+    b.append('(');
+    if (i < n.length()) {
+      char c = n.charAt(i);
+      if (c != '9') b.append("[" + (char) (c + 1) + "-9][0-9]{"
+          + (n.length() - i - 1) + "}|");
+      b.append(c);
+      minInteger(n, i + 1, b);
+    }
+    b.append(')');
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing decimal numbers that
+   * can be written with at most the given number of digits. Surrounding
+   * whitespace is permitted.
+   * 
+   * @param i max number of necessary digits
+   */
+  public static Automaton makeTotalDigits(int i) {
+    return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?0*([0-9]{0," + i
+        + "}|((([0-9]\\.*){0," + i + "})&@\\.@)0*)[ \t\n\r]*")).toAutomaton());
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing decimal numbers that
+   * can be written with at most the given number of digits in the fraction
+   * part. Surrounding whitespace is permitted.
+   * 
+   * @param i max number of necessary fraction digits
+   */
+  public static Automaton makeFractionDigits(int i) {
+    return Automaton.minimize((new RegExp("[ \t\n\r]*[-+]?[0-9]+(\\.[0-9]{0,"
+        + i + "}0*)?[ \t\n\r]*")).toAutomaton());
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing the given integer.
+   * Surrounding whitespace is permitted.
+   * 
+   * @param value string representation of integer
+   */
+  public static Automaton makeIntegerValue(String value) {
+    boolean minus = false;
+    int i = 0;
+    while (i < value.length()) {
+      char c = value.charAt(i);
+      if (c == '-') minus = true;
+      if (c >= '1' && c <= '9') break;
+      i++;
+    }
+    StringBuilder b = new StringBuilder();
+    b.append(value.substring(i));
+    if (b.length() == 0) b.append("0");
+    Automaton s;
+    if (minus) s = makeChar('-');
+    else s = makeChar('+').optional();
+    Automaton ws = getWhitespaceAutomaton();
+    return Automaton.minimize(ws.concatenate(
+        s.concatenate(makeChar('0').repeat()).concatenate(
+            makeString(b.toString()))).concatenate(ws));
+  }
+  
+  /**
+   * Constructs automaton that accept strings representing the given decimal
+   * number. Surrounding whitespace is permitted.
+   * 
+   * @param value string representation of decimal number
+   */
+  public static Automaton makeDecimalValue(String value) {
+    boolean minus = false;
+    int i = 0;
+    while (i < value.length()) {
+      char c = value.charAt(i);
+      if (c == '-') minus = true;
+      if ((c >= '1' && c <= '9') || c == '.') break;
+      i++;
+    }
+    StringBuilder b1 = new StringBuilder();
+    StringBuilder b2 = new StringBuilder();
+    int p = value.indexOf('.', i);
+    if (p == -1) b1.append(value.substring(i));
+    else {
+      b1.append(value.substring(i, p));
+      i = value.length() - 1;
+      while (i > p) {
+        char c = value.charAt(i);
+        if (c >= '1' && c <= '9') break;
+        i--;
+      }
+      b2.append(value.substring(p + 1, i + 1));
+    }
+    if (b1.length() == 0) b1.append("0");
+    Automaton s;
+    if (minus) s = makeChar('-');
+    else s = makeChar('+').optional();
+    Automaton d;
+    if (b2.length() == 0) d = makeChar('.')
+        .concatenate(makeChar('0').repeat(1)).optional();
+    else d = makeChar('.').concatenate(makeString(b2.toString())).concatenate(
+        makeChar('0').repeat());
+    Automaton ws = getWhitespaceAutomaton();
+    return Automaton.minimize(ws.concatenate(
+        s.concatenate(makeChar('0').repeat()).concatenate(
+            makeString(b1.toString())).concatenate(d)).concatenate(ws));
+  }
+  
+  /**
+   * Constructs deterministic automaton that matches strings that contain the
+   * given substring.
+   */
+  public static Automaton makeStringMatcher(String s) {
+    Automaton a = new Automaton();
+    State[] states = new State[s.length() + 1];
+    states[0] = a.initial;
+    for (int i = 0; i < s.length(); i++)
+      states[i + 1] = new State();
+    State f = states[s.length()];
+    f.accept = true;
+    f.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+        f));
+    for (int i = 0; i < s.length(); i++) {
+      Set<Character> done = new HashSet<Character>();
+      char c = s.charAt(i);
+      states[i].transitions.add(new Transition(c, states[i + 1]));
+      done.add(c);
+      for (int j = i; j >= 1; j--) {
+        char d = s.charAt(j - 1);
+        if (!done.contains(d)
+            && s.substring(0, j - 1).equals(s.substring(i - j + 1, i))) {
+          states[i].transitions.add(new Transition(d, states[j]));
+          done.add(d);
+        }
+      }
+      char[] da = new char[done.size()];
+      int h = 0;
+      for (char w : done)
+        da[h++] = w;
+      Arrays.sort(da);
+      int from = Character.MIN_VALUE;
+      int k = 0;
+      while (from <= Character.MAX_VALUE) {
+        while (k < da.length && da[k] == from) {
+          k++;
+          from++;
+        }
+        if (from <= Character.MAX_VALUE) {
+          int to = Character.MAX_VALUE;
+          if (k < da.length) {
+            to = da[k] - 1;
+            k++;
+          }
+          states[i].transitions.add(new Transition((char) from, (char) to,
+              states[0]));
+          from = to + 2;
+        }
+      }
+    }
+    a.deterministic = true;
+    return a;
+  }
+  
+  private static Automaton getWhitespaceAutomaton() {
+    return ws;
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message