lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r888891 [2/3] - in /lucene/java/branches/flex_1458: ./ src/java/org/apache/lucene/search/ src/java/org/apache/lucene/util/ src/java/org/apache/lucene/util/automaton/ src/test/org/apache/lucene/search/
Date Wed, 09 Dec 2009 17:45:59 GMT
Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,624 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Basic automata operations.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+final public class BasicOperations {
+  
+  private BasicOperations() {}
+  
+  /**
+   * Returns an automaton that accepts the concatenation of the languages of the
+   * given automata.
+   * <p>
+   * Complexity: linear in number of states.
+   */
+  static public Automaton concatenate(Automaton a1, Automaton a2) {
+    if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata
+        .makeString(a1.singleton + a2.singleton);
+    if (a1 == a2) {
+      a1 = a1.cloneExpanded();
+      a2 = a2.cloneExpanded();
+    } else {
+      a1 = a1.cloneExpandedIfRequired();
+      a2 = a2.cloneExpandedIfRequired();
+    }
+    for (State s : a1.getAcceptStates()) {
+      s.accept = false;
+      s.addEpsilon(a2.initial);
+    }
+    a1.deterministic = false;
+    a1.clearHashCode();
+    a1.checkMinimizeAlways();
+    return a1;
+  }
+  
+  /**
+   * Returns an automaton that accepts the concatenation of the languages of the
+   * given automata.
+   * <p>
+   * Complexity: linear in total number of states.
+   */
+  static public Automaton concatenate(List<Automaton> l) {
+    if (l.isEmpty()) return BasicAutomata.makeEmptyString();
+    boolean all_singleton = true;
+    for (Automaton a : l)
+      if (!a.isSingleton()) {
+        all_singleton = false;
+        break;
+      }
+    if (all_singleton) {
+      StringBuilder b = new StringBuilder();
+      for (Automaton a : l)
+        b.append(a.singleton);
+      return BasicAutomata.makeString(b.toString());
+    } else {
+      for (Automaton a : l)
+        if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty();
+      Set<Integer> ids = new HashSet<Integer>();
+      for (Automaton a : l)
+        ids.add(System.identityHashCode(a));
+      boolean has_aliases = ids.size() != l.size();
+      Automaton b = l.get(0);
+      if (has_aliases) b = b.cloneExpanded();
+      else b = b.cloneExpandedIfRequired();
+      Set<State> ac = b.getAcceptStates();
+      boolean first = true;
+      for (Automaton a : l)
+        if (first) first = false;
+        else {
+          if (a.isEmptyString()) continue;
+          Automaton aa = a;
+          if (has_aliases) aa = aa.cloneExpanded();
+          else aa = aa.cloneExpandedIfRequired();
+          Set<State> ns = aa.getAcceptStates();
+          for (State s : ac) {
+            s.accept = false;
+            s.addEpsilon(aa.initial);
+            if (s.accept) ns.add(s);
+          }
+          ac = ns;
+        }
+      b.deterministic = false;
+      b.clearHashCode();
+      b.checkMinimizeAlways();
+      return b;
+    }
+  }
+  
+  /**
+   * Returns an automaton that accepts the union of the empty string and the
+   * language of the given automaton.
+   * <p>
+   * Complexity: linear in number of states.
+   */
+  static public Automaton optional(Automaton a) {
+    a = a.cloneExpandedIfRequired();
+    State s = new State();
+    s.addEpsilon(a.initial);
+    s.accept = true;
+    a.initial = s;
+    a.deterministic = false;
+    a.clearHashCode();
+    a.checkMinimizeAlways();
+    return a;
+  }
+  
+  /**
+   * Returns an automaton that accepts the Kleene star (zero or more
+   * concatenated repetitions) of the language of the given automaton. Never
+   * modifies the input automaton language.
+   * <p>
+   * Complexity: linear in number of states.
+   */
+  static public Automaton repeat(Automaton a) {
+    a = a.cloneExpanded();
+    State s = new State();
+    s.accept = true;
+    s.addEpsilon(a.initial);
+    for (State p : a.getAcceptStates())
+      p.addEpsilon(s);
+    a.initial = s;
+    a.deterministic = false;
+    a.clearHashCode();
+    a.checkMinimizeAlways();
+    return a;
+  }
+  
+  /**
+   * Returns an automaton that accepts <code>min</code> or more concatenated
+   * repetitions of the language of the given automaton.
+   * <p>
+   * Complexity: linear in number of states and in <code>min</code>.
+   */
+  static public Automaton repeat(Automaton a, int min) {
+    if (min == 0) return repeat(a);
+    List<Automaton> as = new ArrayList<Automaton>();
+    while (min-- > 0)
+      as.add(a);
+    as.add(repeat(a));
+    return concatenate(as);
+  }
+  
+  /**
+   * Returns an automaton that accepts between <code>min</code> and
+   * <code>max</code> (including both) concatenated repetitions of the language
+   * of the given automaton.
+   * <p>
+   * Complexity: linear in number of states and in <code>min</code> and
+   * <code>max</code>.
+   */
+  static public Automaton repeat(Automaton a, int min, int max) {
+    if (min > max) return BasicAutomata.makeEmpty();
+    max -= min;
+    a.expandSingleton();
+    Automaton b;
+    if (min == 0) b = BasicAutomata.makeEmptyString();
+    else if (min == 1) b = a.clone();
+    else {
+      List<Automaton> as = new ArrayList<Automaton>();
+      while (min-- > 0)
+        as.add(a);
+      b = concatenate(as);
+    }
+    if (max > 0) {
+      Automaton d = a.clone();
+      while (--max > 0) {
+        Automaton c = a.clone();
+        for (State p : c.getAcceptStates())
+          p.addEpsilon(d.initial);
+        d = c;
+      }
+      for (State p : b.getAcceptStates())
+        p.addEpsilon(d.initial);
+      b.deterministic = false;
+      b.clearHashCode();
+      b.checkMinimizeAlways();
+    }
+    return b;
+  }
+  
+  /**
+   * Returns a (deterministic) automaton that accepts the complement of the
+   * language of the given automaton.
+   * <p>
+   * Complexity: linear in number of states (if already deterministic).
+   */
+  static public Automaton complement(Automaton a) {
+    a = a.cloneExpandedIfRequired();
+    a.determinize();
+    a.totalize();
+    for (State p : a.getStates())
+      p.accept = !p.accept;
+    a.removeDeadTransitions();
+    return a;
+  }
+  
+  /**
+   * Returns a (deterministic) automaton that accepts the intersection of the
+   * language of <code>a1</code> and the complement of the language of
+   * <code>a2</code>. As a side-effect, the automata may be determinized, if not
+   * already deterministic.
+   * <p>
+   * Complexity: quadratic in number of states (if already deterministic).
+   */
+  static public Automaton minus(Automaton a1, Automaton a2) {
+    if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata
+        .makeEmpty();
+    if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
+    if (a1.isSingleton()) {
+      if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
+      else return a1.cloneIfRequired();
+    }
+    return intersection(a1, a2.complement());
+  }
+  
+  /**
+   * Returns an automaton that accepts the intersection of the languages of the
+   * given automata. Never modifies the input automata languages.
+   * <p>
+   * Complexity: quadratic in number of states.
+   */
+  static public Automaton intersection(Automaton a1, Automaton a2) {
+    if (a1.isSingleton()) {
+      if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
+      else return BasicAutomata.makeEmpty();
+    }
+    if (a2.isSingleton()) {
+      if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
+      else return BasicAutomata.makeEmpty();
+    }
+    if (a1 == a2) return a1.cloneIfRequired();
+    Transition[][] transitions1 = Automaton
+        .getSortedTransitions(a1.getStates());
+    Transition[][] transitions2 = Automaton
+        .getSortedTransitions(a2.getStates());
+    Automaton c = new Automaton();
+    LinkedList<StatePair> worklist = new LinkedList<StatePair>();
+    HashMap<StatePair,StatePair> newstates = new HashMap<StatePair,StatePair>();
+    StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
+    worklist.add(p);
+    newstates.put(p, p);
+    while (worklist.size() > 0) {
+      p = worklist.removeFirst();
+      p.s.accept = p.s1.accept && p.s2.accept;
+      Transition[] t1 = transitions1[p.s1.number];
+      Transition[] t2 = transitions2[p.s2.number];
+      for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+        while (b2 < t2.length && t2[b2].max < t1[n1].min)
+          b2++;
+        for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
+          if (t2[n2].max >= t1[n1].min) {
+            StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+            StatePair r = newstates.get(q);
+            if (r == null) {
+              q.s = new State();
+              worklist.add(q);
+              newstates.put(q, q);
+              r = q;
+            }
+            char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
+            char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
+            p.s.transitions.add(new Transition(min, max, r.s));
+          }
+      }
+    }
+    c.deterministic = a1.deterministic && a2.deterministic;
+    c.removeDeadTransitions();
+    c.checkMinimizeAlways();
+    return c;
+  }
+  
+  /**
+   * Returns true if the language of <code>a1</code> is a subset of the language
+   * of <code>a2</code>. As a side-effect, <code>a2</code> is determinized if
+   * not already marked as deterministic.
+   * <p>
+   * Complexity: quadratic in number of states.
+   */
+  public static boolean subsetOf(Automaton a1, Automaton a2) {
+    if (a1 == a2) return true;
+    if (a1.isSingleton()) {
+      if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
+      return BasicOperations.run(a2, a1.singleton);
+    }
+    a2.determinize();
+    Transition[][] transitions1 = Automaton
+        .getSortedTransitions(a1.getStates());
+    Transition[][] transitions2 = Automaton
+        .getSortedTransitions(a2.getStates());
+    LinkedList<StatePair> worklist = new LinkedList<StatePair>();
+    HashSet<StatePair> visited = new HashSet<StatePair>();
+    StatePair p = new StatePair(a1.initial, a2.initial);
+    worklist.add(p);
+    visited.add(p);
+    while (worklist.size() > 0) {
+      p = worklist.removeFirst();
+      if (p.s1.accept && !p.s2.accept) return false;
+      Transition[] t1 = transitions1[p.s1.number];
+      Transition[] t2 = transitions2[p.s2.number];
+      for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
+        while (b2 < t2.length && t2[b2].max < t1[n1].min)
+          b2++;
+        int min1 = t1[n1].min, max1 = t1[n1].max;
+        for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
+          if (t2[n2].min > min1) return false;
+          if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1;
+          else {
+            min1 = Character.MAX_VALUE;
+            max1 = Character.MIN_VALUE;
+          }
+          StatePair q = new StatePair(t1[n1].to, t2[n2].to);
+          if (!visited.contains(q)) {
+            worklist.add(q);
+            visited.add(q);
+          }
+        }
+        if (min1 <= max1) return false;
+      }
+    }
+    return true;
+  }
+  
+  /**
+   * Returns an automaton that accepts the union of the languages of the given
+   * automata.
+   * <p>
+   * Complexity: linear in number of states.
+   */
+  public static Automaton union(Automaton a1, Automaton a2) {
+    if ((a1.isSingleton() && a2.isSingleton() && a1.singleton
+        .equals(a2.singleton))
+        || a1 == a2) return a1.cloneIfRequired();
+    if (a1 == a2) {
+      a1 = a1.cloneExpanded();
+      a2 = a2.cloneExpanded();
+    } else {
+      a1 = a1.cloneExpandedIfRequired();
+      a2 = a2.cloneExpandedIfRequired();
+    }
+    State s = new State();
+    s.addEpsilon(a1.initial);
+    s.addEpsilon(a2.initial);
+    a1.initial = s;
+    a1.deterministic = false;
+    a1.clearHashCode();
+    a1.checkMinimizeAlways();
+    return a1;
+  }
+  
+  /**
+   * Returns an automaton that accepts the union of the languages of the given
+   * automata.
+   * <p>
+   * Complexity: linear in number of states.
+   */
+  public static Automaton union(Collection<Automaton> l) {
+    Set<Integer> ids = new HashSet<Integer>();
+    for (Automaton a : l)
+      ids.add(System.identityHashCode(a));
+    boolean has_aliases = ids.size() != l.size();
+    State s = new State();
+    for (Automaton b : l) {
+      if (BasicOperations.isEmpty(b)) continue;
+      Automaton bb = b;
+      if (has_aliases) bb = bb.cloneExpanded();
+      else bb = bb.cloneExpandedIfRequired();
+      s.addEpsilon(bb.initial);
+    }
+    Automaton a = new Automaton();
+    a.initial = s;
+    a.deterministic = false;
+    a.clearHashCode();
+    a.checkMinimizeAlways();
+    return a;
+  }
+  
+  /**
+   * Determinizes the given automaton.
+   * <p>
+   * Complexity: exponential in number of states.
+   */
+  public static void determinize(Automaton a) {
+    if (a.deterministic || a.isSingleton()) return;
+    Set<State> initialset = new HashSet<State>();
+    initialset.add(a.initial);
+    determinize(a, initialset);
+  }
+  
+  /**
+   * Determinizes the given automaton using the given set of initial states.
+   */
+  static void determinize(Automaton a, Set<State> initialset) {
+    char[] points = a.getStartPoints();
+    // subset construction
+    Map<Set<State>,Set<State>> sets = new HashMap<Set<State>,Set<State>>();
+    LinkedList<Set<State>> worklist = new LinkedList<Set<State>>();
+    Map<Set<State>,State> newstate = new HashMap<Set<State>,State>();
+    sets.put(initialset, initialset);
+    worklist.add(initialset);
+    a.initial = new State();
+    newstate.put(initialset, a.initial);
+    while (worklist.size() > 0) {
+      Set<State> s = worklist.removeFirst();
+      State r = newstate.get(s);
+      for (State q : s)
+        if (q.accept) {
+          r.accept = true;
+          break;
+        }
+      for (int n = 0; n < points.length; n++) {
+        Set<State> p = new HashSet<State>();
+        for (State q : s)
+          for (Transition t : q.transitions)
+            if (t.min <= points[n] && points[n] <= t.max) p.add(t.to);
+        if (!sets.containsKey(p)) {
+          sets.put(p, p);
+          worklist.add(p);
+          newstate.put(p, new State());
+        }
+        State q = newstate.get(p);
+        char min = points[n];
+        char max;
+        if (n + 1 < points.length) max = (char) (points[n + 1] - 1);
+        else max = Character.MAX_VALUE;
+        r.transitions.add(new Transition(min, max, q));
+      }
+    }
+    a.deterministic = true;
+    a.removeDeadTransitions();
+  }
+  
+  /**
+   * Adds epsilon transitions to the given automaton. This method adds extra
+   * character interval transitions that are equivalent to the given set of
+   * epsilon transitions.
+   * 
+   * @param pairs collection of {@link StatePair} objects representing pairs of
+   *          source/destination states where epsilon transitions should be
+   *          added
+   */
+  public static void addEpsilons(Automaton a, Collection<StatePair> pairs) {
+    a.expandSingleton();
+    HashMap<State,HashSet<State>> forward = new HashMap<State,HashSet<State>>();
+    HashMap<State,HashSet<State>> back = new HashMap<State,HashSet<State>>();
+    for (StatePair p : pairs) {
+      HashSet<State> to = forward.get(p.s1);
+      if (to == null) {
+        to = new HashSet<State>();
+        forward.put(p.s1, to);
+      }
+      to.add(p.s2);
+      HashSet<State> from = back.get(p.s2);
+      if (from == null) {
+        from = new HashSet<State>();
+        back.put(p.s2, from);
+      }
+      from.add(p.s1);
+    }
+    // calculate epsilon closure
+    LinkedList<StatePair> worklist = new LinkedList<StatePair>(pairs);
+    HashSet<StatePair> workset = new HashSet<StatePair>(pairs);
+    while (!worklist.isEmpty()) {
+      StatePair p = worklist.removeFirst();
+      workset.remove(p);
+      HashSet<State> to = forward.get(p.s2);
+      HashSet<State> from = back.get(p.s1);
+      if (to != null) {
+        for (State s : to) {
+          StatePair pp = new StatePair(p.s1, s);
+          if (!pairs.contains(pp)) {
+            pairs.add(pp);
+            forward.get(p.s1).add(s);
+            back.get(s).add(p.s1);
+            worklist.add(pp);
+            workset.add(pp);
+            if (from != null) {
+              for (State q : from) {
+                StatePair qq = new StatePair(q, p.s1);
+                if (!workset.contains(qq)) {
+                  worklist.add(qq);
+                  workset.add(qq);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // add transitions
+    for (StatePair p : pairs)
+      p.s1.addEpsilon(p.s2);
+    a.deterministic = false;
+    a.clearHashCode();
+    a.checkMinimizeAlways();
+  }
+  
+  /**
+   * Returns true if the given automaton accepts the empty string and nothing
+   * else.
+   */
+  public static boolean isEmptyString(Automaton a) {
+    if (a.isSingleton()) return a.singleton.length() == 0;
+    else return a.initial.accept && a.initial.transitions.isEmpty();
+  }
+  
+  /**
+   * Returns true if the given automaton accepts no strings.
+   */
+  public static boolean isEmpty(Automaton a) {
+    if (a.isSingleton()) return false;
+    return !a.initial.accept && a.initial.transitions.isEmpty();
+  }
+  
+  /**
+   * Returns true if the given automaton accepts all strings.
+   */
+  public static boolean isTotal(Automaton a) {
+    if (a.isSingleton()) return false;
+    if (a.initial.accept && a.initial.transitions.size() == 1) {
+      Transition t = a.initial.transitions.iterator().next();
+      return t.to == a.initial && t.min == Character.MIN_VALUE
+          && t.max == Character.MAX_VALUE;
+    }
+    return false;
+  }
+  
+  /**
+   * Returns true if the given string is accepted by the automaton.
+   * <p>
+   * Complexity: linear in the length of the string.
+   * <p>
+   * <b>Note:</b> for full performance, use the {@link RunAutomaton} class.
+   */
+  public static boolean run(Automaton a, String s) {
+    if (a.isSingleton()) return s.equals(a.singleton);
+    if (a.deterministic) {
+      State p = a.initial;
+      for (int i = 0; i < s.length(); i++) {
+        State q = p.step(s.charAt(i));
+        if (q == null) return false;
+        p = q;
+      }
+      return p.accept;
+    } else {
+      Set<State> states = a.getStates();
+      Automaton.setStateNumbers(states);
+      LinkedList<State> pp = new LinkedList<State>();
+      LinkedList<State> pp_other = new LinkedList<State>();
+      BitSet bb = new BitSet(states.size());
+      BitSet bb_other = new BitSet(states.size());
+      pp.add(a.initial);
+      ArrayList<State> dest = new ArrayList<State>();
+      boolean accept = a.initial.accept;
+      for (int i = 0; i < s.length(); i++) {
+        char c = s.charAt(i);
+        accept = false;
+        pp_other.clear();
+        bb_other.clear();
+        for (State p : pp) {
+          dest.clear();
+          p.step(c, dest);
+          for (State q : dest) {
+            if (q.accept) accept = true;
+            if (!bb_other.get(q.number)) {
+              bb_other.set(q.number);
+              pp_other.add(q);
+            }
+          }
+        }
+        LinkedList<State> tp = pp;
+        pp = pp_other;
+        pp_other = tp;
+        BitSet tb = bb;
+        bb = bb_other;
+        bb_other = tb;
+      }
+      return accept;
+    }
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/BasicOperations.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,278 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.Set;
+
+/**
+ * Operations for minimizing automata.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+final public class MinimizationOperations {
+  
+  private MinimizationOperations() {}
+  
+  /**
+   * Minimizes (and determinizes if not already deterministic) the given
+   * automaton.
+   * 
+   * @see Automaton#setMinimization(int)
+   */
+  public static void minimize(Automaton a) {
+    if (!a.isSingleton()) {
+      minimizeHopcroft(a);
+    }
+    // recompute hash code
+    a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
+    if (a.hash_code == 0) a.hash_code = 1;
+  }
+  
+  private static <T> void initialize(ArrayList<T> list, int size) {
+    for (int i = 0; i < size; i++)
+      list.add(null);
+  }
+  
+  /**
+   * Minimizes the given automaton using Hopcroft's algorithm.
+   */
+  public static void minimizeHopcroft(Automaton a) {
+    a.determinize();
+    Set<Transition> tr = a.initial.getTransitions();
+    if (tr.size() == 1) {
+      Transition t = tr.iterator().next();
+      if (t.to == a.initial && t.min == Character.MIN_VALUE
+          && t.max == Character.MAX_VALUE) return;
+    }
+    a.totalize();
+    // make arrays for numbered states and effective alphabet
+    Set<State> ss = a.getStates();
+    State[] states = new State[ss.size()];
+    int number = 0;
+    for (State q : ss) {
+      states[number] = q;
+      q.number = number++;
+    }
+    char[] sigma = a.getStartPoints();
+    // initialize data structures
+    ArrayList<ArrayList<LinkedList<State>>> reverse = new ArrayList<ArrayList<LinkedList<State>>>();
+    for (int q = 0; q < states.length; q++) {
+      ArrayList<LinkedList<State>> v = new ArrayList<LinkedList<State>>();
+      initialize(v, sigma.length);
+      reverse.add(v);
+    }
+    boolean[][] reverse_nonempty = new boolean[states.length][sigma.length];
+    ArrayList<LinkedList<State>> partition = new ArrayList<LinkedList<State>>();
+    initialize(partition, states.length);
+    int[] block = new int[states.length];
+    StateList[][] active = new StateList[states.length][sigma.length];
+    StateListNode[][] active2 = new StateListNode[states.length][sigma.length];
+    LinkedList<IntPair> pending = new LinkedList<IntPair>();
+    boolean[][] pending2 = new boolean[sigma.length][states.length];
+    ArrayList<State> split = new ArrayList<State>();
+    boolean[] split2 = new boolean[states.length];
+    ArrayList<Integer> refine = new ArrayList<Integer>();
+    boolean[] refine2 = new boolean[states.length];
+    ArrayList<ArrayList<State>> splitblock = new ArrayList<ArrayList<State>>();
+    initialize(splitblock, states.length);
+    for (int q = 0; q < states.length; q++) {
+      splitblock.set(q, new ArrayList<State>());
+      partition.set(q, new LinkedList<State>());
+      for (int x = 0; x < sigma.length; x++) {
+        reverse.get(q).set(x, new LinkedList<State>());
+        active[q][x] = new StateList();
+      }
+    }
+    // find initial partition and reverse edges
+    for (int q = 0; q < states.length; q++) {
+      State qq = states[q];
+      int j;
+      if (qq.accept) j = 0;
+      else j = 1;
+      partition.get(j).add(qq);
+      block[qq.number] = j;
+      for (int x = 0; x < sigma.length; x++) {
+        char y = sigma[x];
+        State p = qq.step(y);
+        reverse.get(p.number).get(x).add(qq);
+        reverse_nonempty[p.number][x] = true;
+      }
+    }
+    // initialize active sets
+    for (int j = 0; j <= 1; j++)
+      for (int x = 0; x < sigma.length; x++)
+        for (State qq : partition.get(j))
+          if (reverse_nonempty[qq.number][x]) active2[qq.number][x] = active[j][x]
+              .add(qq);
+    // initialize pending
+    for (int x = 0; x < sigma.length; x++) {
+      int a0 = active[0][x].size;
+      int a1 = active[1][x].size;
+      int j;
+      if (a0 <= a1) j = 0;
+      else j = 1;
+      pending.add(new IntPair(j, x));
+      pending2[x][j] = true;
+    }
+    // process pending until fixed point
+    int k = 2;
+    while (!pending.isEmpty()) {
+      IntPair ip = pending.removeFirst();
+      int p = ip.n1;
+      int x = ip.n2;
+      pending2[x][p] = false;
+      // find states that need to be split off their blocks
+      for (StateListNode m = active[p][x].first; m != null; m = m.next)
+        for (State s : reverse.get(m.q.number).get(x))
+          if (!split2[s.number]) {
+            split2[s.number] = true;
+            split.add(s);
+            int j = block[s.number];
+            splitblock.get(j).add(s);
+            if (!refine2[j]) {
+              refine2[j] = true;
+              refine.add(j);
+            }
+          }
+      // refine blocks
+      for (int j : refine) {
+        if (splitblock.get(j).size() < partition.get(j).size()) {
+          LinkedList<State> b1 = partition.get(j);
+          LinkedList<State> b2 = partition.get(k);
+          for (State s : splitblock.get(j)) {
+            b1.remove(s);
+            b2.add(s);
+            block[s.number] = k;
+            for (int c = 0; c < sigma.length; c++) {
+              StateListNode sn = active2[s.number][c];
+              if (sn != null && sn.sl == active[j][c]) {
+                sn.remove();
+                active2[s.number][c] = active[k][c].add(s);
+              }
+            }
+          }
+          // update pending
+          for (int c = 0; c < sigma.length; c++) {
+            int aj = active[j][c].size;
+            int ak = active[k][c].size;
+            if (!pending2[c][j] && 0 < aj && aj <= ak) {
+              pending2[c][j] = true;
+              pending.add(new IntPair(j, c));
+            } else {
+              pending2[c][k] = true;
+              pending.add(new IntPair(k, c));
+            }
+          }
+          k++;
+        }
+        for (State s : splitblock.get(j))
+          split2[s.number] = false;
+        refine2[j] = false;
+        splitblock.get(j).clear();
+      }
+      split.clear();
+      refine.clear();
+    }
+    // make a new state for each equivalence class, set initial state
+    State[] newstates = new State[k];
+    for (int n = 0; n < newstates.length; n++) {
+      State s = new State();
+      newstates[n] = s;
+      for (State q : partition.get(n)) {
+        if (q == a.initial) a.initial = s;
+        s.accept = q.accept;
+        s.number = q.number; // select representative
+        q.number = n;
+      }
+    }
+    // build transitions and set acceptance
+    for (int n = 0; n < newstates.length; n++) {
+      State s = newstates[n];
+      s.accept = states[s.number].accept;
+      for (Transition t : states[s.number].transitions)
+        s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number]));
+    }
+    a.removeDeadTransitions();
+  }
+  
+  static class IntPair {
+    
+    int n1, n2;
+    
+    IntPair(int n1, int n2) {
+      this.n1 = n1;
+      this.n2 = n2;
+    }
+  }
+  
+  static class StateList {
+    
+    int size;
+    
+    StateListNode first, last;
+    
+    StateListNode add(State q) {
+      return new StateListNode(q, this);
+    }
+  }
+  
+  static class StateListNode {
+    
+    State q;
+    
+    StateListNode next, prev;
+    
+    StateList sl;
+    
+    StateListNode(State q, StateList sl) {
+      this.q = q;
+      this.sl = sl;
+      if (sl.size++ == 0) sl.first = sl.last = this;
+      else {
+        sl.last.next = this;
+        prev = sl.last;
+        sl.last = this;
+      }
+    }
+    
+    void remove() {
+      sl.size--;
+      if (sl.first == this) sl.first = next;
+      else prev.next = next;
+      if (sl.last == this) sl.last = prev;
+      else next.prev = prev;
+    }
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,1003 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Regular Expression extension to <code>Automaton</code>.
+ * <p>
+ * Regular expressions are built from the following abstract syntax:
+ * <p>
+ * <table border=0>
+ * <tr>
+ * <td><i>regexp</i></td>
+ * <td>::=</td>
+ * <td><i>unionexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>unionexp</i></td>
+ * <td>::=</td>
+ * <td><i>interexp</i>&nbsp;<tt><b>|</b></tt>&nbsp;<i>unionexp</i></td>
+ * <td>(union)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>interexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>interexp</i></td>
+ * <td>::=</td>
+ * <td><i>concatexp</i>&nbsp;<tt><b>&amp;</b></tt>&nbsp;<i>interexp</i></td>
+ * <td>(intersection)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>concatexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>concatexp</i></td>
+ * <td>::=</td>
+ * <td><i>repeatexp</i>&nbsp;<i>concatexp</i></td>
+ * <td>(concatenation)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>repeatexp</i></td>
+ * <td>::=</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>?</b></tt></td>
+ * <td>(zero or one occurrence)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>*</b></tt></td>
+ * <td>(zero or more occurrences)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>+</b></tt></td>
+ * <td>(one or more occurrences)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>{</b><i>n</i><b>}</b></tt></td>
+ * <td>(<tt><i>n</i></tt> occurrences)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>{</b><i>n</i><b>,}</b></tt></td>
+ * <td>(<tt><i>n</i></tt> or more occurrences)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>repeatexp</i>&nbsp;<tt><b>{</b><i>n</i><b>,</b><i>m</i><b>}</b></tt></td>
+ * <td>(<tt><i>n</i></tt> to <tt><i>m</i></tt> occurrences, including both)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>complexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>complexp</i></td>
+ * <td>::=</td>
+ * <td><tt><b>~</b></tt>&nbsp;<i>complexp</i></td>
+ * <td>(complement)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>charclassexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>charclassexp</i></td>
+ * <td>::=</td>
+ * <td><tt><b>[</b></tt>&nbsp;<i>charclasses</i>&nbsp;<tt><b>]</b></tt></td>
+ * <td>(character class)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>[^</b></tt>&nbsp;<i>charclasses</i>&nbsp;<tt><b>]</b></tt></td>
+ * <td>(negated character class)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>simpleexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>charclasses</i></td>
+ * <td>::=</td>
+ * <td><i>charclass</i>&nbsp;<i>charclasses</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>charclass</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>charclass</i></td>
+ * <td>::=</td>
+ * <td><i>charexp</i>&nbsp;<tt><b>-</b></tt>&nbsp;<i>charexp</i></td>
+ * <td>(character range, including end-points)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><i>charexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>simpleexp</i></td>
+ * <td>::=</td>
+ * <td><i>charexp</i></td>
+ * <td></td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>.</b></tt></td>
+ * <td>(any single character)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>#</b></tt></td>
+ * <td>(the empty language)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>@</b></tt></td>
+ * <td>(any string)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>"</b></tt>&nbsp;&lt;Unicode string without double-quotes&gt;&nbsp; <tt><b>"</b></tt></td>
+ * <td>(a string)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>(</b></tt>&nbsp;<tt><b>)</b></tt></td>
+ * <td>(the empty string)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>(</b></tt>&nbsp;<i>unionexp</i>&nbsp;<tt><b>)</b></tt></td>
+ * <td>(precedence override)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>&lt;</b></tt>&nbsp;&lt;identifier&gt;&nbsp;<tt><b>&gt;</b></tt></td>
+ * <td>(named automaton)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>&lt;</b><i>n</i>-<i>m</i><b>&gt;</b></tt></td>
+ * <td>(numerical interval)</td>
+ * <td><small>[OPTIONAL]</small></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td><i>charexp</i></td>
+ * <td>::=</td>
+ * <td>&lt;Unicode character&gt;</td>
+ * <td>(a single non-reserved character)</td>
+ * <td></td>
+ * </tr>
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><tt><b>\</b></tt>&nbsp;&lt;Unicode character&gt;&nbsp;</td>
+ * <td>(a single character)</td>
+ * <td></td>
+ * </tr>
+ * </table>
+ * <p>
+ * The productions marked <small>[OPTIONAL]</small> are only allowed if
+ * specified by the syntax flags passed to the <code>RegExp</code> constructor.
+ * The reserved characters used in the (enabled) syntax must be escaped with
+ * backslash (<tt><b>\</b></tt>) or double-quotes (<tt><b>"..."</b></tt>). (In
+ * contrast to other regexp syntaxes, this is required also in character
+ * classes.) Be aware that dash (<tt><b>-</b></tt>) has a special meaning in
+ * <i>charclass</i> expressions. An identifier is a string not containing right
+ * angle bracket (<tt><b>&gt;</b></tt>) or dash (<tt><b>-</b></tt>). Numerical
+ * intervals are specified by non-negative decimal integers and include both end
+ * points, and if <tt><i>n</i></tt> and <tt><i>m</i></tt> have the same number
+ * of digits, then the conforming strings must have that length (i.e. prefixed
+ * by 0's).
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+public class RegExp {
+  
+  enum Kind {
+    REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+  }
+  
+  /**
+   * Syntax flag, enables intersection (<tt>&amp;</tt>).
+   */
+  public static final int INTERSECTION = 0x0001;
+  
+  /**
+   * Syntax flag, enables complement (<tt>~</tt>).
+   */
+  public static final int COMPLEMENT = 0x0002;
+  
+  /**
+   * Syntax flag, enables empty language (<tt>#</tt>).
+   */
+  public static final int EMPTY = 0x0004;
+  
+  /**
+   * Syntax flag, enables anystring (<tt>@</tt>).
+   */
+  public static final int ANYSTRING = 0x0008;
+  
+  /**
+   * Syntax flag, enables named automata (<tt>&lt;</tt>identifier<tt>&gt;</tt>).
+   */
+  public static final int AUTOMATON = 0x0010;
+  
+  /**
+   * Syntax flag, enables numerical intervals (
+   * <tt>&lt;<i>n</i>-<i>m</i>&gt;</tt>).
+   */
+  public static final int INTERVAL = 0x0020;
+  
+  /**
+   * Syntax flag, enables all optional regexp syntax.
+   */
+  public static final int ALL = 0xffff;
+  
+  /**
+   * Syntax flag, enables no optional regexp syntax.
+   */
+  public static final int NONE = 0x0000;
+  
+  private static boolean allow_mutation = false;
+  
+  Kind kind;
+  RegExp exp1, exp2;
+  String s;
+  char c;
+  int min, max, digits;
+  char from, to;
+  
+  String b;
+  int flags;
+  int pos;
+  
+  RegExp() {}
+  
+  /**
+   * Constructs new <code>RegExp</code> from a string. Same as
+   * <code>RegExp(s, ALL)</code>.
+   * 
+   * @param s regexp string
+   * @exception IllegalArgumentException if an error occured while parsing the
+   *              regular expression
+   */
+  public RegExp(String s) throws IllegalArgumentException {
+    this(s, ALL);
+  }
+  
+  /**
+   * Constructs new <code>RegExp</code> from a string.
+   * 
+   * @param s regexp string
+   * @param syntax_flags boolean 'or' of optional syntax constructs to be
+   *          enabled
+   * @exception IllegalArgumentException if an error occured while parsing the
+   *              regular expression
+   */
+  public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+    b = s;
+    flags = syntax_flags;
+    RegExp e;
+    if (s.length() == 0) e = makeString("");
+    else {
+      e = parseUnionExp();
+      if (pos < b.length()) throw new IllegalArgumentException(
+          "end-of-string expected at position " + pos);
+    }
+    kind = e.kind;
+    exp1 = e.exp1;
+    exp2 = e.exp2;
+    this.s = e.s;
+    c = e.c;
+    min = e.min;
+    max = e.max;
+    digits = e.digits;
+    from = e.from;
+    to = e.to;
+    b = null;
+  }
+  
+  /**
+   * Constructs new <code>Automaton</code> from this <code>RegExp</code>. Same
+   * as <code>toAutomaton(null)</code> (empty automaton map).
+   */
+  public Automaton toAutomaton() {
+    return toAutomatonAllowMutate(null, null);
+  }
+  
+  /**
+   * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The
+   * constructed automaton is minimal and deterministic and has no transitions
+   * to dead states.
+   * 
+   * @param automaton_provider provider of automata for named identifiers
+   * @exception IllegalArgumentException if this regular expression uses a named
+   *              identifier that is not available from the automaton provider
+   */
+  public Automaton toAutomaton(AutomatonProvider automaton_provider)
+      throws IllegalArgumentException {
+    return toAutomatonAllowMutate(null, automaton_provider);
+  }
+  
+  /**
+   * Constructs new <code>Automaton</code> from this <code>RegExp</code>. The
+   * constructed automaton is minimal and deterministic and has no transitions
+   * to dead states.
+   * 
+   * @param automata a map from automaton identifiers to automata (of type
+   *          <code>Automaton</code>).
+   * @exception IllegalArgumentException if this regular expression uses a named
+   *              identifier that does not occur in the automaton map
+   */
+  public Automaton toAutomaton(Map<String,Automaton> automata)
+      throws IllegalArgumentException {
+    return toAutomatonAllowMutate(automata, null);
+  }
+  
+  /**
+   * Sets or resets allow mutate flag. If this flag is set, then automata
+   * construction uses mutable automata, which is slightly faster but not thread
+   * safe. By default, the flag is not set.
+   * 
+   * @param flag if true, the flag is set
+   * @return previous value of the flag
+   */
+  public boolean setAllowMutate(boolean flag) {
+    boolean b = allow_mutation;
+    allow_mutation = flag;
+    return b;
+  }
+  
+  private Automaton toAutomatonAllowMutate(Map<String,Automaton> automata,
+      AutomatonProvider automaton_provider) throws IllegalArgumentException {
+    boolean b = false;
+    if (allow_mutation) b = Automaton.setAllowMutate(true); // thread unsafe
+    Automaton a = toAutomaton(automata, automaton_provider);
+    if (allow_mutation) Automaton.setAllowMutate(b);
+    return a;
+  }
+  
+  private Automaton toAutomaton(Map<String,Automaton> automata,
+      AutomatonProvider automaton_provider) throws IllegalArgumentException {
+    List<Automaton> list;
+    Automaton a = null;
+    switch (kind) {
+      case REGEXP_UNION:
+        list = new ArrayList<Automaton>();
+        findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
+        findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider);
+        a = BasicOperations.union(list);
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_CONCATENATION:
+        list = new ArrayList<Automaton>();
+        findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
+            automaton_provider);
+        findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
+            automaton_provider);
+        a = BasicOperations.concatenate(list);
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_INTERSECTION:
+        a = exp1.toAutomaton(automata, automaton_provider).intersection(
+            exp2.toAutomaton(automata, automaton_provider));
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_OPTIONAL:
+        a = exp1.toAutomaton(automata, automaton_provider).optional();
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_REPEAT:
+        a = exp1.toAutomaton(automata, automaton_provider).repeat();
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_REPEAT_MIN:
+        a = exp1.toAutomaton(automata, automaton_provider).repeat(min);
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_REPEAT_MINMAX:
+        a = exp1.toAutomaton(automata, automaton_provider).repeat(min, max);
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_COMPLEMENT:
+        a = exp1.toAutomaton(automata, automaton_provider).complement();
+        MinimizationOperations.minimize(a);
+        break;
+      case REGEXP_CHAR:
+        a = BasicAutomata.makeChar(c);
+        break;
+      case REGEXP_CHAR_RANGE:
+        a = BasicAutomata.makeCharRange(from, to);
+        break;
+      case REGEXP_ANYCHAR:
+        a = BasicAutomata.makeAnyChar();
+        break;
+      case REGEXP_EMPTY:
+        a = BasicAutomata.makeEmpty();
+        break;
+      case REGEXP_STRING:
+        a = BasicAutomata.makeString(s);
+        break;
+      case REGEXP_ANYSTRING:
+        a = BasicAutomata.makeAnyString();
+        break;
+      case REGEXP_AUTOMATON:
+        Automaton aa = null;
+        if (automata != null) aa = automata.get(s);
+        if (aa == null && automaton_provider != null) try {
+          aa = automaton_provider.getAutomaton(s);
+        } catch (IOException e) {
+          throw new IllegalArgumentException(e);
+        }
+        if (aa == null) throw new IllegalArgumentException("'" + s
+            + "' not found");
+        a = aa.clone(); // always clone here (ignore allow_mutate)
+        break;
+      case REGEXP_INTERVAL:
+        a = BasicAutomata.makeInterval(min, max, digits);
+        break;
+    }
+    return a;
+  }
+  
+  private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
+      Map<String,Automaton> automata, AutomatonProvider automaton_provider) {
+    if (exp.kind == kind) {
+      findLeaves(exp.exp1, kind, list, automata, automaton_provider);
+      findLeaves(exp.exp2, kind, list, automata, automaton_provider);
+    } else list.add(exp.toAutomaton(automata, automaton_provider));
+  }
+  
+  /**
+   * Constructs string from parsed regular expression.
+   */
+  @Override
+  public String toString() {
+    return toStringBuilder(new StringBuilder()).toString();
+  }
+  
+  StringBuilder toStringBuilder(StringBuilder b) {
+    switch (kind) {
+      case REGEXP_UNION:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append("|");
+        exp2.toStringBuilder(b);
+        b.append(")");
+        break;
+      case REGEXP_CONCATENATION:
+        exp1.toStringBuilder(b);
+        exp2.toStringBuilder(b);
+        break;
+      case REGEXP_INTERSECTION:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append("&");
+        exp2.toStringBuilder(b);
+        b.append(")");
+        break;
+      case REGEXP_OPTIONAL:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append(")?");
+        break;
+      case REGEXP_REPEAT:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append(")*");
+        break;
+      case REGEXP_REPEAT_MIN:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append("){").append(min).append(",}");
+        break;
+      case REGEXP_REPEAT_MINMAX:
+        b.append("(");
+        exp1.toStringBuilder(b);
+        b.append("){").append(min).append(",").append(max).append("}");
+        break;
+      case REGEXP_COMPLEMENT:
+        b.append("~(");
+        exp1.toStringBuilder(b);
+        b.append(")");
+        break;
+      case REGEXP_CHAR:
+        b.append("\\").append(c);
+        break;
+      case REGEXP_CHAR_RANGE:
+        b.append("[\\").append(from).append("-\\").append(to).append("]");
+        break;
+      case REGEXP_ANYCHAR:
+        b.append(".");
+        break;
+      case REGEXP_EMPTY:
+        b.append("#");
+        break;
+      case REGEXP_STRING:
+        b.append("\"").append(s).append("\"");
+        break;
+      case REGEXP_ANYSTRING:
+        b.append("@");
+        break;
+      case REGEXP_AUTOMATON:
+        b.append("<").append(s).append(">");
+        break;
+      case REGEXP_INTERVAL:
+        String s1 = Integer.toString(min);
+        String s2 = Integer.toString(max);
+        b.append("<");
+        if (digits > 0) for (int i = s1.length(); i < digits; i++)
+          b.append('0');
+        b.append(s1).append("-");
+        if (digits > 0) for (int i = s2.length(); i < digits; i++)
+          b.append('0');
+        b.append(s2).append(">");
+        break;
+    }
+    return b;
+  }
+  
+  /**
+   * Returns set of automaton identifiers that occur in this regular expression.
+   */
+  public Set<String> getIdentifiers() {
+    HashSet<String> set = new HashSet<String>();
+    getIdentifiers(set);
+    return set;
+  }
+  
+  void getIdentifiers(Set<String> set) {
+    switch (kind) {
+      case REGEXP_UNION:
+      case REGEXP_CONCATENATION:
+      case REGEXP_INTERSECTION:
+        exp1.getIdentifiers(set);
+        exp2.getIdentifiers(set);
+        break;
+      case REGEXP_OPTIONAL:
+      case REGEXP_REPEAT:
+      case REGEXP_REPEAT_MIN:
+      case REGEXP_REPEAT_MINMAX:
+      case REGEXP_COMPLEMENT:
+        exp1.getIdentifiers(set);
+        break;
+      case REGEXP_AUTOMATON:
+        set.add(s);
+        break;
+      default:
+    }
+  }
+  
+  static RegExp makeUnion(RegExp exp1, RegExp exp2) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_UNION;
+    r.exp1 = exp1;
+    r.exp2 = exp2;
+    return r;
+  }
+  
+  static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+    if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
+        exp1, exp2);
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_CONCATENATION;
+    if (exp1.kind == Kind.REGEXP_CONCATENATION
+        && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
+        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
+      r.exp1 = exp1.exp1;
+      r.exp2 = makeString(exp1.exp2, exp2);
+    } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
+        && exp2.kind == Kind.REGEXP_CONCATENATION
+        && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
+      r.exp1 = makeString(exp1, exp2.exp1);
+      r.exp2 = exp2.exp2;
+    } else {
+      r.exp1 = exp1;
+      r.exp2 = exp2;
+    }
+    return r;
+  }
+  
+  static private RegExp makeString(RegExp exp1, RegExp exp2) {
+    StringBuilder b = new StringBuilder();
+    if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
+    else b.append(exp1.c);
+    if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
+    else b.append(exp2.c);
+    return makeString(b.toString());
+  }
+  
+  static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_INTERSECTION;
+    r.exp1 = exp1;
+    r.exp2 = exp2;
+    return r;
+  }
+  
+  static RegExp makeOptional(RegExp exp) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_OPTIONAL;
+    r.exp1 = exp;
+    return r;
+  }
+  
+  static RegExp makeRepeat(RegExp exp) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_REPEAT;
+    r.exp1 = exp;
+    return r;
+  }
+  
+  static RegExp makeRepeat(RegExp exp, int min) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_REPEAT_MIN;
+    r.exp1 = exp;
+    r.min = min;
+    return r;
+  }
+  
+  static RegExp makeRepeat(RegExp exp, int min, int max) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_REPEAT_MINMAX;
+    r.exp1 = exp;
+    r.min = min;
+    r.max = max;
+    return r;
+  }
+  
+  static RegExp makeComplement(RegExp exp) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_COMPLEMENT;
+    r.exp1 = exp;
+    return r;
+  }
+  
+  static RegExp makeChar(char c) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_CHAR;
+    r.c = c;
+    return r;
+  }
+  
+  static RegExp makeCharRange(char from, char to) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_CHAR_RANGE;
+    r.from = from;
+    r.to = to;
+    return r;
+  }
+  
+  static RegExp makeAnyChar() {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_ANYCHAR;
+    return r;
+  }
+  
+  static RegExp makeEmpty() {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_EMPTY;
+    return r;
+  }
+  
+  static RegExp makeString(String s) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_STRING;
+    r.s = s;
+    return r;
+  }
+  
+  static RegExp makeAnyString() {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_ANYSTRING;
+    return r;
+  }
+  
+  static RegExp makeAutomaton(String s) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_AUTOMATON;
+    r.s = s;
+    return r;
+  }
+  
+  static RegExp makeInterval(int min, int max, int digits) {
+    RegExp r = new RegExp();
+    r.kind = Kind.REGEXP_INTERVAL;
+    r.min = min;
+    r.max = max;
+    r.digits = digits;
+    return r;
+  }
+  
+  private boolean peek(String s) {
+    return more() && s.indexOf(b.charAt(pos)) != -1;
+  }
+  
+  private boolean match(char c) {
+    if (pos >= b.length()) return false;
+    if (b.charAt(pos) == c) {
+      pos++;
+      return true;
+    }
+    return false;
+  }
+  
+  private boolean more() {
+    return pos < b.length();
+  }
+  
+  private char next() throws IllegalArgumentException {
+    if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
+    return b.charAt(pos++);
+  }
+  
+  private boolean check(int flag) {
+    return (flags & flag) != 0;
+  }
+  
+  final RegExp parseUnionExp() throws IllegalArgumentException {
+    RegExp e = parseInterExp();
+    if (match('|')) e = makeUnion(e, parseUnionExp());
+    return e;
+  }
+  
+  final RegExp parseInterExp() throws IllegalArgumentException {
+    RegExp e = parseConcatExp();
+    if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+        parseInterExp());
+    return e;
+  }
+  
+  final RegExp parseConcatExp() throws IllegalArgumentException {
+    RegExp e = parseRepeatExp();
+    if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
+        e, parseConcatExp());
+    return e;
+  }
+  
+  final RegExp parseRepeatExp() throws IllegalArgumentException {
+    RegExp e = parseComplExp();
+    while (peek("?*+{")) {
+      if (match('?')) e = makeOptional(e);
+      else if (match('*')) e = makeRepeat(e);
+      else if (match('+')) e = makeRepeat(e, 1);
+      else if (match('{')) {
+        int start = pos;
+        while (peek("0123456789"))
+          next();
+        if (start == pos) throw new IllegalArgumentException(
+            "integer expected at position " + pos);
+        int n = Integer.parseInt(b.substring(start, pos));
+        int m = -1;
+        if (match(',')) {
+          start = pos;
+          while (peek("0123456789"))
+            next();
+          if (start != pos) m = Integer.parseInt(b.substring(start, pos));
+        } else m = n;
+        if (!match('}')) throw new IllegalArgumentException(
+            "expected '}' at position " + pos);
+        if (m == -1) e = makeRepeat(e, n);
+        else e = makeRepeat(e, n, m);
+      }
+    }
+    return e;
+  }
+  
+  final RegExp parseComplExp() throws IllegalArgumentException {
+    if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+    else return parseCharClassExp();
+  }
+  
+  final RegExp parseCharClassExp() throws IllegalArgumentException {
+    if (match('[')) {
+      boolean negate = false;
+      if (match('^')) negate = true;
+      RegExp e = parseCharClasses();
+      if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+      if (!match(']')) throw new IllegalArgumentException(
+          "expected ']' at position " + pos);
+      return e;
+    } else return parseSimpleExp();
+  }
+  
+  final RegExp parseCharClasses() throws IllegalArgumentException {
+    RegExp e = parseCharClass();
+    while (more() && !peek("]"))
+      e = makeUnion(e, parseCharClass());
+    return e;
+  }
+  
+  final RegExp parseCharClass() throws IllegalArgumentException {
+    char c = parseCharExp();
+    if (match('-')) return makeCharRange(c, parseCharExp());
+    else return makeChar(c);
+  }
+  
+  final RegExp parseSimpleExp() throws IllegalArgumentException {
+    if (match('.')) return makeAnyChar();
+    else if (check(EMPTY) && match('#')) return makeEmpty();
+    else if (check(ANYSTRING) && match('@')) return makeAnyString();
+    else if (match('"')) {
+      int start = pos;
+      while (more() && !peek("\""))
+        next();
+      if (!match('"')) throw new IllegalArgumentException(
+          "expected '\"' at position " + pos);
+      return makeString(b.substring(start, pos - 1));
+    } else if (match('(')) {
+      if (match(')')) return makeString("");
+      RegExp e = parseUnionExp();
+      if (!match(')')) throw new IllegalArgumentException(
+          "expected ')' at position " + pos);
+      return e;
+    } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) {
+      int start = pos;
+      while (more() && !peek(">"))
+        next();
+      if (!match('>')) throw new IllegalArgumentException(
+          "expected '>' at position " + pos);
+      String s = b.substring(start, pos - 1);
+      int i = s.indexOf('-');
+      if (i == -1) {
+        if (!check(AUTOMATON)) throw new IllegalArgumentException(
+            "interval syntax error at position " + (pos - 1));
+        return makeAutomaton(s);
+      } else {
+        if (!check(INTERVAL)) throw new IllegalArgumentException(
+            "illegal identifier at position " + (pos - 1));
+        try {
+          if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException();
+          String smin = s.substring(0, i);
+          String smax = s.substring(i + 1, s.length());
+          int imin = Integer.parseInt(smin);
+          int imax = Integer.parseInt(smax);
+          int digits;
+          if (smin.length() == smax.length()) digits = smin.length();
+          else digits = 0;
+          if (imin > imax) {
+            int t = imin;
+            imin = imax;
+            imax = t;
+          }
+          return makeInterval(imin, imax, digits);
+        } catch (NumberFormatException e) {
+          throw new IllegalArgumentException(
+              "interval syntax error at position " + (pos - 1));
+        }
+      }
+    } else return makeChar(parseCharExp());
+  }
+  
+  final char parseCharExp() throws IllegalArgumentException {
+    match('\\');
+    return next();
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RegExp.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,215 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * Finite-state automaton with fast run operation.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+public final class RunAutomaton implements Serializable {
+  
+  static final long serialVersionUID = 20001;
+  
+  final int size;
+  final boolean[] accept;
+  final int initial;
+  final int[] transitions; // delta(state,c) = transitions[state*points.length +
+                     // getCharClass(c)]
+  final char[] points; // char interval start points
+  final int[] classmap; // map from char number to class class
+  
+  /**
+   * Returns a string representation of this automaton.
+   */
+  @Override
+  public String toString() {
+    StringBuilder b = new StringBuilder();
+    b.append("initial state: ").append(initial).append("\n");
+    for (int i = 0; i < size; i++) {
+      b.append("state " + i);
+      if (accept[i]) b.append(" [accept]:\n");
+      else b.append(" [reject]:\n");
+      for (int j = 0; j < points.length; j++) {
+        int k = transitions[i * points.length + j];
+        if (k != -1) {
+          char min = points[j];
+          char max;
+          if (j + 1 < points.length) max = (char) (points[j + 1] - 1);
+          else max = Character.MAX_VALUE;
+          b.append(" ");
+          Transition.appendCharString(min, b);
+          if (min != max) {
+            b.append("-");
+            Transition.appendCharString(max, b);
+          }
+          b.append(" -> ").append(k).append("\n");
+        }
+      }
+    }
+    return b.toString();
+  }
+  
+  /**
+   * Returns number of states in automaton.
+   */
+  public int getSize() {
+    return size;
+  }
+  
+  /**
+   * Returns acceptance status for given state.
+   */
+  public boolean isAccept(int state) {
+    return accept[state];
+  }
+  
+  /**
+   * Returns initial state.
+   */
+  public int getInitialState() {
+    return initial;
+  }
+  
+  /**
+   * Returns array of character class interval start points. The array should
+   * not be modified by the caller.
+   */
+  public char[] getCharIntervals() {
+    return points.clone();
+  }
+  
+  /**
+   * Gets character class of given char.
+   */
+  int getCharClass(char c) {
+    return SpecialOperations.findIndex(c, points);
+  }
+  
+  /**
+   * Constructs a new <code>RunAutomaton</code> from a deterministic
+   * <code>Automaton</code>.
+   * 
+   * @param a an automaton
+   */
+  public RunAutomaton(Automaton a) {
+    a.determinize();
+    points = a.getStartPoints();
+    Set<State> states = a.getStates();
+    Automaton.setStateNumbers(states);
+    initial = a.initial.number;
+    size = states.size();
+    accept = new boolean[size];
+    transitions = new int[size * points.length];
+    for (int n = 0; n < size * points.length; n++)
+      transitions[n] = -1;
+    for (State s : states) {
+      int n = s.number;
+      accept[n] = s.accept;
+      for (int c = 0; c < points.length; c++) {
+        State q = s.step(points[c]);
+        if (q != null) transitions[n * points.length + c] = q.number;
+      }
+    }
+    /*
+     * Set alphabet table for optimal run performance.
+     */
+    classmap = new int[Character.MAX_VALUE + 1];
+    int i = 0;
+    for (int j = 0; j <= Character.MAX_VALUE; j++) {
+      if (i + 1 < points.length && j == points[i + 1]) i++;
+      classmap[j] = i;
+    }
+  }
+  
+  /**
+   * Returns the state obtained by reading the given char from the given state.
+   * Returns -1 if not obtaining any such state. (If the original
+   * <code>Automaton</code> had no dead states, -1 is returned here if and only
+   * if a dead state is entered in an equivalent automaton with a total
+   * transition function.)
+   */
+  public int step(int state, char c) {
+    return transitions[state * points.length + classmap[c]];
+  }
+  
+  /**
+   * Returns true if the given string is accepted by this automaton.
+   */
+  public boolean run(String s) {
+    int p = initial;
+    int l = s.length();
+    for (int i = 0; i < l; i++) {
+      p = step(p, s.charAt(i));
+      if (p == -1) return false;
+    }
+    return accept[p];
+  }
+  
+  /**
+   * Returns true if the given string is accepted by this automaton
+   */
+  public boolean run(char[] s, int offset, int length) {
+    int p = initial;
+    int l = offset + length;
+    for (int i = offset; i < l; i++) {
+      p = step(p, s[i]);
+      if (p == -1) return false;
+    }
+    return accept[p];
+  }
+  
+  /**
+   * Returns the length of the longest accepted run of the given string starting
+   * at the given offset.
+   * 
+   * @param s the string
+   * @param offset offset into <code>s</code> where the run starts
+   * @return length of the longest accepted run, -1 if no run is accepted
+   */
+  public int run(String s, int offset) {
+    int p = initial;
+    int l = s.length();
+    int max = -1;
+    for (int r = 0; offset <= l; offset++, r++) {
+      if (accept[p]) max = r;
+      if (offset == l) break;
+      p = step(p, s.charAt(offset));
+      if (p == -1) break;
+    }
+    return max;
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,182 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Special automata operations.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+final public class SpecialOperations {
+  
+  private SpecialOperations() {}
+  
+  /**
+   * Finds the largest entry whose value is less than or equal to c, or 0 if
+   * there is no such entry.
+   */
+  static int findIndex(char c, char[] points) {
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
+    }
+    return a;
+  }
+  
+  /**
+   * Returns true if the language of this automaton is finite.
+   */
+  public static boolean isFinite(Automaton a) {
+    if (a.isSingleton()) return true;
+    return isFinite(a.initial, new HashSet<State>());
+  }
+  
+  /**
+   * Checks whether there is a loop containing s. (This is sufficient since
+   * there are never transitions to dead states.)
+   */
+  private static boolean isFinite(State s, HashSet<State> path) {
+    path.add(s);
+    for (Transition t : s.transitions)
+      if (path.contains(t.to) || !isFinite(t.to, path)) return false;
+    path.remove(s);
+    return true;
+  }
+  
+  /**
+   * Returns the longest string that is a prefix of all accepted strings and
+   * visits each state at most once.
+   * 
+   * @return common prefix
+   */
+  public static String getCommonPrefix(Automaton a) {
+    if (a.isSingleton()) return a.singleton;
+    StringBuilder b = new StringBuilder();
+    HashSet<State> visited = new HashSet<State>();
+    State s = a.initial;
+    boolean done;
+    do {
+      done = true;
+      visited.add(s);
+      if (!s.accept && s.transitions.size() == 1) {
+        Transition t = s.transitions.iterator().next();
+        if (t.min == t.max && !visited.contains(t.to)) {
+          b.append(t.min);
+          s = t.to;
+          done = false;
+        }
+      }
+    } while (!done);
+    return b.toString();
+  }
+  
+  /**
+   * Returns the longest string that is a suffix of all accepted strings and
+   * visits each state at most once.
+   * 
+   * @return common suffix
+   */
+  public static String getCommonSuffix(Automaton a) {
+    if (a.isSingleton()) // if singleton, the suffix is the string itself.
+      return a.singleton;
+    
+    // reverse the language of the automaton, then reverse its common prefix.
+    Automaton r = a.clone();
+    reverse(r);
+    r.determinize();
+    return reverseUnicode3(SpecialOperations.getCommonPrefix(r));
+  }
+  
+  /**
+   * Reverses the language of the given (non-singleton) automaton while returning
+   * the set of new initial states.
+   */
+  private static Set<State> reverse(Automaton a) {
+    a.expandSingleton();
+    // reverse all edges
+    HashMap<State, HashSet<Transition>> m = new HashMap<State, HashSet<Transition>>();
+    Set<State> states = a.getStates();
+    Set<State> accept = a.getAcceptStates();
+    for (State r : states) {
+      m.put(r, new HashSet<Transition>());
+      r.accept = false;
+    }
+    for (State r : states)
+      for (Transition t : r.getTransitions())
+        m.get(t.to).add(new Transition(t.min, t.max, r));
+    for (State r : states)
+      r.transitions = m.get(r);
+    // make new initial+final states
+    a.initial.accept = true;
+    a.initial = new State();
+    for (State r : accept)
+      a.initial.addEpsilon(r); // ensures that all initial states are reachable
+    a.deterministic = false;
+    return accept;
+  }
+  
+  /**
+   * Intentionally use a unicode 3 reverse.
+   * This is because we are only going to reverse it again...
+   */
+  private static String reverseUnicode3( final String input ){
+    char[] charInput = input.toCharArray();
+    reverseUnicode3(charInput, 0, charInput.length);
+    return new String(charInput);
+  }
+  
+  /**
+   * Intentionally use a unicode 3 reverse.
+   * This is because it is only used by getCommonSuffix(),
+   * which will reverse the entire FSM using code unit reversal,
+   * so we must then reverse its common prefix back using the 
+   * same code unit reversal.
+   */
+  private static void reverseUnicode3(char[] buffer, int start, int len){
+    if (len <= 1) return;
+    int num = len>>1;
+    for (int i = start; i < ( start + num ); i++) {
+      char c = buffer[i];
+      buffer[i] = buffer[start * 2 + len - i - 1];
+      buffer[start * 2 + len - i - 1] = c;
+    }
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,214 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * <tt>Automaton</tt> state.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+public class State implements Serializable, Comparable<State> {
+  
+  static final long serialVersionUID = 30001;
+  
+  boolean accept;
+  Set<Transition> transitions;
+  
+  int number;
+
+  int id;
+  static int next_id;
+  
+  /**
+   * Constructs a new state. Initially, the new state is a reject state.
+   */
+  public State() {
+    resetTransitions();
+    id = next_id++;
+  }
+  
+  /**
+   * Resets transition set.
+   */
+  final void resetTransitions() {
+    transitions = new HashSet<Transition>();
+  }
+  
+  /**
+   * Returns the set of outgoing transitions. Subsequent changes are reflected
+   * in the automaton.
+   * 
+   * @return transition set
+   */
+  public Set<Transition> getTransitions() {
+    return transitions;
+  }
+  
+  /**
+   * Adds an outgoing transition.
+   * 
+   * @param t transition
+   */
+  public void addTransition(Transition t) {
+    transitions.add(t);
+  }
+  
+  /**
+   * Sets acceptance for this state.
+   * 
+   * @param accept if true, this state is an accept state
+   */
+  public void setAccept(boolean accept) {
+    this.accept = accept;
+  }
+  
+  /**
+   * Returns acceptance status.
+   * 
+   * @return true is this is an accept state
+   */
+  public boolean isAccept() {
+    return accept;
+  }
+  
+  /**
+   * Performs lookup in transitions, assuming determinism.
+   * 
+   * @param c character to look up
+   * @return destination state, null if no matching outgoing transition
+   * @see #step(char, Collection)
+   */
+  public State step(char c) {
+    for (Transition t : transitions)
+      if (t.min <= c && c <= t.max) return t.to;
+    return null;
+  }
+  
+  /**
+   * Performs lookup in transitions, allowing nondeterminism.
+   * 
+   * @param c character to look up
+   * @param dest collection where destination states are stored
+   * @see #step(char)
+   */
+  public void step(char c, Collection<State> dest) {
+    for (Transition t : transitions)
+      if (t.min <= c && c <= t.max) dest.add(t.to);
+  }
+  
+  void addEpsilon(State to) {
+    if (to.accept) accept = true;
+    for (Transition t : to.transitions)
+      transitions.add(t);
+  }
+  
+  /**
+   * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse
+   * max)
+   */
+  public Transition[] getSortedTransitionArray(boolean to_first) {
+    Transition[] e = transitions.toArray(new Transition[transitions.size()]);
+    Arrays.sort(e, new TransitionComparator(to_first));
+    return e;
+  }
+  
+  /**
+   * Returns sorted list of outgoing transitions.
+   * 
+   * @param to_first if true, order by (to, min, reverse max); otherwise (min,
+   *          reverse max, to)
+   * @return transition list
+   */
+  public List<Transition> getSortedTransitions(boolean to_first) {
+    return Arrays.asList(getSortedTransitionArray(to_first));
+  }
+  
+  
+  /**
+   * Return this state's number. 
+   * <p>
+   * Expert: Will be useless unless {@link Automaton#setStateNumbers(Set)}
+   * has been called first to number the states.
+   * @return the number
+   */
+  public int getNumber() {
+    return number;
+  }
+  
+  /**
+   * Returns string describing this state. Normally invoked via
+   * {@link Automaton#toString()}.
+   */
+  @Override
+  public String toString() {
+    StringBuilder b = new StringBuilder();
+    b.append("state ").append(number);
+    if (accept) b.append(" [accept]");
+    else b.append(" [reject]");
+    b.append(":\n");
+    for (Transition t : transitions)
+      b.append("  ").append(t.toString()).append("\n");
+    return b.toString();
+  }
+  
+  /**
+   * Compares this object with the specified object for order. States are
+   * ordered by the time of construction.
+   */
+  public int compareTo(State s) {
+    return s.id - id;
+  }
+  
+  /**
+   * See {@link java.lang.Object#equals(java.lang.Object)}.
+   */
+  @Override
+  public boolean equals(Object obj) {
+    return super.equals(obj);
+  }
+  
+  /**
+   * See {@link java.lang.Object#hashCode()}.
+   */
+  @Override
+  public int hashCode() {
+    return super.hashCode();
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/State.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java?rev=888891&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java (added)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java Wed Dec  9 17:45:58 2009
@@ -0,0 +1,104 @@
+/*
+ * dk.brics.automaton
+ * 
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.apache.lucene.util.automaton;
+
+/**
+ * Pair of states.
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the <b>Automaton</b> feature is experimental.
+ * The APIs introduced here might change in the future and will not be
+ * supported anymore in such a case.</font>
+ */
+public class StatePair {
+  State s;
+  State s1;
+  State s2;
+  
+  StatePair(State s, State s1, State s2) {
+    this.s = s;
+    this.s1 = s1;
+    this.s2 = s2;
+  }
+  
+  /**
+   * Constructs a new state pair.
+   * 
+   * @param s1 first state
+   * @param s2 second state
+   */
+  public StatePair(State s1, State s2) {
+    this.s1 = s1;
+    this.s2 = s2;
+  }
+  
+  /**
+   * Returns first component of this pair.
+   * 
+   * @return first state
+   */
+  public State getFirstState() {
+    return s1;
+  }
+  
+  /**
+   * Returns second component of this pair.
+   * 
+   * @return second state
+   */
+  public State getSecondState() {
+    return s2;
+  }
+  
+  /**
+   * Checks for equality.
+   * 
+   * @param obj object to compare with
+   * @return true if <tt>obj</tt> represents the same pair of states as this
+   *         pair
+   */
+  @Override
+  public boolean equals(Object obj) {
+    if (obj instanceof StatePair) {
+      StatePair p = (StatePair) obj;
+      return p.s1 == s1 && p.s2 == s2;
+    } else return false;
+  }
+  
+  /**
+   * Returns hash code.
+   * 
+   * @return hash code
+   */
+  @Override
+  public int hashCode() {
+    return s1.hashCode() + s2.hashCode();
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/java/org/apache/lucene/util/automaton/StatePair.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message