lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1128870 [3/3] - in /lucene/dev/branches/branch_3x/lucene: ./ backwards/src/test/org/apache/lucene/index/ backwards/src/test/org/apache/lucene/store/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/store/ src/java/org/apache/lu...
Date Sun, 29 May 2011 12:30:15 GMT
Added: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java?rev=1128870&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java (added)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/UpToTwoPositiveIntOutputs.java Sun May 29 12:30:14 2011
@@ -0,0 +1,224 @@
+package org.apache.lucene.util.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * Holds one or two longs for each input term.  If it's a
+ * single output, Long is returned; else, TwoLongs.  Order
+ * is preseved in the TwoLongs case, ie .first is the first
+ * input/output added to Builder, and .second is the
+ * second.  You cannot store 0 output with this (that's
+ * reserved to mean "no output")!
+ *
+ * NOTE: the resulting FST is not guaranteed to be minimal!
+ * See {@link Builder}.
+ *
+ * @lucene.experimental
+ */
+
+public final class UpToTwoPositiveIntOutputs extends Outputs<Object> {
+
+  public final static class TwoLongs {
+    final long first;
+    final long second;
+
+    public TwoLongs(long first, long second) {
+      this.first = first;
+      this.second = second;
+      assert first >= 0;
+      assert second >= 0;
+    }
+
+    @Override
+    public String toString() {
+      return "TwoLongs:" + first + "," + second;
+    }
+
+    @Override
+    public boolean equals(Object _other) {
+      if (_other instanceof TwoLongs) {
+        final TwoLongs other = (TwoLongs) _other;
+        return first == other.first && second == other.second;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public int hashCode() {
+      return (int) ((first^(first>>>32)) ^ (second^(second>>32)));
+    }
+  }
+  
+  private final static Long NO_OUTPUT = new Long(0);
+
+  private final boolean doShare;
+
+  private final static UpToTwoPositiveIntOutputs singletonShare = new UpToTwoPositiveIntOutputs(true);
+  private final static UpToTwoPositiveIntOutputs singletonNoShare = new UpToTwoPositiveIntOutputs(false);
+
+  private UpToTwoPositiveIntOutputs(boolean doShare) {
+    this.doShare = doShare;
+  }
+
+  public static UpToTwoPositiveIntOutputs getSingleton(boolean doShare) {
+    return doShare ? singletonShare : singletonNoShare;
+  }
+
+  public Long get(long v) {
+    if (v == 0) {
+      return NO_OUTPUT;
+    } else {
+      return Long.valueOf(v);
+    }
+  }
+
+  public TwoLongs get(long first, long second) {
+    return new TwoLongs(first, second);
+  }
+
+  @Override
+  public Long common(Object _output1, Object _output2) {
+    assert valid(_output1, false);
+    assert valid(_output2, false);
+    final Long output1 = (Long) _output1;
+    final Long output2 = (Long) _output2;
+    if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
+      return NO_OUTPUT;
+    } else if (doShare) {
+      assert output1 > 0;
+      assert output2 > 0;
+      return Math.min(output1, output2);
+    } else if (output1.equals(output2)) {
+      return output1;
+    } else {
+      return NO_OUTPUT;
+    }
+  }
+
+  @Override
+  public Long subtract(Object _output, Object _inc) {
+    assert valid(_output, false);
+    assert valid(_inc, false);
+    final Long output = (Long) _output;
+    final Long inc = (Long) _inc;
+    assert output >= inc;
+
+    if (inc == NO_OUTPUT) {
+      return output;
+    } else if (output.equals(inc)) {
+      return NO_OUTPUT;
+    } else {
+      return output - inc;
+    }
+  }
+
+  @Override
+  public Object add(Object _prefix, Object _output) {
+    assert valid(_prefix, false);
+    assert valid(_output, true);
+    final Long prefix = (Long) _prefix;
+    if (_output instanceof Long) {
+      final Long output = (Long) _output;
+      if (prefix == NO_OUTPUT) {
+        return output;
+      } else if (output == NO_OUTPUT) {
+        return prefix;
+      } else {
+        return prefix + output;
+      }
+    } else {
+      final TwoLongs output = (TwoLongs) _output;
+      final long v = prefix;
+      return new TwoLongs(output.first + v, output.second + v);
+    }
+  }
+
+  @Override
+  public void write(Object _output, DataOutput out) throws IOException {
+    assert valid(_output, true);
+    if (_output instanceof Long) {
+      final Long output = (Long) _output;
+      out.writeVLong(output<<1);
+    } else {
+      final TwoLongs output = (TwoLongs) _output;
+      out.writeVLong((output.first<<1) | 1);
+      out.writeVLong(output.second);
+    }
+  }
+
+  @Override
+  public Object read(DataInput in) throws IOException {
+    final long code = in.readVLong();
+    if ((code & 1) == 0) {
+      // single long
+      final long v = code >>> 1;
+      if (v == 0) {
+        return NO_OUTPUT;
+      } else {
+        return Long.valueOf(v);
+      }
+    } else {
+      // two longs
+      final long first = code >>> 1;
+      final long second = in.readVLong();
+      return new TwoLongs(first, second);
+    }
+  }
+
+  private boolean valid(Long o) {
+    assert o != null;
+    assert o instanceof Long;
+    assert o == NO_OUTPUT || o > 0;
+    return true;
+  }
+
+  // Used only by assert
+  private boolean valid(Object _o, boolean allowDouble) {
+    if (!allowDouble) {
+      assert _o instanceof Long;
+      return valid((Long) _o);
+    } else if (_o instanceof TwoLongs) {
+      return true;
+    } else {
+      return valid((Long) _o);
+    }
+  }
+
+  @Override
+  public Object getNoOutput() {
+    return NO_OUTPUT;
+  }
+
+  @Override
+  public String outputToString(Object output) {
+    return output.toString();
+  }
+
+  @Override
+  public Object merge(Object first, Object second) {
+    assert valid(first, false);
+    assert valid(second, false);
+    return new TwoLongs((Long) first, (Long) second);
+  }
+}

Added: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1128870&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java (added)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java Sun May 29 12:30:14 2011
@@ -0,0 +1,328 @@
+package org.apache.lucene.util.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+/** Static helper methods
+ *
+ * @lucene.experimental */
+public final class Util {
+  private Util() {
+  }
+
+  /** Looks up the output for this input, or null if the
+   *  input is not accepted. FST must be
+   *  INPUT_TYPE.BYTE4. */
+  public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
+    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
+
+    // TODO: would be nice not to alloc this on every lookup
+    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+    // Accumulate output as we go
+    final T NO_OUTPUT = fst.outputs.getNoOutput();
+    T output = NO_OUTPUT;
+    for(int i=0;i<input.length;i++) {
+      if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        output = fst.outputs.add(output, arc.output);
+      }
+    }
+
+    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
+      return null;
+    } else if (arc.output != NO_OUTPUT) {
+      return fst.outputs.add(output, arc.output);
+    } else {
+      return output;
+    }
+  }
+
+  /** Logically casts input to UTF32 ints then looks up the output
+   *  or null if the input is not accepted.  FST must be
+   *  INPUT_TYPE.BYTE4.  */
+  public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
+    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
+
+    // TODO: would be nice not to alloc this on every lookup
+    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+    int charIdx = offset;
+    final int charLimit = offset + length;
+
+    // Accumulate output as we go
+    final T NO_OUTPUT = fst.outputs.getNoOutput();
+    T output = NO_OUTPUT;
+    while(charIdx < charLimit) {
+      final int utf32 = Character.codePointAt(input, charIdx);
+      charIdx += Character.charCount(utf32);
+
+      if (fst.findTargetArc(utf32, arc, arc) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        output = fst.outputs.add(output, arc.output);
+      }
+    }
+
+    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
+      return null;
+    } else if (arc.output != NO_OUTPUT) {
+      return fst.outputs.add(output, arc.output);
+    } else {
+      return output;
+    }
+  }
+
+
+  /** Logically casts input to UTF32 ints then looks up the output
+   *  or null if the input is not accepted.  FST must be
+   *  INPUT_TYPE.BYTE4.  */
+  public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
+    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
+    
+    // TODO: would be nice not to alloc this on every lookup
+    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+    int charIdx = 0;
+    final int charLimit = input.length();
+
+    // Accumulate output as we go
+    final T NO_OUTPUT = fst.outputs.getNoOutput();
+    T output = NO_OUTPUT;
+
+    while(charIdx < charLimit) {
+      final int utf32 = Character.codePointAt(input, charIdx);
+      charIdx += Character.charCount(utf32);
+
+      if (fst.findTargetArc(utf32, arc, arc) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        output = fst.outputs.add(output, arc.output);
+      }
+    }
+
+    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
+      return null;
+    } else if (arc.output != NO_OUTPUT) {
+      return fst.outputs.add(output, arc.output);
+    } else {
+      return output;
+    }
+  }
+
+  /** Looks up the output for this input, or null if the
+   *  input is not accepted */
+  public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
+    assert fst.inputType == FST.INPUT_TYPE.BYTE1;
+
+    // TODO: would be nice not to alloc this on every lookup
+    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+    // Accumulate output as we go
+    final T NO_OUTPUT = fst.outputs.getNoOutput();
+    T output = NO_OUTPUT;
+    for(int i=0;i<input.length;i++) {
+      if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        output = fst.outputs.add(output, arc.output);
+      }
+    }
+
+    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
+      return null;
+    } else if (arc.output != NO_OUTPUT) {
+      return fst.outputs.add(output, arc.output);
+    } else {
+      return output;
+    }
+  }
+  
+  /**
+   * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
+   * for visualization. Example of use:
+   * 
+   * <pre>
+   * PrintStream ps = new PrintStream(&quot;out.dot&quot;);
+   * fst.toDot(ps);
+   * ps.close();
+   * </pre>
+   * 
+   * and then, from command line:
+   * 
+   * <pre>
+   * dot -Tpng -o out.png out.dot
+   * </pre>
+   * 
+   * <p>
+   * Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
+   * 
+   * @param sameRank
+   *          If <code>true</code>, the resulting <code>dot</code> file will try
+   *          to order states in layers of breadth-first traversal. This may
+   *          mess up arcs, but makes the output FST's structure a bit clearer.
+   * 
+   * @param labelStates
+   *          If <code>true</code> states will have labels equal to their offsets in their
+   *          binary format. Expands the graph considerably. 
+   * 
+   * @see "http://www.graphviz.org/"
+   */
+  public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) 
+    throws IOException {    
+    final String expandedNodeColor = "blue";
+
+    // This is the start arc in the automaton (from the epsilon state to the first state 
+    // with outgoing transitions.
+    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
+
+    // A queue of transitions to consider for the next level.
+    final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();
+
+    // A queue of transitions to consider when processing the next level.
+    final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
+    nextLevelQueue.add(startArc);
+    
+    // A list of states on the same level (for ranking).
+    final List<Integer> sameLevelStates = new ArrayList<Integer>();
+
+    // A bitset of already seen states (target offset).
+    final BitSet seen = new BitSet();
+    seen.set(startArc.target);
+
+    // Shape for states.
+    final String stateShape = "circle";
+
+    // Emit DOT prologue.
+    out.write("digraph FST {\n");
+    out.write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
+
+    if (!labelStates) {
+      out.write("  node [shape=circle, width=.2, height=.2, style=filled]\n");      
+    }
+
+    emitDotState(out, "initial", "point", "white", "");
+    emitDotState(out, Integer.toString(startArc.target), stateShape, 
+        fst.isExpandedTarget(startArc) ? expandedNodeColor : null, 
+        "");
+    out.write("  initial -> " + startArc.target + "\n");
+
+    final T NO_OUTPUT = fst.outputs.getNoOutput();
+    int level = 0;
+
+    while (!nextLevelQueue.isEmpty()) {
+      // we could double buffer here, but it doesn't matter probably.
+      thisLevelQueue.addAll(nextLevelQueue);
+      nextLevelQueue.clear();
+
+      level++;
+      out.write("\n  // Transitions and states at level: " + level + "\n");
+      while (!thisLevelQueue.isEmpty()) {
+        final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
+        
+        if (fst.targetHasArcs(arc)) {
+          // scan all arcs
+          final int node = arc.target;
+          fst.readFirstTargetArc(arc, arc);
+          
+          while (true) {
+            // Emit the unseen state and add it to the queue for the next level.
+            if (arc.target >= 0 && !seen.get(arc.target)) {
+              final boolean isExpanded = fst.isExpandedTarget(arc);
+              emitDotState(out, Integer.toString(arc.target), stateShape, 
+                  isExpanded ?  expandedNodeColor : null, 
+                  labelStates ? Integer.toString(arc.target) : ""); 
+              seen.set(arc.target);
+              nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
+              sameLevelStates.add(arc.target);
+            }
+
+            String outs;
+            if (arc.output != NO_OUTPUT) {
+              outs = "/" + fst.outputs.outputToString(arc.output);
+            } else {
+              outs = "";
+            }
+
+            final String cl;
+            if (arc.label == FST.END_LABEL) {
+              cl = "~";
+            } else {
+              cl = printableLabel(arc.label);
+            }
+
+            out.write("  " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");
+            
+            // Break the loop if we're on the last arc of this state.
+            if (arc.isLast()) {
+              break;
+            }
+            fst.readNextArc(arc);
+          }
+        }
+      }
+
+      // Emit state ranking information.
+      if (sameRank && sameLevelStates.size() > 1) {
+        out.write("  {rank=same; ");
+        for (int state : sameLevelStates) {
+          out.write(state + "; ");
+        }
+        out.write(" }\n");
+      }
+      sameLevelStates.clear();                
+    }
+
+    // Emit terminating state (always there anyway).
+    out.write("  -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
+    out.write("  {rank=sink; -1 }\n");
+    
+    out.write("}\n");
+    out.flush();
+  }
+
+  /**
+   * Emit a single state in the <code>dot</code> language. 
+   */
+  private static void emitDotState(Writer out, String name, String shape,
+      String color, String label) throws IOException {
+    out.write("  " + name 
+        + " [" 
+        + (shape != null ? "shape=" + shape : "") + " "
+        + (color != null ? "color=" + color : "") + " "
+        + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
+        + "]\n");
+  }
+
+  /**
+   * Ensures an arc's label is indeed printable (dot uses US-ASCII). 
+   */
+  private static String printableLabel(int label) {
+    if (label >= 0x20 && label <= 0x7d) {
+      return Character.toString((char) label);
+    } else {
+      return "0x" + Integer.toHexString(label);
+    }
+  }
+}

Added: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/package.html?rev=1128870&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/package.html (added)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/package.html Sun May 29 12:30:14 2011
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Finite state transducers
+</body>
+</html>

Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java?rev=1128870&r1=1128869&r2=1128870&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/store/MockIndexOutputWrapper.java Sun May 29 12:30:14 2011
@@ -154,7 +154,7 @@ public class MockIndexOutputWrapper exte
   }
 
   @Override
-  public void copyBytes(IndexInput input, long numBytes) throws IOException {
+  public void copyBytes(DataInput input, long numBytes) throws IOException {
     delegate.copyBytes(input, numBytes);
     // TODO: we may need to check disk full here as well
     dir.maybeThrowDeterministicException();

Modified: lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java?rev=1128870&r1=1128869&r2=1128870&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test-framework/org/apache/lucene/util/ThrottledIndexOutput.java Sun May 29 12:30:14 2011
@@ -18,7 +18,7 @@ package org.apache.lucene.util;
  */
 import java.io.IOException;
 
-import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexOutput;
 
 public class ThrottledIndexOutput extends IndexOutput {
@@ -141,7 +141,7 @@ public class ThrottledIndexOutput extend
   }
 
   @Override
-  public void copyBytes(IndexInput input, long numBytes) throws IOException {
+  public void copyBytes(DataInput input, long numBytes) throws IOException {
     delegate.copyBytes(input, numBytes);
   }
 }

Added: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1128870&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (added)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Sun May 29 12:30:14 2011
@@ -0,0 +1,1565 @@
+package org.apache.lucene.util.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.*;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.fst.FST.Arc;
+
+public class TestFSTs extends LuceneTestCase {
+
+  private MockDirectoryWrapper dir;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    dir = newDirectory();
+    dir.setPreventDoubleWrite(false);
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    dir.close();
+    super.tearDown();
+  }
+
+  private static BytesRef toBytesRef(IntsRef ir) {
+    BytesRef br = new BytesRef(ir.length);
+    for(int i=0;i<ir.length;i++) {
+      int x = ir.ints[ir.offset+i];
+      assert x >= 0 && x <= 255;
+      br.bytes[i] = (byte) x;
+    }
+    br.length = ir.length;
+    return br;
+  }
+
+  private static IntsRef toIntsRef(String s, int inputMode) {
+    return toIntsRef(s, inputMode, new IntsRef(10));
+  }
+
+  private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
+    if (inputMode == 0) {
+      // utf8
+      return toIntsRef(new BytesRef(s), ir);
+    } else {
+      // utf32
+      return toIntsRefUTF32(s, ir);
+    }
+  }
+
+  private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
+    final int charLength = s.length();
+    int charIdx = 0;
+    int intIdx = 0;
+    while(charIdx < charLength) {
+      if (intIdx == ir.ints.length) {
+        ir.grow(intIdx+1);
+      }
+      final int utf32 = s.codePointAt(charIdx);
+      ir.ints[intIdx] = utf32;
+      charIdx += Character.charCount(utf32);
+      intIdx++;
+    }
+    ir.length = intIdx;
+    return ir;
+  }
+
+  private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
+    if (br.length > ir.ints.length) {
+      ir.grow(br.length);
+    }
+    for(int i=0;i<br.length;i++) {
+      ir.ints[i] = br.bytes[br.offset+i]&0xFF;
+    }
+    ir.length = br.length;
+    return ir;
+  }
+
+  public void testBasicFSA() throws IOException {
+    String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
+    String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
+    IntsRef[] terms = new IntsRef[strings.length];
+    IntsRef[] terms2 = new IntsRef[strings2.length];
+    for(int inputMode=0;inputMode<2;inputMode++) {
+      if (VERBOSE) {
+        System.out.println("TEST: inputMode=" + inputModeToString(inputMode));
+      }
+
+      for(int idx=0;idx<strings.length;idx++) {
+        terms[idx] = toIntsRef(strings[idx], inputMode);
+      }
+      for(int idx=0;idx<strings2.length;idx++) {
+        terms2[idx] = toIntsRef(strings2[idx], inputMode);
+      }
+      Arrays.sort(terms2);
+
+      doTest(inputMode, terms);
+    
+      // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms):
+
+      // FSA
+      {
+        final Outputs<Object> outputs = NoOutputs.getSingleton();
+        final Object NO_OUTPUT = outputs.getNoOutput();      
+        final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms2.length);
+        for(IntsRef term : terms2) {
+          pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
+        }
+        FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        assertNotNull(fst);
+        assertEquals(22, fst.getNodeCount());
+        assertEquals(27, fst.getArcCount());
+      }
+
+      // FST ord pos int
+      {
+        final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+        final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
+        for(int idx=0;idx<terms2.length;idx++) {
+          pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
+        }
+        final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        assertNotNull(fst);
+        assertEquals(22, fst.getNodeCount());
+        assertEquals(27, fst.getArcCount());
+      }
+
+      // FST byte sequence ord
+      {
+        final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+        final BytesRef NO_OUTPUT = outputs.getNoOutput();      
+        final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms2.length);
+        for(int idx=0;idx<terms2.length;idx++) {
+          final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
+          pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
+        }
+        final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        assertNotNull(fst);
+        assertEquals(24, fst.getNodeCount());
+        assertEquals(30, fst.getArcCount());
+      }
+    }
+  }
+
+  private static String simpleRandomString(Random r) {
+    final int end = r.nextInt(10);
+    if (end == 0) {
+      // allow 0 length
+      return "";
+    }
+    final char[] buffer = new char[end];
+    for (int i = 0; i < end; i++) {
+      buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+    }
+    return new String(buffer, 0, end);
+  }
+
+  // given set of terms, test the different outputs for them
+  private void doTest(int inputMode, IntsRef[] terms) throws IOException {
+    Arrays.sort(terms);
+
+    // NoOutputs (simple FSA)
+    {
+      final Outputs<Object> outputs = NoOutputs.getSingleton();
+      final Object NO_OUTPUT = outputs.getNoOutput();      
+      final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
+      for(IntsRef term : terms) {
+        pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
+      }
+      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // PositiveIntOutput (ord)
+    {
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+      final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+      for(int idx=0;idx<terms.length;idx++) {
+        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
+      }
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // PositiveIntOutput (random monotonically increasing positive number)
+    {
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+      final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+      long lastOutput = 0;
+      for(int idx=0;idx<terms.length;idx++) {
+        final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
+        lastOutput = value;
+        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
+      }
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // PositiveIntOutput (random positive number)
+    {
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+      final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+      for(int idx=0;idx<terms.length;idx++) {
+        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
+      }
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // Pair<ord, (random monotonically increasing positive number>
+    {
+      final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean());
+      final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean());
+      final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
+      final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>>(terms.length);
+      long lastOutput = 0;
+      for(int idx=0;idx<terms.length;idx++) {
+        final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
+        lastOutput = value;
+        pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
+                                                                         outputs.get(o1.get(idx),
+                                                                                     o2.get(value))));
+      }
+      new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // Sequence-of-bytes
+    {
+      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+      final BytesRef NO_OUTPUT = outputs.getNoOutput();      
+      final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms.length);
+      for(int idx=0;idx<terms.length;idx++) {
+        final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
+        pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
+      }
+      new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // Sequence-of-ints
+    {
+      final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+      final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<FSTTester.InputOutput<IntsRef>>(terms.length);
+      for(int idx=0;idx<terms.length;idx++) {
+        final String s = Integer.toString(idx);
+        final IntsRef output = new IntsRef(s.length());
+        output.length = s.length();
+        for(int idx2=0;idx2<output.length;idx2++) {
+          output.ints[idx2] = s.charAt(idx2);
+        }
+        pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
+      }
+      new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+
+    // Up to two positive ints, shared, generally but not
+    // monotonically increasing
+    {
+      if (VERBOSE) {
+        System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
+      }
+      final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
+      final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
+      long lastOutput = 0;
+      for(int idx=0;idx<terms.length;idx++) {
+        // Sometimes go backwards
+        long value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+        while(value < 0) {
+          value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+        }
+        final Object output;
+        if (random.nextInt(5) == 3) {
+          long value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+          while(value2 < 0) {
+            value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+          }
+          output = outputs.get(value, value2);
+        } else {
+          output = outputs.get(value);
+        }
+        pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
+      }
+      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+    }
+  }
+
+  private static class FSTTester<T> {
+
+    final Random random;
+    final List<InputOutput<T>> pairs;
+    final int inputMode;
+    final Outputs<T> outputs;
+    final Directory dir;
+
+    public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
+      this.random = random;
+      this.dir = dir;
+      this.inputMode = inputMode;
+      this.pairs = pairs;
+      this.outputs = outputs;
+    }
+
+    private static class InputOutput<T> implements Comparable<InputOutput<T>> {
+      public final IntsRef input;
+      public final T output;
+
+      public InputOutput(IntsRef input, T output) {
+        this.input = input;
+        this.output = output;
+      }
+
+      public int compareTo(InputOutput<T> other) {
+        if (other instanceof InputOutput) {
+          return input.compareTo((other).input);
+        } else {
+          throw new IllegalArgumentException();
+        }
+      }
+    }
+
+    public void doTest() throws IOException {
+      // no pruning
+      doTest(0, 0);
+
+      if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
+        // simple pruning
+        doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0);
+        
+        // leafy pruning
+        doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()));
+      }
+    }
+
+    // runs the term, returning the output, or null if term
+    // isn't accepted.  if prefixLength is non-null it must be
+    // length 1 int array; prefixLength[0] is set to the length
+    // of the term prefix that matches
+    private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
+      assert prefixLength == null || prefixLength.length == 1;
+      final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+      final T NO_OUTPUT = fst.outputs.getNoOutput();
+      T output = NO_OUTPUT;
+
+      for(int i=0;i<=term.length;i++) {
+        final int label;
+        if (i == term.length) {
+          label = FST.END_LABEL;
+        } else {
+          label = term.ints[term.offset+i];
+        }
+        //System.out.println("   loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
+        if (fst.findTargetArc(label, arc, arc) == null) {
+          if (prefixLength != null) {
+            prefixLength[0] = i;
+            return output;
+          } else {
+            return null;
+          }
+        }
+        output = fst.outputs.add(output, arc.output);
+      }
+
+      if (prefixLength != null) {
+        prefixLength[0] = term.length;
+      }
+
+      return output;
+    }
+
+    private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
+      FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+      final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
+      in.length = 0;
+      in.offset = 0;
+      final T NO_OUTPUT = fst.outputs.getNoOutput();
+      T output = NO_OUTPUT;
+
+      while(true) {
+        // read all arcs:
+        fst.readFirstTargetArc(arc, arc);
+        arcs.add(new FST.Arc<T>().copyFrom(arc));
+        while(!arc.isLast()) {
+          fst.readNextArc(arc);
+          arcs.add(new FST.Arc<T>().copyFrom(arc));
+        }
+      
+        // pick one
+        arc = arcs.get(random.nextInt(arcs.size()));
+        arcs.clear();
+
+        // accumulate output
+        output = fst.outputs.add(output, arc.output);
+
+        // append label
+        if (arc.label == FST.END_LABEL) {
+          break;
+        }
+
+        if (in.ints.length == in.length) {
+          in.grow(1+in.length);
+        }
+        in.ints[in.length++] = arc.label;
+      }
+
+      return output;
+    }
+
+
+    FST<T> doTest(int prune1, int prune2) throws IOException {
+      if (VERBOSE) {
+        System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
+      }
+
+      final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
+                                                prune1, prune2,
+                                                prune1==0 && prune2==0, outputs);
+
+      for(InputOutput<T> pair : pairs) {
+        if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
+          final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
+          final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
+          @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
+          builderObject.add(pair.input, _outputs.get(twoLongs.first));
+          builderObject.add(pair.input, _outputs.get(twoLongs.second));
+        } else {
+          builder.add(pair.input, pair.output);
+        }
+      }
+      FST<T> fst = builder.finish();
+
+      if (random.nextBoolean() && fst != null) {
+        IndexOutput out = dir.createOutput("fst.bin");
+        fst.save(out);
+        out.close();
+        IndexInput in = dir.openInput("fst.bin");
+        try {
+          fst = new FST<T>(in, outputs);
+        } finally {
+          in.close();
+          dir.deleteFile("fst.bin");
+        }
+      }
+
+      if (VERBOSE && pairs.size() <= 20 && fst != null) {
+        Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+        Util.toDot(fst, w, false, false);
+        w.close();
+        System.out.println("SAVED out.dot");
+      }
+
+      if (VERBOSE) {
+        if (fst == null) {
+          System.out.println("  fst has 0 nodes (fully pruned)");
+        } else {
+          System.out.println("  fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
+        }
+      }
+
+      if (prune1 == 0 && prune2 == 0) {
+        verifyUnPruned(inputMode, fst);
+      } else {
+        verifyPruned(inputMode, fst, prune1, prune2);
+      }
+
+      return fst;
+    }
+
+    // FST is complete
+    private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
+
+      if (pairs.size() == 0) {
+        assertNull(fst);
+        return;
+      }
+
+      if (VERBOSE) {
+        System.out.println("TEST: now verify " + pairs.size() + " terms");
+        for(InputOutput<T> pair : pairs) {
+          assertNotNull(pair);
+          assertNotNull(pair.input);
+          assertNotNull(pair.output);
+          System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+        }
+      }
+
+      assertNotNull(fst);
+
+      // visit valid paris in order -- make sure all words
+      // are accepted, and FSTEnum's next() steps through
+      // them correctly
+      if (VERBOSE) {
+        System.out.println("TEST: check valid terms/next()");
+      }
+      {
+        IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+        for(InputOutput<T> pair : pairs) {
+          IntsRef term = pair.input;
+          if (VERBOSE) {
+            System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
+          }
+          Object output = run(fst, term, null);
+
+          assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
+          assertEquals(pair.output, output);
+
+          // verify enum's next
+          IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
+          assertNotNull(t);
+          assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
+          assertEquals(pair.output, t.output);
+        }
+        assertNull(fstEnum.next());
+      }
+
+      final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
+      for(InputOutput<T> pair : pairs) {
+        termsMap.put(pair.input, pair.output);
+      }
+
+      // find random matching word and make sure it's valid
+      if (VERBOSE) {
+        System.out.println("TEST: verify random accepted terms");
+      }
+      final IntsRef scratch = new IntsRef(10);
+      for(int iter=0;iter<500*RANDOM_MULTIPLIER;iter++) {
+        T output = randomAcceptedWord(fst, scratch);
+        assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
+        assertEquals(termsMap.get(scratch), output);
+      }
+    
+      // test IntsRefFSTEnum.seek:
+      if (VERBOSE) {
+        System.out.println("TEST: verify seek");
+      }
+      IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+      for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+        if (VERBOSE) {
+          System.out.println("TEST: iter=" + iter);
+        }
+        if (random.nextBoolean()) {
+          // seek to term that doesn't exist:
+          while(true) {
+            final IntsRef term = toIntsRef(getRandomString(), inputMode);
+            int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+            if (pos < 0) {
+              pos = -(pos+1);
+              // ok doesn't exist
+              //System.out.println("  seek " + inputToString(inputMode, term));
+              final IntsRefFSTEnum.InputOutput<T> seekResult;
+              if (random.nextBoolean()) {
+                if (VERBOSE) {
+                  System.out.println("  do non-exist seekFloor term=" + inputToString(inputMode, term));
+                }
+                seekResult = fstEnum.seekFloor(term);
+                pos--;
+              } else {
+                if (VERBOSE) {
+                  System.out.println("  do non-exist seekCeil term=" + inputToString(inputMode, term));
+                }
+                seekResult = fstEnum.seekCeil(term);
+              }
+
+              if (pos != -1 && pos < pairs.size()) {
+                //System.out.println("    got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
+                assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
+                if (VERBOSE) {
+                  System.out.println("    got " + inputToString(inputMode, seekResult.input));
+                }
+                assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
+                assertEquals(pairs.get(pos).output, seekResult.output);
+              } else {
+                // seeked before start or beyond end
+                //System.out.println("seek=" + seekTerm);
+                assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
+                if (VERBOSE) {
+                  System.out.println("    got null");
+                }
+              }
+
+              break;
+            }
+          }
+        } else {
+          // seek to term that does exist:
+          InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
+          final IntsRefFSTEnum.InputOutput<T> seekResult;
+          if (random.nextBoolean()) {
+            if (VERBOSE) {
+              System.out.println("  do exists seekFloor " + inputToString(inputMode, pair.input));
+            }
+            seekResult = fstEnum.seekFloor(pair.input);
+          } else {
+            if (VERBOSE) {
+              System.out.println("  do exists seekCeil " + inputToString(inputMode, pair.input));
+            }
+            seekResult = fstEnum.seekCeil(pair.input);
+          }
+          assertNotNull(seekResult);
+          assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
+          assertEquals(pair.output, seekResult.output);
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("TEST: mixed next/seek");
+      }
+
+      // test mixed next/seek
+      for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+        if (VERBOSE) {
+          System.out.println("TEST: iter " + iter);
+        }
+        // reset:
+        fstEnum = new IntsRefFSTEnum<T>(fst);
+        int upto = -1;
+        while(true) {
+          boolean isDone = false;
+          if (upto == pairs.size()-1 || random.nextBoolean()) {
+            // next
+            upto++;
+            if (VERBOSE) {
+              System.out.println("  do next");
+            }
+            isDone = fstEnum.next() == null;
+          } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
+            int attempt = 0;
+            for(;attempt<10;attempt++) {
+              IntsRef term = toIntsRef(getRandomString(), inputMode);
+              if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
+                int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+                assert pos < 0;
+                upto = -(pos+1);
+
+                if (random.nextBoolean()) {
+                  upto--;
+                  assertTrue(upto != -1);
+                  if (VERBOSE) {
+                    System.out.println("  do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
+                  }
+                  isDone = fstEnum.seekFloor(term) == null;
+                } else {
+                  if (VERBOSE) {
+                    System.out.println("  do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
+                  }
+                  isDone = fstEnum.seekCeil(term) == null;
+                }
+
+                break;
+              }
+            }
+            if (attempt == 10) {
+              continue;
+            }
+            
+          } else {
+            final int inc = random.nextInt(pairs.size() - upto - 1);
+            upto += inc;
+            if (upto == -1) {
+              upto = 0;
+            }
+
+            if (random.nextBoolean()) {
+              if (VERBOSE) {
+                System.out.println("  do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+              }
+              isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
+            } else {
+              if (VERBOSE) {
+                System.out.println("  do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+              }
+              isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
+            }
+          }
+          if (VERBOSE) {
+            if (!isDone) {
+              System.out.println("    got " + inputToString(inputMode, fstEnum.current().input));
+            } else {
+              System.out.println("    got null");
+            }
+          }
+
+          if (upto == pairs.size()) {
+            assertTrue(isDone);
+            break;
+          } else {
+            assertFalse(isDone);
+            assertEquals(pairs.get(upto).input, fstEnum.current().input);
+            assertEquals(pairs.get(upto).output, fstEnum.current().output);
+
+            /*
+            if (upto < pairs.size()-1) {
+              int tryCount = 0;
+              while(tryCount < 10) {
+                final IntsRef t = toIntsRef(getRandomString(), inputMode);
+                if (pairs.get(upto).input.compareTo(t) < 0) {
+                  final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
+                  if (VERBOSE) {
+                    System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
+                  }
+                  assertEquals(expected, fstEnum.beforeNext(t));
+                  break;
+                }
+                tryCount++;
+              }
+            }
+            */
+          }
+        }
+      }
+    }
+
+    private static class CountMinOutput<T> {
+      int count;
+      T output;
+      T finalOutput;
+      boolean isLeaf = true;
+      boolean isFinal;
+    }
+
+    // FST is pruned
+    private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
+
+      if (VERBOSE) {
+        System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
+        for(InputOutput<T> pair : pairs) {
+          System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+        }
+      }
+
+      // To validate the FST, we brute-force compute all prefixes
+      // in the terms, matched to their "common" outputs, prune that
+      // set according to the prune thresholds, then assert the FST
+      // matches that same set.
+
+      // NOTE: Crazy RAM intensive!!
+
+      //System.out.println("TEST: tally prefixes");
+
+      // build all prefixes
+      final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
+      final IntsRef scratch = new IntsRef(10);
+      for(InputOutput<T> pair: pairs) {
+        scratch.copy(pair.input);
+        for(int idx=0;idx<=pair.input.length;idx++) {
+          scratch.length = idx;
+          CountMinOutput<T> cmo = prefixes.get(scratch);
+          if (cmo == null) {
+            cmo = new CountMinOutput<T>();
+            cmo.count = 1;
+            cmo.output = pair.output;
+            prefixes.put(new IntsRef(scratch), cmo);
+          } else {
+            cmo.count++;
+            cmo.output = outputs.common(cmo.output, pair.output);
+          }
+          if (idx == pair.input.length) {
+            cmo.isFinal = true;
+            cmo.finalOutput = cmo.output;
+          }
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("TEST: now prune");
+      }
+
+      // prune 'em
+      final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
+      while(it.hasNext()) {
+        Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
+        final IntsRef prefix = ent.getKey();
+        final CountMinOutput<T> cmo = ent.getValue();
+        if (VERBOSE) {
+          System.out.println("  term=" + inputToString(inputMode, prefix) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
+        }
+        final boolean keep;
+        if (prune1 > 0) {
+          keep = cmo.count >= prune1;
+        } else {
+          assert prune2 > 0;
+          if (prune2 > 1 && cmo.count >= prune2) {
+            keep = true;
+          } else if (prefix.length > 0) {
+            // consult our parent
+            scratch.length = prefix.length-1;
+            System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
+            final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+            //System.out.println("    parent count = " + (cmo2 == null ? -1 : cmo2.count));
+            keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
+          } else if (cmo.count >= prune2) {
+            keep = true;
+          } else {
+            keep = false;
+          }
+        }
+
+        if (!keep) {
+          it.remove();
+          //System.out.println("    remove");
+        } else {
+          // clear isLeaf for all ancestors
+          //System.out.println("    keep");
+          scratch.copy(prefix);
+          scratch.length--;
+          while(scratch.length >= 0) {
+            final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+            if (cmo2 != null) {
+              //System.out.println("    clear isLeaf " + inputToString(inputMode, scratch));
+              cmo2.isLeaf = false;
+            }
+            scratch.length--;
+          }
+        }
+      }
+
+      //System.out.println("TEST: after prune");
+      /*
+        for(Map.Entry<BytesRef,CountMinOutput> ent : prefixes.entrySet()) {
+        System.out.println("  " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
+        if (ent.getValue().isFinal) {
+        System.out.println("    finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
+        }
+        }
+      */
+
+      if (prefixes.size() <= 1) {
+        assertNull(fst);
+        return;
+      }
+
+      assertNotNull(fst);
+
+      // make sure FST only enums valid prefixes
+      if (VERBOSE) {
+        System.out.println("TEST: check pruned enum");
+      }
+      IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+      IntsRefFSTEnum.InputOutput<T> current;
+      while((current = fstEnum.next()) != null) {
+        if (VERBOSE) {
+          System.out.println("  fstEnum.next term=" + inputToString(inputMode, current.input) + " output=" + outputs.outputToString(current.output));
+        }
+        final CountMinOutput cmo = prefixes.get(current.input);
+        assertNotNull(cmo);
+        assertTrue(cmo.isLeaf || cmo.isFinal);
+        //if (cmo.isFinal && !cmo.isLeaf) {
+        if (cmo.isFinal) {
+          assertEquals(cmo.finalOutput, current.output);
+        } else {
+          assertEquals(cmo.output, current.output);
+        }
+      }
+
+      // make sure all non-pruned prefixes are present in the FST
+      if (VERBOSE) {
+        System.out.println("TEST: verify all prefixes");
+      }
+      final int[] stopNode = new int[1];
+      for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
+        if (ent.getKey().length > 0) {
+          final CountMinOutput<T> cmo = ent.getValue();
+          final T output = run(fst, ent.getKey(), stopNode);
+          if (VERBOSE) {
+            System.out.println("TEST: verify term=" + inputToString(inputMode, ent.getKey()) + " output=" + outputs.outputToString(cmo.output));
+          }
+          // if (cmo.isFinal && !cmo.isLeaf) {
+          if (cmo.isFinal) {
+            assertEquals(cmo.finalOutput, output);
+          } else {
+            assertEquals(cmo.output, output);
+          }
+          assertEquals(ent.getKey().length, stopNode[0]);
+        }
+      }
+    }
+  }
+
+  public void testRandomWords() throws IOException {
+    testRandomWords(1000, 5 * RANDOM_MULTIPLIER);
+    //testRandomWords(20, 100);
+  }
+
+  private String inputModeToString(int mode) {
+    if (mode == 0) {
+      return "utf8";
+    } else {
+      return "utf32";
+    }
+  }
+
+  private void testRandomWords(int maxNumWords, int numIter) throws IOException {
+    for(int iter=0;iter<numIter;iter++) {
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter " + iter);
+      }
+      for(int inputMode=0;inputMode<2;inputMode++) {
+        final int numWords = random.nextInt(maxNumWords+1);
+        Set<IntsRef> termsSet = new HashSet<IntsRef>();
+        IntsRef[] terms = new IntsRef[numWords];
+        while(termsSet.size() < numWords) {
+          final String term = getRandomString();
+          termsSet.add(toIntsRef(term, inputMode));
+        }
+        doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
+      }
+    }
+  }
+
+  static String getRandomString() {
+    final String term;
+    if (random.nextBoolean()) {
+      term = _TestUtil.randomRealisticUnicodeString(random);
+    } else {
+      // we want to mix in limited-alphabet symbols so
+      // we get more sharing of the nodes given how few
+      // terms we are testing...
+      term = simpleRandomString(random);
+    }
+    return term;
+  }
+
+  @Nightly
+  public void testBigSet() throws IOException {
+    testRandomWords(50000, RANDOM_MULTIPLIER);
+  }
+
+  private static String inputToString(int inputMode, IntsRef term) {
+    if (inputMode == 0) {
+      // utf8
+      return toBytesRef(term).utf8ToString() + " " + term;
+    } else {
+      // utf32
+      return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
+    }
+  }
+
+  private static IntsRef toIntsRef(String s) {
+    final int charCount = s.length();
+    IntsRef ir = new IntsRef(charCount);
+    for(int charIDX=0;charIDX<charCount;charIDX++) {
+      ir.ints[charIDX] = s.charAt(charIDX);
+    }
+    ir.length = charCount;
+    return ir;
+  }
+
+  private static String toString(IntsRef ints) {
+    char[] chars = new char[ints.length];
+    for(int charIDX=0;charIDX<ints.length;charIDX++) {
+      final int ch = ints.ints[ints.offset+charIDX];
+      assertTrue(ch >= 0 && ch < 65536);
+      chars[charIDX] = (char) ch;
+    }
+    return new String(chars);
+  }
+
+  // Build FST for all unique terms in the test line docs
+  // file, up until a time limit
+  public void testRealTerms() throws Exception {
+
+    /*
+    if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
+      // no
+      CodecProvider.getDefault().setDefaultFieldCodec("Standard");
+    }
+    */
+
+    final LineFileDocs docs = new LineFileDocs(random);
+    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 100 : 1;
+    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
+    final File tempDir = _TestUtil.getTempDir("fstlines");
+    final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
+    final IndexWriter writer = new IndexWriter(dir, conf);
+    writer.setInfoStream(VERBOSE ? System.out : null);
+    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC * 1000;
+    Document doc;
+    int docCount = 0;
+    while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
+      writer.addDocument(doc);
+      docCount++;
+    }
+    IndexReader r = IndexReader.open(writer, true);
+    writer.close();
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, outputs);
+
+    boolean storeOrd = false;
+    if (VERBOSE) {
+      if (storeOrd) {
+        System.out.println("FST stores ord");
+      } else {
+        System.out.println("FST stores docFreq");
+      }
+    }
+    TermEnum termEnum = r.terms(new Term("body", ""));
+    if (VERBOSE) {
+      System.out.println("TEST: got termEnum=" + termEnum);
+    }
+    int ord = 0;
+    while(true) {
+      final Term term = termEnum.term();
+      if (term == null || !"body".equals(term.field())) {
+        break;
+      }
+
+      // No ord in 3.x:
+      /*
+      if (ord == 0) {
+        try {
+          termsEnum.ord();
+        } catch (UnsupportedOperationException uoe) {
+          if (VERBOSE) {
+            System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
+          }
+          storeOrd = false;
+        }
+      }
+      */
+      final int output;
+      if (storeOrd) {
+        output = ord;
+      } else {
+        output = termEnum.docFreq();
+      }
+      //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
+      builder.add(toIntsRef(term.text()), outputs.get(output));
+      ord++;
+      if (ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
+        System.out.println(ord + " terms...");
+      }
+      termEnum.next();
+    }
+    final FST<Long> fst = builder.finish();
+    if (VERBOSE) {
+      System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
+    }
+
+    if (ord > 0) {
+      // Now confirm BytesRefFSTEnum and TermEnum act the
+      // same:
+      final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+      for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) {
+        final String randomTerm = getRandomString();
+
+        if (VERBOSE) {
+          System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
+        }
+
+        termEnum = r.terms(new Term("body", randomTerm));
+        final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));
+
+        if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
+          assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
+        } else {
+          assertSame(termEnum, fstEnum, storeOrd);
+          for(int nextIter=0;nextIter<10;nextIter++) {
+            if (VERBOSE) {
+              System.out.println("TEST: next");
+              //if (storeOrd) {
+              //System.out.println("  ord=" + termEnum.ord());
+              //}
+            }
+            termEnum.next();
+            if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
+              if (VERBOSE) {
+                System.out.println("  term=" + termEnum.term());
+              }
+              assertNotNull(fstEnum.next());
+              assertSame(termEnum, fstEnum, storeOrd);
+            } else {
+              if (VERBOSE) {
+                System.out.println("  end!");
+              }
+              IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
+              if (nextResult != null) {
+                System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
+                fail();
+              }
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    r.close();
+    dir.close();
+  }
+
+  private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception {
+    if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
+      if (fstEnum.current() != null) {
+        fail("fstEnum.current().input=" + toString(fstEnum.current().input));
+      }
+    } else {
+      assertNotNull(fstEnum.current());
+      assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input));
+      if (storeOrd) {
+        // fst stored the ord
+        // No ord in 3.x
+        // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue());
+      } else {
+        // fst stored the docFreq
+        assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
+      }
+    }
+  }
+
+  private static abstract class VisitTerms<T> {
+    private final String dirOut;
+    private final String wordsFileIn;
+    private int inputMode;
+    private final Outputs<T> outputs;
+    private final Builder<T> builder;
+
+    public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
+      this.dirOut = dirOut;
+      this.wordsFileIn = wordsFileIn;
+      this.inputMode = inputMode;
+      this.outputs = outputs;
+      
+      builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs);
+    }
+
+    protected abstract T getOutput(IntsRef input, int ord) throws IOException;
+
+    public void run(int limit, boolean verify) throws IOException {
+      BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+      try {
+        final IntsRef intsRef = new IntsRef(10);
+        long tStart = System.currentTimeMillis();
+        int ord = 0;
+        while(true) {
+          String w = is.readLine();
+          if (w == null) {
+            break;
+          }
+          toIntsRef(w, inputMode, intsRef);
+          builder.add(intsRef,
+                      getOutput(intsRef, ord));
+
+          ord++;
+          if (ord % 500000 == 0) {
+            System.out.println(
+                String.format(Locale.ENGLISH, 
+                    "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
+          }
+          if (ord >= limit) {
+            break;
+          }
+        }
+
+        assert builder.getTermCount() == ord;
+        final FST<T> fst = builder.finish();
+        if (fst == null) {
+          System.out.println("FST was fully pruned!");
+          System.exit(0);
+        }
+
+        if (dirOut == null)
+          return;
+
+        System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
+        if (fst.getNodeCount() < 100) {
+          Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+          Util.toDot(fst, w, false, false);
+          w.close();
+          System.out.println("Wrote FST to out.dot");
+        }
+
+        Directory dir = FSDirectory.open(new File(dirOut));
+        IndexOutput out = dir.createOutput("fst.bin");
+        fst.save(out);
+        out.close();
+
+        System.out.println("Saved FST to fst.bin.");
+
+        if (!verify) {
+          return;
+        }
+
+        System.out.println("\nNow verify...");
+
+        is.close();
+        is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+
+        ord = 0;
+        tStart = System.currentTimeMillis();
+        while(true) {
+          String w = is.readLine();
+          if (w == null) {
+            break;
+          }
+          toIntsRef(w, inputMode, intsRef);
+          T expected = getOutput(intsRef, ord);
+          T actual = Util.get(fst, intsRef);
+          if (actual == null) {
+            throw new RuntimeException("unexpected null output on input=" + w);
+          }
+          if (!actual.equals(expected)) {
+            throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+          }
+
+          ord++;
+          if (ord % 500000 == 0) {
+            System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+          }
+          if (ord >= limit) {
+            break;
+          }
+        }
+
+        double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
+        System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+
+      } finally {
+        is.close();
+      }
+    }
+  }
+
+  // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
+  public static void main(String[] args) throws IOException {
+    int prune = 0;
+    int limit = Integer.MAX_VALUE;
+    int inputMode = 0;                             // utf8
+    boolean storeOrds = false;
+    boolean storeDocFreqs = false;
+    boolean verify = true;
+    
+    String wordsFileIn = null;
+    String dirOut = null;
+
+    int idx = 0;
+    while (idx < args.length) {
+      if (args[idx].equals("-prune")) {
+        prune = Integer.valueOf(args[1 + idx]);
+        idx++;
+      } else if (args[idx].equals("-limit")) {
+        limit = Integer.valueOf(args[1 + idx]);
+        idx++;
+      } else if (args[idx].equals("-utf8")) {
+        inputMode = 0;
+      } else if (args[idx].equals("-utf32")) {
+        inputMode = 1;
+      } else if (args[idx].equals("-docFreq")) {
+        storeDocFreqs = true;
+      } else if (args[idx].equals("-ords")) {
+        storeOrds = true;
+      } else if (args[idx].equals("-noverify")) {
+        verify = false;
+      } else if (args[idx].startsWith("-")) {
+        System.err.println("Unrecognized option: " + args[idx]);
+        System.exit(-1);
+      } else {
+        if (wordsFileIn == null) {
+          wordsFileIn = args[idx];
+        } else if (dirOut == null) {
+          dirOut = args[idx];
+        } else {
+          System.err.println("Too many arguments, expected: input [output]");
+          System.exit(-1);
+        }
+      }
+      idx++;
+    }
+    
+    if (wordsFileIn == null) {
+      System.err.println("No input file.");
+      System.exit(-1);
+    }
+
+    // ord benefits from share, docFreqs don't:
+
+    if (storeOrds && storeDocFreqs) {
+      // Store both ord & docFreq:
+      final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
+      final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
+      final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
+      new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+        Random rand;
+        @Override
+        public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
+          if (ord == 0) {
+            rand = new Random(17);
+          }
+          return new PairOutputs.Pair<Long,Long>(o1.get(ord),
+                                                 o2.get(_TestUtil.nextInt(rand, 1, 5000)));
+        }
+      }.run(limit, verify);
+    } else if (storeOrds) {
+      // Store only ords
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+      new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+        @Override
+        public Long getOutput(IntsRef input, int ord) {
+          return outputs.get(ord);
+        }
+      }.run(limit, verify);
+    } else if (storeDocFreqs) {
+      // Store only docFreq
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
+      new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+        Random rand;
+        @Override
+        public Long getOutput(IntsRef input, int ord) {
+          if (ord == 0) {
+            rand = new Random(17);
+          }
+          return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
+        }
+      }.run(limit, verify);
+    } else {
+      // Store nothing
+      final NoOutputs outputs = NoOutputs.getSingleton();
+      final Object NO_OUTPUT = outputs.getNoOutput();
+      new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+        @Override
+        public Object getOutput(IntsRef input, int ord) {
+          return NO_OUTPUT;
+        }
+      }.run(limit, verify);
+    }
+  }
+
+  public void testSingleString() throws Exception {
+    final Outputs<Object> outputs = NoOutputs.getSingleton();
+    final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+    b.add(new BytesRef("foobar"), outputs.getNoOutput());
+    final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
+    assertNull(fstEnum.seekFloor(new BytesRef("foo")));
+    assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
+  }
+
+  public void testSimple() throws Exception {
+
+    // Get outputs -- passing true means FST will share
+    // (delta code) the outputs.  This should result in
+    // smaller FST if the outputs grow monotonically.  But
+    // if numbers are "random", false should give smaller
+    // final size:
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+    // Build an FST mapping BytesRef -> Long
+    final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+    final BytesRef a = new BytesRef("a");
+    final BytesRef b = new BytesRef("b");
+    final BytesRef c = new BytesRef("c");
+
+    builder.add(a, outputs.get(17));
+    builder.add(b, outputs.get(42));
+    builder.add(c, outputs.get(13824324872317238L));
+
+    final FST<Long> fst = builder.finish();
+
+    assertEquals(13824324872317238L, (long) Util.get(fst, c));
+    assertEquals(42, (long) Util.get(fst, b));
+    assertEquals(17, (long) Util.get(fst, a));
+
+    BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
+    BytesRefFSTEnum.InputOutput<Long> seekResult;
+    seekResult = fstEnum.seekFloor(a);
+    assertNotNull(seekResult);
+    assertEquals(17, (long) seekResult.output);
+
+    // goes to a
+    seekResult = fstEnum.seekFloor(new BytesRef("aa"));
+    assertNotNull(seekResult);
+    assertEquals(17, (long) seekResult.output);
+
+    // goes to b
+    seekResult = fstEnum.seekCeil(new BytesRef("aa"));
+    assertNotNull(seekResult);
+    assertEquals(b, seekResult.input);
+    assertEquals(42, (long) seekResult.output);
+  }
+
+  /**
+   * Test state expansion (array format) on close-to-root states. Creates
+   * synthetic input that has one expanded state on each level.
+   * 
+   * @see "https://issues.apache.org/jira/browse/LUCENE-2933" 
+   */
+  public void testExpandedCloseToRoot() throws Exception {
+    class SyntheticData {
+      FST<Object> compile(String[] lines) throws IOException {
+        final NoOutputs outputs = NoOutputs.getSingleton();
+        final Object nothing = outputs.getNoOutput();
+        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+        int line = 0;
+        final BytesRef term = new BytesRef();
+        while (line < lines.length) {
+          String w = lines[line++];
+          if (w == null) {
+            break;
+          }
+          term.copy(w);
+          b.add(term, nothing);
+        }
+        
+        return b.finish();
+      }
+      
+      void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+          int depth) {
+        if (depth == 0 || from == to) {
+          String seq = b.toString() + "_" + out.size() + "_end";
+          out.add(seq);
+        } else {
+          for (char c = from; c <= to; c++) {
+            b.append(c);
+            generate(out, b, from, c == to ? to : from, depth - 1);
+            b.deleteCharAt(b.length() - 1);
+          }
+        }
+      }
+
+      public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) 
+        throws IOException {
+        if (fst.targetHasArcs(arc)) {
+          int childCount = 0;
+          for (arc = fst.readFirstTargetArc(arc, arc);; 
+               arc = fst.readNextArc(arc), childCount++)
+          {
+            boolean expanded = fst.isExpandedTarget(arc);
+            int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+            assertEquals(
+                expanded,
+                (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && 
+                    children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+                 children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+            if (arc.isLast()) break;
+          }
+
+          return childCount;
+        }
+        return 0;
+      }
+    }
+
+    // Sanity check.
+    assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+    assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+    SyntheticData s = new SyntheticData();
+
+    ArrayList<String> out = new ArrayList<String>();
+    StringBuilder b = new StringBuilder();
+    s.generate(out, b, 'a', 'i', 10);
+    String[] input = out.toArray(new String[out.size()]);
+    Arrays.sort(input);
+    FST<Object> fst = s.compile(input);
+    FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+    s.verifyStateAndBelow(fst, arc, 1);
+  }
+
+  // Make sure raw FST can differentiate between final vs
+  // non-final end nodes
+  public void testNonFinalStopNodes() throws Exception {
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+    final Long nothing = outputs.getNoOutput();
+    final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+    final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+
+    final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
+
+    // Add final stop node
+    {
+      final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
+      node.isFinal = true;
+      rootNode.addArc('a', node);
+      final Builder.CompiledNode frozen = new Builder.CompiledNode();
+      frozen.address = fst.addNode(node);
+      rootNode.arcs[0].nextFinalOutput = outputs.get(17);
+      rootNode.arcs[0].isFinal = true;
+      rootNode.arcs[0].output = nothing;
+      rootNode.arcs[0].target = frozen;
+    }
+
+    // Add non-final stop node
+    {
+      final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
+      rootNode.addArc('b', node);
+      final Builder.CompiledNode frozen = new Builder.CompiledNode();
+      frozen.address = fst.addNode(node);
+      rootNode.arcs[1].nextFinalOutput = nothing;
+      rootNode.arcs[1].output = outputs.get(42);
+      rootNode.arcs[1].target = frozen;
+    }
+
+    fst.finish(fst.addNode(rootNode));
+    
+    checkStopNodes(fst, outputs);
+
+    // Make sure it still works after save/load:
+    Directory dir = newDirectory();
+    IndexOutput out = dir.createOutput("fst");
+    fst.save(out);
+    out.close();
+
+    IndexInput in = dir.openInput("fst");
+    final FST<Long> fst2 = new FST<Long>(in, outputs);
+    checkStopNodes(fst2, outputs);
+    in.close();
+    dir.close();
+  }
+
+  private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
+    final Long nothing = outputs.getNoOutput();
+    FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
+    assertEquals(nothing, startArc.output);
+    assertEquals(nothing, startArc.nextFinalOutput);
+
+    FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>());
+    assertEquals('a', arc.label);
+    assertEquals(17, arc.nextFinalOutput.longValue());
+    assertTrue(arc.isFinal());
+
+    arc = fst.readNextArc(arc);
+    assertEquals('b', arc.label);
+    assertFalse(arc.isFinal());
+    assertEquals(42, arc.output.longValue());
+  }
+}



Mime
View raw message