lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dwe...@apache.org
Subject svn commit: r1209265 - in /lucene/dev/trunk: lucene/src/java/org/apache/lucene/util/fst/ modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ modules/suggest/src/test/org/apache/lucene/search/suggest/ modules/suggest/src/test/org/apache/lucen...
Date Thu, 01 Dec 2011 21:49:41 GMT
Author: dweiss
Date: Thu Dec  1 21:49:27 2011
New Revision: 1209265

URL: http://svn.apache.org/viewvc?rev=1209265&view=rev
Log:
SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8,
external sorting (on disk) prevents OOMs even with large data sets
(the bottleneck is now FST construction), code cleanups and API cleanups.

Added:
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java
      - copied, changed from r1209076, lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java
Removed:
    lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java
Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
    lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java?rev=1209265&r1=1209264&r2=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/fst/FST.java Thu Dec  1 21:49:27 2011
@@ -17,12 +17,22 @@ package org.apache.lucene.util.fst;
  * limitations under the License.
  */
 
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.Builder.UnCompiledNode;
 
 // TODO: if FST is pure prefix trie we can do a more compact
@@ -334,6 +344,43 @@ public class FST<T> {
     out.writeVInt(bytes.length);
     out.writeBytes(bytes, 0, bytes.length);
   }
+  
+  /**
+   * Writes an automaton to a file. 
+   */
+  public void save(final File file) throws IOException {
+    boolean success = false;
+    OutputStream os = new BufferedOutputStream(new FileOutputStream(file));
+    try {
+      save(new OutputStreamDataOutput(os));
+      success = true;
+    } finally { 
+      if (success) { 
+        IOUtils.close(os);
+      } else {
+        IOUtils.closeWhileHandlingException(os); 
+      }
+    }
+  }
+
+  /**
+   * Reads an automaton from a file. 
+   */
+  public static <T> FST<T> read(File file, Outputs<T> outputs) throws IOException {
+    InputStream is = new BufferedInputStream(new FileInputStream(file));
+    boolean success = false;
+    try {
+      FST<T> fst = new FST<T>(new InputStreamDataInput(is), outputs);
+      success = true;
+      return fst;
+    } finally {
+      if (success) { 
+        IOUtils.close(is);
+      } else {
+        IOUtils.closeWhileHandlingException(is); 
+      }
+    }
+  }
 
   private void writeLabel(int v) throws IOException {
     assert v >= 0: "v=" + v;

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,29 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
+ * of this interface will be called in a single-threaded scenario.  
+ */
+public interface BytesRefSorter {
+  /**
+   * Adds a single suggestion entry (possibly compound with its bucket).
+   * 
+   * @throws IOException If an I/O exception occurs.
+   * @throws IllegalStateException If an addition attempt is performed after
+   * a call to {@link #iterator()} has been made.
+   */
+  void add(BytesRef utf8) throws IOException, IllegalStateException;
+
+  /**
+   * Sorts the entries added in {@link #add(BytesRef)} and returns 
+   * an iterator over all sorted entries.
+   * 
+   * @throws IOException If an I/O exception occurs.
+   */
+  Iterator<BytesRef> iterator() throws IOException;
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,105 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.*;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Builds and iterates over sequences stored on disk.
+ */
+public class ExternalRefSorter implements BytesRefSorter, Closeable {
+  private final Sort sort;
+  private Sort.ByteSequencesWriter writer;
+  private File input;
+  private File sorted; 
+
+  /**
+   * Will buffer all sequences to a temporary file and then sort (all on-disk).
+   */
+  public ExternalRefSorter(Sort sort) throws IOException {
+    this.sort = sort;
+    this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
+    this.writer = new Sort.ByteSequencesWriter(input);
+  }
+
+  @Override
+  public void add(BytesRef utf8) throws IOException {
+    if (writer == null)
+      throw new IllegalStateException();
+    writer.write(utf8);
+  }
+
+  @Override
+  public Iterator<BytesRef> iterator() throws IOException {
+    if (sorted == null) {
+      closeWriter();
+
+      sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
+      sort.sort(input, sorted);
+
+      input.delete();
+      input = null;
+    }
+
+    return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
+  }
+
+  private void closeWriter() throws IOException {
+    if (writer != null) {
+      writer.close();
+      writer = null;
+    }
+  }
+
+  /**
+   * Removes any written temporary files.
+   */
+  @Override
+  public void close() throws IOException {
+    try {
+      closeWriter();
+    } finally {
+      if (input != null) input.delete();
+      if (sorted != null) sorted.delete();
+    }
+  }
+
+  /**
+   * Iterate over byte refs in a file.
+   */
+  class ByteSequenceIterator implements Iterator<BytesRef> {
+    private ByteSequencesReader reader;
+    private byte[] next;
+
+    public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
+      this.reader = reader;
+      this.next = reader.read();
+    }
+
+    @Override
+    public boolean hasNext() {
+      return next != null;
+    }
+    
+    @Override
+    public BytesRef next() {
+      if (next == null) throw new NoSuchElementException();
+      BytesRef r = new BytesRef(next);
+      try {
+        next = reader.read();
+        if (next == null) {
+          reader.close();
+        }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+      return r;
+    }
+
+    @Override
+    public void remove() { throw new UnsupportedOperationException(); }
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,381 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.lucene.util.*;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.FST.Arc;
+
+/**
+ * Finite state automata based implementation of "autocomplete" functionality.
+ * 
+ * @see FSTCompletionBuilder
+ */
+
+// TODO: we could store exact weights as outputs from the FST (int4 encoded
+// floats). This would provide exact outputs from this method and to some
+// degree allowed post-sorting on a more fine-grained weight.
+
+// TODO: support for Analyzers (infix suggestions, synonyms?)
+
+public class FSTCompletion {
+  /**
+   * A single completion for a given key.
+   */
+  public static final class Completion implements Comparable<Completion> {
+    public final BytesRef utf8;
+    public final int bucket;
+
+    Completion(BytesRef key, int bucket) {
+      this.utf8 = BytesRef.deepCopyOf(key);
+      this.bucket = bucket;
+    }
+
+    @Override
+    public String toString() {
+      return utf8.utf8ToString() + "/" + bucket;
+    }
+
+    /** @see BytesRef#compareTo(BytesRef) */
+    public int compareTo(Completion o) {
+      return this.utf8.compareTo(o.utf8);
+    }
+  }
+
+  /** 
+   * Default number of buckets.
+   */
+  public static final int DEFAULT_BUCKETS = 10;
+
+  /**
+   * An empty result. Keep this an {@link ArrayList} to keep all the returned
+   * lists of single type (monomorphic calls).
+   */
+  private static final ArrayList<Completion> EMPTY_RESULT = new ArrayList<Completion>();
+
+  /**
+   * Finite state automaton encoding all the lookup terms. See class notes for
+   * details.
+   */
+  private final FST<Object> automaton;
+
+  /**
+   * An array of arcs leaving the root automaton state and encoding weights of
+   * all completions in their sub-trees.
+   */
+  private final Arc<Object>[] rootArcs;
+
+  /**
+   * @see #FSTCompletion(FST, boolean, boolean)
+   */
+  private boolean exactFirst;
+
+  /**
+   * @see #FSTCompletion(FST, boolean, boolean)
+   */
+  private boolean higherWeightsFirst;
+
+  /**
+   * @param automaton
+   *          Automaton with completions. See {@link FSTCompletionBuilder}.
+   * @param higherWeightsFirst
+   *          Return most popular suggestions first. This is the default
+   *          behavior for this implementation. Setting it to <code>false</code>
+   *          has no effect (use constant term weights to sort alphabetically
+   *          only).
+   * @param exactFirst
+   *          Find and push an exact match to the first position of the result
+   *          list if found.
+   */
+  @SuppressWarnings("unchecked")
+  public FSTCompletion(FST<Object> automaton, boolean higherWeightsFirst, boolean exactFirst) {
+    this.automaton = automaton;
+    if (automaton != null) {
+      this.rootArcs = cacheRootArcs(automaton);
+    } else {
+      this.rootArcs = new Arc[0];
+    }
+    this.higherWeightsFirst = higherWeightsFirst;
+    this.exactFirst = exactFirst;
+  }
+
+  /**
+   * Defaults to higher weights first and exact first.
+   * @see #FSTCompletion(FST, boolean, boolean)
+   */
+  public FSTCompletion(FST<Object> automaton) {
+    this(automaton, true, true);
+  }
+
+  /**
+   * Cache the root node's output arcs starting with completions with the
+   * highest weights.
+   */
+  @SuppressWarnings({"all"})
+  private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
+    try {
+      List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
+      Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
+      automaton.readFirstTargetArc(arc, arc);
+      while (true) {
+        rootArcs.add(new Arc<Object>().copyFrom(arc));
+        if (arc.isLast()) break;
+        automaton.readNextArc(arc);
+      }
+      
+      Collections.reverse(rootArcs); // we want highest weights first.
+      return rootArcs.toArray(new Arc[rootArcs.size()]);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Returns the first exact match by traversing root arcs, starting from the
+   * arc <code>rootArcIndex</code>.
+   * 
+   * @param rootArcIndex
+   *          The first root arc index in {@link #rootArcs} to consider when
+   *          matching.
+   * 
+   * @param utf8
+   *          The sequence of utf8 bytes to follow.
+   * 
+   * @return Returns the bucket number of the match or <code>null</code> if no
+   *         match was found.
+   */
+  private Integer getExactMatchStartingFromRootArc(
+      int rootArcIndex, BytesRef utf8) {
+    // Get the UTF-8 bytes representation of the input key.
+    try {
+      final FST.Arc<Object> scratch = new FST.Arc<Object>();
+      for (; rootArcIndex < rootArcs.length; rootArcIndex++) {
+        final FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
+        final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
+        
+        // Descend into the automaton using the key as prefix.
+        if (descendWithPrefix(arc, utf8)) {
+          automaton.readFirstTargetArc(arc, arc);
+          if (arc.label == FST.END_LABEL) {
+            // Normalize prefix-encoded weight.
+            return rootArc.label;
+          }
+        }
+      }
+    } catch (IOException e) {
+      // Should never happen, but anyway.
+      throw new RuntimeException(e);
+    }
+    
+    // No match.
+    return null;
+  }
+  
+  /**
+   * Lookup suggestions to <code>key</code>.
+   * 
+   * @param key
+   *          The prefix to which suggestions should be sought.
+   * @param num
+   *          At most this number of suggestions will be returned.
+   * @return Returns the suggestions, sorted by their approximated weight first
+   *         (decreasing) and then alphabetically (UTF-8 codepoint order).
+   */
+  public List<Completion> lookup(String key, int num) {
+    if (key.length() == 0 || automaton == null) {
+      return EMPTY_RESULT;
+    }
+
+    try {
+      BytesRef keyUtf8 = new BytesRef(key);
+      if (!higherWeightsFirst && rootArcs.length > 1) {
+        // We could emit a warning here (?). An optimal strategy for
+        // alphabetically sorted
+        // suggestions would be to add them with a constant weight -- this saves
+        // unnecessary
+        // traversals and sorting.
+        return lookupSortedAlphabetically(keyUtf8, num);
+      } else {
+        return lookupSortedByWeight(keyUtf8, num, false);
+      }
+    } catch (IOException e) {
+      // Should never happen, but anyway.
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Lookup suggestions sorted alphabetically <b>if weights are not
+   * constant</b>. This is a workaround: in general, use constant weights for
+   * alphabetically sorted result.
+   */
+  private List<Completion> lookupSortedAlphabetically(BytesRef key, int num)
+      throws IOException {
+    // Greedily get num results from each weight branch.
+    List<Completion> res = lookupSortedByWeight(key, num, true);
+
+    // Sort and trim.
+    Collections.sort(res);
+    if (res.size() > num) {
+      res = res.subList(0, num);
+    }
+    return res;
+  }
+
+  /**
+   * Lookup suggestions sorted by weight (descending order).
+   * 
+   * @param collectAll
+   *          If <code>true</code>, the routine terminates immediately when
+   *          <code>num</code> suggestions have been collected. If
+   *          <code>false</code>, it will collect suggestions from all weight
+   *          arcs (needed for {@link #lookupSortedAlphabetically}.
+   */
+  private ArrayList<Completion> lookupSortedByWeight(BytesRef key, 
+      int num, boolean collectAll) throws IOException {
+    // Don't overallocate the results buffers. This also serves the purpose of
+    // allowing the user of this class to request all matches using Integer.MAX_VALUE as
+    // the number of results.
+    final ArrayList<Completion> res = new ArrayList<Completion>(Math.min(10, num));
+
+    final BytesRef output = BytesRef.deepCopyOf(key);
+    for (int i = 0; i < rootArcs.length; i++) {
+      final FST.Arc<Object> rootArc = rootArcs[i];
+      final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
+
+      // Descend into the automaton using the key as prefix.
+      if (descendWithPrefix(arc, key)) {
+        // A subgraph starting from the current node has the completions
+        // of the key prefix. The arc we're at is the last key's byte,
+        // so we will collect it too.
+        output.length = key.length - 1;
+        if (collect(res, num, rootArc.label, output, arc) && !collectAll) {
+          // We have enough suggestions to return immediately. Keep on looking
+          // for an
+          // exact match, if requested.
+          if (exactFirst) {
+            if (!checkExistingAndReorder(res, key)) {
+              Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
+              if (exactMatchBucket != null) {
+                // Insert as the first result and truncate at num.
+                while (res.size() >= num) {
+                  res.remove(res.size() - 1);
+                }
+                res.add(0, new Completion(key, exactMatchBucket));
+              }
+            }
+          }
+          break;
+        }
+      }
+    }
+    return res;
+  }
+  
+  /**
+   * Checks if the list of
+   * {@link org.apache.lucene.search.suggest.Lookup.LookupResult}s already has a
+   * <code>key</code>. If so, reorders that
+   * {@link org.apache.lucene.search.suggest.Lookup.LookupResult} to the first
+   * position.
+   * 
+   * @return Returns <code>true<code> if and only if <code>list</code> contained
+   *         <code>key</code>.
+   */
+  private boolean checkExistingAndReorder(ArrayList<Completion> list, BytesRef key) {
+    // We assume list does not have duplicates (because of how the FST is created).
+    for (int i = list.size(); --i >= 0;) {
+      if (key.equals(list.get(i).utf8)) {
+        // Key found. Unless already at i==0, remove it and push up front so
+        // that the ordering
+        // remains identical with the exception of the exact match.
+        list.add(0, list.remove(i));
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  /**
+   * Descend along the path starting at <code>arc</code> and going through bytes
+   * in the argument.
+   * 
+   * @param arc
+   *          The starting arc. This argument is modified in-place.
+   * @param utf8
+   *          The term to descend along.
+   * @return If <code>true</code>, <code>arc</code> will be set to the arc
+   *         matching last byte of <code>term</code>. <code>false</code> is
+   *         returned if no such prefix exists.
+   */
+  private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
+      throws IOException {
+    final int max = utf8.offset + utf8.length;
+    for (int i = utf8.offset; i < max; i++) {
+      if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
+        // No matching prefixes, return an empty result.
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  /**
+   * Recursive collect lookup results from the automaton subgraph starting at
+   * <code>arc</code>.
+   * 
+   * @param num
+   *          Maximum number of results needed (early termination).
+   */
+  private boolean collect(List<Completion> res, int num, int bucket,
+      BytesRef output, Arc<Object> arc) throws IOException {
+    if (output.length == output.bytes.length) {
+      output.bytes = ArrayUtil.grow(output.bytes);
+    }
+    assert output.offset == 0;
+    output.bytes[output.length++] = (byte) arc.label;
+    
+    automaton.readFirstTargetArc(arc, arc);
+    while (true) {
+      if (arc.label == FST.END_LABEL) {
+        res.add(new Completion(output, bucket));
+        if (res.size() >= num) return true;
+      } else {
+        int save = output.length;
+        if (collect(res, num, bucket, output, new Arc<Object>().copyFrom(arc))) {
+          return true;
+        }
+        output.length = save;
+      }
+      
+      if (arc.isLast()) {
+        break;
+      }
+      automaton.readNextArc(arc);
+    }
+    return false;
+  }
+
+  /**
+   * Returns the bucket count (discretization thresholds).
+   */
+  public int getBucketCount() {
+    return rootArcs.length;
+  }
+
+  /**
+   * Returns the bucket assigned to a given key (if found) or <code>null</code> if
+   * no exact match exists.
+   */
+  public Integer getBucket(String key) {
+    return getExactMatchStartingFromRootArc(0, new BytesRef(key));
+  }
+
+  /**
+   * Returns the internal automaton.
+   */
+  public FST<Object> getFST() {
+    return automaton;
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,233 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.fst.*;
+
+/**
+ * Finite state automata based implementation of "autocomplete" functionality.
+ * 
+ * <h2>Implementation details</h2>
+ * 
+ * <p>
+ * The construction step in {@link #finalize()} works as follows:
+ * <ul>
+ * <li>A set of input terms and their buckets is given.</li>
+ * <li>All terms in the input are prefixed with a synthetic pseudo-character
+ * (code) of the weight bucket the term fell into. For example a term
+ * <code>abc</code> with a discretized weight equal '1' would become
+ * <code>1abc</code>.</li>
+ * <li>The terms are then sorted by their raw value of UTF-8 character values
+ * (including the synthetic bucket code in front).</li>
+ * <li>A finite state automaton ({@link FST}) is constructed from the input. The
+ * root node has arcs labeled with all possible weights. We cache all these
+ * arcs, highest-weight first.</li>
+ * </ul>
+ * 
+ * <p>
+ * At runtime, in {@link FSTCompletion#lookup(String, int)}, 
+ * the automaton is utilized as follows:
+ * <ul>
+ * <li>For each possible term weight encoded in the automaton (cached arcs from
+ * the root above), starting with the highest one, we descend along the path of
+ * the input key. If the key is not a prefix of a sequence in the automaton
+ * (path ends prematurely), we exit immediately -- no completions.</li>
+ * <li>Otherwise, we have found an internal automaton node that ends the key.
+ * <b>The entire subautomaton (all paths) starting from this node form the key's
+ * completions.</b> We start the traversal of this subautomaton. Every time we
+ * reach a final state (arc), we add a single suggestion to the list of results
+ * (the weight of this suggestion is constant and equal to the root path we
+ * started from). The tricky part is that because automaton edges are sorted and
+ * we scan depth-first, we can terminate the entire procedure as soon as we
+ * collect enough suggestions the user requested.</li>
+ * <li>In case the number of suggestions collected in the step above is still
+ * insufficient, we proceed to the next (smaller) weight leaving the root node
+ * and repeat the same algorithm again.</li>
+ * </ul>
+ * 
+ * <h2>Runtime behavior and performance characteristic</h2>
+ * 
+ * The algorithm described above is optimized for finding suggestions to short
+ * prefixes in a top-weights-first order. This is probably the most common use
+ * case: it allows presenting suggestions early and sorts them by the global
+ * frequency (and then alphabetically).
+ * 
+ * <p>
+ * If there is an exact match in the automaton, it is returned first on the
+ * results list (even with by-weight sorting).
+ * 
+ * <p>
+ * Note that the maximum lookup time for <b>any prefix</b> is the time of
+ * descending to the subtree, plus traversal of the subtree up to the number of
+ * requested suggestions (because they are already presorted by weight on the
+ * root level and alphabetically at any node level).
+ * 
+ * <p>
+ * To order alphabetically only (no ordering by priorities), use identical term
+ * weights for all terms. Alphabetical suggestions are returned even if
+ * non-constant weights are used, but the algorithm for doing this is
+ * suboptimal.
+ * 
+ * <p>
+ * "alphabetically" in any of the documentation above indicates UTF-8
+ * representation order, nothing else.
+ * 
+ * <p>
+ * <b>NOTE</b>: the FST file format is experimental and subject to suddenly
+ * change, requiring you to rebuild the FST suggest index.
+ * 
+ * @see FSTCompletion
+ */
+public class FSTCompletionBuilder {
+  /** 
+   * Default number of buckets.
+   */
+  public static final int DEFAULT_BUCKETS = 10;
+
+  /**
+   * The number of separate buckets for weights (discretization). The more
+   * buckets, the more fine-grained term weights (priorities) can be assigned.
+   * The speed of lookup will not decrease for prefixes which have
+   * highly-weighted completions (because these are filled-in first), but will
+   * decrease significantly for low-weighted terms (but these should be
+   * infrequent, so it is all right).
+   * 
+   * <p>
+   * The number of buckets must be within [1, 255] range.
+   */
+  private final int buckets;
+
+  /**
+   * Finite state automaton encoding all the lookup terms. See class notes for
+   * details.
+   */
+  FST<Object> automaton;
+
+  /**
+   * FST construction require re-sorting the input. This is the class that
+   * collects all the input entries, their weights and then provides sorted
+   * order.
+   */
+  private final BytesRefSorter sorter;
+  
+  /**
+   * Scratch buffer for {@link #add(BytesRef, int)}.
+   */
+  private final BytesRef scratch = new BytesRef();
+
+  /**
+   * Max tail sharing length.
+   */
+  private final int shareMaxTailLength;
+
+  /**
+   * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
+   * promoted to first position and {@link InMemorySorter}.
+   */
+  public FSTCompletionBuilder() {
+    this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
+  }
+
+  /**
+   * @param buckets
+   *          The number of buckets for weight discretization. Buckets are used
+   *          in {@link #add(BytesRef, int)} and must be smaller than the number
+   *          given here.
+   *          
+   * @param sorter
+   *          {@link BytesRefSorter} used for re-sorting input for the automaton.
+   *          For large inputs, use on-disk sorting implementations. The sorter
+   *          is closed automatically in {@link #build()} if it implements
+   *          {@link Closeable}.
+   *          
+   * @param shareMaxTailLength
+   *          Max shared suffix sharing length.
+   *          
+   *          See the description of this parameter in {@link Builder}'s constructor.
+   *          In general, for very large inputs you'll want to construct a non-minimal
+   *          automaton which will be larger, but the construction will take far less ram.
+   *          For minimal automata, set it to {@link Integer#MAX_VALUE}.
+   */
+  public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
+    if (buckets < 1 || buckets > 255) {
+      throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: "
+          + buckets);
+    }
+    
+    if (sorter == null) throw new IllegalArgumentException(
+        "BytesRefSorter must not be null.");
+    
+    this.sorter = sorter;
+    this.buckets = buckets;
+    this.shareMaxTailLength = shareMaxTailLength;
+  }
+
+  /**
+   * Appends a single suggestion and its weight to the internal buffers.
+   * 
+   * @param utf8
+   *          The suggestion (utf8 representation) to be added. The content is
+   *          copied and the object can be reused.
+   * @param bucket
+   *          The bucket to place this suggestion in. Must be non-negative and
+   *          smaller than the number of buckets passed in the constructor.
+   *          Higher numbers indicate suggestions that should be presented
+   *          before suggestions placed in smaller buckets.
+   */
+  public void add(BytesRef utf8, int bucket) throws IOException {
+    if (bucket < 0 || bucket >= buckets) {
+      throw new IllegalArgumentException(
+          "Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
+    }
+    
+    if (scratch.bytes.length < utf8.length + 1) {
+      scratch.grow(utf8.length + 10);
+    }
+    
+    scratch.length = 1;
+    scratch.bytes[0] = (byte) bucket;
+    scratch.append(utf8);
+    sorter.add(scratch);
+  }
+
+  /**
+   * Builds the final automaton from a list of added entries. This method may
+   * take a longer while as it needs to build the automaton.
+   */
+  public FSTCompletion build() throws IOException {
+    this.automaton = buildAutomaton(sorter);
+
+    if (sorter instanceof Closeable) {
+      ((Closeable) sorter).close();
+    }
+
+    return new FSTCompletion(automaton);
+  }
+
+  /**
+   * Builds the final automaton from a list of entries.
+   */
+  private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
+    // Build the automaton.
+    final Outputs<Object> outputs = NoOutputs.getSingleton();
+    final Object empty = outputs.getNoOutput();
+    final Builder<Object> builder = new Builder<Object>(
+        FST.INPUT_TYPE.BYTE1, 0, 0, true, true, 
+        shareMaxTailLength, outputs, null);
+    
+    BytesRef scratch = new BytesRef();
+    int count = 0;
+    for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
+      BytesRef entry = i.next();
+      if (scratch.compareTo(entry) != 0) {
+        builder.add(entry, empty);
+        scratch.copyBytes(entry);
+      }
+    }
+    
+    return count == 0 ? null : builder.finish();
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,251 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
+import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
+import org.apache.lucene.search.suggest.tst.TSTLookup;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.*;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.NoOutputs;
+
+/**
+ * An adapter from {@link Lookup} API to {@link FSTCompletion}.
+ * 
+ * <p>This adapter differs from {@link FSTCompletion} in that it attempts
+ * to discretize any "weights" as passed from in {@link TermFreqIterator#freq()}
+ * to match the number of buckets. For the rationale for bucketing, see
+ * {@link FSTCompletion}.
+ * 
+ * <p><b>Note:</b>Discretization requires an additional sorting pass.
+ * 
+ * <p>The range of weights for bucketing/ discretization is determined 
+ * by sorting the input by weight and then dividing into
+ * equal ranges. Then, scores within each range are assigned to that bucket. 
+ * 
+ * <p>Note that this means that even large differences in weights may be lost 
+ * during automaton construction, but the overall distinction between "classes"
+ * of weights will be preserved regardless of the distribution of weights. 
+ * 
+ * <p>For fine-grained control over which weights are assigned to which buckets,
+ * use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
+ * 
+ * @see FSTCompletion
+ */
+public class FSTCompletionLookup extends Lookup {
+  /**
+   * Shared tail length for conflating in the created automaton. Setting this
+   * to larger values ({@link Integer#MAX_VALUE}) will create smaller (or minimal) 
+   * automata at the cost of RAM for keeping nodes hash in the {@link FST}. 
+   *  
+   * <p>Empirical pick.
+   */
+  private final static int sharedTailLength = 5;
+
+  /**
+   * File name for the automaton.
+   * 
+   * @see #store(File)
+   * @see #load(File)
+   */
+  private static final String FILENAME = "fst.bin";
+
+  private int buckets;
+  private boolean exactMatchFirst;
+
+  /**
+   * Automaton used for completions with higher weights reordering.
+   */
+  private FSTCompletion higherWeightsCompletion;
+
+  /**
+   * Automaton used for normal completions.
+   */
+  private FSTCompletion normalCompletion;
+
+  /*
+   * 
+   */
+  public FSTCompletionLookup() {
+    this(FSTCompletion.DEFAULT_BUCKETS, true);
+  }
+
+  /*
+   * 
+   */
+  public FSTCompletionLookup(FSTCompletion completion, int buckets, boolean exactMatchFirst) {
+    this(buckets, exactMatchFirst);
+    this.normalCompletion = new FSTCompletion(
+        completion.getFST(), false, exactMatchFirst);
+    this.higherWeightsCompletion =  new FSTCompletion(
+        completion.getFST(), true, exactMatchFirst);
+  }
+
+  /*
+   * 
+   */
+  public FSTCompletionLookup(int buckets, boolean exactMatchFirst) {
+    this.buckets = buckets;
+    this.exactMatchFirst = exactMatchFirst;
+  }
+
+  /*
+   * 
+   */
+  @Override
+  public void build(TermFreqIterator tfit) throws IOException {
+    File tempInput = File.createTempFile(
+        FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
+    File tempSorted = File.createTempFile(
+        FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());
+
+    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
+    Sort.ByteSequencesReader reader = null;
+
+    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
+    // If negative floats are allowed some trickery needs to be done to find their byte order.
+    boolean success = false;
+    try {
+      BytesRef tmp1 = new BytesRef();
+      byte [] buffer = new byte [0];
+      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+      while (tfit.hasNext()) {
+        String key = tfit.next();
+        UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1);
+
+        if (tmp1.length + 4 >= buffer.length) {
+          buffer = ArrayUtil.grow(buffer, tmp1.length + 4);
+        }
+
+        output.reset(buffer);
+        output.writeInt(FloatMagic.toSortable(tfit.freq()));
+        output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length);
+        writer.write(buffer, 0, output.getPosition());
+      }
+      writer.close();
+
+      // We don't know the distribution of scores and we need to bucket them, so we'll sort
+      // and divide into equal buckets.
+      SortInfo info = new Sort().sort(tempInput, tempSorted);
+      tempInput.delete();
+      FSTCompletionBuilder builder = new FSTCompletionBuilder(
+          buckets, new ExternalRefSorter(new Sort()), sharedTailLength);
+
+      final int inputLines = info.lines;
+      reader = new Sort.ByteSequencesReader(tempSorted);
+      long line = 0;
+      int previousBucket = 0;
+      float previousScore = 0;
+      ByteArrayDataInput input = new ByteArrayDataInput();
+      BytesRef tmp2 = new BytesRef();
+      while (reader.read(tmp1)) {
+        input.reset(tmp1.bytes);
+        float currentScore = FloatMagic.fromSortable(input.readInt());
+
+        int bucket;
+        if (line > 0 && currentScore == previousScore) {
+          bucket = previousBucket;
+        } else {
+          bucket = (int) (line * buckets / inputLines);
+        }
+        previousScore = currentScore;
+        previousBucket = bucket;
+
+        // Only append the input, discard the weight.
+        tmp2.bytes = tmp1.bytes;
+        tmp2.offset = input.getPosition();
+        tmp2.length = tmp1.length - input.getPosition();
+        builder.add(tmp2, bucket);
+
+        line++;
+      }
+
+      // The two FSTCompletions share the same automaton.
+      this.higherWeightsCompletion = builder.build();
+      this.normalCompletion = new FSTCompletion(
+          higherWeightsCompletion.getFST(), false, exactMatchFirst);
+      
+      success = true;
+    } finally {
+      if (success) 
+        IOUtils.close(reader, writer);
+      else 
+        IOUtils.closeWhileHandlingException(reader, writer);
+
+      tempInput.delete();
+      tempSorted.delete();
+    }
+  }
+
+  @Override
+  public List<LookupResult> lookup(String key, boolean higherWeightsFirst, int num) {
+    final List<Completion> completions;
+    if (higherWeightsFirst) {
+      completions = higherWeightsCompletion.lookup(key, num);
+    } else {
+      completions = normalCompletion.lookup(key, num);
+    }
+    
+    final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size());
+    for (Completion c : completions) {
+      results.add(new LookupResult(c.utf8.utf8ToString(), c.bucket));
+    }
+    return results;
+  }
+
+  @Override
+  public boolean add(String key, Object value) {
+    // Not supported.
+    return false;
+  }
+
+  @Override
+  public Float get(String key) {
+    Integer bucket = normalCompletion.getBucket(key);
+    if (bucket == null)
+      return null;
+    else
+      return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
+  }
+
+  /**
+   * Deserialization from disk.
+   */
+  @Override
+  public synchronized boolean load(File storeDir) throws IOException {
+    File data = new File(storeDir, FILENAME);
+    if (!data.exists() || !data.canRead()) {
+      return false;
+    }
+
+    this.higherWeightsCompletion = new FSTCompletion(
+        FST.read(data, NoOutputs.getSingleton()));
+    this.normalCompletion = new FSTCompletion(
+        higherWeightsCompletion.getFST(), false, exactMatchFirst);
+
+    return true;
+  }
+
+  /**
+   * Serialization to disk.
+   */
+  @Override
+  public synchronized boolean store(File storeDir) throws IOException {
+    if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
+      return false;
+    }
+
+    if (this.normalCompletion == null) 
+      return false;
+
+    normalCompletion.getFST().save(new File(storeDir, FILENAME));
+    return true;
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,58 @@
+package org.apache.lucene.search.suggest.fst;
+
+import org.apache.lucene.util.NumericUtils;
+
+/**
+ * Converts normalized float representations ({@link Float#floatToIntBits(float)})
+ * into integers that are directly sortable in int4 representation (or unsigned values or
+ * after promoting to a long with higher 32-bits zeroed).
+ */
+class FloatMagic {
+  /**
+   * Convert a float to a directly sortable unsigned integer. For sortable signed
+   * integers, see {@link NumericUtils#floatToSortableInt(float)}.
+   */
+  public static int toSortable(float f) {
+    return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
+  }
+
+  /**
+   * Back from {@link #toSortable(float)} to float.
+   */
+  public static float fromSortable(int v) {
+    return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
+  }
+
+  /**
+   * Convert float bits to directly sortable bits. 
+   * Normalizes all NaNs to canonical form.
+   */
+  static int floatBitsToUnsignedOrdered(int v) {
+    // Canonicalize NaN ranges. I assume this check will be faster here than 
+    // (v == v) == false on the FPU? We don't distinguish between different
+    // flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
+    // in Java this doesn't matter much anyway.
+    if ((v & 0x7fffffff) > 0x7f800000) {
+      // Apply the logic below to a canonical "quiet NaN"
+      return 0x7fc00000 ^ 0x80000000;
+    }
+
+    if (v < 0) {
+      // Reverse the order of negative values and push them before positive values.
+      return ~v;
+    } else {
+      // Shift positive values after negative, but before NaNs, they're sorted already.
+      return v ^ 0x80000000;
+    }
+  }
+
+  /**
+   * Back from {@link #floatBitsToUnsignedOrdered(int)}.
+   */
+  static int unsignedOrderedToFloatBits(int v) {
+    if (v < 0)
+      return v & ~0x80000000;
+    else
+      return ~v; 
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,28 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.util.*;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * An {@link BytesRefSorter} that keeps all the entries in memory.
+ */
+public final class InMemorySorter implements BytesRefSorter {
+  // TODO: use a single byte[] to back up all entries?
+  private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
+  
+  private boolean closed = false;
+
+  @Override
+  public void add(BytesRef utf8) {
+    if (closed) throw new IllegalStateException();
+    refs.add(BytesRef.deepCopyOf(utf8));
+  }
+
+  @Override
+  public Iterator<BytesRef> iterator() {
+    closed = true;
+    Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
+    return Collections.unmodifiableCollection(refs).iterator();
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java (added)
+++ lucene/dev/trunk/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,483 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.lucene.util.*;
+import org.apache.lucene.util.PriorityQueue;
+
+// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
+// memory (alignments) and make GC very happy.
+// 
+// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
+// yields any improvement first.
+
+/**
+ * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
+ * fields:
+ * <ul>
+ *   <li>(two bytes) length of the following byte array,
+ *   <li>exactly the above count of bytes for the sequence to be sorted.
+ * </ul>
+ * 
+ * @see #sort(File, File)
+ */
+public final class Sort {
+  public final static int MB = 1024 * 1024;
+  public final static int GB = MB * 1024;
+  
+  /**
+   * Minimum recommended buffer size for sorting.
+   */
+  public final static int MIN_BUFFER_SIZE_MB = 32;
+
+  /**
+   * Maximum number of temporary files before doing an intermediate merge.
+   */
+  public final static int MAX_TEMPFILES = 128;
+
+  /**
+   * Minimum slot buffer expansion.
+   */
+  private final static int MIN_EXPECTED_GROWTH = 1000;
+
+  /** 
+   * A bit more descriptive unit for constructors.
+   * 
+   * @see #automatic()
+   * @see #megabytes(int)
+   */
+  public static final class BufferSize {
+    final int bytes;
+  
+    private BufferSize(long bytes) {
+      if (bytes > Integer.MAX_VALUE) {
+        throw new IllegalArgumentException("Buffer too large for Java ("
+            + (Integer.MAX_VALUE / MB) + "mb max): " + bytes);
+      }
+  
+      this.bytes = (int) bytes;
+    }
+  
+    public static BufferSize megabytes(int mb) {
+      return new BufferSize(mb * MB);
+    }
+  
+    /** 
+     * Approximately half of the currently available free heap, but no less
+     * than {@link #MIN_BUFFER_SIZE_MB}.
+     */
+    public static BufferSize automatic() {
+      long freeHeap = Runtime.getRuntime().freeMemory();
+      return new BufferSize(Math.min(MIN_BUFFER_SIZE_MB * MB, freeHeap / 2));
+    }
+  }
+
+  /**
+   * byte[] in unsigned byte order.
+   */
+  static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
+    public int compare(byte[] left, byte[] right) {
+      final int max = Math.min(left.length, right.length);
+      for (int i = 0, j = 0; i < max; i++, j++) {
+        int diff = (left[i]  & 0xff) - (right[j] & 0xff); 
+        if (diff != 0) 
+          return diff;
+      }
+      return left.length - right.length;
+    }
+  };
+
+  /**
+   * Sort info (debugging mostly).
+   */
+  public class SortInfo {
+    public int tempMergeFiles;
+    public int mergeRounds;
+    public int lines;
+    public long mergeTime;
+    public long sortTime;
+    public long totalTime;
+    public long readTime;
+    public final long bufferSize = ramBufferSize.bytes;
+    
+    @Override
+    public String toString() {
+      return String.format(Locale.ENGLISH,
+          "time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
+          totalTime / 1000.0d, readTime / 1000.0d, sortTime / 1000.0d, mergeTime / 1000.0d,
+          lines, tempMergeFiles, mergeRounds,
+          (double) bufferSize / MB);
+    }
+  }
+
+  private final static byte [][] EMPTY = new byte [0][];
+
+  private final BufferSize ramBufferSize;
+  private final File tempDirectory;
+
+  private byte [][] buffer = new byte [0][];
+  private SortInfo sortInfo;
+  private int maxTempFiles;
+
+  /**
+   * Defaults constructor.
+   * 
+   * @see #defaultTempDir()
+   * @see BufferSize#automatic()
+   */
+  public Sort() throws IOException {
+    this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
+  }
+
+  /**
+   * All-details constructor.
+   */
+  public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
+    if (ramBufferSize.bytes < 1024 * 1024 / 2) {
+      // Half-meg buffer is the absolute minimum.
+      throw new IllegalArgumentException("At least 0.5MB RAM buffer is needed: "
+          + ramBufferSize.bytes);
+    }
+    
+    if (maxTempfiles < 2) {
+      throw new IllegalArgumentException("maxTempFiles must be >= 2");
+    }
+
+    this.ramBufferSize = ramBufferSize;
+    this.tempDirectory = tempDirectory;
+    this.maxTempFiles = maxTempfiles;
+  }
+
+  /** 
+   * Sort input to output, explicit hint for the buffer size. The amount of allocated
+   * memory may deviate from the hint (may be smaller or larger).  
+   */
+  public SortInfo sort(File input, File output) throws IOException {
+    sortInfo = new SortInfo();
+    sortInfo.totalTime = System.currentTimeMillis();
+
+    output.delete();
+
+    ArrayList<File> merges = new ArrayList<File>();
+    ByteSequencesReader is = new ByteSequencesReader(input);
+    boolean success = false;
+    try {
+      int lines = 0;
+      while ((lines = readPartition(is)) > 0) {                    
+        merges.add(sortPartition(lines));
+        sortInfo.tempMergeFiles++;
+        sortInfo.lines += lines;
+
+        // Handle intermediate merges.
+        if (merges.size() == maxTempFiles) {
+          File intermediate = File.createTempFile("sort", "intermediate", tempDirectory);
+          mergePartitions(merges, intermediate);
+          for (File file : merges) {
+            file.delete();
+          }
+          merges.clear();
+          merges.add(intermediate);
+          sortInfo.tempMergeFiles++;
+        }
+      }
+      success = true;
+    } finally {
+      if (success)
+        IOUtils.close(is);
+      else
+        IOUtils.closeWhileHandlingException(is);
+    }
+
+    // One partition, try to rename or copy if unsuccessful.
+    if (merges.size() == 1) {     
+      // If simple rename doesn't work this means the output is
+      // on a different volume or something. Copy the input then.
+      if (!merges.get(0).renameTo(output)) {
+        copy(merges.get(0), output);
+      }
+    } else { 
+      // otherwise merge the partitions with a priority queue.                  
+      mergePartitions(merges, output);                            
+      for (File file : merges) {
+        file.delete();
+      }
+    }
+
+    sortInfo.totalTime = (System.currentTimeMillis() - sortInfo.totalTime); 
+    return sortInfo;
+  }
+
+  /**
+   * Returns the default temporary directory. By default, java.io.tmpdir. If not accessible
+   * or not available, an IOException is thrown
+   */
+  public static File defaultTempDir() throws IOException {
+    String tempDirPath = System.getProperty("java.io.tmpdir");
+    if (tempDirPath == null) 
+      throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
+
+    File tempDirectory = new File(tempDirPath);
+    if (!tempDirectory.exists() || !tempDirectory.canWrite()) {
+      throw new IOException("Java's temporary folder not present or writeable?: " 
+          + tempDirectory.getAbsolutePath());
+    }
+    return tempDirectory;
+  }
+
+  /**
+   * Copies one file to another.
+   */
+  private static void copy(File file, File output) throws IOException {
+    // 64kb copy buffer (empirical pick).
+    byte [] buffer = new byte [16 * 1024];
+    InputStream is = null;
+    OutputStream os = null;
+    try {
+      is = new FileInputStream(file);
+      os = new FileOutputStream(output);
+      int length;
+      while ((length = is.read(buffer)) > 0) {
+        os.write(buffer, 0, length);
+      }
+    } finally {
+      IOUtils.close(is, os);
+    }
+  }
+
+  /** Sort a single partition in-memory. */
+  protected File sortPartition(int len) throws IOException {
+    byte [][] data = this.buffer;
+    File tempFile = File.createTempFile("sort", "partition", tempDirectory);
+
+    long start = System.currentTimeMillis();
+    Arrays.sort(data, 0, len, unsignedByteOrderComparator);
+    sortInfo.sortTime += (System.currentTimeMillis() - start);
+    
+    ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
+    try {
+      for (int i = 0; i < len; i++) {
+        assert data[i].length <= Short.MAX_VALUE;
+        out.write(data[i]);
+      }
+      out.close();
+
+      // Clean up the buffer for the next partition.
+      this.buffer = EMPTY;
+      return tempFile;
+    } finally {
+      IOUtils.close(out);
+    }
+  }
+
+  /** Merge a list of sorted temporary files (partitions) into an output file */
+  void mergePartitions(List<File> merges, File outputFile) throws IOException {
+    long start = System.currentTimeMillis();
+
+    ByteSequencesWriter out = new ByteSequencesWriter(outputFile);
+
+    PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
+      protected boolean lessThan(FileAndTop a, FileAndTop b) {
+        return a.current.compareTo(b.current) < 0;
+      }
+    };
+
+    ByteSequencesReader [] streams = new ByteSequencesReader [merges.size()];
+    try {
+      // Open streams and read the top for each file
+      for (int i = 0; i < merges.size(); i++) {
+        streams[i] = new ByteSequencesReader(merges.get(i));
+        byte line[] = streams[i].read();
+        if (line != null) {
+          queue.insertWithOverflow(new FileAndTop(i, line));
+        }
+      }
+  
+      // Unix utility sort() uses ordered array of files to pick the next line from, updating
+      // it as it reads new lines. The PQ used here is a more elegant solution and has 
+      // a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
+      // so it shouldn't make much of a difference (didn't check).
+      FileAndTop top;
+      while ((top = queue.top()) != null) {
+        out.write(top.current);
+        if (!streams[top.fd].read(top.current)) {
+          queue.pop();
+        } else {
+          queue.updateTop();
+        }
+      }
+  
+      sortInfo.mergeTime += System.currentTimeMillis() - start;
+      sortInfo.mergeRounds++;
+    } finally {
+      // The logic below is: if an exception occurs in closing out, it has a priority over exceptions
+      // happening in closing streams.
+      try {
+        IOUtils.close(streams);
+      } finally {
+        IOUtils.close(out);
+      }
+    }
+  }
+
+  /** Read in a single partition of data */
+  int readPartition(ByteSequencesReader reader) throws IOException {
+    long start = System.currentTimeMillis();
+
+    // We will be reallocating from scratch.
+    Arrays.fill(this.buffer, null);
+
+    int bytesLimit = this.ramBufferSize.bytes;
+    byte [][] data = this.buffer;
+    byte[] line;
+    int linesRead = 0;
+    while ((line = reader.read()) != null) {
+      if (linesRead + 1 >= data.length) {
+        data = Arrays.copyOf(data,
+            ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH, 
+                RamUsageEstimator.NUM_BYTES_OBJECT_REF));
+      }
+      data[linesRead++] = line;
+
+      // Account for the created objects.
+      // (buffer slots do not account to buffer size.) 
+      bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
+      if (bytesLimit < 0) {
+        break;
+      }
+    }
+    this.buffer = data;
+
+    sortInfo.readTime += (System.currentTimeMillis() - start);
+    return linesRead;
+  }
+
+  static class FileAndTop {
+    final int fd;
+    final BytesRef current;
+
+    FileAndTop(int fd, byte [] firstLine) {
+      this.fd = fd;
+      this.current = new BytesRef(firstLine);
+    }
+  }
+
+  /**
+   * Utility class to emit length-prefixed byte[] entries to an output stream for sorting.
+   * Complementary to {@link ByteSequencesReader}.
+   */
+  public static class ByteSequencesWriter implements Closeable {
+    private final DataOutput os;
+
+    public ByteSequencesWriter(File file) throws IOException {
+      this(new DataOutputStream(
+          new BufferedOutputStream(
+              new FileOutputStream(file))));
+    }
+
+    public ByteSequencesWriter(DataOutput os) {
+      this.os = os;
+    }
+
+    public void write(BytesRef ref) throws IOException {
+      assert ref != null;
+      write(ref.bytes, ref.offset, ref.length);
+    }
+
+    public void write(byte [] bytes) throws IOException {
+      write(bytes, 0, bytes.length);
+    }
+
+    public void write(byte [] bytes, int off, int len) throws IOException {
+      assert bytes != null;
+      assert off >= 0 && off + len <= bytes.length;
+      assert len >= 0;
+      os.writeShort(len);
+      os.write(bytes, off, len);
+    }        
+    
+    /**
+     * Closes the provided {@link DataOutput} if it is {@link Closeable}.
+     */
+    @Override
+    public void close() throws IOException {
+      if (os instanceof Closeable) {
+        ((Closeable) os).close();
+      }
+    }    
+  }
+
+  /**
+   * Utility class to read length-prefixed byte[] entries from an input.
+   * Complementary to {@link ByteSequencesWriter}.
+   */
+  public static class ByteSequencesReader implements Closeable {
+    private final DataInput is;
+
+    public ByteSequencesReader(File file) throws IOException {
+      this(new DataInputStream(
+          new BufferedInputStream(
+              new FileInputStream(file))));
+    }
+
+    public ByteSequencesReader(DataInput is) {
+      this.is = is;
+    }
+
+    /**
+     * Reads the next entry into the provided {@link BytesRef}. The internal
+     * storage is resized if needed.
+     * 
+     * @return Returns <code>false</code> if EOF occurred when trying to read
+     * the header of the next sequence. Returns <code>true</code> otherwise.
+     * @throws EOFException if the file ends before the full sequence is read.
+     */
+    public boolean read(BytesRef ref) throws IOException {
+      short length;
+      try {
+        length = is.readShort();
+      } catch (EOFException e) {
+        return false;
+      }
+
+      ref.grow(length);
+      ref.offset = 0;
+      ref.length = length;
+      is.readFully(ref.bytes, 0, length);
+      return true;
+    }
+
+    /**
+     * Reads the next entry and returns it if successful.
+     * 
+     * @see #read(BytesRef)
+     * 
+     * @return Returns <code>null</code> if EOF occurred before the next entry
+     * could be read.
+     * @throws EOFException if the file ends before the full sequence is read.
+     */
+    public byte[] read() throws IOException {
+      short length;
+      try {
+        length = is.readShort();
+      } catch (EOFException e) {
+        return null;
+      }
+
+      assert length >= 0 : "Sanity: sequence length < 0: " + length;
+      byte [] result = new byte [length];
+      is.readFully(result);
+      return result;
+    }
+
+    /**
+     * Closes the provided {@link DataInput} if it is {@link Closeable}.
+     */
+    @Override
+    public void close() throws IOException {
+      if (is instanceof Closeable) {
+        ((Closeable) is).close();
+      }
+    }
+  }  
+}
\ No newline at end of file

Modified: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java?rev=1209265&r1=1209264&r2=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (original)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java Thu Dec  1 21:49:27 2011
@@ -29,10 +29,9 @@ import java.util.Locale;
 import java.util.Random;
 import java.util.concurrent.Callable;
 
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.*;
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
 import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
 import org.apache.lucene.search.suggest.tst.TSTLookup;
 
@@ -48,7 +47,7 @@ public class LookupBenchmarkTest extends
   private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
       JaspellLookup.class, 
       TSTLookup.class,
-      FSTLookup.class);
+      FSTCompletionLookup.class);
 
   private final static int rounds = 15;
   private final static int warmup = 5;

Modified: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java?rev=1209265&r1=1209264&r2=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java (original)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java Thu Dec  1 21:49:27 2011
@@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest
 import java.io.File;
 
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
 import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
 import org.apache.lucene.search.suggest.tst.TSTLookup;
 import org.apache.lucene.util.LuceneTestCase;
@@ -51,9 +51,9 @@ public class PersistenceTest extends Luc
   }
 
   public void testFSTPersistence() throws Exception {
-    runTest(FSTLookup.class, false);
+    runTest(FSTCompletionLookup.class, false);
   }
-  
+
   private void runTest(Class<? extends Lookup> lookupClass,
       boolean supportsExactWeights) throws Exception {
 

Added: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java (added)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,44 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.util.Iterator;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+public class BytesRefSortersTest extends LuceneTestCase {
+  @Test
+  public void testExternalRefSorter() throws Exception {
+    check(new ExternalRefSorter(new Sort()));
+  }
+
+  @Test
+  public void testInMemorySorter() throws Exception {
+    check(new InMemorySorter());
+  }
+
+  private void check(BytesRefSorter sorter) throws Exception {
+    for (int i = 0; i < 100; i++) {
+      byte [] current = new byte [random.nextInt(256)];
+      random.nextBytes(current);
+      sorter.add(new BytesRef(current));
+    }
+
+    // Create two iterators and check that they're aligned with each other.
+    Iterator<BytesRef> i1 = sorter.iterator();
+    Iterator<BytesRef> i2 = sorter.iterator();
+    
+    // Verify sorter contract.
+    try {
+      sorter.add(new BytesRef(new byte [1]));
+      fail("expected contract violation.");
+    } catch (IllegalStateException e) {
+      // Expected.
+    }
+
+    while (i1.hasNext() && i2.hasNext()) {
+      assertEquals(i1.next(), i2.next());
+    }
+    assertEquals(i1.hasNext(), i2.hasNext());
+  }  
+}

Copied: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java (from r1209076, lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java?p2=lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java&p1=lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java&r1=1209076&r2=1209265&rev=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java (original)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java Thu Dec  1 21:49:27 2011
@@ -17,40 +17,38 @@ package org.apache.lucene.search.suggest
  * limitations under the License.
  */
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-import java.util.Random;
+import java.util.*;
 
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
-import org.apache.lucene.search.suggest.fst.FSTLookup;
-import org.apache.lucene.util.LuceneTestCase;
-
-import org.apache.lucene.search.suggest.LookupBenchmarkTest;
-import org.apache.lucene.search.suggest.TermFreq;
-import org.apache.lucene.search.suggest.TermFreqArrayIterator;
+import org.apache.lucene.search.suggest.*;
+import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
+import org.apache.lucene.util.*;
 
 /**
- * Unit tests for {@link FSTLookup}.
+ * Unit tests for {@link FSTCompletion}.
  */
-public class FSTLookupTest extends LuceneTestCase {
+public class FSTCompletionTest extends LuceneTestCase {
   public static TermFreq tf(String t, float v) {
     return new TermFreq(t, v);
   }
 
-  private FSTLookup lookup;
+  private FSTCompletion completion;
+  private FSTCompletion completionAlphabetical;
 
   public void setUp() throws Exception {
     super.setUp();
 
-    lookup = new FSTLookup();
-    lookup.build(new TermFreqArrayIterator(evalKeys()));
+    FSTCompletionBuilder builder = new FSTCompletionBuilder();
+    for (TermFreq tf : evalKeys()) {
+      builder.add(new BytesRef(tf.term), (int) tf.v);
+    }
+    completion = builder.build();
+    completionAlphabetical = new FSTCompletion(completion.getFST(), false, true);
   }
 
   private TermFreq[] evalKeys() {
     final TermFreq[] keys = new TermFreq[] {
-        tf("one", 0.5f),
+        tf("one", 0),
         tf("oneness", 1),
         tf("onerous", 1),
         tf("onesimus", 1),
@@ -64,103 +62,152 @@ public class FSTLookupTest extends Lucen
         tf("foundation", 1),
         tf("fourblah", 1),
         tf("fourteen", 1),
-        tf("four", 0.5f),
-        tf("fourier", 0.5f),
-        tf("fourty", 0.5f),
+        tf("four", 0f),
+        tf("fourier", 0f),
+        tf("fourty", 0f),
         tf("xo", 1),
       };
     return keys;
   }
 
   public void testExactMatchHighPriority() throws Exception {
-    assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
+    assertMatchEquals(completion.lookup("two", 1),
+        "two/1.0");
   }
 
   public void testExactMatchLowPriority() throws Exception {
-    assertMatchEquals(lookup.lookup("one", true, 2), 
+    assertMatchEquals(completion.lookup("one", 2), 
         "one/0.0",
         "oneness/1.0");
   }
+  
+  public void testExactMatchReordering() throws Exception {
+    // Check reordering of exact matches. 
+    assertMatchEquals(completion.lookup("four", 4), 
+        "four/0.0",
+        "fourblah/1.0",
+        "fourteen/1.0",
+        "fourier/0.0");    
+  }
 
   public void testRequestedCount() throws Exception {
     // 'one' is promoted after collecting two higher ranking results.
-    assertMatchEquals(lookup.lookup("one", true, 2), 
-        "one/0.0", 
-        "oneness/1.0");
-
-    // 'one' is at the top after collecting all alphabetical results. 
-    assertMatchEquals(lookup.lookup("one", false, 2), 
+    assertMatchEquals(completion.lookup("one", 2), 
         "one/0.0", 
         "oneness/1.0");
 
     // 'four' is collected in a bucket and then again as an exact match. 
-    assertMatchEquals(lookup.lookup("four", true, 2), 
+    assertMatchEquals(completion.lookup("four", 2), 
         "four/0.0", 
         "fourblah/1.0");
 
     // Check reordering of exact matches. 
-    assertMatchEquals(lookup.lookup("four", true, 4), 
+    assertMatchEquals(completion.lookup("four", 4), 
         "four/0.0",
         "fourblah/1.0",
         "fourteen/1.0",
         "fourier/0.0");
 
-    lookup = new FSTLookup(10, false);
-    lookup.build(new TermFreqArrayIterator(evalKeys()));
+    // 'one' is at the top after collecting all alphabetical results.
+    assertMatchEquals(completionAlphabetical.lookup("one", 2), 
+        "one/0.0", 
+        "oneness/1.0");
     
     // 'one' is not promoted after collecting two higher ranking results.
-    assertMatchEquals(lookup.lookup("one", true, 2),  
+    FSTCompletion noPromotion = new FSTCompletion(completion.getFST(), true, false);
+    assertMatchEquals(noPromotion.lookup("one", 2),  
         "oneness/1.0",
         "onerous/1.0");
 
     // 'one' is at the top after collecting all alphabetical results. 
-    assertMatchEquals(lookup.lookup("one", false, 2), 
+    assertMatchEquals(completionAlphabetical.lookup("one", 2), 
         "one/0.0", 
         "oneness/1.0");
   }
 
   public void testMiss() throws Exception {
-    assertMatchEquals(lookup.lookup("xyz", true, 1));
+    assertMatchEquals(completion.lookup("xyz", 1));
   }
 
   public void testAlphabeticWithWeights() throws Exception {
-    assertEquals(0, lookup.lookup("xyz", false, 1).size());
+    assertEquals(0, completionAlphabetical.lookup("xyz", 1).size());
   }
 
   public void testFullMatchList() throws Exception {
-    assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
+    assertMatchEquals(completion.lookup("one", Integer.MAX_VALUE),
         "oneness/1.0", 
         "onerous/1.0",
         "onesimus/1.0", 
         "one/0.0");
   }
 
+  public void testThreeByte() throws Exception {
+    String key = new String(new byte[] {
+        (byte) 0xF0, (byte) 0xA4, (byte) 0xAD, (byte) 0xA2}, "UTF-8");
+    FSTCompletionBuilder builder = new FSTCompletionBuilder();
+    builder.add(new BytesRef(key), 0);
+
+    FSTCompletion lookup = builder.build();
+    List<Completion> result = lookup.lookup(key, 1);
+    assertEquals(1, result.size());
+  }
+
+  public void testLargeInputConstantWeights() throws Exception {
+    FSTCompletionLookup lookup = new FSTCompletionLookup(10, true);
+    
+    Random r = random;
+    List<TermFreq> keys = new ArrayList<TermFreq>();
+    for (int i = 0; i < 5000; i++) {
+      keys.add(new TermFreq(_TestUtil.randomSimpleString(r), -1.0f));
+    }
+
+    lookup.build(new TermFreqArrayIterator(keys));
+
+    // All the weights were constant, so all returned buckets must be constant, whatever they
+    // are.
+    Float previous = null; 
+    for (TermFreq tf : keys) {
+      Float current = lookup.get(tf.term);
+      if (previous != null) {
+        assertEquals(previous, current);
+      }
+      previous = current;
+    }
+  }  
+
+  @Nightly
   public void testMultilingualInput() throws Exception {
     List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
 
-    lookup = new FSTLookup();
+    FSTCompletionLookup lookup = new FSTCompletionLookup();
     lookup.build(new TermFreqArrayIterator(input));
 
     for (TermFreq tf : input) {
       assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
       assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
     }
+
+    List<LookupResult> result = lookup.lookup("wit", true, 5);
+    assertEquals(5, result.size());
+    assertTrue(result.get(0).key.equals("wit"));  // exact match.
+    assertTrue(result.get(1).key.equals("with")); // highest count.
   }
 
   public void testEmptyInput() throws Exception {
-    lookup = new FSTLookup();
-    lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
-    
-    assertMatchEquals(lookup.lookup("", true, 10));
+    completion = new FSTCompletionBuilder().build();
+    assertMatchEquals(completion.lookup("", 10));
   }
 
+  @Nightly
   public void testRandom() throws Exception {
     List<TermFreq> freqs = new ArrayList<TermFreq>();
     Random rnd = random;
-    for (int i = 0; i < 5000; i++) {
-      freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
+    for (int i = 0; i < 2500 + rnd.nextInt(2500); i++) {
+      float weight = rnd.nextFloat() * 100; 
+      freqs.add(new TermFreq("" + rnd.nextLong(), weight));
     }
-    lookup = new FSTLookup();
+
+    FSTCompletionLookup lookup = new FSTCompletionLookup();
     lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
 
     for (TermFreq tf : freqs) {
@@ -174,12 +221,13 @@ public class FSTLookupTest extends Lucen
     }
   }
 
-  private void assertMatchEquals(List<LookupResult> res, String... expected) {
+  private void assertMatchEquals(List<Completion> res, String... expected) {
     String [] result = new String [res.size()];
-    for (int i = 0; i < res.size(); i++)
+    for (int i = 0; i < res.size(); i++) {
       result[i] = res.get(i).toString();
-    
-    if (!Arrays.equals(expected, result)) {
+    }
+
+    if (!Arrays.equals(stripScore(expected), stripScore(result))) {
       int colLen = Math.max(maxLen(expected), maxLen(result));
       
       StringBuilder b = new StringBuilder();
@@ -196,6 +244,14 @@ public class FSTLookupTest extends Lucen
     }
   }
 
+  private String[] stripScore(String[] expected) {
+    String [] result = new String [expected.length];
+    for (int i = 0; i < result.length; i++) {
+      result[i] = expected[i].replaceAll("\\/[0-9\\.]+", "");
+    }
+    return result;
+  }
+
   private int maxLen(String[] result) {
     int len = 0;
     for (String s : result)

Added: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java (added)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,119 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.util.*;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.NumericUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class FloatMagicTest extends LuceneTestCase {
+  public void testFloatMagic() {
+    ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(new Float [] {
+        Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
+        Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
+        Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
+        Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
+        Float.POSITIVE_INFINITY,
+        Float.MAX_VALUE,
+        100f,
+        0f,
+        0.1f,
+        Float.MIN_VALUE,
+        Float.NaN,
+        -0.0f,
+        -Float.MIN_VALUE,
+        -0.1f,
+        -1f,
+        -10f,
+        Float.NEGATIVE_INFINITY }));
+
+    // Sort them using juc.
+    Collections.sort(floats);
+    
+    // Convert to sortable int4 representation (as long to have an unsigned sort).
+    long [] int4 = new long [floats.size()];
+    for (int i = 0; i < floats.size(); i++) {
+      int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
+
+      System.out.println(
+          String.format("raw %8s sortable %8s %8s numutils %8s %s",
+              Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
+              Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
+              Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
+              Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
+              floats.get(i)));
+    }
+
+    // Sort and compare. Should be identical order.
+    Arrays.sort(int4);
+    ArrayList<Float> backFromFixed = new ArrayList<Float>();
+    for (int i = 0; i < int4.length; i++) {
+      backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
+    }
+
+    for (int i = 0; i < int4.length; i++) {
+      System.out.println(
+          floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
+    }
+    
+    assertEquals(floats, backFromFixed);
+  }
+
+  @Ignore("Once checked, valid forever?") @Test
+  public void testRoundTripFullRange() {
+    int i = 0;
+    do {
+      float f = Float.intBitsToFloat(i);
+      float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
+      
+      if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
+        throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2); 
+      }
+
+      if ((i & 0xffffff) == 0) {
+        System.out.println(Integer.toHexString(i));
+      }
+      
+      i++;
+    } while (i != 0);
+  }
+  
+  @Ignore("Once checked, valid forever?") @Test
+  public void testIncreasingFullRange() {
+    // -infinity ... -0.0
+    for (int i = 0xff800000; i != 0x80000000; i--) {
+      checkSmaller(i, i - 1); 
+    }
+    
+    // -0.0 +0.0
+    checkSmaller(0x80000000, 0);
+
+    // +0.0 ... +infinity
+    for (int i = 0; i != 0x7f800000; i++) {
+      checkSmaller(i, i + 1); 
+    }
+
+    // All other are NaNs and should be after positive infinity.
+    final long infinity = toSortableL(Float.POSITIVE_INFINITY);
+    for (int i = 0x7f800001; i != 0x7fffffff; i++) {
+      assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
+    }
+    for (int i = 0xff800001; i != 0xffffffff; i++) {
+      assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
+    }
+  }
+
+  private long toSortableL(float f) {
+    return FloatMagic.toSortable(f) & 0xffffffffL;
+  }
+
+  private void checkSmaller(int i1, int i2) {
+    float f1 = Float.intBitsToFloat(i1);
+    float f2 = Float.intBitsToFloat(i2);
+    if (f1 > f2) {
+      throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
+    }
+    assertTrue(toSortableL(f1) < toSortableL(f2));
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java (added)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,43 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.*;
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Try to build a suggester from a large data set. The input is a simple text
+ * file, newline-delimited.
+ */
+public class LargeInputFST {
+  public static void main(String[] args) throws IOException {
+    File input = new File("/home/dweiss/tmp/shuffled.dict");
+
+    int buckets = 20;
+    int shareMaxTail = 10;
+
+    ExternalRefSorter sorter = new ExternalRefSorter(new Sort());
+    FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail);
+
+    BufferedReader reader = new BufferedReader(
+        new InputStreamReader(
+            new FileInputStream(input), "UTF-8"));
+    
+    BytesRef scratch = new BytesRef();
+    String line;
+    int count = 0;
+    while ((line = reader.readLine()) != null) {
+      scratch.copyChars(line);
+      builder.add(scratch, count % buckets);
+      if ((count++ % 100000) == 0) {
+        System.err.println("Line: " + count);
+      }
+    }
+
+    System.out.println("Building FSTCompletion.");
+    FSTCompletion completion = builder.build();
+
+    File fstFile = new File("completion.fst");
+    System.out.println("Done. Writing automaton: " + fstFile.getAbsolutePath());
+    completion.getFST().save(fstFile);
+  }
+}

Added: lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java?rev=1209265&view=auto
==============================================================================
--- lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java (added)
+++ lucene/dev/trunk/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java Thu Dec  1 21:49:27 2011
@@ -0,0 +1,126 @@
+package org.apache.lucene.search.suggest.fst;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
+import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
+import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
+import org.apache.lucene.util.*;
+import org.junit.*;
+
+/**
+ * Tests for on-disk merge sorting.
+ */
+public class TestSort extends LuceneTestCase {
+  private File tempDir;
+
+  @Before
+  public void prepareTempDir() throws IOException {
+    tempDir = _TestUtil.getTempDir("mergesort");
+    _TestUtil.rmDir(tempDir);
+    tempDir.mkdirs();
+  }
+  
+  @After
+  public void cleanup() throws IOException {
+    if (tempDir != null)
+      _TestUtil.rmDir(tempDir);
+  }
+
+  @Test
+  public void testEmpty() throws Exception {
+    checkSort(new Sort(), new byte [][] {});
+  }
+
+  @Test
+  public void testSingleLine() throws Exception {
+    checkSort(new Sort(), new byte [][] {
+        "Single line only.".getBytes("UTF-8")
+    });
+  }
+
+  @Test
+  public void testIntermediateMerges() throws Exception {
+    // Sort 20 mb worth of data with 1mb buffer, binary merging.
+    SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2), 
+        generateRandom(Sort.MB * 20));
+    assertTrue(info.mergeRounds > 10);
+  }
+
+  @Test
+  public void testSmallRandom() throws Exception {
+    // Sort 20 mb worth of data with 1mb buffer.
+    SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), 
+        generateRandom(Sort.MB * 20));
+    assertEquals(1, sortInfo.mergeRounds);
+  }
+
+  @Test @Nightly
+  public void testLargerRandom() throws Exception {
+    // Sort 100MB worth of data with 15mb buffer.
+    checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), 
+        generateRandom(Sort.MB * 100));
+  }
+
+  private byte[][] generateRandom(int howMuchData) {
+    ArrayList<byte[]> data = new ArrayList<byte[]>(); 
+    while (howMuchData > 0) {
+      byte [] current = new byte [random.nextInt(256)];
+      random.nextBytes(current);
+      data.add(current);
+      howMuchData -= current.length;
+    }
+    byte [][] bytes = data.toArray(new byte[data.size()][]);
+    return bytes;
+  }
+
+  /**
+   * Check sorting data on an instance of {@link Sort}.
+   */
+  private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
+    File unsorted = writeAll("unsorted", data);
+
+    Arrays.sort(data, Sort.unsignedByteOrderComparator);
+    File golden = writeAll("golden", data);
+
+    File sorted = new File(tempDir, "sorted");
+    SortInfo sortInfo = sort.sort(unsorted, sorted);
+    System.out.println("Input size [MB]: " + unsorted.length() / (1024 * 1024));
+    System.out.println(sortInfo);
+
+    assertFilesIdentical(golden, sorted);
+    return sortInfo;
+  }
+
+  /**
+   * Make sure two files are byte-byte identical.
+   */
+  private void assertFilesIdentical(File golden, File sorted) throws IOException {
+    assertEquals(golden.length(), sorted.length());
+
+    byte [] buf1 = new byte [64 * 1024 * 1024];
+    byte [] buf2 = new byte [64 * 1024 * 1024];
+    int len;
+    DataInputStream is1 = new DataInputStream(new FileInputStream(golden));
+    DataInputStream is2 = new DataInputStream(new FileInputStream(sorted));
+    while ((len = is1.read(buf1)) > 0) {
+      is2.readFully(buf2, 0, len);
+      for (int i = 0; i < len; i++) {
+        assertEquals(buf1[i], buf2[i]);
+      }
+    }
+    IOUtils.close(is1, is2);
+  }
+
+  private File writeAll(String name, byte[][] data) throws IOException {
+    File file = new File(tempDir, name);
+    ByteSequencesWriter w = new Sort.ByteSequencesWriter(file);
+    for (byte [] datum : data) {
+      w.write(datum);
+    }
+    w.close();
+    return file;
+  }
+}

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1209265&r1=1209264&r2=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Dec  1 21:49:27 2011
@@ -199,6 +199,11 @@ New Features
 Optimizations
 ----------------------
 
+* SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8, 
+  external sorting (on disk) prevents OOMs even with large data sets
+  (the bottleneck is now FST construction), code cleanups and API cleanups.
+  (Dawid Weiss)
+
 * SOLR-1875: Per-segment field faceting for single valued string fields.
   Enable with facet.method=fcs, control the number of threads used with
   the "threads" local param on the facet.field param.  This algorithm will

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java?rev=1209265&r1=1209264&r2=1209265&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/FSTLookupFactory.java Thu Dec  1 21:49:27 2011
@@ -18,16 +18,15 @@ package org.apache.solr.spelling.suggest
  */
 
 import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.fst.FSTLookup;
+import org.apache.lucene.search.suggest.fst.*;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.spelling.suggest.LookupFactory;
 
 /**
- * Factory for {@link FSTLookup}
+ * Factory for {@link FSTCompletionLookup}
  */
 public class FSTLookupFactory extends LookupFactory {
-
   /**
    * The number of separate buckets for weights (discretization). The more buckets,
    * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
@@ -55,6 +54,6 @@ public class FSTLookupFactory extends Lo
     ? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
     : true;
 
-    return new FSTLookup(buckets, exactMatchFirst);
+    return new FSTCompletionLookup(buckets, exactMatchFirst);
   }
 }



Mime
View raw message