lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [5/8] Porting Lucene.Net.Suggest (still not compiling)
Date Mon, 15 Sep 2014 22:24:52 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
new file mode 100644
index 0000000..bb1061f
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
@@ -0,0 +1,792 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Codecs.Lucene46;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Directory = Lucene.Net.Store.Directory;
+using Version = System.Version;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    // TODO:
+    //   - a PostingsFormat that stores super-high-freq terms as
+    //     a bitset should be a win for the prefix terms?
+    //     (LUCENE-5052)
+    //   - we could offer a better integration with
+    //     DocumentDictionary and NRT?  so that your suggester
+    //     "automatically" keeps in sync w/ your index
+
+    /// <summary>
+    /// Analyzes the input text and then suggests matches based
+    ///  on prefix matches to any tokens in the indexed text.
+    ///  This also highlights the tokens that match.
+    /// 
+    ///  <para>This suggester supports payloads.  Matches are sorted only
+    ///  by the suggest weight; it would be nice to support
+    ///  blended score + weight sort in the future.  This means
+    ///  this suggester best applies when there is a strong
+    ///  a-priori ranking of all the suggestions.
+    /// 
+    /// </para>
+    ///  <para>This suggester supports contexts, however the
+    ///  contexts must be valid utf8 (arbitrary binary terms will
+    ///  not work).
+    /// 
+    /// @lucene.experimental 
+    /// </para>
+    /// </summary>
+
+    public class AnalyzingInfixSuggester : Lookup, IDisposable
+    {
+
+        /// <summary>
+        /// Field name used for the indexed text. </summary>
+        protected internal const string TEXT_FIELD_NAME = "text";
+
+        /// <summary>
+        /// Field name used for the indexed text, as a
+        ///  StringField, for exact lookup. 
+        /// </summary>
+        protected internal const string EXACT_TEXT_FIELD_NAME = "exacttext";
+
+        /// <summary>
+        /// Field name used for the indexed context, as a
+        ///  StringField and a SortedSetDVField, for filtering. 
+        /// </summary>
+        protected internal const string CONTEXTS_FIELD_NAME = "contexts";
+
+        /// <summary>
+        /// Analyzer used at search time </summary>
+        protected internal readonly Analyzer queryAnalyzer;
+        /// <summary>
+        /// Analyzer used at index time </summary>
+        protected internal readonly Analyzer indexAnalyzer;
+        internal readonly Version matchVersion;
+        private readonly Directory dir;
+        internal readonly int minPrefixChars;
+
+        /// <summary>
+        /// Used for ongoing NRT additions/updates. </summary>
+        private IndexWriter writer;
+
+        /// <summary>
+        /// <seealso cref="IndexSearcher"/> used for lookups. </summary>
+        protected internal SearcherManager searcherMgr;
+
+        /// <summary>
+        /// Default minimum number of leading characters before
+        ///  PrefixQuery is used (4). 
+        /// </summary>
+        public const int DEFAULT_MIN_PREFIX_CHARS = 4;
+
+        /// <summary>
+        /// How we sort the postings and search results. </summary>
+        private static readonly Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
+
+        /// <summary>
+        /// Create a new instance, loading from a previously built
+        ///  AnalyzingInfixSuggester directory, if it exists.  This directory must be
+        ///  private to the infix suggester (i.e., not an external
+        ///  Lucene index).  Note that <seealso cref="#close"/>
+        ///  will also close the provided directory. 
+        /// </summary>
+        public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer analyzer)
+            : this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS)
+        {
+        }
+
+        /// <summary>
+        /// Create a new instance, loading from a previously built
+        ///  AnalyzingInfixSuggester directory, if it exists.  This directory must be
+        ///  private to the infix suggester (i.e., not an external
+        ///  Lucene index).  Note that <seealso cref="#close"/>
+        ///  will also close the provided directory.
+        /// </summary>
+        ///  <param name="minPrefixChars"> Minimum number of leading characters
+        ///     before PrefixQuery is used (default 4).
+        ///     Prefixes shorter than this are indexed as character
+        ///     ngrams (increasing index size but making lookups
+        ///     faster). </param>
+        public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars)
+        {
+
+            if (minPrefixChars < 0)
+            {
+                throw new System.ArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
+            }
+
+            this.queryAnalyzer = queryAnalyzer;
+            this.indexAnalyzer = indexAnalyzer;
+            this.matchVersion = matchVersion;
+            this.dir = dir;
+            this.minPrefixChars = minPrefixChars;
+
+            if (DirectoryReader.IndexExists(dir))
+            {
+                // Already built; open it:
+                writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, IndexWriterConfig.OpenMode.APPEND));
+                searcherMgr = new SearcherManager(writer, true, null);
+            }
+        }
+
+        /// <summary>
+        /// Override this to customize index settings, e.g. which
+        ///  codec to use. 
+        /// </summary>
+        protected internal virtual IndexWriterConfig GetIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode)
+        {
+            IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
+            iwc.Codec = new Lucene46Codec();
+            iwc.OpenMode = openMode;
+
+            // This way all merged segments will be sorted at
+            // merge time, allow for per-segment early termination
+            // when those segments are searched:
+            iwc.MergePolicy = new SortingMergePolicy(iwc.MergePolicy, SORT);
+
+            return iwc;
+        }
+
+        /// <summary>
+        /// Subclass can override to choose a specific {@link
+        ///  Directory} implementation. 
+        /// </summary>
+        protected internal virtual Directory GetDirectory(File path)
+        {
+            return FSDirectory.Open(path);
+        }
+
+        public override void Build(InputIterator iter)
+        {
+
+            if (searcherMgr != null)
+            {
+                searcherMgr.Dispose();
+                searcherMgr = null;
+            }
+
+            if (writer != null)
+            {
+                writer.Dispose();
+                writer = null;
+            }
+
+            AtomicReader r = null;
+            bool success = false;
+            try
+            {
+                // First pass: build a temporary normal Lucene index,
+                // just indexing the suggestions as they iterate:
+                writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, IndexWriterConfig.OpenMode.CREATE));
+                //long t0 = System.nanoTime();
+
+                // TODO: use threads?
+                BytesRef text;
+                while ((text = iter.Next()) != null)
+                {
+                    BytesRef payload;
+                    if (iter.HasPayloads)
+                    {
+                        payload = iter.Payload;
+                    }
+                    else
+                    {
+                        payload = null;
+                    }
+
+                    Add(text, iter.Contexts, iter.Weight, payload);
+                }
+
+                //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
+
+                searcherMgr = new SearcherManager(writer, true, null);
+                success = true;
+            }
+            finally
+            {
+                if (success)
+                {
+                    IOUtils.Close(r);
+                }
+                else
+                {
+                    IOUtils.CloseWhileHandlingException(writer, r);
+                    writer = null;
+                }
+            }
+        }
+
+        private Analyzer GramAnalyzer
+        {
+            get
+            {
+                return new AnalyzerWrapperAnonymousInnerClassHelper(this, Analyzer.PER_FIELD_REUSE_STRATEGY);
+            }
+        }
+
+        private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper
+        {
+            private readonly AnalyzingInfixSuggester outerInstance;
+
+            public AnalyzerWrapperAnonymousInnerClassHelper(AnalyzingInfixSuggester outerInstance, UnknownType PER_FIELD_REUSE_STRATEGY)
+                : base(PER_FIELD_REUSE_STRATEGY)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            protected override Analyzer GetWrappedAnalyzer(string fieldName)
+            {
+                return outerInstance.indexAnalyzer;
+            }
+
+            protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
+            {
+                if (fieldName.Equals("textgrams") && outerInstance.minPrefixChars > 0)
+                {
+                    return new TokenStreamComponents(components.Tokenizer, new EdgeNGramTokenFilter(outerInstance.matchVersion, components.TokenStream, 1, outerInstance.minPrefixChars));
+                }
+                else
+                {
+                    return components;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Adds a new suggestion.  Be sure to use <seealso cref="#update"/>
+        ///  instead if you want to replace a previous suggestion.
+        ///  After adding or updating a batch of new suggestions,
+        ///  you must call <seealso cref="#refresh"/> in the end in order to
+        ///  see the suggestions in <seealso cref="#lookup"/> 
+        /// </summary>
+        public virtual void Add(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+        {
+            writer.AddDocument(BuildDocument(text, contexts, weight, payload));
+        }
+
+        /// <summary>
+        /// Updates a previous suggestion, matching the exact same
+        ///  text as before.  Use this to change the weight or
+        ///  payload of an already added suggstion.  If you know
+        ///  this text is not already present you can use {@link
+        ///  #add} instead.  After adding or updating a batch of
+        ///  new suggestions, you must call <seealso cref="#refresh"/> in the
+        ///  end in order to see the suggestions in <seealso cref="#lookup"/> 
+        /// </summary>
+        public virtual void Update(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+        {
+            writer.UpdateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.Utf8ToString()), BuildDocument(text, contexts, weight, payload));
+        }
+
+        private Document BuildDocument(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+        {
+            string textString = text.Utf8ToString();
+            var ft = TextFieldType;
+            var doc = new Document
+		{
+		    new Field(TEXT_FIELD_NAME, textString, ft),
+		    new Field("textgrams", textString, ft),
+		    new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO),
+		    new BinaryDocValuesField(TEXT_FIELD_NAME, text),
+		    new NumericDocValuesField("weight", weight)
+		};
+            if (payload != null)
+            {
+                doc.Add(new BinaryDocValuesField("payloads", payload));
+            }
+            if (contexts != null)
+            {
+                foreach (BytesRef context in contexts)
+                {
+                    // TODO: if we had a BinaryTermField we could fix
+                    // this "must be valid ut8f" limitation:
+                    doc.Add(new StringField(CONTEXTS_FIELD_NAME, context.Utf8ToString(), Field.Store.NO));
+                    doc.Add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context));
+                }
+            }
+            return doc;
+        }
+
+        /// <summary>
+        /// Reopens the underlying searcher; it's best to "batch
+        ///  up" many additions/updates, and then call refresh
+        ///  once in the end. 
+        /// </summary>
+        public virtual void Refresh()
+        {
+            searcherMgr.MaybeRefreshBlocking();
+        }
+
+        /// <summary>
+        /// Subclass can override this method to change the field type of the text field
+        /// e.g. to change the index options
+        /// </summary>
+        protected internal virtual FieldType TextFieldType
+        {
+            get
+            {
+                var ft = new FieldType(TextField.TYPE_NOT_STORED);
+                ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY;
+                ft.OmitNorms = true;
+
+                return ft;
+            }
+        }
+
+        public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+        {
+            return Lookup(key, contexts, num, true, true);
+        }
+
+        /// <summary>
+        /// Lookup, without any context.
+        /// </summary>
+        public virtual IList<LookupResult> Lookup(string key, int num, bool allTermsRequired, bool doHighlight)
+        {
+            return Lookup(key, null, num, allTermsRequired, doHighlight);
+        }
+
+        /// <summary>
+        /// This is called if the last token isn't ended
+        ///  (e.g. user did not type a space after it).  Return an
+        ///  appropriate Query clause to add to the BooleanQuery. 
+        /// </summary>
+        protected internal virtual Query GetLastTokenQuery(string token)
+        {
+            if (token.Length < minPrefixChars)
+            {
+                // The leading ngram was directly indexed:
+                return new TermQuery(new Term("textgrams", token));
+            }
+
+            return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));
+        }
+
+        /// <summary>
+        /// Retrieve suggestions, specifying whether all terms
+        ///  must match ({@code allTermsRequired}) and whether the hits
+        ///  should be highlighted ({@code doHighlight}). 
+        /// </summary>
+        public virtual IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
+        {
+
+            if (searcherMgr == null)
+            {
+                throw new InvalidOperationException("suggester was not built");
+            }
+
+            BooleanClause.Occur occur;
+            if (allTermsRequired)
+            {
+                occur = BooleanClause.Occur.MUST;
+            }
+            else
+            {
+                occur = BooleanClause.Occur.SHOULD;
+            }
+
+            TokenStream ts = null;
+            BooleanQuery query;
+            var matchedTokens = new HashSet<string>();
+            string prefixToken = null;
+
+            try
+            {
+                ts = queryAnalyzer.TokenStream("", new StringReader(key));
+
+                //long t0 = System.currentTimeMillis();
+                ts.Reset();
+                var termAtt = ts.AddAttribute<CharTermAttribute>();
+                var offsetAtt = ts.AddAttribute<OffsetAttribute>();
+                string lastToken = null;
+                query = new BooleanQuery();
+                int maxEndOffset = -1;
+                matchedTokens = new HashSet<string>();
+                while (ts.IncrementToken())
+                {
+                    if (lastToken != null)
+                    {
+                        matchedTokens.Add(lastToken);
+                        query.Add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
+                    }
+                    lastToken = termAtt.ToString();
+                    if (lastToken != null)
+                    {
+                        maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
+                    }
+                }
+                ts.End();
+
+                if (lastToken != null)
+                {
+                    Query lastQuery;
+                    if (maxEndOffset == offsetAtt.EndOffset())
+                    {
+                        // Use PrefixQuery (or the ngram equivalent) when
+                        // there was no trailing discarded chars in the
+                        // string (e.g. whitespace), so that if query does
+                        // not end with a space we show prefix matches for
+                        // that token:
+                        lastQuery = GetLastTokenQuery(lastToken);
+                        prefixToken = lastToken;
+                    }
+                    else
+                    {
+                        // Use TermQuery for an exact match if there were
+                        // trailing discarded chars (e.g. whitespace), so
+                        // that if query ends with a space we only show
+                        // exact matches for that term:
+                        matchedTokens.Add(lastToken);
+                        lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
+                    }
+                    if (lastQuery != null)
+                    {
+                        query.Add(lastQuery, occur);
+                    }
+                }
+
+                if (contexts != null)
+                {
+                    BooleanQuery sub = new BooleanQuery();
+                    query.Add(sub, BooleanClause.Occur.MUST);
+                    foreach (BytesRef context in contexts)
+                    {
+                        // NOTE: we "should" wrap this in
+                        // ConstantScoreQuery, or maybe send this as a
+                        // Filter instead to search, but since all of
+                        // these are MUST'd, the change to the score won't
+                        // affect the overall ranking.  Since we indexed
+                        // as DOCS_ONLY, the perf should be the same
+                        // either way (no freq int[] blocks to decode):
+
+                        // TODO: if we had a BinaryTermField we could fix
+                        // this "must be valid ut8f" limitation:
+                        sub.Add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context.Utf8ToString())), BooleanClause.Occur.SHOULD);
+                    }
+                }
+            }
+            finally
+            {
+                IOUtils.CloseWhileHandlingException(ts);
+            }
+
+            // TODO: we could allow blended sort here, combining
+            // weight w/ score.  Now we ignore score and sort only
+            // by weight:
+
+            Query finalQuery = FinishQuery(query, allTermsRequired);
+
+            //System.out.println("finalQuery=" + query);
+
+            // Sort by weight, descending:
+            TopFieldCollector c = TopFieldCollector.Create(SORT, num, true, false, false, false);
+
+            // We sorted postings by weight during indexing, so we
+            // only retrieve the first num hits now:
+            Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
+            IndexSearcher searcher = searcherMgr.Acquire();
+            IList<LookupResult> results = null;
+            try
+            {
+                //System.out.println("got searcher=" + searcher);
+                searcher.Search(finalQuery, c2);
+
+                TopFieldDocs hits = (TopFieldDocs)c.TopDocs();
+
+                // Slower way if postings are not pre-sorted by weight:
+                // hits = searcher.search(query, null, num, SORT);
+                results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
+            }
+            finally
+            {
+                searcherMgr.Release(searcher);
+            }
+
+            //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
+            //System.out.println(results);
+
+            return results;
+        }
+
+        /// <summary>
+        /// Create the results based on the search hits.
+        /// Can be overridden by subclass to add particular behavior (e.g. weight transformation) </summary>
+        /// <exception cref="IOException"> If there are problems reading fields from the underlying Lucene index. </exception>
+        protected internal virtual IList<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, string charSequence, bool doHighlight, HashSet<string> matchedTokens, string prefixToken)
+        {
+
+            BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
+
+            // This will just be null if app didn't pass payloads to build():
+            // TODO: maybe just stored fields?  they compress...
+            BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");
+            IList<AtomicReaderContext> leaves = searcher.IndexReader.Leaves();
+            IList<LookupResult> results = new List<LookupResult>();
+            BytesRef scratch = new BytesRef();
+            for (int i = 0; i < hits.ScoreDocs.Length; i++)
+            {
+                FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];
+                textDV.Get(fd.Doc, scratch);
+                string text = scratch.Utf8ToString();
+                long score = (long?)fd.Fields[0];
+
+                BytesRef payload;
+                if (payloadsDV != null)
+                {
+                    payload = new BytesRef();
+                    payloadsDV.Get(fd.Doc, payload);
+                }
+                else
+                {
+                    payload = null;
+                }
+
+                // Must look up sorted-set by segment:
+                int segment = ReaderUtil.SubIndex(fd.Doc, leaves);
+                SortedSetDocValues contextsDV = leaves[segment].Reader().GetSortedSetDocValues(CONTEXTS_FIELD_NAME);
+                HashSet<BytesRef> contexts;
+                if (contextsDV != null)
+                {
+                    contexts = new HashSet<BytesRef>();
+                    contextsDV.Document = fd.Doc - leaves[segment].DocBase;
+                    long ord;
+                    while ((ord = contextsDV.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS)
+                    {
+                        BytesRef context = new BytesRef();
+                        contextsDV.LookupOrd(ord, context);
+                        contexts.Add(context);
+                    }
+                }
+                else
+                {
+                    contexts = null;
+                }
+
+                LookupResult result;
+
+                if (doHighlight)
+                {
+                    object highlightKey = Highlight(text, matchedTokens, prefixToken);
+                    result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload, contexts);
+                }
+                else
+                {
+                    result = new LookupResult(text, score, payload, contexts);
+                }
+
+                results.Add(result);
+            }
+
+            return results;
+        }
+
+        /// <summary>
+        /// Subclass can override this to tweak the Query before
+        ///  searching. 
+        /// </summary>
+        protected internal virtual Query FinishQuery(BooleanQuery bq, bool allTermsRequired)
+        {
+            return bq;
+        }
+
+        /// <summary>
+        /// Override this method to customize the Object
+        ///  representing a single highlighted suggestions; the
+        ///  result is set on each {@link
+        ///  LookupResult#highlightKey} member. 
+        /// </summary>
+        protected internal virtual object Highlight(string text, HashSet<string> matchedTokens, string prefixToken)
+        {
+            TokenStream ts = queryAnalyzer.TokenStream("text", new StringReader(text));
+            try
+            {
+                var termAtt = ts.AddAttribute<CharTermAttribute>();
+                var offsetAtt = ts.AddAttribute<OffsetAttribute>();
+                ts.Reset();
+                var sb = new StringBuilder();
+                int upto = 0;
+                while (ts.IncrementToken())
+                {
+                    string token = termAtt.ToString();
+                    int startOffset = offsetAtt.StartOffset();
+                    int endOffset = offsetAtt.EndOffset();
+                    if (upto < startOffset)
+                    {
+                        addNonMatch(sb, text.Substring(upto, startOffset - upto));
+                        upto = startOffset;
+                    }
+                    else if (upto > startOffset)
+                    {
+                        continue;
+                    }
+
+                    if (matchedTokens.Contains(token))
+                    {
+                        // Token matches.
+                        AddWholeMatch(sb, text.Substring(startOffset, endOffset - startOffset), token);
+                        upto = endOffset;
+                    }
+                    else if (prefixToken != null && token.StartsWith(prefixToken, StringComparison.Ordinal))
+                    {
+                        AddPrefixMatch(sb, text.Substring(startOffset, endOffset - startOffset), token, prefixToken);
+                        upto = endOffset;
+                    }
+                }
+                ts.End();
+                int endOffset = offsetAtt.EndOffset();
+                if (upto < endOffset)
+                {
+                    addNonMatch(sb, text.Substring(upto));
+                }
+                return sb.ToString();
+            }
+            finally
+            {
+                IOUtils.CloseWhileHandlingException(ts);
+            }
+        }
+
+        /// <summary>
+        /// Called while highlighting a single result, to append a
+        ///  non-matching chunk of text from the suggestion to the
+        ///  provided fragments list. </summary>
+        ///  <param name="sb"> The {@code StringBuilder} to append to </param>
+        ///  <param name="text"> The text chunk to add </param>
+        protected internal virtual void addNonMatch(StringBuilder sb, string text)
+        {
+            sb.Append(text);
+        }
+
+        /// <summary>
+        /// Called while highlighting a single result, to append
+        ///  the whole matched token to the provided fragments list. </summary>
+        ///  <param name="sb"> The {@code StringBuilder} to append to </param>
+        ///  <param name="surface"> The surface form (original) text </param>
+        ///  <param name="analyzed"> The analyzed token corresponding to the surface form text </param>
+        protected internal virtual void AddWholeMatch(StringBuilder sb, string surface, string analyzed)
+        {
+            sb.Append("<b>");
+            sb.Append(surface);
+            sb.Append("</b>");
+        }
+
+        /// <summary>
+        /// Called while highlighting a single result, to append a
+        ///  matched prefix token, to the provided fragments list. </summary>
+        ///  <param name="sb"> The {@code StringBuilder} to append to </param>
+        ///  <param name="surface"> The fragment of the surface form
+        ///        (indexed during <seealso cref="#build"/>, corresponding to
+        ///        this match </param>
+        ///  <param name="analyzed"> The analyzed token that matched </param>
+        ///  <param name="prefixToken"> The prefix of the token that matched </param>
+        protected internal virtual void AddPrefixMatch(StringBuilder sb, string surface, string analyzed, string prefixToken)
+        {
+            // TODO: apps can try to invert their analysis logic
+            // here, e.g. downcase the two before checking prefix:
+            sb.Append("<b>");
+            sb.Append(surface.Substring(0, prefixToken.Length));
+            sb.Append("</b>");
+            if (prefixToken.Length < surface.Length)
+            {
+                sb.Append(surface.Substring(prefixToken.Length));
+            }
+        }
+
+        public override bool Store(DataOutput @in)
+        {
+            return false;
+        }
+
+        public override bool Load(DataInput @out)
+        {
+            return false;
+        }
+
+        public void Dispose()
+        {
+            if (searcherMgr != null)
+            {
+                searcherMgr.Dispose();
+                searcherMgr = null;
+            }
+            if (writer != null)
+            {
+                writer.Dispose();
+                dir.Dispose();
+                writer = null;
+            }
+        }
+
+        public override long SizeInBytes()
+        {
+            long mem = RamUsageEstimator.ShallowSizeOf(this);
+            try
+            {
+                if (searcherMgr != null)
+                {
+                    IndexSearcher searcher = searcherMgr.Acquire();
+                    try
+                    {
+                        foreach (AtomicReaderContext context in searcher.IndexReader.Leaves())
+                        {
+                            AtomicReader reader = FilterAtomicReader.Unwrap(context.Reader());
+                            if (reader is SegmentReader)
+                            {
+                                mem += ((SegmentReader)context.reader()).RamBytesUsed();
+                            }
+                        }
+                    }
+                    finally
+                    {
+                        searcherMgr.Release(searcher);
+                    }
+                }
+                return mem;
+            }
+            catch (IOException ioe)
+            {
+                throw new Exception(ioe);
+            }
+        }
+
+        public override long Count
+        {
+            get
+            {
+                IndexSearcher searcher = searcherMgr.Acquire();
+                try
+                {
+                    return searcher.IndexReader.NumDocs();
+                }
+                finally
+                {
+                    searcherMgr.Release(searcher);
+                }
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
new file mode 100644
index 0000000..885b992
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
@@ -0,0 +1,1093 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using Lucene.Net.Analysis;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// Suggester that first analyzes the surface form, adds the
+    /// analyzed form to a weighted FST, and then does the same
+    /// thing at lookup time.  This means lookup is based on the
+    /// analyzed form while suggestions are still the surface
+    /// form(s).
+    /// 
+    /// <para>
+    /// This can result in powerful suggester functionality.  For
+    /// example, if you use an analyzer removing stop words, 
+    /// then the partial text "ghost chr..." could see the
+    /// suggestion "The Ghost of Christmas Past". Note that
+    /// position increments MUST NOT be preserved for this example
+    /// to work, so you should call the constructor with 
+    /// <code>preservePositionIncrements</code> parameter set to 
+    /// false
+    /// 
+    /// </para>
+    /// <para>
+    /// If SynonymFilter is used to map wifi and wireless network to
+    /// hotspot then the partial text "wirele..." could suggest
+    /// "wifi router".  Token normalization like stemmers, accent
+    /// removal, etc., would allow suggestions to ignore such
+    /// variations.
+    /// 
+    /// </para>
+    /// <para>
+    /// When two matching suggestions have the same weight, they
+    /// are tie-broken by the analyzed form.  If their analyzed
+    /// form is the same then the order is undefined.
+    /// 
+    /// </para>
+    /// <para>
+    /// There are some limitations:
+    /// <ul>
+    /// 
+    ///   <li> A lookup from a query like "net" in English won't
+    ///        be any different than "net " (ie, user added a
+    ///        trailing space) because analyzers don't reflect
+    ///        when they've seen a token separator and when they
+    ///        haven't.
+    /// 
+    ///   <li> If you're using {@code StopFilter}, and the user will
+    ///        type "fast apple", but so far all they've typed is
+    ///        "fast a", again because the analyzer doesn't convey whether
+    ///        it's seen a token separator after the "a",
+    ///        {@code StopFilter} will remove that "a" causing
+    ///        far more matches than you'd expect.
+    /// 
+    ///   <li> Lookups with the empty string return no results
+    ///        instead of all results.
+    /// </ul>
+    /// 
+    /// @lucene.experimental
+    /// </para>
+    /// </summary>
+    public class AnalyzingSuggester : Lookup
+    {
+
+        /// <summary>
+        /// FST<Weight,Surface>: 
+        ///  input is the analyzed form, with a null byte between terms
+        ///  weights are encoded as costs: (Integer.MAX_VALUE-weight)
+        ///  surface is the original, unanalyzed form.
+        /// </summary>
+        private FST<PairOutputs.Pair<long?, BytesRef>> fst = null;
+
+        /// <summary>
+        /// Analyzer that will be used for analyzing suggestions at
+        /// index time.
+        /// </summary>
+        private readonly Analyzer indexAnalyzer;
+
+        /// <summary>
+        /// Analyzer that will be used for analyzing suggestions at
+        /// query time.
+        /// </summary>
+        private readonly Analyzer queryAnalyzer;
+
+        /// <summary>
+        /// True if exact match suggestions should always be returned first.
+        /// </summary>
+        private readonly bool exactFirst;
+
+        /// <summary>
+        /// True if separator between tokens should be preserved.
+        /// </summary>
+        private readonly bool preserveSep;
+
+        /// <summary>
+        /// Include this flag in the options parameter to {@link
+        ///  #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)} to always
+        ///  return the exact match first, regardless of score.  This
+        ///  has no performance impact but could result in
+        ///  low-quality suggestions. 
+        /// </summary>
+        public const int EXACT_FIRST = 1;
+
+        /// <summary>
+        /// Include this flag in the options parameter to {@link
+        ///  #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)} to preserve
+        ///  token separators when matching. 
+        /// </summary>
+        public const int PRESERVE_SEP = 2;
+
+        /// <summary>
+        /// Represents the separation between tokens, if
+        ///  PRESERVE_SEP was specified 
+        /// </summary>
+        private const int SEP_LABEL = '\u001F';
+
+        /// <summary>
+        /// Marks end of the analyzed input and start of dedup
+        ///  byte. 
+        /// </summary>
+        private const int END_BYTE = 0x0;
+
+        /// <summary>
+        /// Maximum number of dup surface forms (different surface
+        ///  forms for the same analyzed form). 
+        /// </summary>
+        private readonly int maxSurfaceFormsPerAnalyzedForm;
+
+        /// <summary>
+        /// Maximum graph paths to index for a single analyzed
+        ///  surface form.  This only matters if your analyzer
+        ///  makes lots of alternate paths (e.g. contains
+        ///  SynonymFilter). 
+        /// </summary>
+        private readonly int maxGraphExpansions;
+
+        /// <summary>
+        /// Highest number of analyzed paths we saw for any single
+        ///  input surface form.  For analyzers that never create
+        ///  graphs this will always be 1. 
+        /// </summary>
+        private int maxAnalyzedPathsForOneInput;
+
+        private bool hasPayloads;
+
+        private const int PAYLOAD_SEP = '\u001f';
+
+        /// <summary>
+        /// Whether position holes should appear in the automaton. </summary>
+        private bool preservePositionIncrements;
+
+        /// <summary>
+        /// Number of entries the lookup was built with </summary>
+        private long count = 0;
+
+        /// <summary>
+        /// Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)
+        /// AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
+        /// PRESERVE_SEP, 256, -1, true)}
+        /// </summary>
+        public AnalyzingSuggester(Analyzer analyzer)
+            : this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)
+        {
+        }
+
+        /// <summary>
+        /// Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)
+        /// AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
+        /// PRESERVE_SEP, 256, -1, true)}
+        /// </summary>
+        public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
+            : this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)
+        {
+        }
+
+        /// <summary>
+        /// Creates a new suggester.
+        /// </summary>
+        /// <param name="indexAnalyzer"> Analyzer that will be used for
+        ///   analyzing suggestions while building the index. </param>
+        /// <param name="queryAnalyzer"> Analyzer that will be used for
+        ///   analyzing query text during lookup </param>
+        /// <param name="options"> see <seealso cref="#EXACT_FIRST"/>, <seealso cref="#PRESERVE_SEP"/> </param>
+        /// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of
+        ///   surface forms to keep for a single analyzed form.
+        ///   When there are too many surface forms we discard the
+        ///   lowest weighted ones. </param>
+        /// <param name="maxGraphExpansions"> Maximum number of graph paths
+        ///   to expand from the analyzed form.  Set this to -1 for
+        ///   no limit. </param>
+        /// <param name="preservePositionIncrements"> Whether position holes
+        ///   should appear in the automata </param>
+        public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options,
+            int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements)
+        {
+            this.indexAnalyzer = indexAnalyzer;
+            this.queryAnalyzer = queryAnalyzer;
+            if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0)
+            {
+                throw new System.ArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " +
+                                                   options);
+            }
+            this.exactFirst = (options & EXACT_FIRST) != 0;
+            this.preserveSep = (options & PRESERVE_SEP) != 0;
+
+            // NOTE: this is just an implementation limitation; if
+            // somehow this is a problem we could fix it by using
+            // more than one byte to disambiguate ... but 256 seems
+            // like it should be way more then enough.
+            if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256)
+            {
+                throw new System.ArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " +
+                                                   maxSurfaceFormsPerAnalyzedForm + ")");
+            }
+            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
+
+            if (maxGraphExpansions < 1 && maxGraphExpansions != -1)
+            {
+                throw new System.ArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " +
+                                                   maxGraphExpansions + ")");
+            }
+            this.maxGraphExpansions = maxGraphExpansions;
+            this.preservePositionIncrements = preservePositionIncrements;
+        }
+
+        /// <summary>
+        /// Returns byte size of the underlying FST. </summary>
+        public override long SizeInBytes()
+        {
+            return fst == null ? 0 : fst.SizeInBytes();
+        }
+
+        private void copyDestTransitions(State from, State to, IList<Transition> transitions)
+        {
+            if (to.Accept)
+            {
+                from.Accept = true;
+            }
+            foreach (Transition t in to.Transitions)
+            {
+                transitions.Add(t);
+            }
+        }
+
+        // Replaces SEP with epsilon or remaps them if
+        // we were asked to preserve them:
+        private void ReplaceSep(Automaton a)
+        {
+
+            State[] states = a.NumberedStates;
+
+            // Go in reverse topo sort so we know we only have to
+            // make one pass:
+            for (int stateNumber = states.Length - 1; stateNumber >= 0; stateNumber--)
+            {
+                State state = states[stateNumber];
+                IList<Transition> newTransitions = new List<Transition>();
+                foreach (Transition t in state.Transitions)
+                {
+                    Debug.Assert(t.Min == t.Max);
+                    if (t.Min == TokenStreamToAutomaton.POS_SEP)
+                    {
+                        if (preserveSep)
+                        {
+                            // Remap to SEP_LABEL:
+                            newTransitions.Add(new Transition(SEP_LABEL, t.Dest));
+                        }
+                        else
+                        {
+                            copyDestTransitions(state, t.Dest, newTransitions);
+                            a.Deterministic = false;
+                        }
+                    }
+                    else if (t.Min == TokenStreamToAutomaton.HOLE)
+                    {
+
+                        // Just remove the hole: there will then be two
+                        // SEP tokens next to each other, which will only
+                        // match another hole at search time.  Note that
+                        // it will also match an empty-string token ... if
+                        // that's somehow a problem we can always map HOLE
+                        // to a dedicated byte (and escape it in the
+                        // input).
+                        copyDestTransitions(state, t.Dest, newTransitions);
+                        a.Deterministic = false;
+                    }
+                    else
+                    {
+                        newTransitions.Add(t);
+                    }
+                }
+                state.Transitions = newTransitions.ToArray();
+            }
+        }
+
+        /// <summary>
+        /// Used by subclass to change the lookup automaton, if
+        ///  necessary. 
+        /// </summary>
+        protected internal virtual Automaton convertAutomaton(Automaton a)
+        {
+            return a;
+        }
+
+        internal virtual TokenStreamToAutomaton TokenStreamToAutomaton
+        {
+            get
+            {
+                var tsta = new TokenStreamToAutomaton();
+                tsta.PreservePositionIncrements = preservePositionIncrements;
+                return tsta;
+            }
+        }
+
+        private class AnalyzingComparator : IComparer<BytesRef>
+        {
+
+            internal readonly bool hasPayloads;
+
+            public AnalyzingComparator(bool hasPayloads)
+            {
+                this.hasPayloads = hasPayloads;
+            }
+
+            internal readonly ByteArrayDataInput readerA = new ByteArrayDataInput();
+            internal readonly ByteArrayDataInput readerB = new ByteArrayDataInput();
+            internal readonly BytesRef scratchA = new BytesRef();
+            internal readonly BytesRef scratchB = new BytesRef();
+
+            public virtual int Compare(BytesRef a, BytesRef b)
+            {
+
+                // First by analyzed form:
+                readerA.Reset(a.Bytes, a.Offset, a.Length);
+                scratchA.Length = readerA.ReadShort();
+                scratchA.Bytes = a.Bytes;
+                scratchA.Offset = readerA.Position;
+
+                readerB.Reset(b.Bytes, b.Offset, b.Length);
+                scratchB.Bytes = b.Bytes;
+                scratchB.Length = readerB.ReadShort();
+                scratchB.Offset = readerB.Position;
+
+                int cmp = scratchA.CompareTo(scratchB);
+                if (cmp != 0)
+                {
+                    return cmp;
+                }
+                readerA.SkipBytes(scratchA.Length);
+                readerB.SkipBytes(scratchB.Length);
+
+                // Next by cost:
+                long aCost = readerA.ReadInt();
+                long bCost = readerB.ReadInt();
+                Debug.Assert(DecodeWeight(aCost) >= 0);
+                Debug.Assert(DecodeWeight(bCost) >= 0);
+                if (aCost < bCost)
+                {
+                    return -1;
+                }
+                else if (aCost > bCost)
+                {
+                    return 1;
+                }
+
+                // Finally by surface form:
+                if (hasPayloads)
+                {
+                    scratchA.Length = readerA.ReadShort();
+                    scratchB.Length = readerB.ReadShort();
+                    scratchA.Offset = readerA.Position;
+                    scratchB.Offset = readerB.Position;
+                }
+                else
+                {
+                    scratchA.Offset = readerA.Position;
+                    scratchB.Offset = readerB.Position;
+                    scratchA.Length = a.Length - scratchA.Offset;
+                    scratchB.Length = b.Length - scratchB.Offset;
+                }
+
+                return scratchA.CompareTo(scratchB);
+            }
+        }
+
+        public override void Build(InputIterator iterator)
+        {
+            if (iterator.HasContexts)
+            {
+                throw new System.ArgumentException("this suggester doesn't support contexts");
+            }
+            string prefix = this.GetType().Name;
+            var directory = OfflineSorter.DefaultTempDir();
+            var tempInput = File.CreateTempFile(prefix, ".input", directory);
+            var tempSorted = File.CreateTempFile(prefix, ".sorted", directory);
+
+            hasPayloads = iterator.HasPayloads;
+
+            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+            OfflineSorter.ByteSequencesReader reader = null;
+            var scratch = new BytesRef();
+
+            TokenStreamToAutomaton ts2a = TokenStreamToAutomaton;
+
+            bool success = false;
+            count = 0;
+            sbyte[] buffer = new sbyte[8];
+            try
+            {
+                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+                BytesRef surfaceForm;
+
+                while ((surfaceForm = iterator.Next()) != null)
+                {
+                    HashSet<IntsRef> paths = ToFiniteStrings(surfaceForm, ts2a);
+
+                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);
+
+                    foreach (IntsRef path in paths)
+                    {
+
+                        Util.Fst.Util.ToBytesRef(path, scratch);
+
+                        // length of the analyzed text (FST input)
+                        if (scratch.Length > short.MaxValue - 2)
+                        {
+                            throw new System.ArgumentException("cannot handle analyzed forms > " + (short.MaxValue - 2) +
+                                                               " in length (got " + scratch.length + ")");
+                        }
+                        short analyzedLength = (short) scratch.Length;
+
+                        // compute the required length:
+                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
+                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;
+
+                        BytesRef payload;
+
+                        if (hasPayloads)
+                        {
+                            if (surfaceForm.Length > (short.MaxValue - 2))
+                            {
+                                throw new ArgumentException("cannot handle surface form > " + (short.MaxValue - 2) +
+                                                            " in length (got " + surfaceForm.Length + ")");
+                            }
+                            payload = iterator.Payload;
+                            // payload + surfaceLength (short)
+                            requiredLength += payload.Length + 2;
+                        }
+                        else
+                        {
+                            payload = null;
+                        }
+
+                        buffer = ArrayUtil.Grow(buffer, requiredLength);
+
+                        output.Reset(buffer);
+
+                        output.WriteShort(analyzedLength);
+
+                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);
+
+                        output.WriteInt(EncodeWeight(iterator.Weight));
+
+                        if (hasPayloads)
+                        {
+                            for (int i = 0; i < surfaceForm.Length; i++)
+                            {
+                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
+                                {
+                                    throw new ArgumentException(
+                                        "surface form cannot contain unit separator character U+001F; this character is reserved");
+                                }
+                            }
+                            output.WriteShort((short) surfaceForm.Length);
+                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
+                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
+                        }
+                        else
+                        {
+                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
+                        }
+
+                        Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength);
+
+                        writer.Write(buffer, 0, output.Position);
+                    }
+                    count++;
+                }
+                writer.Dispose();
+
+                // Sort all input/output pairs (required by FST.Builder):
+                (new OfflineSorter(new AnalyzingComparator(hasPayloads))).Sort(tempInput, tempSorted);
+
+                // Free disk space:
+                tempInput.Delete();
+
+                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
+
+                PairOutputs<long?, BytesRef> outputs = new PairOutputs<long?, BytesRef>(PositiveIntOutputs.Singleton,
+                    ByteSequenceOutputs.Singleton);
+                Builder<PairOutputs.Pair<long?, BytesRef>> builder =
+                    new Builder<PairOutputs.Pair<long?, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
+
+                // Build FST:
+                BytesRef previousAnalyzed = null;
+                BytesRef analyzed = new BytesRef();
+                BytesRef surface = new BytesRef();
+                IntsRef scratchInts = new IntsRef();
+                var input = new ByteArrayDataInput();
+
+                // Used to remove duplicate surface forms (but we
+                // still index the hightest-weight one).  We clear
+                // this when we see a new analyzed form, so it cannot
+                // grow unbounded (at most 256 entries):
+                var seenSurfaceForms = new HashSet<BytesRef>();
+
+                var dedup = 0;
+                while (reader.Read(scratch))
+                {
+                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
+                    short analyzedLength = input.ReadShort();
+                    analyzed.Grow(analyzedLength + 2);
+                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
+                    analyzed.Length = analyzedLength;
+
+                    long cost = input.ReadInt();
+
+                    surface.Bytes = scratch.Bytes;
+                    if (hasPayloads)
+                    {
+                        surface.Length = input.ReadShort();
+                        surface.Offset = input.Position;
+                    }
+                    else
+                    {
+                        surface.Offset = input.Position;
+                        surface.Length = scratch.Length - surface.Offset;
+                    }
+
+                    if (previousAnalyzed == null)
+                    {
+                        previousAnalyzed = new BytesRef();
+                        previousAnalyzed.CopyBytes(analyzed);
+                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+                    }
+                    else if (analyzed.Equals(previousAnalyzed))
+                    {
+                        dedup++;
+                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
+                        {
+                            // More than maxSurfaceFormsPerAnalyzedForm
+                            // dups: skip the rest:
+                            continue;
+                        }
+                        if (seenSurfaceForms.Contains(surface))
+                        {
+                            continue;
+                        }
+                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+                    }
+                    else
+                    {
+                        dedup = 0;
+                        previousAnalyzed.CopyBytes(analyzed);
+                        seenSurfaceForms.Clear();
+                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+                    }
+
+                    // TODO: I think we can avoid the extra 2 bytes when
+                    // there is no dup (dedup==0), but we'd have to fix
+                    // the exactFirst logic ... which would be sort of
+                    // hairy because we'd need to special case the two
+                    // (dup/not dup)...
+
+                    // NOTE: must be byte 0 so we sort before whatever
+                    // is next
+                    analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0;
+                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (sbyte) dedup;
+                    analyzed.Length += 2;
+
+                    Util.Fst.Util.ToIntsRef(analyzed, scratchInts);
+                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
+                    if (!hasPayloads)
+                    {
+                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
+                    }
+                    else
+                    {
+                        int payloadOffset = input.Position + surface.Length;
+                        int payloadLength = scratch.Length - payloadOffset;
+                        BytesRef br = new BytesRef(surface.Length + 1 + payloadLength);
+                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
+                        br.Bytes[surface.Length] = PAYLOAD_SEP;
+                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
+                        br.Length = br.Bytes.Length;
+                        builder.Add(scratchInts, outputs.NewPair(cost, br));
+                    }
+                }
+                fst = builder.Finish();
+
+                //Util.dotToFile(fst, "/tmp/suggest.dot");
+
+                success = true;
+            }
+            finally
+            {
+                if (success)
+                {
+                    IOUtils.Close(reader, writer);
+                }
+                else
+                {
+                    IOUtils.CloseWhileHandlingException(reader, writer);
+                }
+
+                tempInput.Delete();
+                tempSorted.Delete();
+            }
+        }
+
+        public override bool Store(DataOutput output)
+        {
+            output.WriteVLong(count);
+            if (fst == null)
+            {
+                return false;
+            }
+
+            fst.Save(output);
+            output.WriteVInt(maxAnalyzedPathsForOneInput);
+            output.WriteByte((sbyte) (hasPayloads ? 1 : 0));
+            return true;
+        }
+
+        public override bool Load(DataInput input)
+        {
+            count = input.ReadVLong();
+            this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.Singleton, ByteSequenceOutputs.Singleton));
+            maxAnalyzedPathsForOneInput = input.ReadVInt();
+            hasPayloads = input.ReadByte() == 1;
+            return true;
+        }
+
+        private LookupResult GetLookupResult(long? output1, BytesRef output2, CharsRef spare)
+        {
+            LookupResult result;
+            if (hasPayloads)
+            {
+                int sepIndex = -1;
+                for (int i = 0; i < output2.Length; i++)
+                {
+                    if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP)
+                    {
+                        sepIndex = i;
+                        break;
+                    }
+                }
+                Debug.Assert(sepIndex != -1);
+                spare.Grow(sepIndex);
+
+                int payloadLen = output2.Length - sepIndex - 1;
+                UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare);
+                BytesRef payload = new BytesRef(payloadLen);
+                Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen);
+                payload.Length = payloadLen;
+                result = new LookupResult(spare.ToString(), DecodeWeight(output1.Value), payload);
+            }
+            else
+            {
+                spare.Grow(output2.Length);
+                UnicodeUtil.UTF8toUTF16(output2, spare);
+                result = new LookupResult(spare.ToString(), DecodeWeight(output1.Value));
+            }
+
+            return result;
+        }
+
+        private bool SameSurfaceForm(BytesRef key, BytesRef output2)
+        {
+            if (hasPayloads)
+            {
+                // output2 has at least PAYLOAD_SEP byte:
+                if (key.Length >= output2.Length)
+                {
+                    return false;
+                }
+                for (int i = 0; i < key.Length; i++)
+                {
+                    if (key.Bytes[key.Offset + i] != output2.Bytes[output2.Offset + i])
+                    {
+                        return false;
+                    }
+                }
+                return output2.Bytes[output2.Offset + key.Length] == PAYLOAD_SEP;
+            }
+            else
+            {
+                return key.BytesEquals(output2);
+            }
+        }
+
+        public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+        {
+            Debug.Assert(num > 0);
+
+            if (onlyMorePopular)
+            {
+                throw new System.ArgumentException("this suggester only works with onlyMorePopular=false");
+            }
+            if (contexts != null)
+            {
+                throw new System.ArgumentException("this suggester doesn't support contexts");
+            }
+            if (fst == null)
+            {
+                return Collections.emptyList();
+            }
+
+            //System.out.println("lookup key=" + key + " num=" + num);
+            for (var i = 0; i < key.Length; i++)
+            {
+                if (key[i] == 0x1E)
+                {
+                    throw new ArgumentException(
+                        "lookup key cannot contain HOLE character U+001E; this character is reserved");
+                }
+                if (key[i] == 0x1F)
+                {
+                    throw new ArgumentException(
+                        "lookup key cannot contain unit separator character U+001F; this character is reserved");
+                }
+            }
+
+            var utf8Key = new BytesRef(key);
+            try
+            {
+
+                Automaton lookupAutomaton = ToLookupAutomaton(key);
+
+                var spare = new CharsRef();
+
+                //System.out.println("  now intersect exactFirst=" + exactFirst);
+
+                // Intersect automaton w/ suggest wFST and get all
+                // prefix starting nodes & their outputs:
+                //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
+
+                //System.out.println("  prefixPaths: " + prefixPaths.size());
+
+                FST.BytesReader bytesReader = fst.BytesReader;
+
+                FST.Arc<PairOutputs.Pair<long?, BytesRef>> scratchArc = new FST.Arc<PairOutputs.Pair<long?, BytesRef>>();
+
+                IList<LookupResult> results = new List<LookupResult>();
+
+                IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> prefixPaths =
+                    FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
+
+                if (exactFirst)
+                {
+
+                    int count = 0;
+                    foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+                    {
+                        if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null)
+                        {
+                            // This node has END_BYTE arc leaving, meaning it's an
+                            // "exact" match:
+                            count++;
+                        }
+                    }
+
+                    // Searcher just to find the single exact only
+                    // match, if present:
+                    Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>> searcher;
+                    searcher = new Util.TopNSearcher<>(fst, count*maxSurfaceFormsPerAnalyzedForm,
+                        count*maxSurfaceFormsPerAnalyzedForm, weightComparator);
+
+                    // NOTE: we could almost get away with only using
+                    // the first start node.  The only catch is if
+                    // maxSurfaceFormsPerAnalyzedForm had kicked in and
+                    // pruned our exact match from one of these nodes
+                    // ...:
+                    foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+                    {
+                        if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null)
+                        {
+                            // This node has END_BYTE arc leaving, meaning it's an
+                            // "exact" match:
+                            searcher.AddStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.Output), false,
+                                path.input);
+                        }
+                    }
+
+                    Util.TopResults<PairOutputs.Pair<long?, BytesRef>> completions = searcher.Search();
+                    Debug.Assert(completions.IsComplete);
+
+                    // NOTE: this is rather inefficient: we enumerate
+                    // every matching "exactly the same analyzed form"
+                    // path, and then do linear scan to see if one of
+                    // these exactly matches the input.  It should be
+                    // possible (though hairy) to do something similar
+                    // to getByOutput, since the surface form is encoded
+                    // into the FST output, so we more efficiently hone
+                    // in on the exact surface-form match.  Still, I
+                    // suspect very little time is spent in this linear
+                    // seach: it's bounded by how many prefix start
+                    // nodes we have and the
+                    // maxSurfaceFormsPerAnalyzedForm:
+                    foreach (Util.Result<PairOutputs.Pair<long?, BytesRef>> completion in completions)
+                    {
+                        BytesRef output2 = completion.Output.output2;
+                        if (SameSurfaceForm(utf8Key, output2))
+                        {
+                            results.Add(GetLookupResult(completion.Output.output1, output2, spare));
+                            break;
+                        }
+                    }
+
+                    if (results.Count == num)
+                    {
+                        // That was quick:
+                        return results;
+                    }
+                }
+
+                Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>> searcher;
+                searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
+                    num*maxAnalyzedPathsForOneInput, weightComparator, utf8Key, results);
+
+                prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
+
+                foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+                {
+                    searcher.AddStartPaths(path.fstNode, path.output, true, path.input);
+                }
+
+                Util.TopResults<PairOutputs.Pair<long?, BytesRef>> completions = searcher.search();
+                Debug.Assert(completions.IsComplete);
+
+                foreach (Util.Result<PairOutputs.Pair<long?, BytesRef>> completion in completions)
+                {
+
+                    LookupResult result = GetLookupResult(completion.Output.output1, completion.Output.output2, spare);
+
+                    // TODO: for fuzzy case would be nice to return
+                    // how many edits were required
+
+                    //System.out.println("    result=" + result);
+                    results.Add(result);
+
+                    if (results.Count == num)
+                    {
+                        // In the exactFirst=true case the search may
+                        // produce one extra path
+                        break;
+                    }
+                }
+
+                return results;
+            }
+            catch (IOException bogus)
+            {
+                throw new Exception(bogus);
+            }
+        }
+
+        private class TopNSearcherAnonymousInnerClassHelper : Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>>
+        {
+            private readonly AnalyzingSuggester outerInstance;
+
+            private BytesRef utf8Key;
+            private IList<LookupResult> results;
+
+            public TopNSearcherAnonymousInnerClassHelper<T1> 
+        (
+            private AnalyzingSuggester outerInstance, FST
+        <
+            private T1 
+        >
+            private org.apache.lucene.search.suggest.fst 
+        ,
+            private UnknownType size,
+            private int num, UnknownType
+            private weightComparator 
+        ,
+            private BytesRef utf8Key, IList
+        <
+            private LookupResult 
+        >
+            private results 
+        ) : base(
+            private org.apache.lucene.search.suggest.fst 
+        ,
+            private size 
+        ,
+            num* outerInstance.maxAnalyzedPathsForOneInput 
+        ,
+            private weightComparator 
+        )
+        {
+            this.outerInstance = outerInstance;
+            this.utf8Key = utf8Key;
+            this.results = results;
+            seen = new HashSet<>();
+        }
+
+            private readonly HashSet<BytesRef> seen;
+
+            protected internal override bool AcceptResult(IntsRef input, PairOutputs.Pair<long?, BytesRef> output)
+            {
+
+                // Dedup: when the input analyzes to a graph we
+                // can get duplicate surface forms:
+                if (seen.Contains(output.Output2))
+                {
+                    return false;
+                }
+                seen.Add(output.Output2);
+
+                if (!outerInstance.exactFirst)
+                {
+                    return true;
+                }
+                else
+                {
+                    // In exactFirst mode, don't accept any paths
+                    // matching the surface form since that will
+                    // create duplicate results:
+                    if (outerInstance.sameSurfaceForm(utf8Key, output.Output2))
+                    {
+                        // We found exact match, which means we should
+                        // have already found it in the first search:
+                        Debug.Assert(results.Count == 1);
+                        return false;
+                    }
+                    else
+                    {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        public override long Count
+        {
+            get { return count; }
+        }
+
+        /// <summary>
+        /// Returns all prefix paths to initialize the search.
+        /// </summary>
+        protected internal virtual IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> GetFullPrefixPaths(
+            IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton,
+            FST<PairOutputs.Pair<long?, BytesRef>> fst)
+        {
+            return prefixPaths;
+        }
+
+        internal HashSet<IntsRef> ToFiniteStrings(BytesRef surfaceForm, TokenStreamToAutomaton ts2a)
+        {
+            // Analyze surface form:
+            Automaton automaton = null;
+            TokenStream ts = indexAnalyzer.TokenStream("", surfaceForm.Utf8ToString());
+            try
+            {
+
+                // Create corresponding automaton: labels are bytes
+                // from each analyzed token, with byte 0 used as
+                // separator between tokens:
+                automaton = ts2a.ToAutomaton(ts);
+            }
+            finally
+            {
+                IOUtils.CloseWhileHandlingException(ts);
+            }
+
+            ReplaceSep(automaton);
+            automaton = convertAutomaton(automaton);
+
+            Debug.Assert(SpecialOperations.IsFinite(automaton));
+
+            // Get all paths from the automaton (there can be
+            // more than one path, eg if the analyzer created a
+            // graph using SynFilter or WDF):
+
+            // TODO: we could walk & add simultaneously, so we
+            // don't have to alloc [possibly biggish]
+            // intermediate HashSet in RAM:
+            return SpecialOperations.GetFiniteStrings(automaton, maxGraphExpansions);
+        }
+
+        internal Automaton ToLookupAutomaton(string key)
+        {
+            // TODO: is there a Reader from a CharSequence?
+            // Turn tokenstream into automaton:
+            Automaton automaton = null;
+            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());
+            try
+            {
+                automaton = (TokenStreamToAutomaton).ToAutomaton(ts);
+            }
+            finally
+            {
+                IOUtils.CloseWhileHandlingException(ts);
+            }
+
+            // TODO: we could use the end offset to "guess"
+            // whether the final token was a partial token; this
+            // would only be a heuristic ... but maybe an OK one.
+            // This way we could eg differentiate "net" from "net ",
+            // which we can't today...
+
+            ReplaceSep(automaton);
+
+            // TODO: we can optimize this somewhat by determinizing
+            // while we convert
+            BasicOperations.Determinize(automaton);
+            return automaton;
+        }
+
+
+
+        /// <summary>
+        /// Returns the weight associated with an input string,
+        /// or null if it does not exist.
+        /// </summary>
+        public virtual object Get(string key)
+        {
+            throw new NotSupportedException();
+        }
+
+        /// <summary>
+        /// cost -> weight </summary>
+        private static int DecodeWeight(long encoded)
+        {
+            return (int) (int.MaxValue - encoded);
+        }
+
+        /// <summary>
+        /// weight -> cost </summary>
+        private static int EncodeWeight(long value)
+        {
+            if (value < 0 || value > int.MaxValue)
+            {
+                throw new System.NotSupportedException("cannot encode value: " + value);
+            }
+            return int.MaxValue - (int) value;
+        }
+
+        internal static readonly IComparer<PairOutputs.Pair<long?, BytesRef>> weightComparator =
+            new ComparatorAnonymousInnerClassHelper();
+
+        private class ComparatorAnonymousInnerClassHelper : IComparer<PairOutputs.Pair<long?, BytesRef>>
+        {
+            public ComparatorAnonymousInnerClassHelper()
+            {
+            }
+
+            public virtual int Compare(PairOutputs.Pair<long?, BytesRef> left, PairOutputs.Pair<long?, BytesRef> right)
+            {
+                return left.Output1.CompareTo(right.Output1);
+            }
+        }
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
new file mode 100644
index 0000000..df82828
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
@@ -0,0 +1,316 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+using Version = System.Version;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    // TODO:
+    // - allow to use the search score
+
+    /// <summary>
+    /// Extension of the AnalyzingInfixSuggester which transforms the weight
+    /// after search to take into account the position of the searched term into
+    /// the indexed text.
+    /// Please note that it increases the number of elements searched and applies the
+    /// ponderation after. It might be costly for long suggestions.
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public class BlendedInfixSuggester : AnalyzingInfixSuggester
+    {
+
+        /// <summary>
+        /// Coefficient used for linear blending
+        /// </summary>
+        protected internal static double LINEAR_COEF = 0.10;
+
+        /// <summary>
+        /// Default factor
+        /// </summary>
+        public static int DEFAULT_NUM_FACTOR = 10;
+
+        /// <summary>
+        /// Factor to multiply the number of searched elements
+        /// </summary>
+        private readonly int numFactor;
+
+        /// <summary>
+        /// Type of blender used by the suggester
+        /// </summary>
+        private readonly BlenderType blenderType;
+
+        /// <summary>
+        /// The different types of blender.
+        /// </summary>
+        public enum BlenderType
+        {
+            /// <summary>
+            /// Application dependent; override {@link
+            ///  #calculateCoefficient} to compute it. 
+            /// </summary>
+            CUSTOM,
+            /// <summary>
+            /// weight*(1 - 0.10*position) </summary>
+            POSITION_LINEAR,
+            /// <summary>
+            /// weight/(1+position) </summary>
+            POSITION_RECIPROCAL,
+            // TODO:
+            //SCORE
+        }
+
+        /// <summary>
+        /// Create a new instance, loading from a previously built
+        /// directory, if it exists.
+        /// </summary>
+        public BlendedInfixSuggester(Version matchVersion, Directory dir, Analyzer analyzer)
+            : base(matchVersion, dir, analyzer)
+        {
+            this.blenderType = BlenderType.POSITION_LINEAR;
+            this.numFactor = DEFAULT_NUM_FACTOR;
+        }
+
+        /// <summary>
+        /// Create a new instance, loading from a previously built
+        /// directory, if it exists.
+        /// </summary>
+        /// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param>
+        /// <param name="numFactor">   Factor to multiply the number of searched elements before ponderate </param>
+        /// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception>
+        public BlendedInfixSuggester(Version matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor)
+            : base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars)
+        {
+            this.blenderType = blenderType;
+            this.numFactor = numFactor;
+        }
+
+        public override IList<Lookup.LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+        {
+            // here we multiply the number of searched element by the defined factor
+            return base.Lookup(key, contexts, onlyMorePopular, num * numFactor);
+        }
+
+        public IList<Lookup.LookupResult> Lookup(string key, HashSet<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
+        {
+            // here we multiply the number of searched element by the defined factor
+            return base.Lookup(key, contexts, num * numFactor, allTermsRequired, doHighlight);
+        }
+
+        protected internal override FieldType TextFieldType
+        {
+            get
+            {
+                FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+                ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+                ft.StoreTermVectors = true;
+                ft.StoreTermVectorPositions = true;
+                ft.OmitNorms = true;
+
+                return ft;
+            }
+        }
+
+        protected internal override IList<Lookup.LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits,
+            int num, string key, bool doHighlight, HashSet<string> matchedTokens, string prefixToken)
+        {
+
+            BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
+            Debug.Assert(textDV != null);
+
+            // This will just be null if app didn't pass payloads to build():
+            // TODO: maybe just stored fields?  they compress...
+            BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");
+
+            SortedSet<Lookup.LookupResult> results = new SortedSet<Lookup.LookupResult>(LOOKUP_COMP);
+
+            // we reduce the num to the one initially requested
+            int actualNum = num / numFactor;
+
+            BytesRef scratch = new BytesRef();
+            for (int i = 0; i < hits.ScoreDocs.Length; i++)
+            {
+                FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];
+
+                textDV.Get(fd.Doc, scratch);
+                string text = scratch.Utf8ToString();
+                long weight = (long?)fd.Fields[0];
+
+                BytesRef payload;
+                if (payloadsDV != null)
+                {
+                    payload = new BytesRef();
+                    payloadsDV.Get(fd.Doc, payload);
+                }
+                else
+                {
+                    payload = null;
+                }
+
+                double coefficient;
+                if (text.StartsWith(key.ToString(), StringComparison.Ordinal))
+                {
+                    // if hit starts with the key, we don't change the score
+                    coefficient = 1;
+                }
+                else
+                {
+                    coefficient = CreateCoefficient(searcher, fd.Doc, matchedTokens, prefixToken);
+                }
+
+                long score = (long)(weight * coefficient);
+
+                LookupResult result;
+                if (doHighlight)
+                {
+                    object highlightKey = Highlight(text, matchedTokens, prefixToken);
+                    result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload);
+                }
+                else
+                {
+                    result = new LookupResult(text, score, payload);
+                }
+
+                BoundedTreeAdd(results, result, actualNum);
+            }
+
+            return new List<LookupResult>(results.DescendingSet());
+        }
+
+        /// <summary>
+        /// Add an element to the tree respecting a size limit
+        /// </summary>
+        /// <param name="results"> the tree to add in </param>
+        /// <param name="result"> the result we try to add </param>
+        /// <param name="num"> size limit </param>
+        private static void BoundedTreeAdd(SortedSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num)
+        {
+
+            if (results.Count >= num)
+            {
+                if (results.Min.value < result.value)
+                {
+                    results.PollFirst();
+                }
+                else
+                {
+                    return;
+                }
+            }
+
+            results.Add(result);
+        }
+
+        /// <summary>
+        /// Create the coefficient to transform the weight.
+        /// </summary>
+        /// <param name="doc"> id of the document </param>
+        /// <param name="matchedTokens"> tokens found in the query </param>
+        /// <param name="prefixToken"> unfinished token in the query </param>
+        /// <returns> the coefficient </returns>
+        /// <exception cref="IOException"> If there are problems reading term vectors from the underlying Lucene index. </exception>
+        private double CreateCoefficient(IndexSearcher searcher, int doc, HashSet<string> matchedTokens, string prefixToken)
+        {
+
+            Terms tv = searcher.IndexReader.GetTermVector(doc, TEXT_FIELD_NAME);
+            TermsEnum it = tv.Iterator(TermsEnum.EMPTY);
+
+            int? position = int.MaxValue;
+            BytesRef term;
+            // find the closest token position
+            while ((term = it.Next()) != null)
+            {
+
+                string docTerm = term.Utf8ToString();
+
+                if (matchedTokens.Contains(docTerm) || docTerm.StartsWith(prefixToken, StringComparison.Ordinal))
+                {
+
+                    DocsAndPositionsEnum docPosEnum = it.DocsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
+                    docPosEnum.NextDoc();
+
+                    // use the first occurrence of the term
+                    int p = docPosEnum.NextPosition();
+                    if (p < position)
+                    {
+                        position = p;
+                    }
+                }
+            }
+
+            // create corresponding coefficient based on position
+            return CalculateCoefficient(position.Value);
+        }
+
+        /// <summary>
+        /// Calculate the weight coefficient based on the position of the first matching word.
+        /// Subclass should override it to adapt it to particular needs </summary>
+        /// <param name="position"> of the first matching word in text </param>
+        /// <returns> the coefficient </returns>
+        protected internal virtual double CalculateCoefficient(int position)
+        {
+
+            double coefficient;
+            switch (blenderType)
+            {
+                case BlendedInfixSuggester.BlenderType.POSITION_LINEAR:
+                    coefficient = 1 - LINEAR_COEF * position;
+                    break;
+
+                case BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL:
+                    coefficient = 1.0 / (position + 1);
+                    break;
+
+                default:
+                    coefficient = 1;
+                    break;
+            }
+
+            return coefficient;
+        }
+
+        private static IComparer<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparator();
+
+        private class LookUpComparator : IComparer<Lookup.LookupResult>
+        {
+
+            public virtual int Compare(Lookup.LookupResult o1, Lookup.LookupResult o2)
+            {
+                // order on weight
+                if (o1.value > o2.value)
+                {
+                    return 1;
+                }
+                else if (o1.value < o2.value)
+                {
+                    return -1;
+                }
+
+                // otherwise on alphabetic order
+                return CHARSEQUENCE_COMPARATOR.Compare(o1.key, o2.key);
+            }
+        }
+    }
+}
\ No newline at end of file


Mime
View raw message