lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thow...@apache.org
Subject [Lucene.Net] svn commit: r1147514 [2/3] - in /incubator/lucene.net/trunk: src/contrib/Analyzers/ src/contrib/Analyzers/Miscellaneous/ src/contrib/Analyzers/Payloads/ src/contrib/Analyzers/Shingle/ src/contrib/Analyzers/Shingle/Codec/ src/contrib/Analyzers/Shingle/M...
Date Sun, 17 Jul 2011 02:46:03 GMT
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,643 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analyzers.Miscellaneous;
+using Lucene.Net.Analyzers.Shingle.Codec;
+using Lucene.Net.Analyzers.Shingle.Matrix;
+using Lucene.Net.Analyzers.Util;
+using FlagsAttribute = Lucene.Net.Analysis.Tokenattributes.FlagsAttribute;
+
+namespace Lucene.Net.Analyzers.Shingle
+{
+    /// <summary>
+    /// <p>A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream.
+    /// In other words, it creates combinations of tokens as a single token.</p>
+    ///
+    /// <p>For example, the sentence "please divide this sentence into shingles"
+    /// might be tokenized into shingles "please divide", "divide this",
+    /// "this sentence", "sentence into", and "into shingles".</p>
+    ///
+    /// <p>Using a shingle filter at index and query time can in some instances
+    /// be used to replace phrase queries, especially them with 0 slop.</p>
+    ///
+    /// <p>Without a spacer character
+    /// it can be used to handle composition and decomposition of words
+    /// such as searching for "multi dimensional" instead of "multidimensional".
+    /// It is a rather common human problem at query time
+    /// in several languages, notably the northern Germanic branch.</p>
+    ///
+    /// <p>Shingles are amongst many things also known to solve problems
+    /// in spell checking, language detection and document clustering.</p>
+    ///
+    /// <p>This filter is backed by a three dimensional column oriented matrix
+    /// used to create permutations of the second dimension, the rows,
+    /// and leaves the third, the z-axis, for for multi token synonyms.</p>
+    ///
+    /// <p>In order to use this filter you need to define a way of positioning
+    /// the input stream tokens in the matrix. This is done using a
+    /// ShingleMatrixFilter.TokenSettingsCodec.
+    /// There are three simple implementations for demonstrational purposes,
+    /// see ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec,
+    /// ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec
+    /// and ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec.</p>
+    ///
+    /// <p>Consider this token matrix:</p>
+    /// <pre>
+    ///  Token[column][row][z-axis]{
+    ///    {{hello}, {greetings, and, salutations}},
+    ///    {{world}, {earth}, {tellus}}
+    ///  };
+    /// </pre>
+    ///
+    /// It would produce the following 2-3 gram sized shingles:
+    ///
+    /// <pre>
+    /// "hello_world"
+    /// "greetings_and"
+    /// "greetings_and_salutations"
+    /// "and_salutations"
+    /// "and_salutations_world"
+    /// "salutations_world"
+    /// "hello_earth"
+    /// "and_salutations_earth"
+    /// "salutations_earth"
+    /// "hello_tellus"
+    /// "and_salutations_tellus"
+    /// "salutations_tellus"
+    ///  </pre>
+    ///
+    /// <p>This implementation can be rather heap demanding
+    /// if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns,
+    /// or if each column contains a great number of rows.</p>
+    ///
+    /// <p>The problem is that in order avoid producing duplicates
+    /// the filter needs to keep track of any shingle already produced and returned to the consumer.</p>
+    ///
+    /// <p>There is a bit of resource management to handle this
+    /// but it would of course be much better if the filter was written
+    /// so it never created the same shingle more than once in the first place.</p>
+    ///
+    /// <p>The filter also has basic support for calculating weights for the shingles
+    /// based on the weights of the tokens from the input stream, output shingle size, etc.
+    /// See CalculateShingleWeight.
+    /// <p/>
+    /// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
+    /// the ones located in org.apache.lucene.analysis.tokenattributes.</p> 
+    /// </summary>
+    public class ShingleMatrixFilter : TokenStream
+    {
+        public static Char DefaultSpacerCharacter = '_';
+        public static TokenSettingsCodec DefaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
+        public static bool IgnoringSinglePrefixOrSuffixShingleByDefault;
+
+        private readonly FlagsAttribute _flagsAtt;
+        private readonly FlagsAttribute _inFlagsAtt;
+
+        private readonly OffsetAttribute _inOffsetAtt;
+        private readonly PayloadAttribute _inPayloadAtt;
+        private readonly PositionIncrementAttribute _inPosIncrAtt;
+        private readonly TermAttribute _inTermAtt;
+        private readonly TypeAttribute _inTypeAtt;
+        private readonly TokenStream _input;
+        private readonly OffsetAttribute _offsetAtt;
+        private readonly PayloadAttribute _payloadAtt;
+        private readonly PositionIncrementAttribute _posIncrAtt;
+        private readonly Token _requestNextToken = new Token();
+        private readonly Token _reusableToken = new Token();
+        private readonly TokenSettingsCodec _settingsCodec;
+
+        /// <summary>
+        /// A set containing shingles that has been the result of a call to Next(Token),
+        /// used to avoid producing the same shingle more than once.
+        /// 
+        /// <p>
+        /// NOTE: The Java List implementation uses a different equality comparison scheme
+        /// than .NET's Generic List. So We have to use a custom IEqualityComparer implementation 
+        /// to get the same behaviour.
+        /// </p>
+        /// </summary>
+        private readonly HashSet<List<Token>> _shinglesSeen = new HashSet<List<Token>>(new ListComparer<Token>());
+
+        private readonly TermAttribute _termAtt;
+        private readonly TypeAttribute _typeAtt;
+        private List<Token> _currentPermuationTokens;
+
+        // Index to what row a token in currentShingleTokens represents
+        private List<Row> _currentPermutationRows;
+
+        private int _currentPermutationTokensStartOffset;
+        private int _currentShingleLength;
+        private MatrixPermutationIterator _permutations;
+        private Token _readColumnBuf;
+
+
+        /// <summary>
+        /// Creates a shingle filter based on a user defined matrix.
+        /// 
+        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
+        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
+        /// 
+        /// </summary>
+        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.Next(Token) is called the first time.</param>
+        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
+        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
+        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
+        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
+        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
+        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
+        {
+            Matrix = matrix;
+            MinimumShingleSize = minimumShingleSize;
+            MaximumShingleSize = maximumShingleSize;
+            SpacerCharacter = spacerCharacter;
+            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
+            _settingsCodec = settingsCodec;
+
+            // ReSharper disable DoNotCallOverridableMethodsInConstructor
+            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
+            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
+            _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
+            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
+            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
+            _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+            // ReSharper restore DoNotCallOverridableMethodsInConstructor
+
+            // set the input to be an empty token stream, we already have the data.
+            _input = new EmptyTokenStream();
+
+            _inTermAtt = (TermAttribute) _input.AddAttribute(typeof (TermAttribute));
+            _inPosIncrAtt = (PositionIncrementAttribute) _input.AddAttribute(typeof (PositionIncrementAttribute));
+            _inPayloadAtt = (PayloadAttribute) _input.AddAttribute(typeof (PayloadAttribute));
+            _inOffsetAtt = (OffsetAttribute) _input.AddAttribute(typeof (OffsetAttribute));
+            _inTypeAtt = (TypeAttribute) _input.AddAttribute(typeof (TypeAttribute));
+            _inFlagsAtt = (FlagsAttribute) _input.AddAttribute(typeof (FlagsAttribute));
+        }
+
+        /// <summary>
+        /// Creates a shingle filter using default settings.
+        /// 
+        /// See ShingleMatrixFilter.DefaultSpacerCharacter, 
+        /// ShingleMatrixFilter.IgnoringSinglePrefixOrSuffixShingleByDefault, 
+        /// and ShingleMatrixFilter.DefaultSettingsCodec
+        /// </summary>
+        /// <param name="input">stream from which to construct the matrix</param>
+        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
+        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
+        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize)
+            : this(input, minimumShingleSize, maximumShingleSize, DefaultSpacerCharacter) { }
+
+        /// <summary>
+        /// Creates a shingle filter using default settings.
+        /// 
+        /// See IgnoringSinglePrefixOrSuffixShingleByDefault, and DefaultSettingsCodec
+        /// </summary>
+        /// <param name="input">stream from which to construct the matrix</param>
+        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
+        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
+        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none. </param>
+        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter)
+            : this( input, minimumShingleSize, maximumShingleSize, spacerCharacter, IgnoringSinglePrefixOrSuffixShingleByDefault) { }
+
+        /// <summary>
+        /// Creates a shingle filter using the default {@link TokenSettingsCodec}.
+        /// 
+        /// See DefaultSettingsCodec
+        /// </summary>
+        /// <param name="input">stream from which to construct the matrix</param>
+        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
+        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
+        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
+        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
+        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle)
+            : this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, DefaultSettingsCodec) { }
+
+        /// <summary>
+        /// Creates a shingle filter with ad hoc parameter settings.
+        /// </summary>
+        /// <param name="input">stream from which to construct the matrix</param>
+        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
+        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
+        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
+        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
+        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
+        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
+        {
+            _input = input;
+            MinimumShingleSize = minimumShingleSize;
+            MaximumShingleSize = maximumShingleSize;
+            SpacerCharacter = spacerCharacter;
+            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
+            _settingsCodec = settingsCodec;
+
+            // ReSharper disable DoNotCallOverridableMethodsInConstructor
+            _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
+            _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
+            _payloadAtt = (PayloadAttribute) AddAttribute(typeof (PayloadAttribute));
+            _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
+            _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
+            _flagsAtt = (FlagsAttribute) AddAttribute(typeof (FlagsAttribute));
+            // ReSharper restore DoNotCallOverridableMethodsInConstructor
+
+            _inTermAtt = (TermAttribute) input.AddAttribute(typeof (TermAttribute));
+            _inPosIncrAtt = (PositionIncrementAttribute) input.AddAttribute(typeof (PositionIncrementAttribute));
+            _inPayloadAtt = (PayloadAttribute) input.AddAttribute(typeof (PayloadAttribute));
+            _inOffsetAtt = (OffsetAttribute) input.AddAttribute(typeof (OffsetAttribute));
+            _inTypeAtt = (TypeAttribute) input.AddAttribute(typeof (TypeAttribute));
+            _inFlagsAtt = (FlagsAttribute) input.AddAttribute(typeof (FlagsAttribute));
+        }
+
+        public int MinimumShingleSize { get; set; }
+
+        public int MaximumShingleSize { get; set; }
+
+        public Matrix.Matrix Matrix { get; set; }
+
+        public Char? SpacerCharacter { get; set; }
+
+        public bool IsIgnoringSinglePrefixOrSuffixShingle { get; set; }
+
+        public override void Reset()
+        {
+            _permutations = null;
+            _shinglesSeen.Clear();
+            _input.Reset();
+        }
+
+        public override sealed bool IncrementToken()
+        {
+            if (Matrix == null)
+            {
+                Matrix = new Matrix.Matrix();
+
+                // fill matrix with maximumShingleSize columns
+                while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
+                {
+                    // this loop looks ugly
+                }
+            }
+
+            // This loop exists in order to avoid recursive calls to the next method
+            // as the complexity of a large matrix
+            // then would require a multi gigabyte sized stack.
+            Token token;
+            do
+            {
+                token = ProduceNextToken(_reusableToken);
+            } while (token == _requestNextToken);
+            
+            if (token == null) 
+                return false;
+
+            ClearAttributes();
+
+            _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
+            _posIncrAtt.SetPositionIncrement(token.GetPositionIncrement());
+            _flagsAtt.SetFlags(token.GetFlags());
+            _offsetAtt.SetOffset(token.StartOffset(), token.EndOffset());
+            _typeAtt.SetType(token.Type());
+            _payloadAtt.SetPayload(token.GetPayload());
+
+            return true;
+        }
+
+        private Token GetNextInputToken(Token token)
+        {
+            if (!_input.IncrementToken()) return null;
+
+            token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
+            token.SetPositionIncrement(_inPosIncrAtt.GetPositionIncrement());
+            token.SetFlags(_inFlagsAtt.GetFlags());
+            token.SetOffset(_inOffsetAtt.StartOffset(), _inOffsetAtt.EndOffset());
+            token.SetType(_inTypeAtt.Type());
+            token.SetPayload(_inPayloadAtt.GetPayload());
+            return token;
+        }
+
+        /// <summary>
+        /// Deprecated: Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
+        /// </summary>
+        /// <param name="reusableToken"></param>
+        /// <returns></returns>
+        [Obsolete("The new IncrementToken() and AttributeSource APIs should be used instead.")]
+        public override sealed Token Next(Token reusableToken)
+        {
+            return base.Next(reusableToken);
+        }
+        
+        /// <summary>
+        /// Deprecated: Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.
+        /// </summary>
+        /// <returns></returns>
+        [Obsolete("The returned Token is a \"full private copy\" (not re-used across calls to Next()) but will be slower than calling {@link #Next(Token)} or using the new IncrementToken() method with the new AttributeSource API.")]
+        public override sealed Token Next()
+        {
+            return base.Next();
+        }
+
+        /// <summary>
+        /// This method exists in order to avoid recursive calls to the method
+        /// as the complexity of a fairly small matrix then easily would require
+        /// a gigabyte sized stack per thread.
+        /// </summary>
+        /// <param name="reusableToken"></param>
+        /// <returns>null if exhausted, instance request_next_token if one more call is required for an answer, 
+        /// or instance parameter resuableToken.</returns>
+        private Token ProduceNextToken(Token reusableToken)
+        {
+            if (_currentPermuationTokens != null)
+            {
+                _currentShingleLength++;
+
+                if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count
+                    && _currentShingleLength <= MaximumShingleSize)
+                {
+                    // it is possible to create at least one more shingle of the current matrix permutation
+
+                    if (IsIgnoringSinglePrefixOrSuffixShingle && 
+                        _currentShingleLength == 1 && 
+                        (_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
+                    {
+                        return Next();
+                    }
+
+                    var termLength = 0;
+
+                    var shingle = new List<Token>();
+
+                    for (int i = 0; i < _currentShingleLength; i++)
+                    {
+                        var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset];
+                        termLength += shingleToken.TermLength();
+                        shingle.Add(shingleToken);
+                    }
+                    if (SpacerCharacter != null)
+                        termLength += _currentShingleLength - 1;
+
+                    // only produce shingles that not already has been created
+                    if (!_shinglesSeen.Add(shingle))
+                        return _requestNextToken;
+
+                    // shingle token factory
+                    var sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. ;)
+                    foreach (var shingleToken in shingle)
+                    {
+                        if (SpacerCharacter != null &&  sb.Length > 0)
+                            sb.Append(SpacerCharacter);
+
+                        sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength());
+                    }
+
+                    reusableToken.SetTermBuffer(sb.ToString());
+                    UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows,
+                                _currentPermuationTokens);
+
+                    return reusableToken;
+                }
+
+                // it is NOT possible to create one more shingles of the current matrix permutation
+                if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1)
+                {
+                    // reset shingle size and move one step to the right in the current tokens permutation
+                    _currentPermutationTokensStartOffset++;
+                    _currentShingleLength = MinimumShingleSize - 1;
+                    return _requestNextToken;
+                }
+
+
+                // todo does this ever occur?
+                if (_permutations == null)
+                    return null;
+
+                if (!_permutations.HasNext())
+                {
+                    // load more data (if available) to the matrix
+
+                    // don't really care, we just read it.
+                    if (_input != null)
+                        ReadColumn();
+
+                    // get rid of resources
+
+                    // delete the first column in the matrix
+                    var deletedColumn = Matrix.Columns[0];
+                    Matrix.Columns.RemoveAt(0);
+
+                    // remove all shingles seen that include any of the tokens from the deleted column.
+                    var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList();
+                    
+                    // I'm a little concerned about this part of the code, because the unit tests currently 
+                    // don't cover this scenario. (I put a break point here, and ran the unit tests in debug mode 
+                    // and this code block was never hit... I also changed it significatly from the Java version
+                    // to use RemoveWhere and LINQ. 
+                    //
+                    // TODO: Write a unit test to cover this and make sure this is a good port! -thoward
+
+                    // linq version
+                    _shinglesSeen.RemoveWhere(
+                        shingle => (shingle.Find(deletedColumnTokens.Contains) != default(Token)));
+
+                    //// initial conversion
+                    //var shinglesSeenIterator = _shinglesSeen.ToList();
+                    //foreach (var shingle in shinglesSeenIterator)
+                    //{
+                    //    foreach (var deletedColumnToken in deletedColumnTokens)
+                    //    {
+                    //        if (shingle.Contains(deletedColumnToken))
+                    //        {
+                    //            _shinglesSeen.Remove(shingle);
+                    //            break;
+                    //        }
+                    //    }
+                    //}
+
+                    // exhausted
+                    if (Matrix.Columns.Count < MinimumShingleSize)
+                        return null;
+
+                    // create permutations of the matrix it now looks
+                    _permutations = Matrix.PermutationIterator();
+                }
+
+                NextTokensPermutation();
+                return _requestNextToken;
+            }
+
+            if (_permutations == null)
+                _permutations = Matrix.PermutationIterator();
+
+            if (!_permutations.HasNext())
+                return null;
+
+            NextTokensPermutation();
+
+            return _requestNextToken;
+        }
+
+        /// <summary>
+        /// Get next permutation of row combinations,
+        /// creates list of all tokens in the row and
+        /// an index from each such token to what row they exist in.
+        /// finally resets the current (next) shingle size and offset. 
+        /// </summary>
+        private void NextTokensPermutation()
+        {
+            var rowsPermutation = _permutations.Next();
+            var currentPermutationRows = new List<Row>();
+            var currentPermuationTokens = new List<Token>();
+
+            foreach (var row in rowsPermutation)
+            {
+                foreach (var token in row.Tokens)
+                {
+                    currentPermuationTokens.Add(token);
+                    currentPermutationRows.Add(row);
+                }
+            }
+            _currentPermuationTokens = currentPermuationTokens;
+            _currentPermutationRows = currentPermutationRows;
+
+            _currentPermutationTokensStartOffset = 0;
+            _currentShingleLength = MinimumShingleSize - 1;
+        }
+
+        /// <summary>
+        /// Final touch of a shingle token before it is passed on to the consumer from method {@link #next(org.apache.lucene.analysis.Token)}.
+        /// 
+        /// Calculates and sets type, flags, position increment, start/end offsets and weight.
+        /// </summary>
+        /// <param name="token">Shingle Token</param>
+        /// <param name="shingle">Tokens used to produce the shingle token.</param>
+        /// <param name="currentPermutationStartOffset">Start offset in parameter currentPermutationTokens</param>
+        /// <param name="currentPermutationRows">index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens</param>
+        /// <param name="currentPermuationTokens">tokens of the current permutation of rows in the matrix. </param>
+        public void UpdateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
+        {
+            token.SetType(typeof(ShingleMatrixFilter).Name);
+            token.SetFlags(0);
+            token.SetPositionIncrement(1);
+            token.SetStartOffset((shingle[0]).StartOffset());
+            token.SetEndOffset(shingle[shingle.Count - 1].EndOffset());
+
+            _settingsCodec.SetWeight(
+                token, 
+                CalculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)
+                );
+        }
+
+        /// <summary>
+        /// Evaluates the new shingle token weight.
+        /// 
+        /// for (shingle part token in shingle)
+        /// weight +=  shingle part token weight * (1 / sqrt(all shingle part token weights summed))
+        /// 
+        /// This algorithm gives a slightly greater score for longer shingles
+        /// and is rather penalising to great shingle token part weights.
+        /// </summary>
+        /// <param name="shingleToken">token returned to consumer</param>
+        /// <param name="shingle">tokens the tokens used to produce the shingle token.</param>
+        /// <param name="currentPermutationStartOffset">start offset in parameter currentPermutationRows and currentPermutationTokens.</param>
+        /// <param name="currentPermutationRows">an index to what matrix row a token in parameter currentPermutationTokens exist.</param>
+        /// <param name="currentPermuationTokens">all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle.</param>
+        /// <returns>weight to be set for parameter shingleToken </returns>
+        public float CalculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens)
+        {
+            var weights = new double[shingle.Count];
+
+            double total = 0f;
+            double top = 0d;
+
+            for (int i = 0; i < weights.Length; i++)
+            {
+                weights[i] = _settingsCodec.GetWeight(shingle[i]);
+
+                double tmp = weights[i];
+
+                if (tmp > top)
+                    top = tmp;
+
+                total += tmp;
+            }
+
+            double factor = 1d/Math.Sqrt(total);
+
+            double weight = weights.Sum(partWeight => partWeight*factor);
+
+            return (float) weight;
+        }
+
+        /// <summary>
+        /// Loads one column from the token stream.
+        /// 
+        /// When the last token is read from the token stream it will column.setLast(true);
+        /// </summary>
+        /// <returns>true if it manage to read one more column from the input token stream</returns>
+        private bool ReadColumn()
+        {
+            Token token;
+
+            if (_readColumnBuf != null)
+            {
+                token = _readColumnBuf;
+                _readColumnBuf = null;
+            }
+            else
+            {
+                token = GetNextInputToken(new Token());
+            }
+
+            if (token == null)
+                return false;
+
+            var currentReaderColumn = new Column(Matrix);
+            var currentReaderRow = new Row(currentReaderColumn);
+
+            currentReaderRow.Tokens.AddLast(token);
+
+            TokenPositioner tokenPositioner;
+            while ((_readColumnBuf = GetNextInputToken(new Token())) != null &&
+                   (tokenPositioner = _settingsCodec.GetTokenPositioner(_readColumnBuf)) != TokenPositioner.NewColumn)
+            {
+                if (tokenPositioner == TokenPositioner.SameRow)
+                {
+                    currentReaderRow.Tokens.AddLast(_readColumnBuf);
+                }
+                else
+                {
+                    currentReaderRow = new Row(currentReaderColumn);
+                    currentReaderRow.Tokens.AddLast(_readColumnBuf);
+                }
+                _readColumnBuf = null;
+            }
+
+            if (_readColumnBuf == null)
+            {
+                _readColumnBuf = GetNextInputToken(new Token());
+
+                if (_readColumnBuf == null)
+                    currentReaderColumn.IsLast = true;
+            }
+
+            return true;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analyzers.Shingle
+{
+    public class TokenPositioner
+    {
+        public static readonly TokenPositioner NewColumn = new TokenPositioner(0);
+        public static readonly TokenPositioner NewRow = new TokenPositioner(1);
+        public static readonly TokenPositioner SameRow = new TokenPositioner(2);
+
+        private TokenPositioner(int index)
+        {
+            Index = index;
+        }
+
+        public int Index { get; private set; }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/Util/FloatHelper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Util/FloatHelper.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Util/FloatHelper.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Util/FloatHelper.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analyzers.Util
+{
+    /// <summary>
+    /// Helper to support some static methods from the Java Float class
+    /// </summary>
+    public static class FloatHelper
+    {
+        public static int FloatToIntBits(float value)
+        {
+            return BitConverter.ToInt32(BitConverter.GetBytes(value), 0);
+        }
+
+        public static float IntBitsToFloat(int value)
+        {
+            return BitConverter.ToSingle(BitConverter.GetBytes(value), 0);
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/Util/ListComparer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Util/ListComparer.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Util/ListComparer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Util/ListComparer.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Lucene.Net.Analyzers.Util
+{
+    public class ListComparer<T>: IEqualityComparer<List<T>> where T : class 
+    {
+        public bool Equals(List<T> x, List<T> y)
+        {
+            return 
+                x.Count == y.Count && 
+                GetHashCode(x).Equals(GetHashCode(y));
+        }
+
+        public int GetHashCode(List<T> obj)
+        {
+            return 
+                obj.Aggregate(
+                    1, 
+                    (current, item) => 
+                    31 * current + (item == default(T) ? 0 : item.GetHashCode())
+                    );
+        }
+    }
+}
\ No newline at end of file

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1147514&r1=1147513&r2=1147514&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun Jul 17 02:46:00 2011
@@ -60,11 +60,16 @@
     <Compile Include="AR\TestArabicAnalyzer.cs" />
     <Compile Include="AR\TestArabicNormalizationFilter.cs" />
     <Compile Include="AR\TestArabicStemFilter.cs" />
+    <Compile Include="Miscellaneous\TestPrefixAndSuffixAwareTokenFilter.cs" />
+    <Compile Include="Miscellaneous\TestPrefixAwareTokenFilter.cs" />
     <Compile Include="NGram\TestEdgeNGramTokenFilter.cs" />
     <Compile Include="NGram\TestEdgeNGramTokenizer.cs" />
     <Compile Include="NGram\TestNGramTokenFilter.cs" />
     <Compile Include="NGram\TestNGramTokenizer.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Shingle\ShingleAnalyzerWrapperTest.cs" />
+    <Compile Include="Shingle\ShingleFilterTest.cs" />
+    <Compile Include="Shingle\TestShingleMatrixFilter.cs" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\..\..\src\contrib\Analyzers\Contrib.Analyzers.csproj">

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analyzers.Miscellaneous
+{
+    public class TestPrefixAndSuffixAwareTokenFilter : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestTokenStreamContents()
+        {
+            var ts = new PrefixAndSuffixAwareTokenFilter(
+                new SingleTokenTokenStream(CreateToken("^", 0, 0)),
+                new WhitespaceTokenizer(new StringReader("hello world")),
+                new SingleTokenTokenStream(CreateToken("$", 0, 0)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"^", "hello", "world", "$"},
+                                      new[] {0, 0, 6, 11},
+                                      new[] {0, 5, 11, 11});
+        }
+
+        private static Token CreateToken(String term, int start, int offset)
+        {
+            var token = new Token(start, offset);
+            token.SetTermBuffer(term);
+            return token;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analyzers.Miscellaneous
+{
+    public class TestPrefixAwareTokenFilter : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestTokenStreamContents()
+        {
+            var ts = new PrefixAwareTokenFilter(
+                new SingleTokenTokenStream(CreateToken("a", 0, 1)),
+                new SingleTokenTokenStream(CreateToken("b", 0, 1)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"a", "b"},
+                                      new[] {0, 1},
+                                      new[] {1, 2});
+
+            // prefix and suffix using 2x prefix
+
+            ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(CreateToken("^", 0, 0)),
+                                            new WhitespaceTokenizer(new StringReader("hello world")));
+            ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(CreateToken("$", 0, 0)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"^", "hello", "world", "$"},
+                                      new[] {0, 0, 6, 11},
+                                      new[] {0, 5, 11, 11});
+        }
+
+        private static Token CreateToken(String term, int start, int offset)
+        {
+            var token = new Token(start, offset);
+            token.SetTermBuffer(term);
+            return token;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs?rev=1147514&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs Sun Jul 17 02:46:00 2011
@@ -0,0 +1,293 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.QueryParsers;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using NUnit.Framework;
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Analyzers.Shingle
+{
+    /// <summary>
+    /// A test class for ShingleAnalyzerWrapper as regards queries and scoring.
+    /// </summary>
+    public class ShingleAnalyzerWrapperTest : BaseTokenStreamTestCase
+    {
+        public IndexSearcher Searcher;
+
+        /// <summary>
+        /// Set up a new index in RAM with three test phrases and the supplied Analyzer.
+        /// </summary>
+        /// <param name="analyzer">the analyzer to use</param>
+        /// <returns>an indexSearcher on the test index.</returns>
+        public IndexSearcher SetUpSearcher(Analyzer analyzer)
+        {
+            Directory dir = new RAMDirectory();
+            var writer = new IndexWriter(dir, analyzer, true);
+
+            var doc = new Document();
+            doc.Add(new Field("content", "please divide this sentence into shingles",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            doc = new Document();
+            doc.Add(new Field("content", "just another test sentence",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            doc = new Document();
+            doc.Add(new Field("content", "a sentence which contains no test",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            writer.Close();
+
+            return new IndexSearcher(dir);
+        }
+
+        protected Hits QueryParsingTest(Analyzer analyzer, String qs)
+        {
+            Searcher = SetUpSearcher(analyzer);
+
+            var qp = new QueryParser("content", analyzer);
+
+            var q = qp.Parse(qs);
+
+            return Searcher.Search(q);
+        }
+
+        protected void CompareRanks(Hits hits, int[] ranks)
+        {
+            Assert.AreEqual(ranks.Length, hits.Length());
+            for (int i = 0; i < ranks.Length; i++)
+            {
+                Assert.AreEqual(ranks[i], hits.Id(i));
+            }
+        }
+
+        /// <summary>
+        /// Will not work on an index without unigrams, since QueryParser automatically tokenizes on whitespace.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "test sentence");
+            var ranks = new[] {1, 2, 0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This one fails with an exception.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQueryParsingFails()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"this sentence\"");
+            var ranks = new[] {0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This one works, actually.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper
+                                             (new WhitespaceAnalyzer(), 2),
+                                         "\"test sentence\"");
+            var ranks = new[] {1};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// Same as above, is tokenized without using the analyzer.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperRequiredQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper
+                                             (new WhitespaceAnalyzer(), 2),
+                                         "+test +sentence");
+            var ranks = new[] {1, 2};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This shows how to construct a phrase query containing shingles.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQuery()
+        {
+            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            Searcher = SetUpSearcher(analyzer);
+
+            var q = new PhraseQuery();
+
+            var ts = analyzer.TokenStream("content", new StringReader("this sentence"));
+            var j = -1;
+
+            var posIncrAtt = (PositionIncrementAttribute) ts.AddAttribute(typeof (PositionIncrementAttribute));
+            var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+
+            while (ts.IncrementToken())
+            {
+                j += posIncrAtt.GetPositionIncrement();
+                var termText = termAtt.Term();
+                q.Add(new Term("content", termText), j);
+            }
+
+            var hits = Searcher.Search(q);
+            var ranks = new[] {0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// How to construct a boolean query with shingles. A query like this will
+        /// implicitly score those documents higher that contain the words in the query
+        /// in the right order and adjacent to each other. 
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperBooleanQuery()
+        {
+            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            Searcher = SetUpSearcher(analyzer);
+
+            var q = new BooleanQuery();
+
+            var ts = analyzer.TokenStream("content", new StringReader("test sentence"));
+
+            var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+
+            while (ts.IncrementToken())
+            {
+                var termText = termAtt.Term();
+                q.Add(new TermQuery(new Term("content", termText)),
+                      BooleanClause.Occur.SHOULD);
+            }
+
+            var hits = Searcher.Search(q);
+            var ranks = new[] {1, 2, 0};
+            CompareRanks(hits, ranks);
+        }
+
+        [Test]
+        public void TestReusableTokenStream()
+        {
+            Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            AssertAnalyzesToReuse(a, "please divide into shingles",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "divide me up again",
+                                  new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"},
+                                  new[] {0, 0, 7, 7, 10, 10, 13},
+                                  new[] {6, 9, 9, 12, 12, 18, 18},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+        }
+
+        /// <summary>
+        /// subclass that acts just like whitespace analyzer for testing
+        /// </summary>
+        [Test]
+        public void TestLucene1678BwComp()
+        {
+            Analyzer a = new ShingleWrapperSubclassAnalyzer();
+            AssertAnalyzesToReuse(a, "this is a test",
+                                  new[] {"this", "is", "a", "test"},
+                                  new[] {0, 5, 8, 10},
+                                  new[] {4, 7, 9, 14});
+        }
+
+        /// <summary>
+        /// analyzer that does not support reuse it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
+        /// </summary>
+        [Test]
+        public void TestWrappedAnalyzerDoesNotReuse()
+        {
+            Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles.",
+                                          "shingles."
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 28, 28},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+        }
+
+        #region Nested type: NonreusableAnalyzer
+
+        private class NonreusableAnalyzer : Analyzer
+        {
+            private int _invocationCount;
+
+            public override TokenStream TokenStream(String fieldName, TextReader reader)
+            {
+                if (++_invocationCount%2 == 0)
+                    return new WhitespaceTokenizer(reader);
+
+                return new LetterTokenizer(reader);
+            }
+        }
+
+        #endregion
+
+        #region Nested type: ShingleWrapperSubclassAnalyzer
+
+        private class ShingleWrapperSubclassAnalyzer : ShingleAnalyzerWrapper
+        {
+            public override TokenStream TokenStream(String fieldName, TextReader reader)
+            {
+                return new WhitespaceTokenizer(reader);
+            }
+        } ;
+
+        #endregion
+    }
+}
\ No newline at end of file



Mime
View raw message