lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ccurr...@apache.org
Subject [Lucene.Net] svn commit: r1294875 [5/45] - in /incubator/lucene.net/trunk: ./ build/ build/vs2010/contrib/ build/vs2010/test/ doc/ src/ src/contrib/Analyzers/ src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/ src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cn/ s...
Date Tue, 28 Feb 2012 22:43:28 GMT
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -20,198 +20,269 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
 using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Support;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.Nl
 {
-
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2001 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Analyzer for Dutch language. Supports an external list of stopwords (words that
-	/// will not be indexed at all), an external list of exclusions (word that will
-	/// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
-	/// the algorithm (dictionary stemming).
-	/// A default set of stopwords is used unless an alternative list is specified, the
-	/// exclusion list is empty by default. 
-	/// <version>$Id: DutchAnalyzer.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
-	/// </summary>
-	/// <author>Edwin de Jonge</author>
-	public class DutchAnalyzer : Analyzer
-	{
-		/// <summary>
-		/// List of typical german stopwords.
-		/// </summary>
-		public static string[] DUTCH_STOP_WORDS = 
-		{
-       "de","en","van","ik","te","dat","die","in","een",
-       "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
-       "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
-       "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
-       "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
-       "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
-       "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
-       "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
-       "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
-       "uw","iemand","geweest","andere"		
-		};
-		/// <summary>
-		/// Contains the stopwords used with the StopFilter. 
-		/// </summary>
-		private Hashtable stoptable = new Hashtable();
-
-		/// <summary>
-		/// Contains words that should be indexed but not stemmed. 
-		/// </summary>
-		private Hashtable excltable = new Hashtable();
-
-		private Hashtable _stemdict = new Hashtable();
-
-		/// <summary>
-		/// Builds an analyzer. 
-		/// </summary>
-		public DutchAnalyzer()
-		{
-			stoptable = StopFilter.MakeStopSet( DUTCH_STOP_WORDS );
-			_stemdict.Add("fiets","fiets"); //otherwise fiet
-			_stemdict.Add("bromfiets","bromfiets"); //otherwise bromfiet
-			_stemdict.Add("ei","eier"); 
-			_stemdict.Add("kind","kinder");
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public DutchAnalyzer( String[] stopwords )
-		{
-			stoptable = StopFilter.MakeStopSet( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public DutchAnalyzer( Hashtable stopwords )
-		{
-			stoptable = stopwords;
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words. 
-		/// </summary>
-		/// <param name="stopwords"></param>
-		public DutchAnalyzer( FileInfo stopwords )
-		{
-			stoptable = WordlistLoader.GetWordtable( stopwords );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from an array of Strings. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable( String[] exclusionlist )
-		{
-			excltable = StopFilter.MakeStopSet( exclusionlist );
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from a Hashtable. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable( Hashtable exclusionlist )
-		{
-			excltable = exclusionlist;
-		}
-
-		/// <summary>
-		/// Builds an exclusionlist from the words contained in the given file. 
-		/// </summary>
-		/// <param name="exclusionlist"></param>
-		public void SetStemExclusionTable(FileInfo exclusionlist)
-		{
-			excltable = WordlistLoader.GetWordtable(exclusionlist);
-		}
-
-		/// <summary>
-		/// Reads a stemdictionary file , that overrules the stemming algorithm
-		/// This is a textfile that contains per line
-		/// word\tstem
-		/// i.e: tabseperated
-		/// </summary>
-		/// <param name="stemdict"></param>
-		public void SetStemDictionary(FileInfo stemdict)
-		{
-			_stemdict = WordlistLoader.GetStemDict(stemdict);
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided TextReader. 
-		/// </summary>
-		/// <param name="fieldName"></param>
-		/// <param name="reader"></param>
-		/// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
-		public override TokenStream TokenStream(String fieldName, TextReader reader)
-		{
-			TokenStream result = new StandardTokenizer( reader );
-			result = new StandardFilter( result );
-			result = new StopFilter( result, stoptable );
-			result = new DutchStemFilter( result, excltable, _stemdict);
-			return result;
-		}
-	}
+    /**
+ * {@link Analyzer} for Dutch language. 
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all), an external list of exclusions (word that will
+ * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
+ * the algorithm (dictionary stemming).
+ * A default set of stopwords is used unless an alternative list is specified, but the
+ * exclusion list is empty by default.
+ * </p>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+    public class DutchAnalyzer : Analyzer
+    {
+        /**
+         * List of typical Dutch stopwords.
+         * @deprecated use {@link #getDefaultStopSet()} instead
+         */
+        public static readonly String[] DUTCH_STOP_WORDS =
+      {
+        "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
+        "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
+        "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
+        "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
+        "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
+        "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
+        "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
+        "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
+        "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
+        "uw", "iemand", "geweest", "andere"
+      };
+        /**
+         * Returns an unmodifiable instance of the default stop-words set.
+         * @return an unmodifiable instance of the default stop-words set.
+         */
+        public static ISet<string> getDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        static class DefaultSetHolder
+        {
+            internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet
+                .UnmodifiableSet(new CharArraySet(DUTCH_STOP_WORDS, false));
+        }
+
+
+        /**
+         * Contains the stopwords used with the StopFilter.
+         */
+        private readonly ISet<string> stoptable;
+
+        /**
+         * Contains words that should be indexed but not stemmed.
+         */
+        private ISet<string> excltable = new HashSet<string>();
+
+        private IDictionary<String, String> stemdict = new HashMap<String, String>();
+        private readonly Version matchVersion;
+
+        /**
+         * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}) 
+         * and a few default entries for the stem exclusion table.
+         * 
+         */
+        public DutchAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+            stemdict.Add("fiets", "fiets"); //otherwise fiet
+            stemdict.Add("bromfiets", "bromfiets"); //otherwise bromfiet
+            stemdict.Add("ei", "eier");
+            stemdict.Add("kind", "kinder");
+        }
+
+        public DutchAnalyzer(Version matchVersion, ISet<string> stopwords)
+            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+        {
+
+        }
+
+        public DutchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionTable)
+        {
+            stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionTable));
+            this.matchVersion = matchVersion;
+            SetOverridesTokenStreamMethod<DutchAnalyzer>();
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         *
+         * @param matchVersion
+         * @param stopwords
+         * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+         */
+        public DutchAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         *
+         * @param stopwords
+         * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+         */
+        public DutchAnalyzer(Version matchVersion, HashSet<string> stopwords)
+            : this(matchVersion, (ISet<string>)stopwords)
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         *
+         * @param stopwords
+         * @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
+         */
+        public DutchAnalyzer(Version matchVersion, FileInfo stopwords)
+        {
+            // this is completely broken!
+            SetOverridesTokenStreamMethod<DutchAnalyzer>();
+            try
+            {
+                stoptable = WordlistLoader.GetWordSet(stopwords);
+            }
+            catch (IOException e)
+            {
+                // TODO: throw IOException
+                throw new Exception("", e);
+            }
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an exclusionlist from an array of Strings.
+         *
+         * @param exclusionlist
+         * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(params string[] exclusionlist)
+        {
+            excltable = StopFilter.MakeStopSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
+        }
+
+        /**
+         * Builds an exclusionlist from a Hashtable.
+         * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(HashSet<string> exclusionlist)
+        {
+            excltable = exclusionlist;
+            SetPreviousTokenStream(null); // force a new stemmer to be created
+        }
+
+        /**
+         * Builds an exclusionlist from the words contained in the given file.
+         * @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
+         */
+        public void SetStemExclusionTable(FileInfo exclusionlist)
+        {
+            try
+            {
+                excltable = WordlistLoader.GetWordSet(exclusionlist);
+                SetPreviousTokenStream(null); // force a new stemmer to be created
+            }
+            catch (IOException e)
+            {
+                // TODO: throw IOException
+                throw new Exception("", e);
+            }
+        }
+
+        /**
+         * Reads a stemdictionary file , that overrules the stemming algorithm
+         * This is a textfile that contains per line
+         * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
+         */
+        public void SetStemDictionary(FileInfo stemdictFile)
+        {
+            try
+            {
+                stemdict = WordlistLoader.GetStemDict(stemdictFile);
+                SetPreviousTokenStream(null); // force a new stemmer to be created
+            }
+            catch (IOException e)
+            {
+                // TODO: throw IOException
+                throw new Exception(string.Empty, e);
+            }
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the 
+         * provided {@link Reader}.
+         *
+         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+         *   filtered with {@link StandardFilter}, {@link StopFilter}, 
+         *   and {@link DutchStemFilter}
+         */
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
+            result = new StandardFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stoptable);
+            result = new DutchStemFilter(result, excltable, stemdict);
+            return result;
+        }
+
+        class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
+         * text in the provided {@link Reader}.
+         *
+         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+         *   filtered with {@link StandardFilter}, {@link StopFilter}, 
+         *   and {@link DutchStemFilter}
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            if (overridesTokenStreamMethod)
+            {
+                // LUCENE-1678: force fallback to tokenStream() if we
+                // have been subclassed and that subclass overrides
+                // tokenStream but not reusableTokenStream
+                return TokenStream(fieldName, reader);
+            }
+
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new StandardTokenizer(matchVersion, reader);
+                streams.result = new StandardFilter(streams.source);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stoptable);
+                streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -20,167 +20,113 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
 
 namespace Lucene.Net.Analysis.Nl
 {
-
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2001 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// A filter that stems Dutch words. It supports a table of words that should
-	/// not be stemmed at all. The stemmer used can be changed at runtime after the
-	/// filter object is created (as long as it is a DutchStemmer).
-	/// 
-	/// <version>$Id: DutchStemFilter.java,v 1.1 2004/03/09 14:55:08 otis Exp $</version>
-	/// </summary>
-	/// <author>Edwin de Jonge</author>
-	public sealed class DutchStemFilter : TokenFilter
-	{
-		/// <summary>
-		/// The actual token in the input stream.
-		/// </summary>
-		private Token token = null;
-		private DutchStemmer stemmer = null;
-		private Hashtable exclusions = null;
-    
-		public DutchStemFilter( TokenStream _in ) : base(_in)
-		{
-			stemmer = new DutchStemmer();
-		}
-    
-		/// <summary>
-		/// Builds a DutchStemFilter that uses an exclusiontable. 
-		/// </summary>
-		/// <param name="_in"></param>
-		/// <param name="exclusiontable"></param>
-		public DutchStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
-		{
-			exclusions = exclusiontable;
-		}
-
-		/// <summary>
-		/// 
-		/// </summary>
-		/// <param name="_in"></param>
-		/// <param name="exclusiontable"></param>
-		/// <param name="stemdictionary">Dictionary of word stem pairs, that overrule the algorithm</param>
-		public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary): this(_in, exclusiontable)
-		{
-			stemmer.SetStemDictionary(stemdictionary);
-		}
-
-		/// <summary>
-		/// </summary>
-		/// <returns>Returns the next token in the stream, or null at EOS</returns>
-		public override Token Next()
-	
-		{
-			if ( ( token = input.Next() ) == null ) 
-			{
-				return null;
-			}
-				// Check the exclusiontable
-			else if ( exclusions != null && exclusions.Contains( token.TermText() ) ) 
-			{
-				return token;
-			}
-			else 
-			{
-				String s = stemmer.Stem( token.TermText() );
-				// If not stemmed, dont waste the time creating a new token
-				if ( !s.Equals( token.TermText() ) ) 
-				{
-					return new Token( s, token.StartOffset(),
-						token.EndOffset(), token.Type() );
-				}
-				return token;
-			}
-		}
-
-		/// <summary>
-		/// Set a alternative/custom DutchStemmer for this filter. 
-		/// </summary>
-		/// <param name="stemmer"></param>
-		public void SetStemmer( DutchStemmer stemmer )
-		{
-			if ( stemmer != null ) 
-			{
-				this.stemmer = stemmer;
-			}
-		}
-
-		/// <summary>
-		/// Set an alternative exclusion list for this filter. 
-		/// </summary>
-		/// <param name="exclusiontable"></param>
-		public void SetExclusionTable( Hashtable exclusiontable )
-		{
-			exclusions = exclusiontable;
-		}
-
-		/// <summary>
-		/// Set dictionary for stemming, this dictionary overrules the algorithm,
-		/// so you can correct for a particular unwanted word-stem pair.
-		/// </summary>
-		/// <param name="dict"></param>
-		public void SetStemDictionary(Hashtable dict)
-		{
-			if (stemmer != null)
-				stemmer.SetStemDictionary(dict);
-		}
-	}
+    /**
+ * A {@link TokenFilter} that stems Dutch words. 
+ * <p>
+ * It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a {@link DutchStemmer}).
+ * </p>
+ * NOTE: This stemmer does not implement the Snowball algorithm correctly,
+ * specifically doubled consonants. It is recommended that you consider using
+ * the "Dutch" stemmer in the snowball package instead. This stemmer will likely
+ * be deprecated in a future release.
+ */
+    public sealed class DutchStemFilter : TokenFilter
+    {
+        /**
+         * The actual token in the input stream.
+         */
+        private DutchStemmer stemmer = null;
+        private ISet<string> exclusions = null;
+
+        private TermAttribute termAtt;
+
+        public DutchStemFilter(TokenStream _in)
+            : base(_in)
+        {
+            stemmer = new DutchStemmer();
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        /**
+         * Builds a DutchStemFilter that uses an exclusion table.
+         */
+        public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable)
+            : this(_in)
+        {
+            exclusions = exclusiontable;
+        }
+
+        /**
+         * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+         */
+        public DutchStemFilter(TokenStream _in, ISet<string> exclusiontable, IDictionary<string, string> stemdictionary)
+            : this(_in, exclusiontable)
+        {
+            stemmer.SetStemDictionary(stemdictionary);
+        }
+
+        /**
+         * Returns the next token in the stream, or null at EOS
+         */
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String term = termAtt.Term();
+
+                // Check the exclusion table.
+                if (exclusions == null || !exclusions.Contains(term))
+                {
+                    String s = stemmer.Stem(term);
+                    // If not stemmed, don't waste the time adjusting the token.
+                    if ((s != null) && !s.Equals(term))
+                        termAtt.SetTermBuffer(s);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /**
+         * Set a alternative/custom {@link DutchStemmer} for this filter.
+         */
+        public void SetStemmer(DutchStemmer stemmer)
+        {
+            if (stemmer != null)
+            {
+                this.stemmer = stemmer;
+            }
+        }
+
+        /**
+         * Set an alternative exclusion list for this filter.
+         */
+        public void SetExclusionTable(HashSet<string> exclusiontable)
+        {
+            exclusions = exclusiontable;
+        }
+
+        /**
+         * Set dictionary for stemming, this dictionary overrules the algorithm,
+         * so you can correct for a particular unwanted word-stem pair.
+         */
+        public void SetStemDictionary(IDictionary<string, string> dict)
+        {
+            if (stemmer != null)
+                stemmer.SetStemDictionary(dict);
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs Tue Feb 28 22:43:08 2012
@@ -23,484 +23,440 @@ using System;
 using System.IO;
 using System.Text;
 using System.Collections;
+using System.Collections.Generic;
 
 namespace Lucene.Net.Analysis.Nl
 {
-
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2001 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// A stemmer for Dutch words. The algorithm is an implementation of
-	/// the <see c="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
-	/// algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?): 
-	/// 
-	/// @version   $Id: DutchStemmer.java,v 1.1 2004/03/09 14:55:08 otis Exp $
-	/// </summary>
-	/// <author>Edwin de Jonge (ejne@cbs.nl)</author>
-	public class DutchStemmer
-	{
-		/// <summary>
-		/// Buffer for the terms while stemming them. 
-		/// </summary>
-		private StringBuilder sb = new StringBuilder();
-		private bool _removedE;
-		private Hashtable _stemDict;
-
-
-		private int _R1;
-		private int _R2;
-
-		/// <summary>
-		/// Stemms the given term to an unique <tt>discriminator</tt>.
-		/// </summary>
-		/// <param name="term">The term that should be stemmed.</param>
-		/// <returns>Discriminator for <tt>term</tt></returns>
-		//TODO convert to internal
-		public string Stem( String term )
-		{
-			term = term.ToLower();
-			if ( !IsStemmable( term ) )
-				return term;
-			if (_stemDict != null && _stemDict.Contains(term))
-				return _stemDict[term] as string;
-			// Reset the StringBuilder.
-			sb.Remove(0, sb.Length);
-			sb.Insert(0, term);
-			// Stemming starts here...
-			Substitute(sb);
-			StoreYandI(sb);
-			_R1 = GetRIndex(sb, 0);
-			_R1 = Math.Max(3,_R1);
-			Step1(sb);
-			Step2(sb);
-			_R2 = GetRIndex(sb, _R1);
-			Step3a(sb);
-			Step3b(sb);
-			Step4(sb);
-			ReStoreYandI(sb);
-			return sb.ToString();
-		}
-
-		private bool enEnding(StringBuilder sb)
-		{
-			string[] enend = new string[]{"ene","en"};
-			foreach(string end in enend)
-			{
-				string s = sb.ToString();
-				int index = s.Length - end.Length;
-				if ( s.EndsWith(end) &&
-					  index >= _R1 && 
-					  IsValidEnEnding(sb,index-1) 
-					)
-				{
-					sb.Remove(index, end.Length);
-					UnDouble(sb,index);
-					return true;
-				}
-			}
-			return false;
-		}
-
-
-		private void Step1(StringBuilder sb)
-		{
-			if (_R1 >= sb.Length)
-				return;
-
-			string s = sb.ToString();
-			int lengthR1 = sb.Length - _R1;
-			int index;
-
-			if (s.EndsWith("heden"))
-			{
-				sb.Replace("heden","heid", _R1, lengthR1);
-				return;
-			}
-
-			if (enEnding(sb))
-				return;
-			
-			if (s.EndsWith("se")              && 
-				 (index = s.Length - 2) >= _R1  &&
-				 IsValidSEnding(sb, index -1)
-				)
-			{
-				sb.Remove(index, 2);
-				return;
-			} 
-			if (s.EndsWith("s") && 
-				(index = s.Length - 1) >= _R1  &&
-				IsValidSEnding(sb, index - 1))
-			{
-				sb.Remove(index, 1);
-			}
-		}
-
-		/// <summary>
-		/// Delete suffix e if in R1 and 
-		/// preceded by a non-vowel, and then undouble the ending
-		/// </summary>
-		/// <param name="sb">string being stemmed</param>
-		private void Step2(StringBuilder sb)
-		{
-			_removedE = false;
-			if (_R1 >= sb.Length)
-				return;
-			string s = sb.ToString();
-			int index = s.Length - 1;
-			if ( index >= _R1   && 
-				 s.EndsWith("e") &&
-				 !IsVowel(sb[index-1]))
-			{
-				sb.Remove(index,1);
-				UnDouble(sb);
-				_removedE = true;
-			}
-		}
-
-		/// <summary>
-		/// Delete "heid"
-		/// </summary>
-		/// <param name="sb">string being stemmed</param>
-		private void Step3a(StringBuilder sb)
-		{
-			if (_R2 >= sb.Length)
-				return;
-			string s = sb.ToString();
-			int index = s.Length - 4;
-			if (s.EndsWith("heid")&& index >= _R2 && sb[index - 1] != 'c')
-			{
-				sb.Remove(index,4); //remove heid
-				enEnding(sb);
-			}
-		}
-
-		/// <summary>
-		/// <p>A d-suffix, or derivational suffix, enables a new word, 
-		/// often with a different grammatical category, or with a different 
-		/// sense, to be built from another word. Whether a d-suffix can be 
-		/// attached is discovered not from the rules of grammar, but by 
-		/// referring to a dictionary. So in English, ness can be added to 
-		/// certain adjectives to form corresponding nouns (littleness, 
-		/// kindness, foolishness ...) but not to all adjectives 
-		/// (not for example, to big, cruel, wise ...) d-suffixes can be 
-		/// used to change meaning, often in rather exotic ways.</p>
-		/// Remove "ing", "end", "ig", "lijk", "baar" and "bar"
-		/// </summary>
-		/// <param name="sb">string being stemmed</param>
-		private void Step3b(StringBuilder sb)
-		{
-			if (_R2 >= sb.Length)
-				return;
-			string s = sb.ToString();
-			int index;
-
-			if ((s.EndsWith("end") || s.EndsWith("ing")) &&
-      		 (index = s.Length - 3) >= _R2
-				)
-			{
-				sb.Remove(index,3);
-				if (sb[index - 2] == 'i' && 
-					 sb[index - 1] == 'g')
-				{
-					if (sb[index - 3] != 'e' & index-2 >= _R2)
-					{
-						index -= 2;
-						sb.Remove(index,2);
-					}
-				}
-				else
-				{
-					UnDouble(sb,index);
-				}
-				return;
-			}
-			if ( s.EndsWith("ig")    &&
-				  (index = s.Length - 2) >= _R2
-				)
-			{
-				if (sb[index - 1] != 'e')
-					sb.Remove(index, 2);
-				return;
-			}
-			if (s.EndsWith("lijk") &&
-				 (index = s.Length - 4) >= _R2
-				)
-			{
-				sb.Remove(index, 4);
-				Step2(sb);
-				return;
-			}
-			if (s.EndsWith("baar") &&
-				(index = s.Length - 4) >= _R2
-				)
-			{
-				sb.Remove(index, 4);
-				return;
-			}
-			if (s.EndsWith("bar")  &&
-				 (index = s.Length - 3) >= _R2
-				)
-			{
-				if (_removedE)
-					sb.Remove(index, 3);
-				return;
-			}
-		}
-
-		/// <summary>
-		/// undouble vowel 
-		/// If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). 
-		/// </summary>
-		/// <param name="sb">string being stemmed</param>
-		private void Step4(StringBuilder sb)
-		{
-			if (sb.Length < 4)
-				return;
-			string end = sb.ToString(sb.Length - 4,4);
-			char c = end[0];
-			char v1 = end[1];
-			char v2 = end[2];
-			char d = end[3];
-			if (v1 == v2    &&
-				 d != 'I'    &&
-				 v1 != 'i'    &&
-				 IsVowel(v1) &&
-				!IsVowel(d)  &&
-				!IsVowel(c))
-			{
-				sb.Remove(sb.Length - 2, 1);
-			}
-		}
-
-		/// <summary>
-		/// Checks if a term could be stemmed.
-		/// </summary>
-		/// <param name="term"></param>
-		/// <returns>true if, and only if, the given term consists in letters.</returns>
-		private bool IsStemmable( String term )
-		{
-			for ( int c = 0; c < term.Length; c++ ) 
-			{
-				if ( !Char.IsLetter(term[c])) return false;
-			}
-			return true;
-		}
-
-		/// <summary>
-		/// Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
-		/// </summary>
-		/// <param name="buffer"></param>
-		private void Substitute( StringBuilder buffer )
-		{
-			for ( int i = 0; i < buffer.Length; i++ ) 
-			{
-				switch (buffer[i])
-				{
-					case 'ä':
-					case 'á':
-					{
-						buffer[i] = 'a';
-						break;
-					}
-					case 'ë':
-					case 'é':
-					{
-						buffer[i] = 'e';
-						break;
-					}
-					case 'ü':
-					case 'ú':
-					{
-						buffer[i] = 'u';
-						break;
-					}
-					case 'ï':
-					case 'i':
-					{
-						buffer[i] = 'i';
-						break;
-					}
-					case 'ö':
-					case 'ó':
-					{
-						buffer[i] = 'o';
-						break;
-					}
-				}
-			}
-		}
-
-//		private bool IsValidSEnding(StringBuilder sb)
-//		{
-//			return  IsValidSEnding(sb,sb.Length - 1);
-//		}
-
-		private bool IsValidSEnding(StringBuilder sb, int index)
-		{
-			char c = sb[index];
-			if (IsVowel(c) || c == 'j')
-				return false;
-			return true;
-		}
-
-//		private bool IsValidEnEnding(StringBuilder sb)
-//		{
-//			return IsValidEnEnding(sb,sb.Length - 1);
-//		}
-
-		private bool IsValidEnEnding(StringBuilder sb, int index)
-		{
-			char c = sb[index];
-			if (IsVowel(c))
-				return false;
-			if (c < 3)
-				return false;
-			// ends with "gem"?
-			if (c == 'm' && sb[index - 2] == 'g' && sb[index-1] == 'e')
-				return false;
-			return true;
-		}
-
-		private void UnDouble(StringBuilder sb)
-		{
-			UnDouble(sb, sb.Length);
-		}
-
-		private void UnDouble(StringBuilder sb, int endIndex)
-		{
-			string s = sb.ToString(0, endIndex);
-			if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
-			{
-				sb.Remove(endIndex-1,1);
-			}
-		}
-
-		private int GetRIndex(StringBuilder sb, int start)
-		{
-			if (start == 0) 
-				start = 1;
-			int i = start;
-			for (; i < sb.Length; i++)
-			{
-				//first non-vowel preceded by a vowel
-				if (!IsVowel(sb[i]) && IsVowel(sb[i-1]))
-				{
-					return i + 1;
-				}
-			}
-			return i + 1;
-		}
-
-		private void StoreYandI(StringBuilder sb)
-		{
-			if (sb[0] == 'y')
-				sb[0] = 'Y';
-			//char c;
-			int last = sb.Length - 1;
-			for (int i = 1; i < last; i++)
-			{
-				switch (sb[i])
-				{
-					case 'i':
-					{
-						if (IsVowel(sb[i-1]) && 
-							IsVowel(sb[i+1])
-							)
-							sb[i] = 'I';
-						break;
-					}
-					case 'y':
-					{
-						if (IsVowel(sb[i-1]))
-							sb[i] = 'Y';
-						break;
-					}
-				}
-			}
-			if (last > 0 && sb[last]=='y' && IsVowel(sb[last-1]))
-				sb[last]='Y';
-		}
-
-		private void ReStoreYandI(StringBuilder sb)
-		{
-			sb.Replace("I","i");
-			sb.Replace("Y","y");
-		}
-
-		private bool IsVowel(char c)
-		{
-			switch (c)
-			{
-				case 'e':
-				case 'a':
-				case 'o':
-				case 'i':
-				case 'u':
-				case 'y':
-				case 'è':
-				{
-					return true;
-				}
-			}
-			return false;
-		}
-
-		internal void SetStemDictionary(Hashtable dict)
-		{
-			_stemDict = dict;
-		}
-	}
+    /**
+     * A stemmer for Dutch words. 
+     * <p>
+     * The algorithm is an implementation of
+     * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
+     * algorithm in Martin Porter's snowball project.
+     * </p>
+     */
+
+    public class DutchStemmer
+    {
+        /**
+         * Buffer for the terms while stemming them.
+         */
+        private StringBuilder sb = new StringBuilder();
+        private bool _removedE;
+        private IDictionary<string, string> _stemDict;
+
+        private int _R1;
+        private int _R2;
+
+        //TODO convert to internal
+        /*
+         * Stems the given term to an unique <tt>discriminator</tt>.
+         *
+         * @param term The term that should be stemmed.
+         * @return Discriminator for <tt>term</tt>
+         */
+        public String Stem(String term)
+        {
+            term = term.ToLower();
+            if (!isStemmable(term))
+                return term;
+            if (_stemDict != null && _stemDict.ContainsKey(term))
+                if (_stemDict[term] is String)
+                    return (String)_stemDict[term];
+                else
+                    return null;
+
+            // Reset the StringBuilder.
+            sb.Clear();
+            sb.Insert(0, term);
+            // Stemming starts here...
+            substitute(sb);
+            storeYandI(sb);
+            _R1 = getRIndex(sb, 0);
+            _R1 = Math.Max(3, _R1);
+            step1(sb);
+            step2(sb);
+            _R2 = getRIndex(sb, _R1);
+            step3a(sb);
+            step3b(sb);
+            step4(sb);
+            reStoreYandI(sb);
+            return sb.ToString();
+        }
+
+        private bool enEnding(StringBuilder sb)
+        {
+            String[] enend = new String[] { "ene", "en" };
+            for (int i = 0; i < enend.Length; i++)
+            {
+                String end = enend[i];
+                String s = sb.ToString();
+                int index = s.Length - end.Length;
+                if (s.EndsWith(end) &&
+                    index >= _R1 &&
+                    isValidEnEnding(sb, index - 1)
+                )
+                {
+                    sb.Remove(index, end.Length);
+                    unDouble(sb, index);
+                    return true;
+                }
+            }
+            return false;
+        }
+
+
+        private void step1(StringBuilder sb)
+        {
+            if (_R1 >= sb.Length)
+                return;
+
+            String s = sb.ToString();
+            int LengthR1 = sb.Length - _R1;
+            int index;
+
+            if (s.EndsWith("heden"))
+            {
+                var toReplace = sb.ToString(_R1, LengthR1).Replace("heden", "heid");
+                sb.Remove(_R1, LengthR1);
+                sb.Insert(_R1, toReplace);
+                return;
+            }
+
+            if (enEnding(sb))
+                return;
+
+            if (s.EndsWith("se") &&
+                (index = s.Length - 2) >= _R1 &&
+                isValidSEnding(sb, index - 1)
+            )
+            {
+                sb.Remove(index, 2);
+                return;
+            }
+            if (s.EndsWith("s") &&
+                (index = s.Length - 1) >= _R1 &&
+                isValidSEnding(sb, index - 1))
+            {
+                sb.Remove(index, 1);
+            }
+        }
+
+        /**
+         * Remove suffix e if in R1 and
+         * preceded by a non-vowel, and then undouble the ending
+         *
+         * @param sb String being stemmed
+         */
+        private void step2(StringBuilder sb)
+        {
+            _removedE = false;
+            if (_R1 >= sb.Length)
+                return;
+            String s = sb.ToString();
+            int index = s.Length - 1;
+            if (index >= _R1 &&
+                s.EndsWith("e") &&
+                !isVowel(sb[index - 1]))
+            {
+                sb.Remove(index, 1);
+                unDouble(sb);
+                _removedE = true;
+            }
+        }
+
+        /**
+         * Remove "heid"
+         *
+         * @param sb String being stemmed
+         */
+        private void step3a(StringBuilder sb)
+        {
+            if (_R2 >= sb.Length)
+                return;
+            String s = sb.ToString();
+            int index = s.Length - 4;
+            if (s.EndsWith("heid") && index >= _R2 && sb[index - 1] != 'c')
+            {
+                sb.Remove(index, 4); //remove heid
+                enEnding(sb);
+            }
+        }
+
+        /**
+         * <p>A d-suffix, or derivational suffix, enables a new word,
+         * often with a different grammatical category, or with a different
+         * sense, to be built from another word. Whether a d-suffix can be
+         * attached is discovered not from the rules of grammar, but by
+         * referring to a dictionary. So in English, ness can be added to
+         * certain adjectives to form corresponding nouns (littleness,
+         * kindness, foolishness ...) but not to all adjectives
+         * (not for example, to big, cruel, wise ...) d-suffixes can be
+         * used to change meaning, often in rather exotic ways.</p>
+         * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
+         *
+         * @param sb String being stemmed
+         */
+        private void step3b(StringBuilder sb)
+        {
+            if (_R2 >= sb.Length)
+                return;
+            String s = sb.ToString();
+            int index = 0;
+
+            if ((s.EndsWith("end") || s.EndsWith("ing")) &&
+                (index = s.Length - 3) >= _R2)
+            {
+                sb.Remove(index, 3);
+                if (sb[index - 2] == 'i' &&
+                    sb[index - 1] == 'g')
+                {
+                    if (sb[index - 3] != 'e' & index - 2 >= _R2)
+                    {
+                        index -= 2;
+                        sb.Remove(index, 2);
+                    }
+                }
+                else
+                {
+                    unDouble(sb, index);
+                }
+                return;
+            }
+            if (s.EndsWith("ig") &&
+                (index = s.Length - 2) >= _R2
+            )
+            {
+                if (sb[index - 1] != 'e')
+                    sb.Remove(index, 2);
+                return;
+            }
+            if (s.EndsWith("lijk") &&
+                (index = s.Length - 4) >= _R2
+            )
+            {
+                sb.Remove(index, 4);
+                step2(sb);
+                return;
+            }
+            if (s.EndsWith("baar") &&
+                (index = s.Length - 4) >= _R2
+            )
+            {
+                sb.Remove(index, 4);
+                return;
+            }
+            if (s.EndsWith("bar") &&
+                (index = s.Length - 3) >= _R2
+            )
+            {
+                if (_removedE)
+                    sb.Remove(index, 3);
+                return;
+            }
+        }
+
+        /**
+         * undouble vowel
+         * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
+         *
+         * @param sb String being stemmed
+         */
+        private void step4(StringBuilder sb)
+        {
+            if (sb.Length < 4)
+                return;
+            String end = sb.ToString(sb.Length - 4, 4);
+            char c = end[0];
+            char v1 = end[1];
+            char v2 = end[2];
+            char d = end[3];
+            if (v1 == v2 &&
+                d != 'I' &&
+                v1 != 'i' &&
+                isVowel(v1) &&
+                !isVowel(d) &&
+                !isVowel(c))
+            {
+                sb.Remove(sb.Length - 2, 1);
+            }
+        }
+
+        /**
+         * Checks if a term could be stemmed.
+         *
+         * @return true if, and only if, the given term consists in letters.
+         */
+        private bool isStemmable(String term)
+        {
+            for (int c = 0; c < term.Length; c++)
+            {
+                if (!char.IsLetter(term[c])) return false;
+            }
+            return true;
+        }
+
+        /**
+         * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
+         */
+        private void substitute(StringBuilder buffer)
+        {
+            for (int i = 0; i < buffer.Length; i++)
+            {
+                switch (buffer[i])
+                {
+                    case 'ä':
+                    case 'á':
+                        {
+                            buffer[i] = 'a';
+                            break;
+                        }
+                    case 'ë':
+                    case 'é':
+                        {
+                            buffer[i] = 'e';
+                            break;
+                        }
+                    case 'ü':
+                    case 'ú':
+                        {
+                            buffer[i] = 'u';
+                            break;
+                        }
+                    case 'ï':
+                    case 'i':
+                        {
+                            buffer[i] = 'i';
+                            break;
+                        }
+                    case 'ö':
+                    case 'ó':
+                        {
+                            buffer[i] = 'o';
+                            break;
+                        }
+                }
+            }
+        }
+
+        /*private bool isValidSEnding(StringBuilder sb) {
+          return isValidSEnding(sb, sb.Length - 1);
+        }*/
+
+        private bool isValidSEnding(StringBuilder sb, int index)
+        {
+            char c = sb[index];
+            if (isVowel(c) || c == 'j')
+                return false;
+            return true;
+        }
+
+        /*private bool isValidEnEnding(StringBuilder sb) {
+          return isValidEnEnding(sb, sb.Length - 1);
+        }*/
+
+        private bool isValidEnEnding(StringBuilder sb, int index)
+        {
+            char c = sb[index];
+            if (isVowel(c))
+                return false;
+            if (c < 3)
+                return false;
+            // ends with "gem"?
+            if (c == 'm' && sb[index - 2] == 'g' && sb[index - 1] == 'e')
+                return false;
+            return true;
+        }
+
+        private void unDouble(StringBuilder sb)
+        {
+            unDouble(sb, sb.Length);
+        }
+
+        private void unDouble(StringBuilder sb, int endIndex)
+        {
+            String s = sb.ToString(0, endIndex);
+            if (s.EndsWith("kk") || s.EndsWith("tt") || s.EndsWith("dd") || s.EndsWith("nn") || s.EndsWith("mm") || s.EndsWith("ff"))
+            {
+                sb.Remove(endIndex - 1, 1);
+            }
+        }
+
+        private int getRIndex(StringBuilder sb, int start)
+        {
+            if (start == 0)
+                start = 1;
+            int i = start;
+            for (; i < sb.Length; i++)
+            {
+                //first non-vowel preceded by a vowel
+                if (!isVowel(sb[i]) && isVowel(sb[i - 1]))
+                {
+                    return i + 1;
+                }
+            }
+            return i + 1;
+        }
+
+        private void storeYandI(StringBuilder sb)
+        {
+            if (sb[0] == 'y')
+                sb[0] = 'Y';
+
+            int last = sb.Length - 1;
+
+            for (int i = 1; i < last; i++)
+            {
+                switch (sb[i])
+                {
+                    case 'i':
+                        {
+                            if (isVowel(sb[i - 1]) &&
+                                isVowel(sb[i + 1])
+                            )
+                                sb[i] = 'I';
+                            break;
+                        }
+                    case 'y':
+                        {
+                            if (isVowel(sb[i - 1]))
+                                sb[i] = 'Y';
+                            break;
+                        }
+                }
+            }
+            if (last > 0 && sb[last] == 'y' && isVowel(sb[last - 1]))
+                sb[last] = 'Y';
+        }
+
+        private void reStoreYandI(StringBuilder sb)
+        {
+            String tmp = sb.ToString();
+            sb.Clear();
+            sb.Insert(0, tmp.Replace("I", "i").Replace("Y", "y"));
+        }
+
+        private bool isVowel(char c)
+        {
+            switch (c)
+            {
+                case 'e':
+                case 'a':
+                case 'o':
+                case 'i':
+                case 'u':
+                case 'y':
+                case 'è':
+                    {
+                        return true;
+                    }
+            }
+            return false;
+        }
+
+        protected internal void SetStemDictionary(IDictionary<string, string> dict)
+        {
+            _stemDict = dict;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs Tue Feb 28 22:43:08 2012
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+using Lucene.Net.Support;
+
 namespace Lucene.Net.Analyzers.Payloads
 {
     /// <summary>
@@ -29,7 +31,7 @@ namespace Lucene.Net.Analyzers.Payloads
 
         public static byte[] EncodeFloat(float payload, byte[] data, int offset)
         {
-            return EncodeInt(SupportClass.Single.FloatToIntBits(payload), data, offset);
+            return EncodeInt(Single.FloatToIntBits(payload), data, offset);
         }
 
         public static byte[] EncodeInt(int payload)
@@ -66,7 +68,7 @@ namespace Lucene.Net.Analyzers.Payloads
         /// <returns>The float that was encoded</returns>
         public static float DecodeFloat(byte[] bytes, int offset)
         {
-            return SupportClass.Single.IntBitsToFloat(DecodeInt(bytes, offset));
+            return Single.IntBitsToFloat(DecodeInt(bytes, offset));
         }
 
         public static int DecodeInt(byte[] bytes, int offset)

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Properties/AssemblyInfo.cs Tue Feb 28 22:43:08 2012
@@ -53,5 +53,12 @@ using System.Runtime.InteropServices;
 // You can specify all the values or you can default the Build and Revision Numbers 
 // by using the '*' as shown below:
 // [assembly: AssemblyVersion("1.0.*")]
-[assembly: AssemblyVersion("2.9.2.1")]
-[assembly: AssemblyFileVersion("2.9.2.1")]
+[assembly: AssemblyVersion("3.0.3")]
+[assembly: AssemblyFileVersion("3.0.3")]
+
+// for testing
+[assembly: InternalsVisibleTo("Lucene.Net.Contrib.Analyzers.Test, PublicKey=002400000480000094000000060200000024000052534131000400000100010075a07ce602f88e" +
+                                                         "f263c7db8cb342c58ebd49ecdcc210fac874260b0213fb929ac3dcaf4f5b39744b800f99073eca" +
+                                                         "72aebfac5f7284e1d5f2c82012a804a140f06d7d043d83e830cdb606a04da2ad5374cc92c0a495" +
+                                                         "08437802fb4f8fb80a05e59f80afb99f4ccd0dfe44065743543c4b053b669509d29d332cd32a0c" +
+                                                         "b1e97e84")]
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,253 +20,153 @@
 */
 
 using System;
+using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 using System.IO;
 using System.Collections;
 using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// Analyzer for Russian language. Supports an external list of stopwords (words that
-	/// will not be indexed at all).
-	/// A default set of stopwords is used unless an alternative list is specified.
-	/// </summary>
-	public sealed class RussianAnalyzer : Analyzer
-	{
-		// letters
-		private static char A = (char)0;
-		private static char B = (char)1;
-		private static char V = (char)2;
-		private static char G = (char)3;
-		private static char D = (char)4;
-		private static char E = (char)5;
-		private static char ZH = (char)6;
-		private static char Z = (char)7;
-		private static char I = (char)8;
-		private static char I_ = (char)9;
-		private static char K = (char)10;
-		private static char L = (char)11;
-		private static char M = (char)12;
-		private static char N = (char)13;
-		private static char O = (char)14;
-		private static char P = (char)15;
-		private static char R = (char)16;
-		private static char S = (char)17;
-		private static char T = (char)18;
-		private static char U = (char)19;
-		//private static char F = (char)20;
-		private static char X = (char)21;
-		//private static char TS = (char)22;
-		private static char CH = (char)23;
-		private static char SH = (char)24;
-		private static char SHCH = (char)25;
-		//private static char HARD = (char)26;
-		private static char Y = (char)27;
-		private static char SOFT = (char)28;
-		private static char AE = (char)29;
-		private static char IU = (char)30;
-		private static char IA = (char)31;
-
-		/// <summary>
-		/// List of typical Russian stopwords.
-		/// </summary>
-		private static char[][] RUSSIAN_STOP_WORDS = {
-		new char[] {A},
-		new char[] {B, E, Z},
-		new char[] {B, O, L, E, E},
-		new char[] {B, Y},
-		new char[] {B, Y, L},
-		new char[] {B, Y, L, A},
-		new char[] {B, Y, L, I},
-		new char[] {B, Y, L, O},
-		new char[] {B, Y, T, SOFT},
-		new char[] {V},
-		new char[] {V, A, M},
-		new char[] {V, A, S},
-		new char[] {V, E, S, SOFT},
-		new char[] {V, O},
-		new char[] {V, O, T},
-		new char[] {V, S, E},
-		new char[] {V, S, E, G, O},
-		new char[] {V, S, E, X},
-		new char[] {V, Y},
-		new char[] {G, D, E},
-		new char[] {D, A},
-		new char[] {D, A, ZH, E},
-		new char[] {D, L, IA},
-		new char[] {D, O},
-		new char[] {E, G, O},
-		new char[] {E, E},
-		new char[] {E, I_,},
-		new char[] {E, IU},
-		new char[] {E, S, L, I},
-		new char[] {E, S, T, SOFT},
-		new char[] {E, SHCH, E},
-		new char[] {ZH, E},
-		new char[] {Z, A},
-		new char[] {Z, D, E, S, SOFT},
-		new char[] {I},
-		new char[] {I, Z},
-		new char[] {I, L, I},
-		new char[] {I, M},
-		new char[] {I, X},
-		new char[] {K},
-		new char[] {K, A, K},
-		new char[] {K, O},
-		new char[] {K, O, G, D, A},
-		new char[] {K, T, O},
-		new char[] {L, I},
-		new char[] {L, I, B, O},
-		new char[] {M, N, E},
-		new char[] {M, O, ZH, E, T},
-		new char[] {M, Y},
-		new char[] {N, A},
-		new char[] {N, A, D, O},
-		new char[] {N, A, SH},
-		new char[] {N, E},
-		new char[] {N, E, G, O},
-		new char[] {N, E, E},
-		new char[] {N, E, T},
-		new char[] {N, I},
-		new char[] {N, I, X},
-		new char[] {N, O},
-		new char[] {N, U},
-		new char[] {O},
-		new char[] {O, B},
-		new char[] {O, D, N, A, K, O},
-		new char[] {O, N},
-		new char[] {O, N, A},
-		new char[] {O, N, I},
-		new char[] {O, N, O},
-		new char[] {O, T},
-		new char[] {O, CH, E, N, SOFT},
-		new char[] {P, O},
-		new char[] {P, O, D},
-		new char[] {P, R, I},
-		new char[] {S},
-		new char[] {S, O},
-		new char[] {T, A, K},
-		new char[] {T, A, K, ZH, E},
-		new char[] {T, A, K, O, I_},
-		new char[] {T, A, M},
-		new char[] {T, E},
-		new char[] {T, E, M},
-		new char[] {T, O},
-		new char[] {T, O, G, O},
-		new char[] {T, O, ZH, E},
-		new char[] {T, O, I_},
-		new char[] {T, O, L, SOFT, K, O},
-		new char[] {T, O, M},
-		new char[] {T, Y},
-		new char[] {U},
-		new char[] {U, ZH, E},
-		new char[] {X, O, T, IA},
-		new char[] {CH, E, G, O},
-		new char[] {CH, E, I_},
-		new char[] {CH, E, M},
-		new char[] {CH, T, O},
-		new char[] {CH, T, O, B, Y},
-		new char[] {CH, SOFT, E},
-		new char[] {CH, SOFT, IA},
-		new char[] {AE, T, A},
-		new char[] {AE, T, I},
-		new char[] {AE, T, O},
-		new char[] {IA}
-													 };
-
-		/// <summary>
-		/// Contains the stopwords used with the StopFilter.
-		/// </summary>
-		private Hashtable stoptable = new Hashtable();
-
-		/// <summary>
-		/// Charset for Russian letters.
-	    /// Represents encoding for 32 lowercase Russian letters.
-		/// Predefined charsets can be taken from RussianCharSets class
-		/// </summary>
-		private char[] charset;
-
-		/// <summary>
-		/// Builds an analyzer.
-		/// </summary>
-		public RussianAnalyzer()
-		{
-			this.charset = RussianCharsets.UnicodeRussian;
-			stoptable = StopFilter.MakeStopSet(MakeStopWords(RussianCharsets.UnicodeRussian));
-		}
-
-		/// <summary>
-		/// Builds an analyzer.
-		/// </summary>
-		/// <param name="charset"></param>
-		public RussianAnalyzer(char[] charset)
-		{
-			this.charset = charset;
-			stoptable = StopFilter.MakeStopSet(MakeStopWords(charset));
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <param name="stopwords"></param>
-		public RussianAnalyzer(char[] charset, String[] stopwords)
-		{
-			this.charset = charset;
-			stoptable = StopFilter.MakeStopSet(stopwords);
-		}
-
-		/// <summary>
-		/// Takes russian stop words and translates them to a String array, using
-		/// the given charset 
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <returns></returns>
-		private static String[] MakeStopWords(char[] charset)
-		{
-			String[] res = new String[RUSSIAN_STOP_WORDS.Length];
-			for (int i = 0; i < res.Length; i++)
-			{
-				char[] theStopWord = RUSSIAN_STOP_WORDS[i];
-				// translate the word,using the charset
-				StringBuilder theWord = new StringBuilder();
-				for (int j = 0; j < theStopWord.Length; j++)
-				{
-					theWord.Append(charset[theStopWord[j]]);
-				}
-				res[i] = theWord.ToString();
-			}
-			return res;
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <param name="stopwords"></param>
-		public RussianAnalyzer(char[] charset, Hashtable stopwords)
-		{
-			this.charset = charset;
-			stoptable = stopwords;
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided TextReader.
-		/// </summary>
-		/// <param name="fieldName"></param>
-		/// <param name="reader"></param>
-		/// <returns>
-		///		A TokenStream build from a RussianLetterTokenizer filtered with
-		///     RussianLowerCaseFilter, StopFilter, and RussianStemFilter
-		///  </returns>
-		public override TokenStream TokenStream(String fieldName, TextReader reader)
-		{
-			TokenStream result = new RussianLetterTokenizer(reader, charset);
-			result = new RussianLowerCaseFilter(result, charset);
-			result = new StopFilter(result, stoptable);
-			result = new RussianStemFilter(result, charset);
-			return result;
-		}
-	}
+    /// <summary>
+    /// Analyzer for Russian language. Supports an external list of stopwords (words that
+    /// will not be indexed at all).
+    /// A default set of stopwords is used unless an alternative list is specified.
+    /// </summary>
+    public sealed class RussianAnalyzer : Analyzer
+    {
+        /// <summary>
+        /// List of typical Russian stopwords.
+        /// </summary>
+        private static readonly String[] RUSSIAN_STOP_WORDS = {
+                                                                  "а", "без", "более", "бы", "был", "была", "были",
+                                                                  "было", "быть", "в",
+                                                                  "вам", "вас", "весь", "во", "вот", "все", "всего",
+                                                                  "всех", "вы", "где",
+                                                                  "да", "даже", "для", "до", "его", "ее", "ей", "ею",
+                                                                  "если", "есть",
+                                                                  "еще", "же", "за", "здесь", "и", "из", "или", "им",
+                                                                  "их", "к", "как",
+                                                                  "ко", "когда", "кто", "ли", "либо", "мне", "может",
+                                                                  "мы", "на", "надо",
+                                                                  "наш", "не", "него", "нее", "нет", "ни", "них", "но",
+                                                                  "ну", "о", "об",
+                                                                  "однако", "он", "она", "они", "оно", "от", "очень",
+                                                                  "по", "под", "при",
+                                                                  "с", "со", "так", "также", "такой", "там", "те", "тем"
+                                                                  , "то", "того",
+                                                                  "тоже", "той", "только", "том", "ты", "у", "уже",
+                                                                  "хотя", "чего", "чей",
+                                                                  "чем", "что", "чтобы", "чье", "чья", "эта", "эти",
+                                                                  "это", "я"
+                                                              };
+
+        private static class DefaultSetHolder
+        {
+            internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet(RUSSIAN_STOP_WORDS, false));
+        }
+
+        /// <summary>
+        /// Contains the stopwords used with the StopFilter.
+        /// </summary>
+        private readonly ISet<string> stopSet;
+
+        private readonly Version matchVersion;
+
+
+        public RussianAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+         */
+        public RussianAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         */
+        public RussianAnalyzer(Version matchVersion, ISet<string> stopwords)
+        {
+            stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * TODO: create a Set version of this ctor
+         * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+         */
+        public RussianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the 
+         * provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a 
+         *   {@link RussianLetterTokenizer} filtered with 
+         *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+         *   and {@link RussianStemFilter}
+         */
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new RussianLetterTokenizer(reader);
+            result = new LowerCaseFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stopSet);
+            result = new RussianStemFilter(result);
+            return result;
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a 
+         *   {@link RussianLetterTokenizer} filtered with 
+         *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+         *   and {@link RussianStemFilter}
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new RussianLetterTokenizer(reader);
+                streams.result = new LowerCaseFilter(streams.source);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stopSet);
+                streams.result = new RussianStemFilter(streams.result);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -22,42 +22,41 @@
 using System;
 using System.IO;
 using Lucene.Net.Analysis;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
-	/// in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
-	/// which doesn't know how to detect letters in encodings like CP1252 and KOI8
-	/// (well-known problems with 0xD7 and 0xF7 chars)
-	/// </summary>
-	public class RussianLetterTokenizer : CharTokenizer
-	{
-		/// <summary>
-		/// Construct a new LetterTokenizer.
-		/// </summary>
-		private char[] charset;
+    ///<summary>
+    /// A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+    /// by also allowing the basic latin digits 0-9. 
+    ///</summary>
+    public class RussianLetterTokenizer : CharTokenizer
+    {
+        public RussianLetterTokenizer(TextReader _in)
+            : base(_in)
+        {
+        }
 
-		public RussianLetterTokenizer(TextReader _in, char[] charset) : base(_in)
-		{
-			this.charset = charset;
-		}
+        public RussianLetterTokenizer(AttributeSource source, TextReader _in)
+            : base(source, _in)
+        {
+        }
 
-		/// <summary>
-		/// Collects only characters which satisfy Char.IsLetter(char).
-		/// </summary>
-		/// <param name="c"></param>
-		/// <returns></returns>
-		protected override bool IsTokenChar(char c)
-		{
-			if (Char.IsLetter(c))
-				return true;
-			for (int i = 0; i < charset.Length; i++)
-			{
-				if (c == charset[i])
-					return true;
-			}
-			return false;
-		}
-	}
+        public RussianLetterTokenizer(AttributeSource.AttributeFactory factory, TextReader __in)
+            : base(factory, __in)
+        {
+        }
+
+        /**
+         * Collects only characters which satisfy
+         * {@link Character#isLetter(char)}.
+         */
+        protected override bool IsTokenChar(char c)
+        {
+            if (char.IsLetter(c) || (c >= '0' && c <= '9'))
+                return true;
+            else
+                return false;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs Tue Feb 28 22:43:08 2012
@@ -21,41 +21,40 @@
 
 using System;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// Normalizes token text to lower case, analyzing given ("russian") charset.
-	/// </summary>
-	public sealed class RussianLowerCaseFilter : TokenFilter
-	{
-		char[] charset;
-
-		public RussianLowerCaseFilter(TokenStream _in, char[] charset) : base(_in)
-		{
-			this.charset = charset;
-		}
-
-		public override Token Next() 
-		{
-			Token t = input.Next();
-
-			if (t == null)
-				return null;
-
-			String txt = t.TermText();
-
-			char[] chArray = txt.ToCharArray();
-			for (int i = 0; i < chArray.Length; i++)
-			{
-				chArray[i] = RussianCharsets.ToLowerCase(chArray[i], charset);
-			}
-
-			String newTxt = new String(chArray);
-			// create new token
-			Token newToken = new Token(newTxt, t.StartOffset(), t.EndOffset());
-
-			return newToken;
-		}
-	}
+    /// <summary>
+    /// Normalizes token text to lower case.
+    /// </summary>
+    [Obsolete("Use LowerCaseFilter instead, which has the same functionality. This filter will be removed in Lucene 4.0")]
+    public sealed class RussianLowerCaseFilter : TokenFilter
+    {
+        private TermAttribute termAtt;
+
+        public RussianLowerCaseFilter(TokenStream _in)
+            : base(_in)
+        {
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                char[] chArray = termAtt.TermBuffer();
+                int chLen = termAtt.TermLength();
+                for (int i = 0; i < chLen; i++)
+                {
+                    chArray[i] = char.ToLower(chArray[i]);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -21,59 +21,65 @@
 
 using System;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
-	/// The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter,
-	/// because RussianStemFilter only works  with lowercase part of any "russian" charset.
-	/// </summary>
-	public sealed class RussianStemFilter : TokenFilter
-	{
-		/// <summary>
-		/// The actual token in the input stream.
-		/// </summary>
-		private Token token = null;
-		private RussianStemmer stemmer = null;
-
-		public RussianStemFilter(TokenStream _in, char[] charset) : base(_in)
-		{
-			stemmer = new RussianStemmer(charset);
-		}
-
-		/// <summary>
-		/// 
-		/// </summary>
-		/// <returns>Returns the next token in the stream, or null at EOS</returns>
-		public override Token Next() 
-		{
-			if ((token = input.Next()) == null)
-			{
-				return null;
-			}
-			else
-			{
-				String s = stemmer.Stem(token.TermText());
-				if (!s.Equals(token.TermText()))
-				{
-					return new Token(s, token.StartOffset(), token.EndOffset(),
-						token.Type());
-				}
-				return token;
-			}
-		}
-
-		/// <summary>
-		/// Set a alternative/custom RussianStemmer for this filter.
-		/// </summary>
-		/// <param name="stemmer"></param>
-		public void SetStemmer(RussianStemmer stemmer)
-		{
-			if (stemmer != null)
-			{
-				this.stemmer = stemmer;
-			}
-		}
-	}
+    /**
+    * A {@link TokenFilter} that stems Russian words. 
+    * <p>
+    * The implementation was inspired by GermanStemFilter.
+    * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
+    * because RussianStemFilter only works with lowercase characters.
+    * </p>
+    */
+    public sealed class RussianStemFilter : TokenFilter
+    {
+        /**
+         * The actual token in the input stream.
+         */
+        private RussianStemmer stemmer = null;
+
+        private TermAttribute termAtt;
+
+        public RussianStemFilter(TokenStream _in)
+            : base(_in)
+        {
+            stemmer = new RussianStemmer();
+            termAtt = AddAttribute<TermAttribute>();
+        }
+        /**
+         * Returns the next token in the stream, or null at EOS
+         */
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String term = termAtt.Term();
+                String s = stemmer.Stem(term);
+                if (s != null && !s.Equals(term))
+                    termAtt.SetTermBuffer(s);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+
+        // I don't get the point of this.  All methods in java are private, so they can't be
+        // overridden...You can't really subclass any of its behavior.  I've commented it out,
+        // as it doesn't compile as is. - cc
+        ////**
+        // * Set a alternative/custom {@link RussianStemmer} for this filter.
+        // */
+        //public void SetStemmer(RussianStemmer stemmer)
+        //{
+        //    if (stemmer != null)
+        //    {
+        //        this.stemmer = stemmer;
+        //    }
+        //}
+    }
 }
\ No newline at end of file



Mime
View raw message