lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pnas...@apache.org
Subject [Lucene.Net] svn commit: r1083372 [1/3] - in /incubator/lucene.net/trunk/C#/contrib/Analyzers: ./ Lucene.Net.Analyzers/ Lucene.Net.Analyzers/BR/ Lucene.Net.Analyzers/CJK/ Lucene.Net.Analyzers/Cn/ Lucene.Net.Analyzers/Cz/ Lucene.Net.Analyzers/De/ Lucene.Net.Analyzer...
Date Sun, 20 Mar 2011 07:30:38 GMT
Author: pnasser
Date: Sun Mar 20 07:30:37 2011
New Revision: 1083372

URL: http://svn.apache.org/viewvc?rev=1083372&view=rev
Log:
LUCENENET 372 - BR, CJK, CN, CZ, DE, FR, NL, RU Analyzers - Tests Missing

Added:
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemmer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchStemFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/DutchStemmer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Nl/WordlistLoader.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianCharsets.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianLetterTokenizer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianLowerCaseFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianStemFilter.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Ru/RussianStemmer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/WordlistLoader.cs
Modified:
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
    incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Lucene.Net.Analyzers.csproj

Modified: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers.sln?rev=1083372&r1=1083371&r2=1083372&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln (original)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers.sln Sun Mar 20 07:30:37 2011
@@ -1,9 +1,15 @@
 
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual C# Express 2008
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analyzers", "Lucene.Net.Analyzers\Lucene.Net.Analyzers.csproj", "{4286E961-9143-4821-B46D-3D39D3736386}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.TestAnalyzers", "Test\Lucene.Net.TestAnalyzers.csproj", "{67D27628-F1D5-4499-9818-B669731925C8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\src\Lucene.Net\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Test", "..\..\src\Test\Lucene.Net.Test.csproj", "{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DemoLib", "..\..\src\Demo\DemoLib\DemoLib.csproj", "{F04CA2F4-E182-46A8-B914-F46AF5319E83}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -19,6 +25,18 @@ Global
 		{67D27628-F1D5-4499-9818-B669731925C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{67D27628-F1D5-4499-9818-B669731925C8}.Release|Any CPU.Build.0 = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.Build.0 = Release|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

Modified: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs?rev=1083372&r1=1083371&r2=1083372&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/BR/BrazilianAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -96,7 +96,7 @@ namespace Lucene.Net.Analysis.BR
          */
         public BrazilianAnalyzer(FileInfo stopwords)
         {
-            stoptable = WordlistLoader.GetWordSet(stopwords);
+            stoptable = WordlistLoader.GetWordtable(stopwords);
         }
 
         /**
@@ -118,7 +118,7 @@ namespace Lucene.Net.Analysis.BR
          */
         public void SetStemExclusionTable(FileInfo exclusionlist)
         {
-            excltable = WordlistLoader.GetWordSet(exclusionlist);
+            excltable = WordlistLoader.GetWordtable(exclusionlist);
         }
 
         /**

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,128 @@
+using System;
+using System.IO;
+using System.Collections;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 *
+	 * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
+	 */
+
+	/// <summary>
+	/// Filters CJKTokenizer with StopFilter.
+	/// 
+	/// <author>Che, Dong</author>
+	/// </summary>
+	public class CJKAnalyzer : Analyzer 
+	{
+		//~ Static fields/initializers ---------------------------------------------
+
+		/// <summary>
+		/// An array containing some common English words that are not usually
+		/// useful for searching. and some double-byte interpunctions.....
+		/// </summary>
+		public static String[] stopWords = 
+		{
+			"a", "and", "are", "as", "at", "be",
+			"but", "by", "for", "if", "in",
+			"into", "is", "it", "no", "not",
+			"of", "on", "or", "s", "such", "t",
+			"that", "the", "their", "then",
+			"there", "these", "they", "this",
+			"to", "was", "will", "with", "",
+			"www"
+		};
+
+		//~ Instance fields --------------------------------------------------------
+
+		/// <summary>
+		/// stop word list
+		/// </summary>
+		private Hashtable stopTable;
+
+		//~ Constructors -----------------------------------------------------------
+
+		/// <summary>
+		/// Builds an analyzer which removes words in STOP_WORDS.
+		/// </summary>
+		public CJKAnalyzer() 
+		{
+			stopTable = StopFilter.MakeStopSet(stopWords);
+		}
+
+		/// <summary>
+		/// Builds an analyzer which removes words in the provided array.
+		/// </summary>
+		/// <param name="stopWords">stop word array</param>
+		public CJKAnalyzer(String[] stopWords) 
+		{
+			stopTable = StopFilter.MakeStopSet(stopWords);
+		}
+
+		//~ Methods ----------------------------------------------------------------
+
+		/// <summary>
+		/// get token stream from input
+		/// </summary>
+		/// <param name="fieldName">lucene field name</param>
+		/// <param name="reader">input reader</param>
+		/// <returns>Token Stream</returns>
+		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
+		{
+			return new StopFilter(new CJKTokenizer(reader), stopTable);
+		}
+	}
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/CJK/CJKTokenizer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,329 @@
+using System;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// <p>
+	/// CJKTokenizer was modified from StopTokenizer which does a decent job for
+	/// most European languages. and it perferm other token method for double-byte
+	/// Characters: the token will return at each two charactors with overlap match.<br>
+	/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+	/// also need filter filter zero length token ""<br>
+	/// for Digit: digit, '+', '#' will token as letter<br>
+	/// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+	/// please search  <a
+	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+	/// </p>
+	/// 
+	/// @author Che, Dong
+	/// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+	/// </summary>
+	public sealed class CJKTokenizer : Tokenizer 
+	{
+		//~ Static fields/initializers ---------------------------------------------
+
+		/// <summary>
+		/// Max word length
+		/// </summary>
+		private static int MAX_WORD_LEN = 255;
+
+		/// <summary>
+		/// buffer size
+		/// </summary>
+		private static int IO_BUFFER_SIZE = 256;
+
+		//~ Instance fields --------------------------------------------------------
+
+		/// <summary>
+		/// word offset, used to imply which character(in ) is parsed
+		/// </summary>
+		private int offset = 0;
+
+		/// <summary>
+		/// the index used only for ioBuffer
+		/// </summary>
+		private int bufferIndex = 0;
+
+		/// <summary>
+		/// data length
+		/// </summary>
+		private int dataLen = 0;
+
+		/// <summary>
+		/// character buffer, store the characters which are used to compose <br>
+		/// the returned Token
+		/// </summary>
+		private char[] buffer = new char[MAX_WORD_LEN];
+
+		/// <summary>
+		/// I/O buffer, used to store the content of the input(one of the <br>
+		/// members of Tokenizer)
+		/// </summary>
+		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+		/// <summary>
+		/// word type: single=>ASCII  double=>non-ASCII word=>default 
+		/// </summary>
+		private String tokenType = "word";
+
+		/// <summary>
+		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
+		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+		/// </summary>
+		private bool preIsTokened = false;
+
+		//~ Constructors -----------------------------------------------------------
+
+		/// <summary>
+		/// Construct a token stream processing the given input.
+		/// </summary>
+		/// <param name="_in">I/O reader</param>
+		public CJKTokenizer(TextReader _in) 
+		{
+			input = _in;
+		}
+
+		//~ Methods ----------------------------------------------------------------
+
+		/// <summary>
+		///  Returns the next token in the stream, or null at EOS.
+		/// </summary>
+		/// <returns>Token</returns>
+		public override Token Next()
+		{
+			/** how many character(s) has been stored in buffer */
+			int length = 0;
+
+			/** the position used to create Token */
+			int start = offset;
+
+			while (true) 
+			{
+				/** current charactor */
+				char c;
+
+				/** unicode block of current charactor for detail */
+				//Character.UnicodeBlock ub;
+
+				offset++;
+
+				if (bufferIndex >= dataLen) 
+				{
+					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+					bufferIndex = 0;
+				}
+
+				if (dataLen == 0) 
+				{
+					if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							length = 0;
+							preIsTokened = false;
+						}
+
+						break;
+					} 
+					else 
+					{
+						return null;
+					}
+				} 
+				else 
+				{
+					//get current character
+					c = ioBuffer[bufferIndex++];
+
+					//get the UnicodeBlock of the current character
+					//ub = Character.UnicodeBlock.of(c);
+				}
+
+				//if the current character is ASCII or Extend ASCII
+				if (('\u0000' <= c && c <= '\u007F') || 
+					('\uFF00' <= c && c <= '\uFFEF')) 
+				{
+					if ('\uFF00' <= c && c <= '\uFFEF')
+					{
+						/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+						int i = (int) c;
+						i = i - 65248;
+						c = (char) i;
+					}
+
+					// if the current character is a letter or "_" "+" "#"
+					if (Char.IsLetterOrDigit(c)
+						|| ((c == '_') || (c == '+') || (c == '#'))
+						) 
+					{
+						if (length == 0) 
+						{
+							// "javaC1C2C3C4linux" <br>
+							//      ^--: the current character begin to token the ASCII
+							// letter
+							start = offset - 1;
+						} 
+						else if (tokenType == "double") 
+						{
+							// "javaC1C2C3C4linux" <br>
+							//              ^--: the previous non-ASCII
+							// : the current character
+							offset--;
+							bufferIndex--;
+							tokenType = "single";
+
+							if (preIsTokened == true) 
+							{
+								// there is only one non-ASCII has been stored
+								length = 0;
+								preIsTokened = false;
+
+								break;
+							} 
+							else 
+							{
+								break;
+							}
+						}
+
+						// store the LowerCase(c) in the buffer
+						buffer[length++] = Char.ToLower(c);
+						tokenType = "single";
+
+						// break the procedure if buffer overflowed!
+						if (length == MAX_WORD_LEN) 
+						{
+							break;
+						}
+					} 
+					else if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							length = 0;
+							preIsTokened = false;
+						} 
+						else 
+						{
+							break;
+						}
+					}
+				} 
+				else 
+				{
+					// non-ASCII letter, eg."C1C2C3C4"
+					if (Char.IsLetter(c)) 
+					{
+						if (length == 0) 
+						{
+							start = offset - 1;
+							buffer[length++] = c;
+							tokenType = "double";
+						} 
+						else 
+						{
+							if (tokenType == "single") 
+							{
+								offset--;
+								bufferIndex--;
+
+								//return the previous ASCII characters
+								break;
+							} 
+							else 
+							{
+								buffer[length++] = c;
+								tokenType = "double";
+
+								if (length == 2) 
+								{
+									offset--;
+									bufferIndex--;
+									preIsTokened = true;
+
+									break;
+								}
+							}
+						}
+					} 
+					else if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							// empty the buffer
+							length = 0;
+							preIsTokened = false;
+						} 
+						else 
+						{
+							break;
+						}
+					}
+				}
+			}
+
+			return new Token(new String(buffer, 0, length), start, start + length,
+				tokenType
+				);
+		}
+	}
+
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,92 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseAnalyzer
+	/// Description:
+	///   Subclass of org.apache.lucene.analysis.Analyzer
+	///   build from a ChineseTokenizer, filtered with ChineseFilter.
+	/// Copyright:   Copyright (c) 2001
+	/// Company:
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
+	/// </summary>
+	public class ChineseAnalyzer : Analyzer 
+	{
+
+		public ChineseAnalyzer() 
+		{
+		}
+
+		/// <summary>
+		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
+		/// </summary>
+		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
+		{
+			TokenStream result = new ChineseTokenizer(reader);
+			result = new ChineseFilter(result);
+			return result;
+		}
+	}
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,138 @@
+using System;
+using System.IO;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseFilter
+	/// Description: Filter with a stop word table
+	///              Rule: No digital is allowed.
+	///                    English word/token should larger than 1 character.
+	///                    One Chinese character as one Chinese word.
+	/// TO DO:
+	///   1. Add Chinese stop words, such as \ue400
+	///   2. Dictionary based Chinese word extraction
+	///   3. Intelligent Chinese word extraction
+	/// 
+	/// Copyright:    Copyright (c) 2001
+	/// Company:
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
+	/// </summary>
+	public sealed class ChineseFilter : TokenFilter 
+	{
+		// Only English now, Chinese to be added later.
+		public static String[] STOP_WORDS = 
+				 {
+					 "and", "are", "as", "at", "be", "but", "by",
+					 "for", "if", "in", "into", "is", "it",
+					 "no", "not", "of", "on", "or", "such",
+					 "that", "the", "their", "then", "there", "these",
+					 "they", "this", "to", "was", "will", "with"
+				 };
+
+		private Hashtable stopTable;
+
+		public ChineseFilter(TokenStream _in) : base (_in)
+		{
+			stopTable = new Hashtable(STOP_WORDS.Length);
+
+			for (int i = 0; i < STOP_WORDS.Length; i++)
+				stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
+		}
+
+		public override Token Next()
+		{
+
+			for (Token token = input.Next(); token != null; token = input.Next()) 
+			{
+				String text = token.TermText();
+
+				// why not key off token type here assuming ChineseTokenizer comes first?
+				if (stopTable[text] == null) 
+				{
+					switch (Char.GetUnicodeCategory(text[0])) 
+					{
+
+						case UnicodeCategory.LowercaseLetter:
+						case UnicodeCategory.UppercaseLetter:
+
+							// English word/token should larger than 1 character.
+							if (text.Length > 1) 
+							{
+								return token;
+							}
+							break;
+						case UnicodeCategory.OtherLetter:
+
+							// One Chinese character as one Chinese word.
+							// Chinese word extraction to be added later here.
+
+							return token;
+					}
+
+				}
+
+			}
+			return null;
+		}
+	}
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cn/ChineseTokenizer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,179 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseTokenizer
+	/// Description: Extract tokens from the Stream using Character.getType()
+	///              Rule: A Chinese character as a single token
+	/// Copyright:   Copyright (c) 2001
+	/// Company:
+	/// 
+	/// The difference between thr ChineseTokenizer and the
+	/// CJKTokenizer (id=23545) is that they have different
+	/// token parsing logic.
+	/// 
+	/// Let me use an example. If having a Chinese text
+	/// "C1C2C3C4" to be indexed, the tokens returned from the
+	/// ChineseTokenizer are C1, C2, C3, C4. And the tokens
+	/// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+	/// 
+	/// Therefore the index the CJKTokenizer created is much
+	/// larger.
+	/// 
+	/// The problem is that when searching for C1, C1C2, C1C3,
+	/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+	/// CJKTokenizer will not work.
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
+	/// </summary>
+	public sealed class ChineseTokenizer : Tokenizer 
+	{
+
+
+		public ChineseTokenizer(TextReader _in) 
+		{
+			input = _in;
+		}
+
+		private int offset = 0, bufferIndex=0, dataLen=0;
+		private static int MAX_WORD_LEN = 255;
+		private static int IO_BUFFER_SIZE = 1024;
+		private char[] buffer = new char[MAX_WORD_LEN];
+		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+		private int length;
+		private int start;
+
+		private void Push(char c) 
+		{
+
+			if (length == 0) start = offset-1;            // start of token
+			buffer[length++] = Char.ToLower(c);  // buffer it
+
+		}
+
+		private Token Flush() 
+		{
+
+			if (length > 0) 
+			{
+				//System.out.println(new String(buffer, 0, length));
+				return new Token(new String(buffer, 0, length), start, start+length);
+			}
+			else
+				return null;
+		}
+
+		public override Token Next()
+		{
+
+			length = 0;
+			start = offset;
+
+
+			while (true) 
+			{
+
+				char c;
+				offset++;
+
+				if (bufferIndex >= dataLen) 
+				{
+					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+					bufferIndex = 0;
+				};
+
+				if (dataLen == 0) return Flush();
+				else
+					c = ioBuffer[bufferIndex++];
+
+
+				switch(Char.GetUnicodeCategory(c)) 
+				{
+
+					case UnicodeCategory.DecimalDigitNumber:
+					case UnicodeCategory.LowercaseLetter:
+					case UnicodeCategory.UppercaseLetter:
+						Push(c);
+						if (length == MAX_WORD_LEN) return Flush();
+						break;
+
+					case UnicodeCategory.OtherLetter:
+						if (length>0) 
+						{
+							bufferIndex--;
+							return Flush();
+						}
+						Push(c);
+						return Flush();
+
+					default:
+						if (length>0) return Flush();
+						break;
+				}
+			}
+
+		}
+	}
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Cz/CzechAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,190 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.De;
+using Lucene.Net.Analysis.Standard;
+
+namespace Lucene.Net.Analysis.Cz
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Analyzer for Czech language. Supports an external list of stopwords (words that
+	/// will not be indexed at all).
+	/// A default set of stopwords is used unless an alternative list is specified, the
+	/// exclusion list is empty by default.
+	/// 
+	/// <author>Lukas Zapletal [lzap@root.cz]</author>
+	/// <version>$Id: CzechAnalyzer.java,v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
+	/// </summary>
+	public sealed class CzechAnalyzer : Analyzer 
+	{
+		/// <summary>
+		/// List of typical stopwords.
+		/// </summary>
+		public static String[] STOP_WORDS = 
+				 {
+					 "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
+					 "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
+					 "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
+					 "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
+					 "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
+					 "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
+					 "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
+					 "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
+					 "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
+					 "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
+					 "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
+					 "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
+					 "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
+					 "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
+					 "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
+					 "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
+					 "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
+					 "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
+		};
+
+		/// <summary>
+		/// Contains the stopwords used with the StopFilter.
+		/// </summary>
+		private Hashtable stoptable = new Hashtable();
+
+		/// <summary>
+		/// Builds an analyzer.
+		/// </summary>
+		public CzechAnalyzer() 
+		{
+			stoptable = StopFilter.MakeStopSet( STOP_WORDS );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public CzechAnalyzer( String[] stopwords ) 
+		{
+			stoptable = StopFilter.MakeStopSet( stopwords );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public CzechAnalyzer( Hashtable stopwords ) 
+		{
+			stoptable = stopwords;
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public CzechAnalyzer( FileInfo stopwords ) 
+		{
+			stoptable = WordlistLoader.GetWordtable( stopwords );
+		}
+
+		/// <summary>
+		/// Loads stopwords hash from resource stream (file, database...).
+		/// </summary>
+		/// <param name="wordfile">File containing the wordlist</param>
+		/// <param name="encoding">Encoding used (win-1250, iso-8859-2, ...}, null for default system encoding</param>
+		public void LoadStopWords( Stream wordfile, String encoding ) 
+		{
+			if ( wordfile == null ) 
+			{
+				stoptable = new Hashtable();
+				return;
+			}
+			try 
+			{
+				// clear any previous table (if present)
+				stoptable = new Hashtable();
+
+				StreamReader isr;
+				if (encoding == null)
+					isr = new StreamReader(wordfile);
+				else
+					isr = new StreamReader(wordfile, Encoding.GetEncoding(encoding));
+
+				String word;
+				while ( ( word = isr.ReadLine() ) != null ) 
+				{
+					stoptable[word] = word;
+				}
+
+			} 
+			catch ( IOException ) 
+			{
+				stoptable = null;
+			}
+		}
+
+		/// <summary>
+		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
+		/// <returns>
+		/// A TokenStream build from a StandardTokenizer filtered with
+		/// StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
+		/// </returns>
+		public override TokenStream TokenStream( String fieldName, TextReader reader ) 
+		{
+			TokenStream result = new StandardTokenizer( reader );
+			result = new StandardFilter( result );
+			result = new LowerCaseFilter( result );
+			result = new StopFilter( result, stoptable );
+			return result;
+		}
+	}
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,125 @@
+using System;
+using System.IO;
+using System.Collections;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.De
+{
+	/// <summary>
+	/// Analyzer for German language. Supports an external list of stopwords (words that
+	/// will not be indexed at all) and an external list of exclusions (word that will
+	/// not be stemmed, but indexed).
+	/// A default set of stopwords is used unless an alternative list is specified, the
+	/// exclusion list is empty by default.
+	/// </summary>
+	public class GermanAnalyzer : Analyzer
+	{
+		/// <summary>
+		/// List of typical german stopwords.
+		/// </summary>
+		private String[] GERMAN_STOP_WORDS = 
+		{
+			"einer", "eine", "eines", "einem", "einen",
+			"der", "die", "das", "dass", "daß",
+			"du", "er", "sie", "es",
+			"was", "wer", "wie", "wir",
+			"und", "oder", "ohne", "mit",
+			"am", "im", "in", "aus", "auf",
+			"ist", "sein", "war", "wird",
+			"ihr", "ihre", "ihres",
+			"als", "für", "von",
+			"dich", "dir", "mich", "mir",
+			"mein", "kein",
+			"durch", "wegen"
+		};
+
+		/// <summary>
+		/// Contains the stopwords used with the StopFilter. 
+		/// </summary>
+		private Hashtable stoptable = new Hashtable();
+
+		/// <summary>
+		/// Contains words that should be indexed but not stemmed. 
+		/// </summary>
+		private Hashtable excltable = new Hashtable();
+
+		/// <summary>
+		/// Builds an analyzer. 
+		/// </summary>
+		public GermanAnalyzer()
+		{
+			stoptable = StopFilter.MakeStopSet( GERMAN_STOP_WORDS );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words. 
+		/// </summary>
+		/// <param name="stopwords"></param>
+		public GermanAnalyzer( String[] stopwords )
+		{
+			stoptable = StopFilter.MakeStopSet( stopwords );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words. 
+		/// </summary>
+		/// <param name="stopwords"></param>
+		public GermanAnalyzer( Hashtable stopwords )
+		{
+			stoptable = stopwords;
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words. 
+		/// </summary>
+		/// <param name="stopwords"></param>
+		public GermanAnalyzer( FileInfo stopwords )
+		{
+			stoptable = WordlistLoader.GetWordtable( stopwords );
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from an array of Strings. 
+		/// </summary>
+		/// <param name="exclusionlist"></param>
+		public void SetStemExclusionTable( String[] exclusionlist )
+		{
+			excltable = StopFilter.MakeStopSet( exclusionlist );
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from a Hashtable. 
+		/// </summary>
+		/// <param name="exclusionlist"></param>
+		public void SetStemExclusionTable( Hashtable exclusionlist )
+		{
+			excltable = exclusionlist;
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from the words contained in the given file. 
+		/// </summary>
+		/// <param name="exclusionlist"></param>
+		public void SetStemExclusionTable(FileInfo exclusionlist)
+		{
+			excltable = WordlistLoader.GetWordtable(exclusionlist);
+		}
+
+		/// <summary>
+		/// Creates a TokenStream which tokenizes all the text in the provided TextReader. 
+		/// </summary>
+		/// <param name="fieldName"></param>
+		/// <param name="reader"></param>
+		/// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
+		public override TokenStream TokenStream(String fieldName, TextReader reader)
+		{
+			TokenStream result = new StandardTokenizer( reader );
+			result = new StandardFilter( result );
+			result = new LowerCaseFilter(result);
+			result = new StopFilter( result, stoptable );
+			result = new GermanStemFilter( result, excltable );
+			return result;
+		}
+	}
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,85 @@
+using System;
+using System.IO;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+	/// <summary>
+	/// A filter that stems German words. It supports a table of words that should
+	/// not be stemmed at all. The stemmer used can be changed at runtime after the
+	/// filter object is created (as long as it is a GermanStemmer).
+	/// </summary>
+	public sealed class GermanStemFilter : TokenFilter
+	{
+		/// <summary>
+		/// The actual token in the input stream.
+		/// </summary>
+		private Token token = null;
+		private GermanStemmer stemmer = null;
+		private Hashtable exclusions = null;
+    
+		public GermanStemFilter( TokenStream _in ) : base(_in)
+		{
+			stemmer = new GermanStemmer();
+		}
+    
+		/// <summary>
+		/// Builds a GermanStemFilter that uses an exclusiontable. 
+		/// </summary>
+		/// <param name="_in"></param>
+		/// <param name="exclusiontable"></param>
+		public GermanStemFilter( TokenStream _in, Hashtable exclusiontable ): this(_in)
+		{
+			exclusions = exclusiontable;
+		}
+    
+		/// <summary>
+		/// </summary>
+		/// <returns>Returns the next token in the stream, or null at EOS</returns>
+		public override Token Next()
+	
+		{
+			if ( ( token = input.Next() ) == null ) 
+			{
+				return null;
+			}
+				// Check the exclusiontable
+			else if ( exclusions != null && exclusions.Contains( token.TermText() ) ) 
+			{
+				return token;
+			}
+			else 
+			{
+				String s = stemmer.Stem( token.TermText() );
+				// If not stemmed, dont waste the time creating a new token
+				if ( !s.Equals( token.TermText() ) ) 
+				{
+					return new Token( s, token.StartOffset(),
+						token.EndOffset(), token.Type() );
+				}
+				return token;
+			}
+		}
+
+		/// <summary>
+		/// Set a alternative/custom GermanStemmer for this filter. 
+		/// </summary>
+		/// <param name="stemmer"></param>
+		public void SetStemmer( GermanStemmer stemmer )
+		{
+			if ( stemmer != null ) 
+			{
+				this.stemmer = stemmer;
+			}
+		}
+
+		/// <summary>
+		/// Set an alternative exclusion list for this filter. 
+		/// </summary>
+		/// <param name="exclusiontable"></param>
+		public void SetExclusionTable( Hashtable exclusiontable )
+		{
+			exclusions = exclusiontable;
+		}
+	}
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/GermanStemmer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,287 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+	/// <summary>
+	/// A stemmer for German words. The algorithm is based on the report
+	/// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
+	/// Caumanns (joerg.caumanns@isst.fhg.de).
+	/// </summary>
+	public class GermanStemmer
+	{
+		/// <summary>
+		/// Buffer for the terms while stemming them. 
+		/// </summary>
+		private StringBuilder sb = new StringBuilder();
+
+		/// <summary>
+		/// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
+		/// </summary>
+		private int substCount = 0;
+
+		/// <summary>
+		/// Stemms the given term to an unique <tt>discriminator</tt>.
+		/// </summary>
+		/// <param name="term">The term that should be stemmed.</param>
+		/// <returns>Discriminator for <tt>term</tt></returns>
+		internal String Stem( String term )
+		{
+			// Use lowercase for medium stemming.
+			term = term.ToLower();
+			if ( !IsStemmable( term ) )
+				return term;
+			// Reset the StringBuilder.
+			sb.Remove(0, sb.Length);
+			sb.Insert(0, term);
+			// Stemming starts here...
+			Substitute( sb );
+			Strip( sb );
+			Optimize( sb );
+			Resubstitute( sb );
+			RemoveParticleDenotion( sb );
+			return sb.ToString();
+		}
+
+		/// <summary>
+		/// Checks if a term could be stemmed.
+		/// </summary>
+		/// <param name="term"></param>
+		/// <returns>true if, and only if, the given term consists in letters.</returns>
+		private bool IsStemmable( String term )
+		{
+			for ( int c = 0; c < term.Length; c++ ) 
+			{
+				if ( !Char.IsLetter(term[c])) return false;
+			}
+			return true;
+		}
+
+		/// <summary>
+		/// Suffix stripping (stemming) on the current term. The stripping is reduced
+		/// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+		/// from which all regular suffixes are build of. The simplification causes
+		/// some overstemming, and way more irregular stems, but still provides unique.
+		/// discriminators in the most of those cases.
+		/// The algorithm is context free, except of the length restrictions.
+		/// </summary>
+		/// <param name="buffer"></param>
+		private void Strip( StringBuilder buffer )
+		{
+			bool doMore = true;
+			while ( doMore && buffer.Length > 3 ) 
+			{
+				if ( ( buffer.Length + substCount > 5 ) &&
+					buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) )
+				{
+					buffer.Remove( buffer.Length - 2, 2 );
+				}
+				else if ( ( buffer.Length + substCount > 4 ) &&
+					buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) ) 
+				{
+					buffer.Remove( buffer.Length - 2, 2 );
+				}
+				else if ( ( buffer.Length + substCount > 4 ) &&
+					buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) ) 
+				{
+					buffer.Remove( buffer.Length - 2, 2 );
+				}
+				else if ( buffer[buffer.Length - 1] == 'e' ) 
+				{
+					buffer.Remove(buffer.Length - 1, 1);
+				}
+				else if ( buffer[buffer.Length - 1] == 's' ) 
+				{
+					buffer.Remove(buffer.Length - 1, 1);
+				}
+				else if ( buffer[buffer.Length - 1] == 'n' ) 
+				{
+					buffer.Remove(buffer.Length - 1, 1);
+				}
+					// "t" occurs only as suffix of verbs.
+				else if ( buffer[buffer.Length - 1] == 't') 
+				{
+					buffer.Remove(buffer.Length - 1, 1);
+				}
+				else 
+				{
+					doMore = false;
+				}
+			}
+		}
+
+		/// <summary>
+		/// Does some optimizations on the term. This optimisations are contextual.
+		/// </summary>
+		/// <param name="buffer"></param>
+		private void Optimize( StringBuilder buffer )
+		{
+			// Additional step for female plurals of professions and inhabitants.
+			if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" )) 
+			{
+				buffer.Remove(buffer.Length - 1, 1);
+				Strip(buffer);
+			}
+			// Additional step for irregular plural nouns like "Matrizen -> Matrix".
+			if ( buffer[buffer.Length - 1] == ('z') ) 
+			{
+				buffer[buffer.Length - 1] = 'x';
+			}
+		}
+
+		/// <summary>
+		/// Removes a particle denotion ("ge") from a term.
+		/// </summary>
+		/// <param name="buffer"></param>
+		private void RemoveParticleDenotion( StringBuilder buffer )
+		{
+			if ( buffer.Length > 4 ) 
+			{
+				for ( int c = 0; c < buffer.Length - 3; c++ ) 
+				{
+					if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) ) 
+					{
+						buffer.Remove(c, 2);
+						return;
+					}
+				}
+			}
+		}
+
+		/// <summary>
+		/// Do some substitutions for the term to reduce overstemming:
+		///
+		/// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+		///   "ß" is substituted by "ss"
+		/// - Substitute a second char of a pair of equal characters with
+		/// an asterisk: ?? -> ?*
+		/// - Substitute some common character combinations with a token:
+		///   sch/ch/ei/ie/ig/st -> $/В§/%/&/#/!
+		/// </summary>
+		private void Substitute( StringBuilder buffer )
+		{
+			substCount = 0;
+			for ( int c = 0; c < buffer.Length; c++ ) 
+			{
+				// Replace the second char of a pair of the equal characters with an asterisk
+				if ( c > 0 && buffer[c] == buffer[c - 1]) 
+				{
+					buffer[c] = '*';
+				}
+					// Substitute Umlauts.
+				else if ( buffer[c] == 'ä' ) 
+				{
+					buffer[c] = 'a';
+				}
+				else if ( buffer[c] == 'ö' ) 
+				{
+					buffer[c] = 'o';
+				}
+				else if ( buffer[c] == 'ü' ) 
+				{
+					buffer[c] = 'u';
+				}
+				// Fix bug so that 'ß' at the end of a word is replaced.
+				else if ( buffer[c] == 'ß' ) 
+				{
+				
+					buffer[c] = 's';
+					buffer.Insert(c + 1, 's');
+					substCount++;
+				}
+				// Take care that at least one character is left left side from the current one
+				if ( c < buffer.Length - 1 ) 
+				{
+					// Masking several common character combinations with an token
+					if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' &&
+						buffer[c + 1] == 'c' && buffer[c + 2] == 'h' )
+					{
+						buffer[c] = '$';
+						buffer.Remove(c + 1, 2);
+						substCount =+ 2;
+					}
+					else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' ) 
+					{
+						buffer[c] = '§';
+						buffer.Remove(c + 1, 1);
+						substCount++;
+					}
+					else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' ) 
+					{
+						buffer[c] = '%';
+						buffer.Remove(c + 1, 1);
+						substCount++;
+					}
+					else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' ) 
+					{
+						buffer[c] = '&';
+						buffer.Remove(c + 1, 1);
+						substCount++;
+					}
+					else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' ) 
+					{
+						buffer[c] = '#';
+						buffer.Remove(c + 1, 1);
+						substCount++;
+					}
+					else if ( buffer[c] == 's' && buffer[c + 1] == 't' ) 
+					{
+						buffer[c] = '!';
+						buffer.Remove(c + 1, 1);
+						substCount++;
+					}
+				}
+			}
+		}
+
+		/// <summary>
+		/// Undoes the changes made by Substitute(). That are character pairs and
+		/// character combinations. Umlauts will remain as their corresponding vowel,
+		/// as "?" remains as "ss".
+		/// </summary>
+		/// <param name="buffer"></param>
+		private void Resubstitute( StringBuilder buffer )
+		{
+			for ( int c = 0; c < buffer.Length; c++ ) 
+			{
+				if ( buffer[c] == '*' ) 
+				{
+					char x = buffer[c - 1];
+					buffer[c] = x;
+				}
+				else if ( buffer[c] == '$' ) 
+				{
+					buffer[c] = 's';
+					buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2);
+				}
+				else if ( buffer[c] == '§' ) 
+				{
+					buffer[c] = 'c';
+					buffer.Insert( c + 1, 'h' );
+				}
+				else if ( buffer[c] == '%' ) 
+				{
+					buffer[c] = 'e';
+					buffer.Insert( c + 1, 'i' );
+				}
+				else if ( buffer[c] == '&' ) 
+				{
+					buffer[c] = 'i';
+					buffer.Insert( c + 1, 'e' );
+				}
+				else if ( buffer[c] == '#' ) 
+				{
+					buffer[c] = 'i';
+					buffer.Insert( c + 1, 'g' );
+				}
+				else if ( buffer[c] == '!' ) 
+				{
+					buffer[c] = 's';
+					buffer.Insert( c + 1, 't' );
+				}
+			}
+		}
+	}
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/De/WordlistLoader.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,96 @@
+using System;
+using System.IO;
+using System.Collections;
+
+namespace Lucene.Net.Analysis.De
+{
+	/// <summary>
+	/// Loads a text file and adds every line as an entry to a Hashtable. Every line
+	/// should contain only one word. If the file is not found or on any error, an
+	/// empty table is returned.
+	/// </summary>
+	public class WordlistLoader
+	{
+		/// <summary>
+		/// </summary>
+		/// <param name="path">Path to the wordlist</param>
+		/// <param name="wordfile">Name of the wordlist</param>
+		/// <returns></returns>
+		public static Hashtable GetWordtable( String path, String wordfile ) 
+		{
+			if ( path == null || wordfile == null ) 
+			{
+				return new Hashtable();
+			}
+			return GetWordtable(new FileInfo(path + "\\" + wordfile));
+		}
+
+		/// <summary>
+		/// </summary>
+		/// <param name="wordfile">Complete path to the wordlist</param>
+		/// <returns></returns>
+		public static Hashtable GetWordtable( String wordfile ) 
+		{
+			if ( wordfile == null ) 
+			{
+				return new Hashtable();
+			}
+			return GetWordtable( new FileInfo( wordfile ) );
+		}
+
+		/// <summary>
+		/// 
+		/// </summary>
+		/// <param name="wordfile">File containing the wordlist</param>
+		/// <returns></returns>
+		public static Hashtable GetWordtable( FileInfo wordfile ) 
+		{
+			if ( wordfile == null ) 
+			{
+				return new Hashtable();
+			}
+			Hashtable result = null;
+			try 
+			{
+				StreamReader lnr = new StreamReader(wordfile.FullName);
+				String word = null;
+				String[] stopwords = new String[100];
+				int wordcount = 0;
+				while ( ( word = lnr.ReadLine() ) != null ) 
+				{
+					wordcount++;
+					if ( wordcount == stopwords.Length ) 
+					{
+						String[] tmp = new String[stopwords.Length + 50];
+						Array.Copy( stopwords, 0, tmp, 0, wordcount );
+						stopwords = tmp;
+					}
+					stopwords[wordcount-1] = word;
+				}
+				result = MakeWordTable( stopwords, wordcount );
+			}
+				// On error, use an empty table
+			catch (IOException) 
+			{
+				result = new Hashtable();
+			}
+			return result;
+		}
+
+		/// <summary>
+		/// Builds the wordlist table.
+		/// </summary>
+		/// <param name="words">Word that where read</param>
+		/// <param name="length">Amount of words that where read into <tt>words</tt></param>
+		/// <returns></returns>
+		private static Hashtable MakeWordTable( String[] words, int length ) 
+		{
+			Hashtable table = new Hashtable( length );
+			for ( int i = 0; i < length; i++ ) 
+			{
+				table.Add(words[i], words[i]);
+			}
+			return table;
+		}
+	}
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchAnalyzer.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,197 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.De;
+using Lucene.Net.Analysis.Standard;
+
+namespace Lucene.Net.Analysis.Fr
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Analyzer for french language. Supports an external list of stopwords (words that
+	/// will not be indexed at all) and an external list of exclusions (word that will
+	/// not be stemmed, but indexed).
+	/// A default set of stopwords is used unless an other list is specified, the
+	/// exclusionlist is empty by default.
+	/// 
+	/// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
+	/// <version>$Id: FrenchAnalyzer.java,v 1.9 2004/10/17 11:41:40 dnaber Exp $</version>
+	/// </summary>
+	public sealed class FrenchAnalyzer : Analyzer 
+	{
+
+		/// <summary>
+		/// Extended list of typical french stopwords.
+		/// </summary>
+		public static String[] FRENCH_STOP_WORDS = 
+				 {
+					 "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
+					 "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
+					 "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
+					 "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
+					 "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
+					 "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
+					 "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
+					 "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
+					 "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
+					 "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
+					 "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
+					 "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
+					 "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
+					 "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
+					 "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
+					 "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
+					 "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
+					 "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
+					 "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
+					 "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
+					 "voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
+					 "été", "être", "ô"
+				 };
+
+		/// <summary>
+		/// Contains the stopwords used with the StopFilter.
+		/// </summary>
+		private Hashtable stoptable = new Hashtable();
+
+		/// <summary>
+		/// Contains words that should be indexed but not stemmed.
+		/// </summary>
+		private Hashtable excltable = new Hashtable();
+
+		/// <summary>
+		/// Builds an analyzer.
+		/// </summary>
+		public FrenchAnalyzer() 
+		{
+			stoptable = StopFilter.MakeStopSet( FRENCH_STOP_WORDS );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public FrenchAnalyzer( String[] stopwords ) 
+		{
+			stoptable = StopFilter.MakeStopSet( stopwords );
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public FrenchAnalyzer( Hashtable stopwords ) 
+		{
+			stoptable = stopwords;
+		}
+
+		/// <summary>
+		/// Builds an analyzer with the given stop words.
+		/// </summary>
+		public FrenchAnalyzer( FileInfo stopwords ) 
+		{
+			stoptable = WordlistLoader.GetWordtable( stopwords );
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from an array of Strings.
+		/// </summary>
+		public void SetStemExclusionTable( String[] exclusionlist ) 
+		{
+			excltable = StopFilter.MakeStopSet( exclusionlist );
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from a Hashtable.
+		/// </summary>
+		public void SetStemExclusionTable( Hashtable exclusionlist ) 
+		{
+			excltable = exclusionlist;
+		}
+
+		/// <summary>
+		/// Builds an exclusionlist from the words contained in the given file.
+		/// </summary>
+		public void SetStemExclusionTable( FileInfo exclusionlist ) 
+		{
+			excltable = WordlistLoader.GetWordtable( exclusionlist );
+		}
+
+		/// <summary>
+		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
+		/// </summary>
+		/// <returns>
+		/// A TokenStream build from a StandardTokenizer filtered with
+		/// 	StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
+		/// </returns>
+		public override TokenStream TokenStream( String fieldName, TextReader reader ) 
+		{
+		
+			if (fieldName==null) throw new ArgumentException("fieldName must not be null");
+			if (reader==null) throw new ArgumentException("readermust not be null");
+				
+			TokenStream result = new StandardTokenizer( reader );
+			result = new StandardFilter( result );
+			result = new StopFilter( result, stoptable );
+			result = new FrenchStemFilter( result, excltable );
+			// Convert to lowercase after stemming!
+			result = new LowerCaseFilter( result );
+			return result;
+		}
+	}
+
+}

Added: incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs?rev=1083372&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Analyzers/Lucene.Net.Analyzers/Fr/FrenchStemFilter.cs Sun Mar 20 07:30:37 2011
@@ -0,0 +1,143 @@
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Fr
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// A filter that stemms french words. It supports a table of words that should
+	/// not be stemmed at all. The used stemmer can be changed at runtime after the
+	/// filter object is created (as long as it is a FrenchStemmer).
+	/// 
+	/// <author>Patrick Talbot (based on Gerhard Schwarz work for German)</author>
+	/// <version>$Id: FrenchAnalyzer.java,v 1.2 2004/01/23 20:54:47 ehatcher Exp $</version>
+	/// </summary>
+	public sealed class FrenchStemFilter : TokenFilter 
+	{
+
+		/// <summary>
+		/// The actual token in the input stream.
+		/// </summary>
+		private Token token = null;
+		private FrenchStemmer stemmer = null;
+		private Hashtable exclusions = null;
+
+		public FrenchStemFilter( TokenStream _in ) : base(_in)
+		{
+			stemmer = new FrenchStemmer();
+		}
+
+		/// <summary>
+		/// Builds a FrenchStemFilter that uses an exclusiontable.
+		/// </summary>
+		public FrenchStemFilter( TokenStream _in, Hashtable exclusiontable ) : 	this( _in )
+		{
+			exclusions = exclusiontable;
+		}
+
+		/// <summary>
+		/// Returns the next token in the stream, or null at EOS
+		/// </summary>
+		/// <returns>
+		/// Returns the next token in the stream, or null at EOS
+		/// </returns>
+		public override Token Next()
+		{
+			if ( ( token = input.Next() ) == null ) 
+			{
+				return null;
+			}
+				// Check the exclusiontable
+			else if ( exclusions != null && exclusions.Contains( token.TermText() ) ) 
+			{
+				return token;
+			}
+			else 
+			{
+				String s = stemmer.Stem( token.TermText() );
+				// If not stemmed, dont waste the time creating a new token
+				if ( !s.Equals( token.TermText() ) ) 
+				{
+					return new Token( s, 0, s.Length, token.Type() );
+				}
+				return token;
+			}
+		}
+
+		/// <summary>
+		/// Set a alternative/custom FrenchStemmer for this filter.
+		/// </summary>
+		public void SetStemmer( FrenchStemmer stemmer ) 
+		{
+			if ( stemmer != null ) 
+			{
+				this.stemmer = stemmer;
+			}
+		}
+
+		/// <summary>
+		/// Set an alternative exclusion list for this filter.
+		/// </summary>
+		public void SetExclusionTable( Hashtable exclusiontable ) 
+		{
+			exclusions = exclusiontable;
+		}
+	}
+}



Mime
View raw message