lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pnas...@apache.org
Subject [Lucene.Net] svn commit: r1230919 [4/22] - in /incubator/lucene.net/branches/Lucene.Net_2_9_4g: ./ build/scripts/ build/vs2010/contrib/ build/vs2010/core/ build/vs2010/demo/ build/vs2010/test/ src/contrib/Analyzers/ src/contrib/Analyzers/CJK/ src/contrib/Analyzers/...
Date Fri, 13 Jan 2012 08:42:38 GMT
Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Contrib.SpellChecker.Test.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Contrib.SpellChecker.Test.sln?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Contrib.SpellChecker.Test.sln (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Contrib.SpellChecker.Test.sln Fri Jan 13 08:42:34 2012
@@ -1,32 +1,52 @@
-
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual C# Express 2010
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\..\src\core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Contrib.SpellChecker", "..\..\..\src\contrib\SpellChecker\Contrib.SpellChecker.csproj", "{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Contrib.SpellChecker.Test", "..\..\..\test\contrib\SpellChecker\Contrib.SpellChecker.Test.csproj", "{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
-		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.Build.0 = Release|Any CPU
-		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual C# Express 2010
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\..\src\core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Contrib.SpellChecker", "..\..\..\src\contrib\SpellChecker\Contrib.SpellChecker.csproj", "{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Contrib.SpellChecker.Test", "..\..\..\test\contrib\SpellChecker\Contrib.SpellChecker.Test.csproj", "{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.Build.0 = Release|Any CPU
+		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Lucene.Net.Test.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Lucene.Net.Test.sln?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Lucene.Net.Test.sln (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/build/vs2010/test/Lucene.Net.Test.sln Fri Jan 13 08:42:34 2012
@@ -1,32 +1,52 @@
-
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual C# Express 2010
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\..\src\core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Test", "..\..\..\test\core\Lucene.Net.Test.csproj", "{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}"
-EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo.Common", "..\..\..\src\demo\Demo.Common\Demo.Common.csproj", "{F04CA2F4-E182-46A8-B914-F46AF5319E83}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
-		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.Build.0 = Release|Any CPU
-		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual C# Express 2010
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net", "..\..\..\src\core\Lucene.Net.csproj", "{5D4AD9BE-1FFB-41AB-9943-25737971BF57}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Test", "..\..\..\test\core\Lucene.Net.Test.csproj", "{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo.Common", "..\..\..\src\demo\Demo.Common\Demo.Common.csproj", "{F04CA2F4-E182-46A8-B914-F46AF5319E83}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{5D4AD9BE-1FFB-41AB-9943-25737971BF57}.Release|Any CPU.Build.0 = Release|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{AAF68BCF-F781-45FC-98B3-2B9CEE411E01}.Release|Any CPU.Build.0 = Release|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{F04CA2F4-E182-46A8-B914-F46AF5319E83}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKAnalyzer.cs Fri Jan 13 08:42:34 2012
@@ -1,129 +1,150 @@
-using System;
-using System.IO;
-using System.Collections;
-using System.Collections.Generic;
-using Lucene.Net.Analysis;
-
-namespace Lucene.Net.Analysis.CJK
-{
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 *
-	 * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
-	 */
-
-	/// <summary>
-	/// Filters CJKTokenizer with StopFilter.
-	/// 
-	/// <author>Che, Dong</author>
-	/// </summary>
-	public class CJKAnalyzer : Analyzer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// An array containing some common English words that are not usually
-		/// useful for searching. and some double-byte interpunctions.....
-		/// </summary>
-		public static String[] stopWords = 
-		{
-			"a", "and", "are", "as", "at", "be",
-			"but", "by", "for", "if", "in",
-			"into", "is", "it", "no", "not",
-			"of", "on", "or", "s", "such", "t",
-			"that", "the", "their", "then",
-			"there", "these", "they", "this",
-			"to", "was", "will", "with", "",
-			"www"
-		};
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// stop word list
-		/// </summary>
-        private ICollection<string> stopTable;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Builds an analyzer which removes words in STOP_WORDS.
-		/// </summary>
-		public CJKAnalyzer() 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		/// <summary>
-		/// Builds an analyzer which removes words in the provided array.
-		/// </summary>
-		/// <param name="stopWords">stop word array</param>
-		public CJKAnalyzer(String[] stopWords) 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		/// get token stream from input
-		/// </summary>
-		/// <param name="fieldName">lucene field name</param>
-		/// <param name="reader">input reader</param>
-		/// <returns>Token Stream</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			return new StopFilter(new CJKTokenizer(reader), stopTable);
-		}
-	}
-}
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.IO;
+using System.Collections;
+using System.Collections.Generic;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 *
+	 * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
+	 */
+
+	/// <summary>
+	/// Filters CJKTokenizer with StopFilter.
+	/// 
+	/// <author>Che, Dong</author>
+	/// </summary>
+	public class CJKAnalyzer : Analyzer 
+	{
+		//~ Static fields/initializers ---------------------------------------------
+
+		/// <summary>
+		/// An array containing some common English words that are not usually
+		/// useful for searching. and some double-byte interpunctions.....
+		/// </summary>
+		public static String[] stopWords = 
+		{
+			"a", "and", "are", "as", "at", "be",
+			"but", "by", "for", "if", "in",
+			"into", "is", "it", "no", "not",
+			"of", "on", "or", "s", "such", "t",
+			"that", "the", "their", "then",
+			"there", "these", "they", "this",
+			"to", "was", "will", "with", "",
+			"www"
+		};
+
+		//~ Instance fields --------------------------------------------------------
+
+		/// <summary>
+		/// stop word list
+		/// </summary>
+        private ICollection<string> stopTable;
+
+		//~ Constructors -----------------------------------------------------------
+
+		/// <summary>
+		/// Builds an analyzer which removes words in STOP_WORDS.
+		/// </summary>
+		public CJKAnalyzer() 
+		{
+			stopTable = StopFilter.MakeStopSet(stopWords);
+		}
+
+		/// <summary>
+		/// Builds an analyzer which removes words in the provided array.
+		/// </summary>
+		/// <param name="stopWords">stop word array</param>
+		public CJKAnalyzer(String[] stopWords) 
+		{
+			stopTable = StopFilter.MakeStopSet(stopWords);
+		}
+
+		//~ Methods ----------------------------------------------------------------
+
+		/// <summary>
+		/// get token stream from input
+		/// </summary>
+		/// <param name="fieldName">lucene field name</param>
+		/// <param name="reader">input reader</param>
+		/// <returns>Token Stream</returns>
+		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
+		{
+			return new StopFilter(new CJKTokenizer(reader), stopTable);
+		}
+	}
+}

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKTokenizer.cs?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/CJK/CJKTokenizer.cs Fri Jan 13 08:42:34 2012
@@ -1,329 +1,350 @@
-using System;
-using System.IO;
-using System.Text;
-using Lucene.Net.Analysis;
-
-namespace Lucene.Net.Analysis.CJK
-{
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// <p>
-	/// CJKTokenizer was modified from StopTokenizer which does a decent job for
-	/// most European languages. and it perferm other token method for double-byte
-	/// Characters: the token will return at each two charactors with overlap match.<br>
-	/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
-	/// also need filter filter zero length token ""<br>
-	/// for Digit: digit, '+', '#' will token as letter<br>
-	/// for more info on Asia language(Chinese Japanese Korean) text segmentation:
-	/// please search  <a
-	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
-	/// </p>
-	/// 
-	/// @author Che, Dong
-	/// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
-	/// </summary>
-	public sealed class CJKTokenizer : Tokenizer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// Max word length
-		/// </summary>
-		private static int MAX_WORD_LEN = 255;
-
-		/// <summary>
-		/// buffer size
-		/// </summary>
-		private static int IO_BUFFER_SIZE = 256;
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// word offset, used to imply which character(in ) is parsed
-		/// </summary>
-		private int offset = 0;
-
-		/// <summary>
-		/// the index used only for ioBuffer
-		/// </summary>
-		private int bufferIndex = 0;
-
-		/// <summary>
-		/// data length
-		/// </summary>
-		private int dataLen = 0;
-
-		/// <summary>
-		/// character buffer, store the characters which are used to compose <br>
-		/// the returned Token
-		/// </summary>
-		private char[] buffer = new char[MAX_WORD_LEN];
-
-		/// <summary>
-		/// I/O buffer, used to store the content of the input(one of the <br>
-		/// members of Tokenizer)
-		/// </summary>
-		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-		/// <summary>
-		/// word type: single=>ASCII  double=>non-ASCII word=>default 
-		/// </summary>
-		private String tokenType = "word";
-
-		/// <summary>
-		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
-		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
-		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
-		/// </summary>
-		private bool preIsTokened = false;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Construct a token stream processing the given input.
-		/// </summary>
-		/// <param name="_in">I/O reader</param>
-		public CJKTokenizer(TextReader _in) 
-		{
-			input = _in;
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		///  Returns the next token in the stream, or null at EOS.
-		/// </summary>
-		/// <returns>Token</returns>
-		public override Token Next()
-		{
-			/** how many character(s) has been stored in buffer */
-			int length = 0;
-
-			/** the position used to create Token */
-			int start = offset;
-
-			while (true) 
-			{
-				/** current charactor */
-				char c;
-
-				/** unicode block of current charactor for detail */
-				//Character.UnicodeBlock ub;
-
-				offset++;
-
-				if (bufferIndex >= dataLen) 
-				{
-					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
-					bufferIndex = 0;
-				}
-
-				if (dataLen == 0) 
-				{
-					if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						}
-
-						break;
-					} 
-					else 
-					{
-						return null;
-					}
-				} 
-				else 
-				{
-					//get current character
-					c = ioBuffer[bufferIndex++];
-
-					//get the UnicodeBlock of the current character
-					//ub = Character.UnicodeBlock.of(c);
-				}
-
-				//if the current character is ASCII or Extend ASCII
-				if (('\u0000' <= c && c <= '\u007F') || 
-					('\uFF00' <= c && c <= '\uFFEF')) 
-				{
-					if ('\uFF00' <= c && c <= '\uFFEF')
-					{
-						/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
-						int i = (int) c;
-						i = i - 65248;
-						c = (char) i;
-					}
-
-					// if the current character is a letter or "_" "+" "#"
-					if (Char.IsLetterOrDigit(c)
-						|| ((c == '_') || (c == '+') || (c == '#'))
-						) 
-					{
-						if (length == 0) 
-						{
-							// "javaC1C2C3C4linux" <br>
-							//      ^--: the current character begin to token the ASCII
-							// letter
-							start = offset - 1;
-						} 
-						else if (tokenType == "double") 
-						{
-							// "javaC1C2C3C4linux" <br>
-							//              ^--: the previous non-ASCII
-							// : the current character
-							offset--;
-							bufferIndex--;
-							tokenType = "single";
-
-							if (preIsTokened == true) 
-							{
-								// there is only one non-ASCII has been stored
-								length = 0;
-								preIsTokened = false;
-
-								break;
-							} 
-							else 
-							{
-								break;
-							}
-						}
-
-						// store the LowerCase(c) in the buffer
-						buffer[length++] = Char.ToLower(c);
-						tokenType = "single";
-
-						// break the procedure if buffer overflowed!
-						if (length == MAX_WORD_LEN) 
-						{
-							break;
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				} 
-				else 
-				{
-					// non-ASCII letter, eg."C1C2C3C4"
-					if (Char.IsLetter(c)) 
-					{
-						if (length == 0) 
-						{
-							start = offset - 1;
-							buffer[length++] = c;
-							tokenType = "double";
-						} 
-						else 
-						{
-							if (tokenType == "single") 
-							{
-								offset--;
-								bufferIndex--;
-
-								//return the previous ASCII characters
-								break;
-							} 
-							else 
-							{
-								buffer[length++] = c;
-								tokenType = "double";
-
-								if (length == 2) 
-								{
-									offset--;
-									bufferIndex--;
-									preIsTokened = true;
-
-									break;
-								}
-							}
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							// empty the buffer
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				}
-			}
-
-			return new Token(new String(buffer, 0, length), start, start + length,
-				tokenType
-				);
-		}
-	}
-
-}
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.CJK
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// <p>
+	/// CJKTokenizer was modified from StopTokenizer which does a decent job for
+	/// most European languages. and it perferm other token method for double-byte
+	/// Characters: the token will return at each two charactors with overlap match.<br>
+	/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+	/// also need filter filter zero length token ""<br>
+	/// for Digit: digit, '+', '#' will token as letter<br>
+	/// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+	/// please search  <a
+	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+	/// </p>
+	/// 
+	/// @author Che, Dong
+	/// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+	/// </summary>
+	public sealed class CJKTokenizer : Tokenizer 
+	{
+		//~ Static fields/initializers ---------------------------------------------
+
+		/// <summary>
+		/// Max word length
+		/// </summary>
+		private static int MAX_WORD_LEN = 255;
+
+		/// <summary>
+		/// buffer size
+		/// </summary>
+		private static int IO_BUFFER_SIZE = 256;
+
+		//~ Instance fields --------------------------------------------------------
+
+		/// <summary>
+		/// word offset, used to imply which character(in ) is parsed
+		/// </summary>
+		private int offset = 0;
+
+		/// <summary>
+		/// the index used only for ioBuffer
+		/// </summary>
+		private int bufferIndex = 0;
+
+		/// <summary>
+		/// data length
+		/// </summary>
+		private int dataLen = 0;
+
+		/// <summary>
+		/// character buffer, store the characters which are used to compose <br>
+		/// the returned Token
+		/// </summary>
+		private char[] buffer = new char[MAX_WORD_LEN];
+
+		/// <summary>
+		/// I/O buffer, used to store the content of the input(one of the <br>
+		/// members of Tokenizer)
+		/// </summary>
+		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+		/// <summary>
+		/// word type: single=>ASCII  double=>non-ASCII word=>default 
+		/// </summary>
+		private String tokenType = "word";
+
+		/// <summary>
+		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
+		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+		/// </summary>
+		private bool preIsTokened = false;
+
+		//~ Constructors -----------------------------------------------------------
+
+		/// <summary>
+		/// Construct a token stream processing the given input.
+		/// </summary>
+		/// <param name="_in">I/O reader</param>
+		public CJKTokenizer(TextReader _in) 
+		{
+			input = _in;
+		}
+
+		//~ Methods ----------------------------------------------------------------
+
+		/// <summary>
+		///  Returns the next token in the stream, or null at EOS.
+		/// </summary>
+		/// <returns>Token</returns>
+		public override Token Next()
+		{
+			/** how many character(s) has been stored in buffer */
+			int length = 0;
+
+			/** the position used to create Token */
+			int start = offset;
+
+			while (true) 
+			{
+				/** current charactor */
+				char c;
+
+				/** unicode block of current charactor for detail */
+				//Character.UnicodeBlock ub;
+
+				offset++;
+
+				if (bufferIndex >= dataLen) 
+				{
+					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+					bufferIndex = 0;
+				}
+
+				if (dataLen == 0) 
+				{
+					if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							length = 0;
+							preIsTokened = false;
+						}
+
+						break;
+					} 
+					else 
+					{
+						return null;
+					}
+				} 
+				else 
+				{
+					//get current character
+					c = ioBuffer[bufferIndex++];
+
+					//get the UnicodeBlock of the current character
+					//ub = Character.UnicodeBlock.of(c);
+				}
+
+				//if the current character is ASCII or Extend ASCII
+				if (('\u0000' <= c && c <= '\u007F') || 
+					('\uFF00' <= c && c <= '\uFFEF')) 
+				{
+					if ('\uFF00' <= c && c <= '\uFFEF')
+					{
+						/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+						int i = (int) c;
+						i = i - 65248;
+						c = (char) i;
+					}
+
+					// if the current character is a letter or "_" "+" "#"
+					if (Char.IsLetterOrDigit(c)
+						|| ((c == '_') || (c == '+') || (c == '#'))
+						) 
+					{
+						if (length == 0) 
+						{
+							// "javaC1C2C3C4linux" <br>
+							//      ^--: the current character begin to token the ASCII
+							// letter
+							start = offset - 1;
+						} 
+						else if (tokenType == "double") 
+						{
+							// "javaC1C2C3C4linux" <br>
+							//              ^--: the previous non-ASCII
+							// : the current character
+							offset--;
+							bufferIndex--;
+							tokenType = "single";
+
+							if (preIsTokened == true) 
+							{
+								// there is only one non-ASCII has been stored
+								length = 0;
+								preIsTokened = false;
+
+								break;
+							} 
+							else 
+							{
+								break;
+							}
+						}
+
+						// store the LowerCase(c) in the buffer
+						buffer[length++] = Char.ToLower(c);
+						tokenType = "single";
+
+						// break the procedure if buffer overflowed!
+						if (length == MAX_WORD_LEN) 
+						{
+							break;
+						}
+					} 
+					else if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							length = 0;
+							preIsTokened = false;
+						} 
+						else 
+						{
+							break;
+						}
+					}
+				} 
+				else 
+				{
+					// non-ASCII letter, eg."C1C2C3C4"
+					if (Char.IsLetter(c)) 
+					{
+						if (length == 0) 
+						{
+							start = offset - 1;
+							buffer[length++] = c;
+							tokenType = "double";
+						} 
+						else 
+						{
+							if (tokenType == "single") 
+							{
+								offset--;
+								bufferIndex--;
+
+								//return the previous ASCII characters
+								break;
+							} 
+							else 
+							{
+								buffer[length++] = c;
+								tokenType = "double";
+
+								if (length == 2) 
+								{
+									offset--;
+									bufferIndex--;
+									preIsTokened = true;
+
+									break;
+								}
+							}
+						}
+					} 
+					else if (length > 0) 
+					{
+						if (preIsTokened == true) 
+						{
+							// empty the buffer
+							length = 0;
+							preIsTokened = false;
+						} 
+						else 
+						{
+							break;
+						}
+					}
+				}
+			}
+
+			return new Token(new String(buffer, 0, length), start, start + length,
+				tokenType
+				);
+		}
+	}
+
+}

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs Fri Jan 13 08:42:34 2012
@@ -1,92 +1,113 @@
-using System;
-using System.IO;
-using System.Text;
-using System.Collections;
-
-using Lucene.Net.Analysis;
-
-namespace Lucene.Net.Analysis.Cn
-{
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Title: ChineseAnalyzer
-	/// Description:
-	///   Subclass of org.apache.lucene.analysis.Analyzer
-	///   build from a ChineseTokenizer, filtered with ChineseFilter.
-	/// Copyright:   Copyright (c) 2001
-	/// Company:
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
-	/// </summary>
-	public class ChineseAnalyzer : Analyzer 
-	{
-
-		public ChineseAnalyzer() 
-		{
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
-		/// </summary>
-		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			TokenStream result = new ChineseTokenizer(reader);
-			result = new ChineseFilter(result);
-			return result;
-		}
-	}
-}
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseAnalyzer
+	/// Description:
+	///   Subclass of org.apache.lucene.analysis.Analyzer
+	///   build from a ChineseTokenizer, filtered with ChineseFilter.
+	/// Copyright:   Copyright (c) 2001
+	/// Company:
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
+	/// </summary>
+	public class ChineseAnalyzer : Analyzer 
+	{
+
+		public ChineseAnalyzer() 
+		{
+		}
+
+		/// <summary>
+		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
+		/// </summary>
+		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
+		{
+			TokenStream result = new ChineseTokenizer(reader);
+			result = new ChineseFilter(result);
+			return result;
+		}
+	}
+}

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseFilter.cs?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseFilter.cs Fri Jan 13 08:42:34 2012
@@ -1,138 +1,159 @@
-using System;
-using System.IO;
-using System.Collections;
-using System.Globalization;
-
-using Lucene.Net.Analysis;
-
-namespace Lucene.Net.Analysis.Cn
-{
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Title: ChineseFilter
-	/// Description: Filter with a stop word table
-	///              Rule: No digital is allowed.
-	///                    English word/token should larger than 1 character.
-	///                    One Chinese character as one Chinese word.
-	/// TO DO:
-	///   1. Add Chinese stop words, such as \ue400
-	///   2. Dictionary based Chinese word extraction
-	///   3. Intelligent Chinese word extraction
-	/// 
-	/// Copyright:    Copyright (c) 2001
-	/// Company:
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
-	/// </summary>
-	public sealed class ChineseFilter : TokenFilter 
-	{
-		// Only English now, Chinese to be added later.
-		public static String[] STOP_WORDS = 
-				 {
-					 "and", "are", "as", "at", "be", "but", "by",
-					 "for", "if", "in", "into", "is", "it",
-					 "no", "not", "of", "on", "or", "such",
-					 "that", "the", "their", "then", "there", "these",
-					 "they", "this", "to", "was", "will", "with"
-				 };
-
-		private Hashtable stopTable;
-
-		public ChineseFilter(TokenStream _in) : base (_in)
-		{
-			stopTable = new Hashtable(STOP_WORDS.Length);
-
-			for (int i = 0; i < STOP_WORDS.Length; i++)
-				stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
-		}
-
-		public override Token Next()
-		{
-
-			for (Token token = input.Next(); token != null; token = input.Next()) 
-			{
-				String text = token.TermText();
-
-				// why not key off token type here assuming ChineseTokenizer comes first?
-				if (stopTable[text] == null) 
-				{
-					switch (Char.GetUnicodeCategory(text[0])) 
-					{
-
-						case UnicodeCategory.LowercaseLetter:
-						case UnicodeCategory.UppercaseLetter:
-
-							// English word/token should larger than 1 character.
-							if (text.Length > 1) 
-							{
-								return token;
-							}
-							break;
-						case UnicodeCategory.OtherLetter:
-
-							// One Chinese character as one Chinese word.
-							// Chinese word extraction to be added later here.
-
-							return token;
-					}
-
-				}
-
-			}
-			return null;
-		}
-	}
-}
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.IO;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseFilter
+	/// Description: Filter with a stop word table
+	///              Rule: No digital is allowed.
+	///                    English word/token should larger than 1 character.
+	///                    One Chinese character as one Chinese word.
+	/// TO DO:
+	///   1. Add Chinese stop words, such as \ue400
+	///   2. Dictionary based Chinese word extraction
+	///   3. Intelligent Chinese word extraction
+	/// 
+	/// Copyright:    Copyright (c) 2001
+	/// Company:
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
+	/// </summary>
+	public sealed class ChineseFilter : TokenFilter 
+	{
+		// Only English now, Chinese to be added later.
+		public static String[] STOP_WORDS = 
+				 {
+					 "and", "are", "as", "at", "be", "but", "by",
+					 "for", "if", "in", "into", "is", "it",
+					 "no", "not", "of", "on", "or", "such",
+					 "that", "the", "their", "then", "there", "these",
+					 "they", "this", "to", "was", "will", "with"
+				 };
+
+		private Hashtable stopTable;
+
+		public ChineseFilter(TokenStream _in) : base (_in)
+		{
+			stopTable = new Hashtable(STOP_WORDS.Length);
+
+			for (int i = 0; i < STOP_WORDS.Length; i++)
+				stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
+		}
+
+		public override Token Next()
+		{
+
+			for (Token token = input.Next(); token != null; token = input.Next()) 
+			{
+				String text = token.TermText();
+
+				// why not key off token type here assuming ChineseTokenizer comes first?
+				if (stopTable[text] == null) 
+				{
+					switch (Char.GetUnicodeCategory(text[0])) 
+					{
+
+						case UnicodeCategory.LowercaseLetter:
+						case UnicodeCategory.UppercaseLetter:
+
+							// English word/token should larger than 1 character.
+							if (text.Length > 1) 
+							{
+								return token;
+							}
+							break;
+						case UnicodeCategory.OtherLetter:
+
+							// One Chinese character as one Chinese word.
+							// Chinese word extraction to be added later here.
+
+							return token;
+					}
+
+				}
+
+			}
+			return null;
+		}
+	}
+}

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseTokenizer.cs?rev=1230919&r1=1230918&r2=1230919&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/src/contrib/Analyzers/Cn/ChineseTokenizer.cs Fri Jan 13 08:42:34 2012
@@ -1,179 +1,200 @@
-using System;
-using System.IO;
-using System.Text;
-using System.Collections;
-using System.Globalization;
-
-using Lucene.Net.Analysis;
-
-namespace Lucene.Net.Analysis.Cn
-{
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Title: ChineseTokenizer
-	/// Description: Extract tokens from the Stream using Character.getType()
-	///              Rule: A Chinese character as a single token
-	/// Copyright:   Copyright (c) 2001
-	/// Company:
-	/// 
-	/// The difference between thr ChineseTokenizer and the
-	/// CJKTokenizer (id=23545) is that they have different
-	/// token parsing logic.
-	/// 
-	/// Let me use an example. If having a Chinese text
-	/// "C1C2C3C4" to be indexed, the tokens returned from the
-	/// ChineseTokenizer are C1, C2, C3, C4. And the tokens
-	/// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
-	/// 
-	/// Therefore the index the CJKTokenizer created is much
-	/// larger.
-	/// 
-	/// The problem is that when searching for C1, C1C2, C1C3,
-	/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
-	/// CJKTokenizer will not work.
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
-	/// </summary>
-	public sealed class ChineseTokenizer : Tokenizer 
-	{
-
-
-		public ChineseTokenizer(TextReader _in) 
-		{
-			input = _in;
-		}
-
-		private int offset = 0, bufferIndex=0, dataLen=0;
-		private static int MAX_WORD_LEN = 255;
-		private static int IO_BUFFER_SIZE = 1024;
-		private char[] buffer = new char[MAX_WORD_LEN];
-		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-		private int length;
-		private int start;
-
-		private void Push(char c) 
-		{
-
-			if (length == 0) start = offset-1;            // start of token
-			buffer[length++] = Char.ToLower(c);  // buffer it
-
-		}
-
-		private Token Flush() 
-		{
-
-			if (length > 0) 
-			{
-				//System.out.println(new String(buffer, 0, length));
-				return new Token(new String(buffer, 0, length), start, start+length);
-			}
-			else
-				return null;
-		}
-
-		public override Token Next()
-		{
-
-			length = 0;
-			start = offset;
-
-
-			while (true) 
-			{
-
-				char c;
-				offset++;
-
-				if (bufferIndex >= dataLen) 
-				{
-					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
-					bufferIndex = 0;
-				};
-
-				if (dataLen == 0) return Flush();
-				else
-					c = ioBuffer[bufferIndex++];
-
-
-				switch(Char.GetUnicodeCategory(c)) 
-				{
-
-					case UnicodeCategory.DecimalDigitNumber:
-					case UnicodeCategory.LowercaseLetter:
-					case UnicodeCategory.UppercaseLetter:
-						Push(c);
-						if (length == MAX_WORD_LEN) return Flush();
-						break;
-
-					case UnicodeCategory.OtherLetter:
-						if (length>0) 
-						{
-							bufferIndex--;
-							return Flush();
-						}
-						Push(c);
-						return Flush();
-
-					default:
-						if (length>0) return Flush();
-						break;
-				}
-			}
-
-		}
-	}
-}
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System;
+using System.IO;
+using System.Text;
+using System.Collections;
+using System.Globalization;
+
+using Lucene.Net.Analysis;
+
+namespace Lucene.Net.Analysis.Cn
+{
+	/* ====================================================================
+	 * The Apache Software License, Version 1.1
+	 *
+	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
+	 * reserved.
+	 *
+	 * Redistribution and use in source and binary forms, with or without
+	 * modification, are permitted provided that the following conditions
+	 * are met:
+	 *
+	 * 1. Redistributions of source code must retain the above copyright
+	 *    notice, this list of conditions and the following disclaimer.
+	 *
+	 * 2. Redistributions in binary form must reproduce the above copyright
+	 *    notice, this list of conditions and the following disclaimer in
+	 *    the documentation and/or other materials provided with the
+	 *    distribution.
+	 *
+	 * 3. The end-user documentation included with the redistribution,
+	 *    if any, must include the following acknowledgment:
+	 *       "This product includes software developed by the
+	 *        Apache Software Foundation (http://www.apache.org/)."
+	 *    Alternately, this acknowledgment may appear in the software itself,
+	 *    if and wherever such third-party acknowledgments normally appear.
+	 *
+	 * 4. The names "Apache" and "Apache Software Foundation" and
+	 *    "Apache Lucene" must not be used to endorse or promote products
+	 *    derived from this software without prior written permission. For
+	 *    written permission, please contact apache@apache.org.
+	 *
+	 * 5. Products derived from this software may not be called "Apache",
+	 *    "Apache Lucene", nor may "Apache" appear in their name, without
+	 *    prior written permission of the Apache Software Foundation.
+	 *
+	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	 * SUCH DAMAGE.
+	 * ====================================================================
+	 *
+	 * This software consists of voluntary contributions made by many
+	 * individuals on behalf of the Apache Software Foundation.  For more
+	 * information on the Apache Software Foundation, please see
+	 * <http://www.apache.org/>.
+	 */
+
+	/// <summary>
+	/// Title: ChineseTokenizer
+	/// Description: Extract tokens from the Stream using Character.getType()
+	///              Rule: A Chinese character as a single token
+	/// Copyright:   Copyright (c) 2001
+	/// Company:
+	/// 
+	/// The difference between thr ChineseTokenizer and the
+	/// CJKTokenizer (id=23545) is that they have different
+	/// token parsing logic.
+	/// 
+	/// Let me use an example. If having a Chinese text
+	/// "C1C2C3C4" to be indexed, the tokens returned from the
+	/// ChineseTokenizer are C1, C2, C3, C4. And the tokens
+	/// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+	/// 
+	/// Therefore the index the CJKTokenizer created is much
+	/// larger.
+	/// 
+	/// The problem is that when searching for C1, C1C2, C1C3,
+	/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+	/// CJKTokenizer will not work.
+	/// <author>Yiyi Sun</author>
+	/// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
+	/// </summary>
+	public sealed class ChineseTokenizer : Tokenizer 
+	{
+
+
+		public ChineseTokenizer(TextReader _in) 
+		{
+			input = _in;
+		}
+
+		private int offset = 0, bufferIndex=0, dataLen=0;
+		private static int MAX_WORD_LEN = 255;
+		private static int IO_BUFFER_SIZE = 1024;
+		private char[] buffer = new char[MAX_WORD_LEN];
+		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+		private int length;
+		private int start;
+
+		private void Push(char c) 
+		{
+
+			if (length == 0) start = offset-1;            // start of token
+			buffer[length++] = Char.ToLower(c);  // buffer it
+
+		}
+
+		private Token Flush() 
+		{
+
+			if (length > 0) 
+			{
+				//System.out.println(new String(buffer, 0, length));
+				return new Token(new String(buffer, 0, length), start, start+length);
+			}
+			else
+				return null;
+		}
+
+		public override Token Next()
+		{
+
+			length = 0;
+			start = offset;
+
+
+			while (true) 
+			{
+
+				char c;
+				offset++;
+
+				if (bufferIndex >= dataLen) 
+				{
+					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+					bufferIndex = 0;
+				};
+
+				if (dataLen == 0) return Flush();
+				else
+					c = ioBuffer[bufferIndex++];
+
+
+				switch(Char.GetUnicodeCategory(c)) 
+				{
+
+					case UnicodeCategory.DecimalDigitNumber:
+					case UnicodeCategory.LowercaseLetter:
+					case UnicodeCategory.UppercaseLetter:
+						Push(c);
+						if (length == MAX_WORD_LEN) return Flush();
+						break;
+
+					case UnicodeCategory.OtherLetter:
+						if (length>0) 
+						{
+							bufferIndex--;
+							return Flush();
+						}
+						Push(c);
+						return Flush();
+
+					default:
+						if (length>0) return Flush();
+						break;
+				}
+			}
+
+		}
+	}
+}



Mime
View raw message