lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ccurr...@apache.org
Subject [Lucene.Net] svn commit: r1204353 [1/9] - in /incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src: contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/ contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/ contrib/Analyzers/Compoun...
Date Mon, 21 Nov 2011 04:44:59 GMT
Author: ccurrens
Date: Mon Nov 21 04:44:55 2011
New Revision: 1204353

URL: http://svn.apache.org/viewvc?rev=1204353&view=rev
Log:
Ported contrib/Analyzers/*, existing tests pass, but missing for most classes

Added:
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/CompoundWordTokenFilterBase.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/DictionaryCompoundWordTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/ByteVector.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/CharVector.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/Hyphen.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/Hyphenation.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/HyphenationException.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/HyphenationTree.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/PatternConsumer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/PatternParser.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/TernaryTree.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/Hyphenation/hyphenation.dtd
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Compound/HyphenationCompoundWordTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj.user
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizationFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fa/PersianNormalizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/FileDiffs.txt
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/ElisionFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/AbstractEncoder.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/DelimitedPayloadTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/FloatEncoder.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IdentityEncoder.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/IntegerEncoder.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/NumericPayloadTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadEncoder.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TokenOffsetPayloadTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Sinks/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Sinks/DateRecognizerSinkFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Sinks/TokenRangeSinkFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Sinks/TokenTypeSinkFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Th/
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Th/ThaiAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Th/ThaiWordFilter.cs
Removed:
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/WordlistLoader.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/WordlistLoader.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianCharsets.cs
Modified:
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizationFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemmer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cz/CzechAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Fr/FrenchStemmer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Nl/DutchStemmer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/PayloadHelper.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemmer.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Codec/OneDimensionalNonWeightedTokenSettingsCodec.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Codec/SimpleThreeDimensionalTokenSettingsCodec.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Codec/TokenSettingsCodec.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Codec/TwoDimensionalNonWeightedSynonymTokenSettingsCodec.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Matrix/Column.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Matrix/Matrix.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Matrix/MatrixPermutationIterator.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/Matrix/Row.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/ShingleAnalyzerWrapper.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/ShingleFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/ShingleMatrixFilter.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Shingle/TokenPositioner.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/WordlistLoader.cs
    incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/core/Analysis/TeeSinkTokenFilter.cs

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,11 +15,13 @@
  * limitations under the License.
  */
 
+using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
-
+using System.Linq;
 using Lucene.Net.Analysis;
-using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.AR
 {
@@ -54,96 +56,96 @@ namespace Lucene.Net.Analysis.AR
         /**
          * Contains the stopwords used with the StopFilter.
          */
-        private Hashtable stoptable = new Hashtable();
-        /**
+        private readonly ISet<string> stoptable;
+        /**<summary>
          * The comment character in the stopwords file.  All lines prefixed with this will be ignored  
+         * </summary>
          */
+        [Obsolete("Use WordListLoader.GetWordSet(FileInfo, string) directly")]
         public static string STOPWORDS_COMMENT = "#";
 
-        private Version matchVersion;
-
-        /**
-         * Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>.
-         *
-         * @deprecated Use <see cref="ArabicAnalyzer(Version)"/> instead
-         */
-        public ArabicAnalyzer() : this(Version.LUCENE_24)
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set
+        /// </summary>
+        /// <returns>Returns an unmodifiable instance of the default stop-words set</returns>
+        public static ISet<string>  GetDefaultStopSet()
         {
-            
+            return DefaultSetHolder.DEFAULT_STOP_SET;
         }
 
-        /**
-         * Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>.
-         */
-        public ArabicAnalyzer(Version matchVersion)
+        private static class DefaultSetHolder
         {
-            this.matchVersion = matchVersion;
+            internal static ISet<string> DEFAULT_STOP_SET;
 
-            using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(this.GetType()).GetManifestResourceStream("Lucene.Net.Analyzers.AR." + DEFAULT_STOPWORD_FILE)))
+            static DefaultSetHolder()
             {
-                while (!reader.EndOfStream)
+                try
                 {
-                    string word = reader.ReadLine();
-                    stoptable.Add(word, word);
+                    DEFAULT_STOP_SET = LoadDefaultStopWordSet();
+                }
+                catch (System.IO.IOException)
+                {
+                    // default set should always be present as it is part of the
+                    // distribution (JAR)
+                    throw new Exception("Unable to load default stopword set");
+                }
+            }
+
+            internal static ISet<string> LoadDefaultStopWordSet()
+            {
+                using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(typeof(ArabicAnalyzer)).GetManifestResourceStream("Lucene.Net.Analysis.AR." + DEFAULT_STOPWORD_FILE)))
+                {
+                    return CharArraySet.UnmodifiableSet(CharArraySet.Copy(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT)));
                 }
             }
         }
 
+        private Version matchVersion;
+
         /**
-         * Builds an analyzer with the given stop words.
-         *
-         * @deprecated Use <see cref="ArabicAnalyzer(Lucene.Net.Util.Version, string[])"/> instead
+         * Builds an analyzer with the default stop words: <see cref="DEFAULT_STOPWORD_FILE"/>.
          */
-        public ArabicAnalyzer(string[] stopwords): this(Version.LUCENE_24, stopwords)
+        public ArabicAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
         {
         }
 
-        /**
-         * Builds an analyzer with the given stop words.
-         */
-        public ArabicAnalyzer(Version matchVersion, string[] stopwords)
+        /// <summary>
+        /// Builds an analyzer with the given stop words.
+        /// </summary>
+        /// <param name="matchVersion">Lucene compatibility version</param>
+        /// <param name="stopwords">a stopword set</param>
+        public ArabicAnalyzer(Version matchVersion, ISet<string> stopwords)
         {
-            stoptable = StopFilter.MakeStopSet(stopwords);
+            stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
             this.matchVersion = matchVersion;
         }
 
         /**
          * Builds an analyzer with the given stop words.
-         *
-         * @deprecated Use <see cref="ArabicAnalyzer(Version, Hashtable)"/> instead
          */
-        public ArabicAnalyzer(Hashtable stopwords) : this(Version.LUCENE_24, stopwords)
+        [Obsolete("Use ArabicAnalyzer(Version, Set) instead")]
+        public ArabicAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
         {
         }
 
         /**
          * Builds an analyzer with the given stop words.
          */
-        public ArabicAnalyzer(Version matchVersion, Hashtable stopwords)
+        [Obsolete("Use ArabicAnalyzer(Version, Set) instead")]
+        public ArabicAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
         {
-            stoptable = new Hashtable(stopwords);
-            this.matchVersion = matchVersion;
         }
 
-        //DIGY
-        ///**
-        // * Builds an analyzer with the given stop words.  Lines can be commented out using <see cref="STOPWORDS_COMMENT"/>
-        // *
-        // * @deprecated Use <see cref="ArabicAnalyzer(Version, File)"/> instead
-        // */
-        //public ArabicAnalyzer(File stopwords)
-        //{
-        //    this(Version.LUCENE_24, stopwords);
-        //}
-
-        ///**
-        // * Builds an analyzer with the given stop words.  Lines can be commented out using <see cref="STOPWORDS_COMMENT"/>
-        // */
-        //public ArabicAnalyzer(Version matchVersion, File stopwords)
-        //{
-        //    stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
-        //    this.matchVersion = matchVersion;
-        //}
+        /**
+         * Builds an analyzer with the given stop words.  Lines can be commented out using <see cref="STOPWORDS_COMMENT"/>
+         */
+        public ArabicAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
+        {
+        }
 
 
         /**
@@ -157,6 +159,7 @@ namespace Lucene.Net.Analysis.AR
         {
             TokenStream result = new ArabicLetterTokenizer(reader);
             result = new LowerCaseFilter(result);
+            // the order here is important: the stopword list is not normalized!
             result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
             result = new ArabicNormalizationFilter(result);
             result = new ArabicStemFilter(result);
@@ -186,6 +189,7 @@ namespace Lucene.Net.Analysis.AR
                 streams = new SavedStreams();
                 streams.Source = new ArabicLetterTokenizer(reader);
                 streams.Result = new LowerCaseFilter(streams.Source);
+                // the order here is important: the stopword list is not normalized!
                 streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                 streams.Result, stoptable);
                 streams.Result = new ArabicNormalizationFilter(streams.Result);

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizationFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizationFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizationFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizationFilter.cs Mon Nov 21 04:44:55 2011
@@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.AR
         {
             
             normalizer = new ArabicNormalizer();
-            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            termAtt = AddAttribute<TermAttribute>();
         }
 
         public override bool IncrementToken()
@@ -52,10 +52,7 @@ namespace Lucene.Net.Analysis.AR
                 termAtt.SetTermLength(newlen);
                 return true;
             }
-            else
-            {
-                return false;
-            }
+            return false;
         }
     }
-}
\ No newline at end of file
+}

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicNormalizer.cs Mon Nov 21 04:44:55 2011
@@ -43,27 +43,27 @@ namespace Lucene.Net.Analysis.AR
      */
     public class ArabicNormalizer
     {
-        public static char ALEF = '\u0627';
-        public static char ALEF_MADDA = '\u0622';
-        public static char ALEF_HAMZA_ABOVE = '\u0623';
-        public static char ALEF_HAMZA_BELOW = '\u0625';
-
-        public static char YEH = '\u064A';
-        public static char DOTLESS_YEH = '\u0649';
-
-        public static char TEH_MARBUTA = '\u0629';
-        public static char HEH = '\u0647';
-
-        public static char TATWEEL = '\u0640';
-
-        public static char FATHATAN = '\u064B';
-        public static char DAMMATAN = '\u064C';
-        public static char KASRATAN = '\u064D';
-        public static char FATHA = '\u064E';
-        public static char DAMMA = '\u064F';
-        public static char KASRA = '\u0650';
-        public static char SHADDA = '\u0651';
-        public static char SUKUN = '\u0652';
+        public const char ALEF = '\u0627';
+        public const char ALEF_MADDA = '\u0622';
+        public const char ALEF_HAMZA_ABOVE = '\u0623';
+        public const char ALEF_HAMZA_BELOW = '\u0625';
+
+        public const char YEH = '\u064A';
+        public const char DOTLESS_YEH = '\u0649';
+
+        public const char TEH_MARBUTA = '\u0629';
+        public const char HEH = '\u0647';
+
+        public const char TATWEEL = '\u0640';
+
+        public const char FATHATAN = '\u064B';
+        public const char DAMMATAN = '\u064C';
+        public const char KASRATAN = '\u064D';
+        public const char FATHA = '\u064E';
+        public const char DAMMA = '\u064F';
+        public const char KASRA = '\u0650';
+        public const char SHADDA = '\u0651';
+        public const char SUKUN = '\u0652';
 
         /**
          * Normalize an input buffer of Arabic text
@@ -77,20 +77,33 @@ namespace Lucene.Net.Analysis.AR
 
             for (int i = 0; i < len; i++)
             {
-                if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
-                    s[i] = ALEF;
-
-                if (s[i] == DOTLESS_YEH)
-                    s[i] = YEH;
-
-                if (s[i] == TEH_MARBUTA)
-                    s[i] = HEH;
-
-                if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
-                    s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN)
+                switch (s[i])
                 {
-                    len = Delete(s, i, len);
-                    i--;
+                    case ALEF_MADDA:
+                    case ALEF_HAMZA_ABOVE:
+                    case ALEF_HAMZA_BELOW:
+                        s[i] = ALEF;
+                        break;
+                    case DOTLESS_YEH:
+                        s[i] = YEH;
+                        break;
+                    case TEH_MARBUTA:
+                        s[i] = HEH;
+                        break;
+                    case TATWEEL:
+                    case KASRATAN:
+                    case DAMMATAN:
+                    case FATHATAN:
+                    case FATHA:
+                    case DAMMA:
+                    case KASRA:
+                    case SHADDA:
+                    case SUKUN:
+                        len = Delete(s, i, len);
+                        i--;
+                        break;
+                    default:
+                        break;
                 }
             }
 

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -36,13 +36,13 @@ namespace Lucene.Net.Analysis.AR
     public class ArabicStemFilter : TokenFilter
     {
 
-        protected ArabicStemmer stemmer = null;
-        private TermAttribute termAtt;
+        private readonly ArabicStemmer stemmer;
+        private readonly TermAttribute termAtt;
 
         public ArabicStemFilter(TokenStream input) : base(input)
         {
             stemmer = new ArabicStemmer();
-            termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            termAtt = AddAttribute<TermAttribute>();
         }
 
         public override bool IncrementToken()

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemmer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemmer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/AR/ArabicStemmer.cs Mon Nov 21 04:44:55 2011
@@ -42,19 +42,19 @@ namespace Lucene.Net.Analysis.AR
      */
     public class ArabicStemmer
     {
-        public static char ALEF = '\u0627';
-        public static char BEH = '\u0628';
-        public static char TEH_MARBUTA = '\u0629';
-        public static char TEH = '\u062A';
-        public static char FEH = '\u0641';
-        public static char KAF = '\u0643';
-        public static char LAM = '\u0644';
-        public static char NOON = '\u0646';
-        public static char HEH = '\u0647';
-        public static char WAW = '\u0648';
-        public static char YEH = '\u064A';
+        public const char ALEF = '\u0627';
+        public const char BEH = '\u0628';
+        public const char TEH_MARBUTA = '\u0629';
+        public const char TEH = '\u062A';
+        public const char FEH = '\u0641';
+        public const char KAF = '\u0643';
+        public const char LAM = '\u0644';
+        public const char NOON = '\u0646';
+        public const char HEH = '\u0647';
+        public const char WAW = '\u0648';
+        public const char YEH = '\u064A';
 
-        public static char[][] prefixes = {
+        public static readonly char[][] prefixes = {
             ("" + ALEF + LAM).ToCharArray(), 
             ("" + WAW + ALEF + LAM).ToCharArray(), 
             ("" + BEH + ALEF + LAM).ToCharArray(),
@@ -64,7 +64,7 @@ namespace Lucene.Net.Analysis.AR
             ("" + WAW).ToCharArray(),
         };
 
-        public static char[][] suffixes = {
+        public static readonly char[][] suffixes = {
             ("" + HEH + ALEF).ToCharArray(), 
             ("" + ALEF + NOON).ToCharArray(), 
             ("" + ALEF + TEH).ToCharArray(), 

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -15,11 +15,14 @@
  * limitations under the License.
  */
 
+using System;
 using System.Collections;
-
+using System.Collections.Generic;
+using System.Linq;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.Standard;
 using System.IO;
+using Version = Lucene.Net.Util.Version;
 
 /**
  * Analyzer for Brazilian language. Supports an external list of stopwords (words that
@@ -31,110 +34,216 @@ namespace Lucene.Net.Analysis.BR
 {
     public sealed class BrazilianAnalyzer : Analyzer
     {
-
         /**
          * List of typical Brazilian stopwords.
          */
+        //TODO: Make this private in 3.1
         public static string[] BRAZILIAN_STOP_WORDS = {
-      "a","ainda","alem","ambas","ambos","antes",
-      "ao","aonde","aos","apos","aquele","aqueles",
-      "as","assim","com","como","contra","contudo",
-      "cuja","cujas","cujo","cujos","da","das","de",
-      "dela","dele","deles","demais","depois","desde",
-      "desta","deste","dispoe","dispoem","diversa",
-      "diversas","diversos","do","dos","durante","e",
-      "ela","elas","ele","eles","em","entao","entre",
-      "essa","essas","esse","esses","esta","estas",
-      "este","estes","ha","isso","isto","logo","mais",
-      "mas","mediante","menos","mesma","mesmas","mesmo",
-      "mesmos","na","nas","nao","nas","nem","nesse","neste",
-      "nos","o","os","ou","outra","outras","outro","outros",
-      "pelas","pelas","pelo","pelos","perante","pois","por",
-      "porque","portanto","proprio","propios","quais","qual",
-      "qualquer","quando","quanto","que","quem","quer","se",
-      "seja","sem","sendo","seu","seus","sob","sobre","sua",
-      "suas","tal","tambem","teu","teus","toda","todas","todo",
-      "todos","tua","tuas","tudo","um","uma","umas","uns"};
+                                                          "a", "ainda", "alem", "ambas", "ambos", "antes",
+                                                          "ao", "aonde", "aos", "apos", "aquele", "aqueles",
+                                                          "as", "assim", "com", "como", "contra", "contudo",
+                                                          "cuja", "cujas", "cujo", "cujos", "da", "das", "de",
+                                                          "dela", "dele", "deles", "demais", "depois", "desde",
+                                                          "desta", "deste", "dispoe", "dispoem", "diversa",
+                                                          "diversas", "diversos", "do", "dos", "durante", "e",
+                                                          "ela", "elas", "ele", "eles", "em", "entao", "entre",
+                                                          "essa", "essas", "esse", "esses", "esta", "estas",
+                                                          "este", "estes", "ha", "isso", "isto", "logo", "mais",
+                                                          "mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
+                                                          "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
+                                                          "nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
+                                                          "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
+                                                          "porque", "portanto", "proprio", "propios", "quais", "qual",
+                                                          "qualquer", "quando", "quanto", "que", "quem", "quer", "se",
+                                                          "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
+                                                          "suas", "tal", "tambem", "teu", "teus", "toda", "todas",
+                                                          "todo",
+                                                          "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"
+                                                      };
 
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set.
+        /// </summary>
+        /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
 
-        /**
-         * Contains the stopwords used with the StopFilter.
-         */
-        private Hashtable stoptable = new Hashtable();
+        private static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_STOP_SET =
+                CharArraySet.UnmodifiableSet(new CharArraySet(BRAZILIAN_STOP_WORDS, false));
+        }
 
-        /**
-         * Contains words that should be indexed but not stemmed.
-         */
-        private Hashtable excltable = new Hashtable();
+        /// <summary>
+        /// Contains the stopwords used with the StopFilter.
+        /// </summary>
+        private ISet<string> stoptable = new HashSet<string>();
 
-        /**
-         * Builds an analyzer with the default stop words (<see cref="BRAZILIAN_STOP_WORDS"/>).
-         */
-        public BrazilianAnalyzer()
+        private readonly Version matchVersion;
+
+        /// <summary>
+        /// Contains words that should be indexed but not stemmed.
+        // TODO: make this private in 3.1
+        /// </summary>
+        private ISet<string> excltable = new HashSet<string>();
+
+        public BrazilianAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
         {
-            stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
         }
 
-        /**
-         * Builds an analyzer with the given stop words.
-         */
-        public BrazilianAnalyzer(string[] stopwords)
+        /**
+           * Builds an analyzer with the given stop words
+           * 
+           * @param matchVersion
+           *          lucene compatibility version
+           * @param stopwords
+           *          a stopword set
+           */
+
+        public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords)
         {
-            stoptable = StopFilter.MakeStopSet(stopwords);
+            stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
         }
 
-        /**
-         * Builds an analyzer with the given stop words.
+        /**
+         * Builds an analyzer with the given stop words and stemming exclusion words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
          */
-        public BrazilianAnalyzer(Hashtable stopwords)
+
+        public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords,
+                                 ISet<string> stemExclusionSet)
+            : this(matchVersion, stopwords)
         {
-            stoptable = stopwords;
+
+            excltable = CharArraySet.UnmodifiableSet(CharArraySet
+                                                         .Copy(stemExclusionSet));
         }
 
-        /**
-         * Builds an analyzer with the given stop words.
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
          */
-        public BrazilianAnalyzer(FileInfo stopwords)
+
+        public BrazilianAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
         {
-            stoptable = WordlistLoader.GetWordtable(stopwords);
+
         }
 
-        /**
-         * Builds an exclusionlist from an array of Strings.
+        /**
+   * Builds an analyzer with the given stop words. 
+   * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+   */
+
+        public BrazilianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+
+        }
+
+        /**
+   * Builds an analyzer with the given stop words.
+   * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+   */
+
+        public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
+        {
+        }
+
+        /**
+         * Builds an exclusionlist from an array of Strings.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
-        public void SetStemExclusionTable(string[] exclusionlist)
+
+        public void setStemExclusionTable(params string[] exclusionlist)
         {
             excltable = StopFilter.MakeStopSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
-        /**
-         * Builds an exclusionlist from a Hashtable.
+
+        /**
+         * Builds an exclusionlist from a {@link Map}.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
-        public void SetStemExclusionTable(Hashtable exclusionlist)
+
+        public void setStemExclusionTable(IDictionary<string, string> exclusionlist)
         {
-            excltable = exclusionlist;
+            excltable = new HashSet<string>(exclusionlist.Keys);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
-        /**
-         * Builds an exclusionlist from the words contained in the given file.
+
+        /**
+         * Builds an exclusionlist from the words contained in the given file.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
-        public void SetStemExclusionTable(FileInfo exclusionlist)
+
+        public void setStemExclusionTable(FileInfo exclusionlist)
         {
-            excltable = WordlistLoader.GetWordtable(exclusionlist);
+            excltable = WordlistLoader.GetWordSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
 
-        /**
-         * Creates a TokenStream which tokenizes all the text in the provided Reader.
-         *
-         * <returns>A TokenStream build from a StandardTokenizer filtered with
-         * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.</returns>
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         * 			{@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+         *          {@link BrazilianStemFilter}.
          */
-        public override TokenStream TokenStream(string fieldName, TextReader reader)
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
         {
-            TokenStream result = new StandardTokenizer(reader);
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
             result = new LowerCaseFilter(result);
             result = new StandardFilter(result);
-            result = new StopFilter(result, stoptable);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stoptable);
             result = new BrazilianStemFilter(result, excltable);
             return result;
         }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         *          {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+         *          {@link BrazilianStemFilter}.
+         */
+
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new StandardTokenizer(matchVersion, reader);
+                streams.result = new LowerCaseFilter(streams.source);
+                streams.result = new StandardFilter(streams.result);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stoptable);
+                streams.result = new BrazilianStemFilter(streams.result, excltable);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
     }
 }

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs Mon Nov 21 04:44:55 2011
@@ -15,8 +15,11 @@
  * limitations under the License.
  */
 
+using System.Collections.Generic;
 using Lucene.Net.Analysis;
 using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Version = Lucene.Net.Util.Version;
 
 
 /**
@@ -33,15 +36,17 @@ namespace Lucene.Net.Analysis.BR
          * The actual token in the input stream.
          */
         private BrazilianStemmer stemmer = null;
-        private Hashtable exclusions = null;
+        private ISet<string> exclusions = null;
+        private TermAttribute termAtt;
 
         public BrazilianStemFilter(TokenStream input)
             : base(input)
         {
-            stemmer = new BrazilianStemmer();
+            stemmer = new BrazilianStemmer();
+            termAtt = AddAttribute<TermAttribute>();
         }
 
-        public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+        public BrazilianStemFilter(TokenStream input, ISet<string> exclusiontable)
             : this(input)
         {
             this.exclusions = exclusiontable;
@@ -50,25 +55,25 @@ namespace Lucene.Net.Analysis.BR
         /**
          * <returns>Returns the next token in the stream, or null at EOS.</returns>
          */
-        public override Token Next(Token reusableToken)
+        public override bool IncrementToken()
         {
-            System.Diagnostics.Trace.Assert(reusableToken != null);
-
-            Token nextToken = input.Next(reusableToken);
-            if (nextToken == null)
-                return null;
-
-            string term = nextToken.TermText();
-
-            // Check the exclusion table.
-            if (exclusions == null || !exclusions.Contains(term))
+            if (input.IncrementToken())
+            {
+                string term = termAtt.Term();
+                // Check the exclusion table.
+                if (exclusions == null || !exclusions.Contains(term))
+                {
+                    string s = stemmer.Stem(term);
+                    // If not stemmed, don't waste the time adjusting the token.
+                    if ((s != null) && !s.Equals(term))
+                        termAtt.SetTermBuffer(s);
+                }
+                return true;
+            }
+            else
             {
-                string s = stemmer.Stem(term);
-                // If not stemmed, don't waste the time adjusting the token.
-                if ((s != null) && !s.Equals(term))
-                    nextToken.SetTermBuffer(s.ToCharArray(), 0, s.Length);//was  SetTermBuffer(s)
+                return false;
             }
-            return nextToken;
         }
     }
-}
\ No newline at end of file
+}

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -20,130 +20,135 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
 using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.CJK
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 *
-	 * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
-	 */
-
-	/// <summary>
-	/// Filters CJKTokenizer with StopFilter.
-	/// 
-	/// <author>Che, Dong</author>
-	/// </summary>
-	public class CJKAnalyzer : Analyzer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// An array containing some common English words that are not usually
-		/// useful for searching. and some double-byte interpunctions.....
-		/// </summary>
-		public static String[] stopWords = 
-		{
-			"a", "and", "are", "as", "at", "be",
-			"but", "by", "for", "if", "in",
-			"into", "is", "it", "no", "not",
-			"of", "on", "or", "s", "such", "t",
-			"that", "the", "their", "then",
-			"there", "these", "they", "this",
-			"to", "was", "will", "with", "",
-			"www"
-		};
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// stop word list
-		/// </summary>
-		private Hashtable stopTable;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Builds an analyzer which removes words in STOP_WORDS.
-		/// </summary>
-		public CJKAnalyzer() 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		/// <summary>
-		/// Builds an analyzer which removes words in the provided array.
-		/// </summary>
-		/// <param name="stopWords">stop word array</param>
-		public CJKAnalyzer(String[] stopWords) 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		/// get token stream from input
-		/// </summary>
-		/// <param name="fieldName">lucene field name</param>
-		/// <param name="reader">input reader</param>
-		/// <returns>Token Stream</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			return new StopFilter(new CJKTokenizer(reader), stopTable);
-		}
-	}
+    /// <summary>
+    /// Filters CJKTokenizer with StopFilter.
+    /// 
+    /// <author>Che, Dong</author>
+    /// </summary>
+    public class CJKAnalyzer : Analyzer
+    {
+        //~ Static fields/initializers ---------------------------------------------
+
+        /// <summary>
+        /// An array containing some common English words that are not usually
+        /// useful for searching. and some double-byte interpunctions.....
+        /// </summary>
+        // TODO make this final in 3.1 -
+        // this might be revised and merged with StopFilter stop words too
+        [Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS =
+            {
+                "a", "and", "are", "as", "at", "be",
+                "but", "by", "for", "if", "in",
+                "into", "is", "it", "no", "not",
+                "of", "on", "or", "s", "such", "t",
+                "that", "the", "their", "then",
+                "there", "these", "they", "this",
+                "to", "was", "will", "with", "",
+                "www"
+            };
+
+        //~ Instance fields --------------------------------------------------------
+
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set.
+        /// </summary>
+        /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        private static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_STOP_SET =
+                CharArraySet.UnmodifiableSet(new CharArraySet(STOP_WORDS, false));
+        }
+
+        /// <summary>
+        /// stop word list
+        /// </summary>
+        private ISet<string> stopTable;
+
+        private readonly Version matchVersion;
+
+        //~ Constructors -----------------------------------------------------------
+
+        public CJKAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+
+        }
+
+        public CJKAnalyzer(Version matchVersion, ISet<string> stopWords)
+        {
+            stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
+            this.matchVersion = matchVersion;
+        }
+
+        /// <summary>
+        /// Builds an analyzer which removes words in the provided array.
+        /// </summary>
+        /// <param name="stopWords">stop word array</param>
+        public CJKAnalyzer(Version matchVersion, params string[] stopWords)
+        {
+            stopTable = StopFilter.MakeStopSet(stopWords);
+            this.matchVersion = matchVersion;
+        }
+
+        //~ Methods ----------------------------------------------------------------
+
+        /// <summary>
+        /// get token stream from input
+        /// </summary>
+        /// <param name="fieldName">lucene field name</param>
+        /// <param name="reader">input reader</param>
+        /// <returns>Token Stream</returns>
+        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            return new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                  new CJKTokenizer(reader), stopTable);
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @param fieldName lucene field name
+         * @param reader    Input {@link Reader}
+         * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
+         *    {@link StopFilter}
+         */
+        public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            /* tokenStream() is final, no back compat issue */
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new CJKTokenizer(reader);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.source, stopTable);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs Mon Nov 21 04:44:55 2011
@@ -20,331 +20,380 @@
 */
 
 using System;
+using System.Globalization;
 using System.IO;
 using System.Text;
+using System.Text.RegularExpressions;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis.CJK
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// <p>
-	/// CJKTokenizer was modified from StopTokenizer which does a decent job for
-	/// most European languages. and it perferm other token method for double-byte
-	/// Characters: the token will return at each two charactors with overlap match.<br/>
-	/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
-	/// also need filter filter zero length token ""<br/>
-	/// for Digit: digit, '+', '#' will token as letter<br/>
-	/// for more info on Asia language(Chinese Japanese Korean) text segmentation:
-	/// please search  <a
-	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
-	/// </p>
-	/// 
-	/// @author Che, Dong
-	/// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
-	/// </summary>
-	public sealed class CJKTokenizer : Tokenizer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// Max word length
-		/// </summary>
-		private static int MAX_WORD_LEN = 255;
-
-		/// <summary>
-		/// buffer size
-		/// </summary>
-		private static int IO_BUFFER_SIZE = 256;
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// word offset, used to imply which character(in ) is parsed
-		/// </summary>
-		private int offset = 0;
-
-		/// <summary>
-		/// the index used only for ioBuffer
-		/// </summary>
-		private int bufferIndex = 0;
-
-		/// <summary>
-		/// data length
-		/// </summary>
-		private int dataLen = 0;
-
-		/// <summary>
-		/// character buffer, store the characters which are used to compose <br/>
-		/// the returned Token
-		/// </summary>
-		private char[] buffer = new char[MAX_WORD_LEN];
-
-		/// <summary>
-		/// I/O buffer, used to store the content of the input(one of the <br/>
-		/// members of Tokenizer)
-		/// </summary>
-		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-		/// <summary>
-		/// word type: single=>ASCII  double=>non-ASCII word=>default 
-		/// </summary>
-		private String tokenType = "word";
-
-		/// <summary>
-		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
-		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
-		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
-		/// </summary>
-		private bool preIsTokened = false;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Construct a token stream processing the given input.
-		/// </summary>
-		/// <param name="_in">I/O reader</param>
-		public CJKTokenizer(TextReader _in) 
-		{
-			input = _in;
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		///  Returns the next token in the stream, or null at EOS.
-		/// </summary>
-		/// <returns>Token</returns>
-		public override Token Next()
-		{
-			/** how many character(s) has been stored in buffer */
-			int length = 0;
-
-			/** the position used to create Token */
-			int start = offset;
-
-			while (true) 
-			{
-				/** current charactor */
-				char c;
-
-				/** unicode block of current charactor for detail */
-				//Character.UnicodeBlock ub;
-
-				offset++;
-
-				if (bufferIndex >= dataLen) 
-				{
-					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
-					bufferIndex = 0;
-				}
-
-				if (dataLen == 0) 
-				{
-					if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						}
-
-						break;
-					} 
-					else 
-					{
-						return null;
-					}
-				} 
-				else 
-				{
-					//get current character
-					c = ioBuffer[bufferIndex++];
-
-					//get the UnicodeBlock of the current character
-					//ub = Character.UnicodeBlock.of(c);
-				}
-
-				//if the current character is ASCII or Extend ASCII
-				if (('\u0000' <= c && c <= '\u007F') || 
-					('\uFF00' <= c && c <= '\uFFEF')) 
-				{
-					if ('\uFF00' <= c && c <= '\uFFEF')
-					{
-						/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
-						int i = (int) c;
-						i = i - 65248;
-						c = (char) i;
-					}
-
-					// if the current character is a letter or "_" "+" "#"
-					if (Char.IsLetterOrDigit(c)
-						|| ((c == '_') || (c == '+') || (c == '#'))
-						) 
-					{
-						if (length == 0) 
-						{
-							// "javaC1C2C3C4linux" <br/>
-							//      ^--: the current character begin to token the ASCII
-							// letter
-							start = offset - 1;
-						} 
-						else if (tokenType == "double") 
-						{
-							// "javaC1C2C3C4linux" <br/>
-							//              ^--: the previous non-ASCII
-							// : the current character
-							offset--;
-							bufferIndex--;
-							tokenType = "single";
-
-							if (preIsTokened == true) 
-							{
-								// there is only one non-ASCII has been stored
-								length = 0;
-								preIsTokened = false;
-
-								break;
-							} 
-							else 
-							{
-								break;
-							}
-						}
-
-						// store the LowerCase(c) in the buffer
-						buffer[length++] = Char.ToLower(c);
-						tokenType = "single";
-
-						// break the procedure if buffer overflowed!
-						if (length == MAX_WORD_LEN) 
-						{
-							break;
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				} 
-				else 
-				{
-					// non-ASCII letter, eg."C1C2C3C4"
-					if (Char.IsLetter(c)) 
-					{
-						if (length == 0) 
-						{
-							start = offset - 1;
-							buffer[length++] = c;
-							tokenType = "double";
-						} 
-						else 
-						{
-							if (tokenType == "single") 
-							{
-								offset--;
-								bufferIndex--;
-
-								//return the previous ASCII characters
-								break;
-							} 
-							else 
-							{
-								buffer[length++] = c;
-								tokenType = "double";
-
-								if (length == 2) 
-								{
-									offset--;
-									bufferIndex--;
-									preIsTokened = true;
-
-									break;
-								}
-							}
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							// empty the buffer
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				}
-			}
-
-			return new Token(new String(buffer, 0, length), start, start + length,
-				tokenType
-				);
-		}
-	}
-
+    /// <summary>
+    /// <p>
+    /// CJKTokenizer was modified from StopTokenizer which does a decent job for
+    /// most European languages. and it perferm other token method for double-byte
+    /// chars: the token will return at each two charactors with overlap match.<br/>
+    /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+    /// also need filter filter zero length token ""<br/>
+    /// for Digit: digit, '+', '#' will token as letter<br/>
+    /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+    /// please search  <a
+    /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+    /// </p>
+    /// 
+    /// @author Che, Dong
+    /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+    /// </summary>
+    public sealed class CJKTokenizer : Tokenizer
+    {
+        //~ Static fields/initializers ---------------------------------------------
+        /// <summary>
+        /// Word token type
+        /// </summary>
+        private static int WORD_TYPE = 0;
+
+        /// <summary>
+        /// Single byte token type
+        /// </summary>
+        private static int SINGLE_TOKEN_TYPE = 1;
+
+        /// <summary>
+        /// Double byte token type
+        /// </summary>
+        private static int DOUBLE_TOKEN_TYPE = 2;
+
+        /// <summary>
+        /// Names for token types
+        /// </summary>
+        private static String[] TOKEN_TYPE_NAMES = {"word", "single", "double"};
+
+        /// <summary>
+        /// Max word length
+        /// </summary>
+        private static int MAX_WORD_LEN = 255;
+
+        /// <summary>
+        /// buffer size
+        /// </summary>
+        private static int IO_BUFFER_SIZE = 256;
+
+        //~ Instance fields --------------------------------------------------------
+
+        /// <summary>
+        /// word offset, used to imply which character(in ) is parsed
+        /// </summary>
+        private int offset = 0;
+
+        /// <summary>
+        /// the index used only for ioBuffer
+        /// </summary>
+        private int bufferIndex = 0;
+
+        /// <summary>
+        /// data length
+        /// </summary>
+        private int dataLen = 0;
+
+        /// <summary>
+        /// character buffer, store the characters which are used to compose <br/>
+        /// the returned Token
+        /// </summary>
+        private char[] buffer = new char[MAX_WORD_LEN];
+
+        /// <summary>
+        /// I/O buffer, used to store the content of the input(one of the <br/>
+        /// members of Tokenizer)
+        /// </summary>
+        private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+        /// <summary>
+        /// word type: single=>ASCII  double=>non-ASCII word=>default
+        /// </summary>
+        private int tokenType = WORD_TYPE;
+
+        /// <summary>
+        /// tag: previous character is a cached double-byte character  "C1C2C3C4"
+        /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+        /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+        /// </summary>
+        private bool preIsTokened = false;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+        private TypeAttribute typeAtt;
+
+        //~ Constructors -----------------------------------------------------------
+
+        /// <summary>
+        /// Construct a token stream processing the given input.
+        /// </summary>
+        /// <param name="_in">I/O reader</param>
+        public CJKTokenizer(TextReader _in)
+            : base(_in)
+        {
+            Init();
+        }
+
+        public CJKTokenizer(AttributeSource source, TextReader _in)
+            : base(source, _in)
+        {
+            Init();
+        }
+
+        public CJKTokenizer(AttributeFactory factory, TextReader _in)
+            : base(factory, _in)
+        {
+            Init();
+        }
+
+        private void Init()
+        {
+            termAtt = AddAttribute<TermAttribute>();
+            offsetAtt = AddAttribute<OffsetAttribute>();
+            typeAtt = AddAttribute<TypeAttribute>();
+        }
+
+        //~ Methods ----------------------------------------------------------------
+
+        /**
+         * Returns true for the next token in the stream, or false at EOS.
+         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html
+         * for detail.
+         *
+         * @return false for end of stream, true otherwise
+         *
+         * @throws java.io.IOException - throw IOException when read error <br>
+         *         happened in the InputStream
+         *
+         */
+
+        Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled);
+        Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            /** how many character(s) has been stored in buffer */
+
+            while (true)
+            {
+                // loop until we find a non-empty token
+
+                int length = 0;
+
+                /** the position used to create Token */
+                int start = offset;
+
+                while (true)
+                {
+                    // loop until we've found a full token
+                    /** current character */
+                    char c;
+
+                    offset++;
+
+                    if (bufferIndex >= dataLen)
+                    {
+                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+                        bufferIndex = 0;
+                    }
+
+                    if (dataLen == -1)
+                    {
+                        if (length > 0)
+                        {
+                            if (preIsTokened == true)
+                            {
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                offset--;
+                            }
+
+                            break;
+                        }
+                        else
+                        {
+                            offset--;
+                            return false;
+                        }
+                    }
+                    else
+                    {
+                        //get current character
+                        c = ioBuffer[bufferIndex++];
+                    }
+
+                    //TODO: Using a Regex to determine the UnicodeCategory is probably slower than
+                    //      If we just created a small class that would look it up for us, which 
+                    //      would likely be trivial, however time-consuming.  I can't imagine a Regex
+                    //      being fast for this, considering we have to pull a char from the buffer,
+                    //      and convert it to a string before we run a regex on it. - cc
+                    bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;
+                    //if the current character is ASCII or Extend ASCII
+                    if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))
+                    {
+                        if (isHalfFullForm)
+                        {
+                            int i = (int) c;
+                            if (i >= 65281 && i <= 65374)
+                            {
+                                // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+                                i = i - 65248;
+                                c = (char) i;
+                            }
+                        }
+
+                        // if the current character is a letter or "_" "+" "#"
+                        if (char.IsLetterOrDigit(c)
+                            || ((c == '_') || (c == '+') || (c == '#'))
+                            )
+                        {
+                            if (length == 0)
+                            {
+                                // "javaC1C2C3C4linux" <br>
+                                //      ^--: the current character begin to token the ASCII
+                                // letter
+                                start = offset - 1;
+                            }
+                            else if (tokenType == DOUBLE_TOKEN_TYPE)
+                            {
+                                // "javaC1C2C3C4linux" <br>
+                                //              ^--: the previous non-ASCII
+                                // : the current character
+                                offset--;
+                                bufferIndex--;
+
+                                if (preIsTokened == true)
+                                {
+                                    // there is only one non-ASCII has been stored
+                                    length = 0;
+                                    preIsTokened = false;
+                                    break;
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+
+                            // store the LowerCase(c) in the buffer
+                            buffer[length++] = char.ToLowerInvariant(c); // TODO: is java invariant?  If not, this should be ToLower()
+                            tokenType = SINGLE_TOKEN_TYPE;
+
+                            // break the procedure if buffer overflowed!
+                            if (length == MAX_WORD_LEN)
+                            {
+                                break;
+                            }
+                        }
+                        else if (length > 0)
+                        {
+                            if (preIsTokened == true)
+                            {
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // non-ASCII letter, e.g."C1C2C3C4"
+                        if (char.IsLetter(c))
+                        {
+                            if (length == 0)
+                            {
+                                start = offset - 1;
+                                buffer[length++] = c;
+                                tokenType = DOUBLE_TOKEN_TYPE;
+                            }
+                            else
+                            {
+                                if (tokenType == SINGLE_TOKEN_TYPE)
+                                {
+                                    offset--;
+                                    bufferIndex--;
+
+                                    //return the previous ASCII characters
+                                    break;
+                                }
+                                else
+                                {
+                                    buffer[length++] = c;
+                                    tokenType = DOUBLE_TOKEN_TYPE;
+
+                                    if (length == 2)
+                                    {
+                                        offset--;
+                                        bufferIndex--;
+                                        preIsTokened = true;
+
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+                        else if (length > 0)
+                        {
+                            if (preIsTokened == true)
+                            {
+                                // empty the buffer
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (length > 0)
+                {
+                    termAtt.SetTermBuffer(buffer, 0, length);
+                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+                    typeAtt.SetType(TOKEN_TYPE_NAMES[tokenType]);
+                    return true;
+                }
+                else if (dataLen == -1)
+                {
+                    offset--;
+                    return false;
+                }
+
+                // Cycle back and try for the next token (don't
+                // return an empty string)
+            }
+        }
+
+        public override void End()
+        {
+            // set final offset
+            int finalOffset = CorrectOffset(offset);
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            offset = bufferIndex = dataLen = 0;
+            preIsTokened = false;
+            tokenType = WORD_TYPE;
+        }
+
+        public override void Reset(TextReader reader)
+        {
+            base.Reset(reader);
+            Reset();
+        }
+    }
 }

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs Mon Nov 21 04:44:55 2011
@@ -28,86 +28,58 @@ using Lucene.Net.Analysis;
 
 namespace Lucene.Net.Analysis.Cn
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
+    /// <summary>
+    /// An <see cref="Analyzer"/> that tokenizes text with <see cref="ChineseTokenizer"/> and
+    /// filters with <see cref="ChineseFilter"/>
+    /// </summary>
+    public class ChineseAnalyzer : Analyzer
+    {
 
-	/// <summary>
-	/// Title: ChineseAnalyzer
-	/// Description:
-	///   Subclass of org.apache.lucene.analysis.Analyzer
-	///   build from a ChineseTokenizer, filtered with ChineseFilter.
-	/// Copyright:   Copyright (c) 2001
-	/// Company:
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
-	/// </summary>
-	public class ChineseAnalyzer : Analyzer 
-	{
+        public ChineseAnalyzer()
+        {
+        }
 
-		public ChineseAnalyzer() 
-		{
-		}
+        /// <summary>
+        /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+        /// </summary>
+        /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new ChineseTokenizer(reader);
+            result = new ChineseFilter(result);
+            return result;
+        }
 
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
-		/// </summary>
-		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			TokenStream result = new ChineseTokenizer(reader);
-			result = new ChineseFilter(result);
-			return result;
-		}
-	}
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /// <summary>
+        /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text in the
+        /// provided <see cref="TextReader"/>.
+        /// </summary>
+        /// <returns>
+        ///   A <see cref="TokenStream"/> built from a <see cref="ChineseTokenizer"/> 
+        ///   filtered with <see cref="ChineseFilter"/>.
+        /// </returns>
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            /* tokenStream() is final, no back compat issue */
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new ChineseTokenizer(reader);
+                streams.result = new ChineseFilter(streams.source);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }



Mime
View raw message