lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ccurr...@apache.org
Subject [Lucene.Net] svn commit: r1294875 [2/45] - in /incubator/lucene.net/trunk: ./ build/ build/vs2010/contrib/ build/vs2010/test/ doc/ src/ src/contrib/Analyzers/ src/contrib/Analyzers/AR/ src/contrib/Analyzers/BR/ src/contrib/Analyzers/CJK/ src/contrib/Analyzers/Cn/ s...
Date Tue, 28 Feb 2012 22:43:28 GMT
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -1,4 +1,4 @@
-/*
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,11 +15,14 @@
  * limitations under the License.
  */
 
+using System;
 using System.Collections;
-
+using System.Collections.Generic;
+using System.Linq;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.Standard;
 using System.IO;
+using Version = Lucene.Net.Util.Version;
 
 /**
  * Analyzer for Brazilian language. Supports an external list of stopwords (words that
@@ -31,110 +34,216 @@ namespace Lucene.Net.Analysis.BR
 {
     public sealed class BrazilianAnalyzer : Analyzer
     {
-
         /**
          * List of typical Brazilian stopwords.
          */
+        //TODO: Make this private in 3.1
         public static string[] BRAZILIAN_STOP_WORDS = {
-      "a","ainda","alem","ambas","ambos","antes",
-      "ao","aonde","aos","apos","aquele","aqueles",
-      "as","assim","com","como","contra","contudo",
-      "cuja","cujas","cujo","cujos","da","das","de",
-      "dela","dele","deles","demais","depois","desde",
-      "desta","deste","dispoe","dispoem","diversa",
-      "diversas","diversos","do","dos","durante","e",
-      "ela","elas","ele","eles","em","entao","entre",
-      "essa","essas","esse","esses","esta","estas",
-      "este","estes","ha","isso","isto","logo","mais",
-      "mas","mediante","menos","mesma","mesmas","mesmo",
-      "mesmos","na","nas","nao","nas","nem","nesse","neste",
-      "nos","o","os","ou","outra","outras","outro","outros",
-      "pelas","pelas","pelo","pelos","perante","pois","por",
-      "porque","portanto","proprio","propios","quais","qual",
-      "qualquer","quando","quanto","que","quem","quer","se",
-      "seja","sem","sendo","seu","seus","sob","sobre","sua",
-      "suas","tal","tambem","teu","teus","toda","todas","todo",
-      "todos","tua","tuas","tudo","um","uma","umas","uns"};
+                                                          "a", "ainda", "alem", "ambas", "ambos", "antes",
+                                                          "ao", "aonde", "aos", "apos", "aquele", "aqueles",
+                                                          "as", "assim", "com", "como", "contra", "contudo",
+                                                          "cuja", "cujas", "cujo", "cujos", "da", "das", "de",
+                                                          "dela", "dele", "deles", "demais", "depois", "desde",
+                                                          "desta", "deste", "dispoe", "dispoem", "diversa",
+                                                          "diversas", "diversos", "do", "dos", "durante", "e",
+                                                          "ela", "elas", "ele", "eles", "em", "entao", "entre",
+                                                          "essa", "essas", "esse", "esses", "esta", "estas",
+                                                          "este", "estes", "ha", "isso", "isto", "logo", "mais",
+                                                          "mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
+                                                          "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
+                                                          "nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
+                                                          "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
+                                                          "porque", "portanto", "proprio", "propios", "quais", "qual",
+                                                          "qualquer", "quando", "quanto", "que", "quem", "quer", "se",
+                                                          "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
+                                                          "suas", "tal", "tambem", "teu", "teus", "toda", "todas",
+                                                          "todo",
+                                                          "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"
+                                                      };
 
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set.
+        /// </summary>
+        /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
 
-        /**
-         * Contains the stopwords used with the StopFilter.
-         */
-        private Hashtable stoptable = new Hashtable();
+        private static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_STOP_SET =
+                CharArraySet.UnmodifiableSet(new CharArraySet(BRAZILIAN_STOP_WORDS, false));
+        }
+
+        /// <summary>
+        /// Contains the stopwords used with the StopFilter.
+        /// </summary>
+        private ISet<string> stoptable = new HashSet<string>();
+
+        private readonly Version matchVersion;
+
+        /// <summary>
+        /// Contains words that should be indexed but not stemmed.
+        // TODO: make this private in 3.1
+        /// </summary>
+        private ISet<string> excltable = new HashSet<string>();
+
+        public BrazilianAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+        }
 
         /**
-         * Contains words that should be indexed but not stemmed.
-         */
-        private Hashtable excltable = new Hashtable();
+           * Builds an analyzer with the given stop words
+           * 
+           * @param matchVersion
+           *          lucene compatibility version
+           * @param stopwords
+           *          a stopword set
+           */
+
+        public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords)
+        {
+            stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
+        }
 
         /**
-         * Builds an analyzer with the default stop words (<see cref="BRAZILIAN_STOP_WORDS"/>).
+         * Builds an analyzer with the given stop words and stemming exclusion words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
          */
-        public BrazilianAnalyzer()
+
+        public BrazilianAnalyzer(Version matchVersion, ISet<string> stopwords,
+                                 ISet<string> stemExclusionSet)
+            : this(matchVersion, stopwords)
         {
-            stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
+
+            excltable = CharArraySet.UnmodifiableSet(CharArraySet
+                                                         .Copy(stemExclusionSet));
         }
 
         /**
          * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
          */
-        public BrazilianAnalyzer(string[] stopwords)
+
+        public BrazilianAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
         {
-            stoptable = StopFilter.MakeStopSet(stopwords);
+
         }
 
         /**
-         * Builds an analyzer with the given stop words.
-         */
-        public BrazilianAnalyzer(Hashtable stopwords)
+   * Builds an analyzer with the given stop words. 
+   * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+   */
+
+        public BrazilianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
         {
-            stoptable = stopwords;
+
         }
 
         /**
-         * Builds an analyzer with the given stop words.
-         */
-        public BrazilianAnalyzer(FileInfo stopwords)
+   * Builds an analyzer with the given stop words.
+   * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
+   */
+
+        public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
+            : this(matchVersion, WordlistLoader.GetWordSet(stopwords))
         {
-            stoptable = WordlistLoader.GetWordtable(stopwords);
         }
 
         /**
          * Builds an exclusionlist from an array of Strings.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
-        public void SetStemExclusionTable(string[] exclusionlist)
+
+        public void SetStemExclusionTable(params string[] exclusionlist)
         {
             excltable = StopFilter.MakeStopSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
+
         /**
-         * Builds an exclusionlist from a Hashtable.
+         * Builds an exclusionlist from a {@link Map}.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
-        public void SetStemExclusionTable(Hashtable exclusionlist)
+
+        public void SetStemExclusionTable(IDictionary<string, string> exclusionlist)
         {
-            excltable = exclusionlist;
+            excltable = new HashSet<string>(exclusionlist.Keys);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
+
         /**
          * Builds an exclusionlist from the words contained in the given file.
+         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
          */
+
         public void SetStemExclusionTable(FileInfo exclusionlist)
         {
-            excltable = WordlistLoader.GetWordtable(exclusionlist);
+            excltable = WordlistLoader.GetWordSet(exclusionlist);
+            SetPreviousTokenStream(null); // force a new stemmer to be created
         }
 
         /**
-         * Creates a TokenStream which tokenizes all the text in the provided Reader.
+         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
          *
-         * <returns>A TokenStream build from a StandardTokenizer filtered with
-         * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.</returns>
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         * 			{@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+         *          {@link BrazilianStemFilter}.
          */
-        public override TokenStream TokenStream(string fieldName, TextReader reader)
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
         {
-            TokenStream result = new StandardTokenizer(reader);
+            TokenStream result = new StandardTokenizer(matchVersion, reader);
             result = new LowerCaseFilter(result);
             result = new StandardFilter(result);
-            result = new StopFilter(result, stoptable);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stoptable);
             result = new BrazilianStemFilter(result, excltable);
             return result;
         }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
+         *          {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and 
+         *          {@link BrazilianStemFilter}.
+         */
+
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new StandardTokenizer(matchVersion, reader);
+                streams.result = new LowerCaseFilter(streams.source);
+                streams.result = new StandardFilter(streams.result);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stoptable);
+                streams.result = new BrazilianStemFilter(streams.result, excltable);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
     }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/BR/BrazilianStemFilter.cs Tue Feb 28 22:43:08 2012
@@ -15,8 +15,11 @@
  * limitations under the License.
  */
 
+using System.Collections.Generic;
 using Lucene.Net.Analysis;
 using System.Collections;
+using Lucene.Net.Analysis.Tokenattributes;
+using Version = Lucene.Net.Util.Version;
 
 
 /**
@@ -33,15 +36,17 @@ namespace Lucene.Net.Analysis.BR
          * The actual token in the input stream.
          */
         private BrazilianStemmer stemmer = null;
-        private Hashtable exclusions = null;
+        private ISet<string> exclusions = null;
+        private TermAttribute termAtt;
 
         public BrazilianStemFilter(TokenStream input)
             : base(input)
         {
-            stemmer = new BrazilianStemmer();
+            stemmer = new BrazilianStemmer();
+            termAtt = AddAttribute<TermAttribute>();
         }
 
-        public BrazilianStemFilter(TokenStream input, Hashtable exclusiontable)
+        public BrazilianStemFilter(TokenStream input, ISet<string> exclusiontable)
             : this(input)
         {
             this.exclusions = exclusiontable;
@@ -50,25 +55,25 @@ namespace Lucene.Net.Analysis.BR
         /**
          * <returns>Returns the next token in the stream, or null at EOS.</returns>
          */
-        public override Token Next(Token reusableToken)
+        public override bool IncrementToken()
         {
-            System.Diagnostics.Trace.Assert(reusableToken != null);
-
-            Token nextToken = input.Next(reusableToken);
-            if (nextToken == null)
-                return null;
-
-            string term = nextToken.TermText();
-
-            // Check the exclusion table.
-            if (exclusions == null || !exclusions.Contains(term))
+            if (input.IncrementToken())
+            {
+                string term = termAtt.Term();
+                // Check the exclusion table.
+                if (exclusions == null || !exclusions.Contains(term))
+                {
+                    string s = stemmer.Stem(term);
+                    // If not stemmed, don't waste the time adjusting the token.
+                    if ((s != null) && !s.Equals(term))
+                        termAtt.SetTermBuffer(s);
+                }
+                return true;
+            }
+            else
             {
-                string s = stemmer.Stem(term);
-                // If not stemmed, don't waste the time adjusting the token.
-                if ((s != null) && !s.Equals(term))
-                    nextToken.SetTermBuffer(s.ToCharArray(), 0, s.Length);//was  SetTermBuffer(s)
+                return false;
             }
-            return nextToken;
         }
     }
-}
\ No newline at end of file
+}

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -20,130 +20,135 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Collections;
 using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.CJK
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 *
-	 * $Id: CJKAnalyzer.java,v 1.5 2004/10/17 11:41:41 dnaber Exp $
-	 */
-
-	/// <summary>
-	/// Filters CJKTokenizer with StopFilter.
-	/// 
-	/// <author>Che, Dong</author>
-	/// </summary>
-	public class CJKAnalyzer : Analyzer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// An array containing some common English words that are not usually
-		/// useful for searching. and some double-byte interpunctions.....
-		/// </summary>
-		public static String[] stopWords = 
-		{
-			"a", "and", "are", "as", "at", "be",
-			"but", "by", "for", "if", "in",
-			"into", "is", "it", "no", "not",
-			"of", "on", "or", "s", "such", "t",
-			"that", "the", "their", "then",
-			"there", "these", "they", "this",
-			"to", "was", "will", "with", "",
-			"www"
-		};
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// stop word list
-		/// </summary>
-		private Hashtable stopTable;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Builds an analyzer which removes words in STOP_WORDS.
-		/// </summary>
-		public CJKAnalyzer() 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		/// <summary>
-		/// Builds an analyzer which removes words in the provided array.
-		/// </summary>
-		/// <param name="stopWords">stop word array</param>
-		public CJKAnalyzer(String[] stopWords) 
-		{
-			stopTable = StopFilter.MakeStopSet(stopWords);
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		/// get token stream from input
-		/// </summary>
-		/// <param name="fieldName">lucene field name</param>
-		/// <param name="reader">input reader</param>
-		/// <returns>Token Stream</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			return new StopFilter(new CJKTokenizer(reader), stopTable);
-		}
-	}
+    /// <summary>
+    /// Filters CJKTokenizer with StopFilter.
+    /// 
+    /// <author>Che, Dong</author>
+    /// </summary>
+    public class CJKAnalyzer : Analyzer
+    {
+        //~ Static fields/initializers ---------------------------------------------
+
+        /// <summary>
+        /// An array containing some common English words that are not usually
+        /// useful for searching. and some double-byte interpunctions.....
+        /// </summary>
+        // TODO make this final in 3.1 -
+        // this might be revised and merged with StopFilter stop words too
+        [Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS =
+            {
+                "a", "and", "are", "as", "at", "be",
+                "but", "by", "for", "if", "in",
+                "into", "is", "it", "no", "not",
+                "of", "on", "or", "s", "such", "t",
+                "that", "the", "their", "then",
+                "there", "these", "they", "this",
+                "to", "was", "will", "with", "",
+                "www"
+            };
+
+        //~ Instance fields --------------------------------------------------------
+
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set.
+        /// </summary>
+        /// <returns>Returns an unmodifiable instance of the default stop-words set.</returns>
+        public static ISet<string> GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        private static class DefaultSetHolder
+        {
+            internal static ISet<string> DEFAULT_STOP_SET =
+                CharArraySet.UnmodifiableSet(new CharArraySet(STOP_WORDS, false));
+        }
+
+        /// <summary>
+        /// stop word list
+        /// </summary>
+        private ISet<string> stopTable;
+
+        private readonly Version matchVersion;
+
+        //~ Constructors -----------------------------------------------------------
+
+        public CJKAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+
+        }
+
+        public CJKAnalyzer(Version matchVersion, ISet<string> stopWords)
+        {
+            stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
+            this.matchVersion = matchVersion;
+        }
+
+        /// <summary>
+        /// Builds an analyzer which removes words in the provided array.
+        /// </summary>
+        /// <param name="stopWords">stop word array</param>
+        public CJKAnalyzer(Version matchVersion, params string[] stopWords)
+        {
+            stopTable = StopFilter.MakeStopSet(stopWords);
+            this.matchVersion = matchVersion;
+        }
+
+        //~ Methods ----------------------------------------------------------------
+
+        /// <summary>
+        /// get token stream from input
+        /// </summary>
+        /// <param name="fieldName">lucene field name</param>
+        /// <param name="reader">input reader</param>
+        /// <returns>Token Stream</returns>
+        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            return new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                  new CJKTokenizer(reader), stopTable);
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @param fieldName lucene field name
+         * @param reader    Input {@link Reader}
+         * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
+         *    {@link StopFilter}
+         */
+        public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            /* tokenStream() is final, no back compat issue */
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new CJKTokenizer(reader);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.source, stopTable);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/CJK/CJKTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -20,331 +20,380 @@
 */
 
 using System;
+using System.Globalization;
 using System.IO;
 using System.Text;
+using System.Text.RegularExpressions;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis.CJK
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// <p>
-	/// CJKTokenizer was modified from StopTokenizer which does a decent job for
-	/// most European languages. and it perferm other token method for double-byte
-	/// Characters: the token will return at each two charactors with overlap match.<br/>
-	/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
-	/// also need filter filter zero length token ""<br/>
-	/// for Digit: digit, '+', '#' will token as letter<br/>
-	/// for more info on Asia language(Chinese Japanese Korean) text segmentation:
-	/// please search  <a
-	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
-	/// </p>
-	/// 
-	/// @author Che, Dong
-	/// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
-	/// </summary>
-	public sealed class CJKTokenizer : Tokenizer 
-	{
-		//~ Static fields/initializers ---------------------------------------------
-
-		/// <summary>
-		/// Max word length
-		/// </summary>
-		private static int MAX_WORD_LEN = 255;
-
-		/// <summary>
-		/// buffer size
-		/// </summary>
-		private static int IO_BUFFER_SIZE = 256;
-
-		//~ Instance fields --------------------------------------------------------
-
-		/// <summary>
-		/// word offset, used to imply which character(in ) is parsed
-		/// </summary>
-		private int offset = 0;
-
-		/// <summary>
-		/// the index used only for ioBuffer
-		/// </summary>
-		private int bufferIndex = 0;
-
-		/// <summary>
-		/// data length
-		/// </summary>
-		private int dataLen = 0;
-
-		/// <summary>
-		/// character buffer, store the characters which are used to compose <br/>
-		/// the returned Token
-		/// </summary>
-		private char[] buffer = new char[MAX_WORD_LEN];
-
-		/// <summary>
-		/// I/O buffer, used to store the content of the input(one of the <br/>
-		/// members of Tokenizer)
-		/// </summary>
-		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-		/// <summary>
-		/// word type: single=>ASCII  double=>non-ASCII word=>default 
-		/// </summary>
-		private String tokenType = "word";
-
-		/// <summary>
-		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
-		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
-		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
-		/// </summary>
-		private bool preIsTokened = false;
-
-		//~ Constructors -----------------------------------------------------------
-
-		/// <summary>
-		/// Construct a token stream processing the given input.
-		/// </summary>
-		/// <param name="_in">I/O reader</param>
-		public CJKTokenizer(TextReader _in) 
-		{
-			input = _in;
-		}
-
-		//~ Methods ----------------------------------------------------------------
-
-		/// <summary>
-		///  Returns the next token in the stream, or null at EOS.
-		/// </summary>
-		/// <returns>Token</returns>
-		public override Token Next()
-		{
-			/** how many character(s) has been stored in buffer */
-			int length = 0;
-
-			/** the position used to create Token */
-			int start = offset;
-
-			while (true) 
-			{
-				/** current charactor */
-				char c;
-
-				/** unicode block of current charactor for detail */
-				//Character.UnicodeBlock ub;
-
-				offset++;
-
-				if (bufferIndex >= dataLen) 
-				{
-					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
-					bufferIndex = 0;
-				}
-
-				if (dataLen == 0) 
-				{
-					if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						}
-
-						break;
-					} 
-					else 
-					{
-						return null;
-					}
-				} 
-				else 
-				{
-					//get current character
-					c = ioBuffer[bufferIndex++];
-
-					//get the UnicodeBlock of the current character
-					//ub = Character.UnicodeBlock.of(c);
-				}
-
-				//if the current character is ASCII or Extend ASCII
-				if (('\u0000' <= c && c <= '\u007F') || 
-					('\uFF00' <= c && c <= '\uFFEF')) 
-				{
-					if ('\uFF00' <= c && c <= '\uFFEF')
-					{
-						/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
-						int i = (int) c;
-						i = i - 65248;
-						c = (char) i;
-					}
-
-					// if the current character is a letter or "_" "+" "#"
-					if (Char.IsLetterOrDigit(c)
-						|| ((c == '_') || (c == '+') || (c == '#'))
-						) 
-					{
-						if (length == 0) 
-						{
-							// "javaC1C2C3C4linux" <br/>
-							//      ^--: the current character begin to token the ASCII
-							// letter
-							start = offset - 1;
-						} 
-						else if (tokenType == "double") 
-						{
-							// "javaC1C2C3C4linux" <br/>
-							//              ^--: the previous non-ASCII
-							// : the current character
-							offset--;
-							bufferIndex--;
-							tokenType = "single";
-
-							if (preIsTokened == true) 
-							{
-								// there is only one non-ASCII has been stored
-								length = 0;
-								preIsTokened = false;
-
-								break;
-							} 
-							else 
-							{
-								break;
-							}
-						}
-
-						// store the LowerCase(c) in the buffer
-						buffer[length++] = Char.ToLower(c);
-						tokenType = "single";
-
-						// break the procedure if buffer overflowed!
-						if (length == MAX_WORD_LEN) 
-						{
-							break;
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				} 
-				else 
-				{
-					// non-ASCII letter, eg."C1C2C3C4"
-					if (Char.IsLetter(c)) 
-					{
-						if (length == 0) 
-						{
-							start = offset - 1;
-							buffer[length++] = c;
-							tokenType = "double";
-						} 
-						else 
-						{
-							if (tokenType == "single") 
-							{
-								offset--;
-								bufferIndex--;
-
-								//return the previous ASCII characters
-								break;
-							} 
-							else 
-							{
-								buffer[length++] = c;
-								tokenType = "double";
-
-								if (length == 2) 
-								{
-									offset--;
-									bufferIndex--;
-									preIsTokened = true;
-
-									break;
-								}
-							}
-						}
-					} 
-					else if (length > 0) 
-					{
-						if (preIsTokened == true) 
-						{
-							// empty the buffer
-							length = 0;
-							preIsTokened = false;
-						} 
-						else 
-						{
-							break;
-						}
-					}
-				}
-			}
-
-			return new Token(new String(buffer, 0, length), start, start + length,
-				tokenType
-				);
-		}
-	}
-
+    /// <summary>
+    /// <p>
+    /// CJKTokenizer was modified from StopTokenizer which does a decent job for
+    /// most European languages. and it perferm other token method for double-byte
+    /// chars: the token will return at each two charactors with overlap match.<br/>
+    /// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+    /// also need filter filter zero length token ""<br/>
+    /// for Digit: digit, '+', '#' will token as letter<br/>
+    /// for more info on Asia language(Chinese Japanese Korean) text segmentation:
+    /// please search  <a
+    /// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+    /// </p>
+    /// 
+    /// @author Che, Dong
+    /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $
+    /// </summary>
+    public sealed class CJKTokenizer : Tokenizer
+    {
+        //~ Static fields/initializers ---------------------------------------------
+        /// <summary>
+        /// Word token type
+        /// </summary>
+        internal static readonly int WORD_TYPE = 0;
+
+        /// <summary>
+        /// Single byte token type
+        /// </summary>
+        internal static readonly int SINGLE_TOKEN_TYPE = 1;
+
+        /// <summary>
+        /// Double byte token type
+        /// </summary>
+        internal static readonly int DOUBLE_TOKEN_TYPE = 2;
+
+        /// <summary>
+        /// Names for token types
+        /// </summary>
+        internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
+        /// <summary>
+        /// Max word length
+        /// </summary>
+        internal static readonly int MAX_WORD_LEN = 255;
+
+        /// <summary>
+        /// buffer size
+        /// </summary>
+        internal static readonly int IO_BUFFER_SIZE = 256;
+
+        //~ Instance fields --------------------------------------------------------
+
+        /// <summary>
+        /// word offset, used to imply which character(in ) is parsed
+        /// </summary>
+        private int offset = 0;
+
+        /// <summary>
+        /// the index used only for ioBuffer
+        /// </summary>
+        private int bufferIndex = 0;
+
+        /// <summary>
+        /// data length
+        /// </summary>
+        private int dataLen = 0;
+
+        /// <summary>
+        /// character buffer, store the characters which are used to compose <br/>
+        /// the returned Token
+        /// </summary>
+        private char[] buffer = new char[MAX_WORD_LEN];
+
+        /// <summary>
+        /// I/O buffer, used to store the content of the input(one of the <br/>
+        /// members of Tokenizer)
+        /// </summary>
+        private char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+        /// <summary>
+        /// word type: single=>ASCII  double=>non-ASCII word=>default
+        /// </summary>
+        private int tokenType = WORD_TYPE;
+
+        /// <summary>
+        /// tag: previous character is a cached double-byte character  "C1C2C3C4"
+        /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+        /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+        /// </summary>
+        private bool preIsTokened = false;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+        private TypeAttribute typeAtt;
+
+        //~ Constructors -----------------------------------------------------------
+
+        /// <summary>
+        /// Construct a token stream processing the given input.
+        /// </summary>
+        /// <param name="_in">I/O reader</param>
+        public CJKTokenizer(TextReader _in)
+            : base(_in)
+        {
+            Init();
+        }
+
+        public CJKTokenizer(AttributeSource source, TextReader _in)
+            : base(source, _in)
+        {
+            Init();
+        }
+
+        public CJKTokenizer(AttributeFactory factory, TextReader _in)
+            : base(factory, _in)
+        {
+            Init();
+        }
+
+        private void Init()
+        {
+            termAtt = AddAttribute<TermAttribute>();
+            offsetAtt = AddAttribute<OffsetAttribute>();
+            typeAtt = AddAttribute<TypeAttribute>();
+        }
+
+        //~ Methods ----------------------------------------------------------------
+
+        /**
+         * Returns true for the next token in the stream, or false at EOS.
+         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html
+         * for detail.
+         *
+         * @return false for end of stream, true otherwise
+         *
+         * @throws java.io.IOException - throw IOException when read error <br>
+         *         happened in the InputStream
+         *
+         */
+
+        Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled);
+        Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled);
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            /** how many character(s) has been stored in buffer */
+
+            while (true)
+            {
+                // loop until we find a non-empty token
+
+                int length = 0;
+
+                /** the position used to create Token */
+                int start = offset;
+
+                while (true)
+                {
+                    // loop until we've found a full token
+                    /** current character */
+                    char c;
+
+                    offset++;
+
+                    if (bufferIndex >= dataLen)
+                    {
+                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+                        bufferIndex = 0;
+                    }
+
+                    if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java
+                    {
+                        if (length > 0)
+                        {
+                            if (preIsTokened == true)
+                            {
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                offset--;
+                            }
+
+                            break;
+                        }
+                        else
+                        {
+                            offset--;
+                            return false;
+                        }
+                    }
+                    else
+                    {
+                        //get current character
+                        c = ioBuffer[bufferIndex++];
+                    }
+
+                    //TODO: Using a Regex to determine the UnicodeCategory is probably slower than
+                    //      If we just created a small class that would look it up for us, which 
+                    //      would likely be trivial, however time-consuming.  I can't imagine a Regex
+                    //      being fast for this, considering we have to pull a char from the buffer,
+                    //      and convert it to a string before we run a regex on it. - cc
+                    bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success;
+                    //if the current character is ASCII or Extend ASCII
+                    if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm))
+                    {
+                        if (isHalfFullForm)
+                        {
+                            int i = (int) c;
+                            if (i >= 65281 && i <= 65374)
+                            {
+                                // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+                                i = i - 65248;
+                                c = (char) i;
+                            }
+                        }
+
+                        // if the current character is a letter or "_" "+" "#"
+                        if (char.IsLetterOrDigit(c)
+                            || ((c == '_') || (c == '+') || (c == '#'))
+                            )
+                        {
+                            if (length == 0)
+                            {
+                                // "javaC1C2C3C4linux" <br>
+                                //      ^--: the current character begin to token the ASCII
+                                // letter
+                                start = offset - 1;
+                            }
+                            else if (tokenType == DOUBLE_TOKEN_TYPE)
+                            {
+                                // "javaC1C2C3C4linux" <br>
+                                //              ^--: the previous non-ASCII
+                                // : the current character
+                                offset--;
+                                bufferIndex--;
+
+                                if (preIsTokened == true)
+                                {
+                                    // there is only one non-ASCII has been stored
+                                    length = 0;
+                                    preIsTokened = false;
+                                    break;
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+
+                            // store the LowerCase(c) in the buffer
+                            buffer[length++] = char.ToLower(c); // TODO: is java invariant?  If so, this should be ToLowerInvariant()
+                            tokenType = SINGLE_TOKEN_TYPE;
+
+                            // break the procedure if buffer overflowed!
+                            if (length == MAX_WORD_LEN)
+                            {
+                                break;
+                            }
+                        }
+                        else if (length > 0)
+                        {
+                            if (preIsTokened)
+                            {
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // non-ASCII letter, e.g."C1C2C3C4"
+                        if (char.IsLetter(c))
+                        {
+                            if (length == 0)
+                            {
+                                start = offset - 1;
+                                buffer[length++] = c;
+                                tokenType = DOUBLE_TOKEN_TYPE;
+                            }
+                            else
+                            {
+                                if (tokenType == SINGLE_TOKEN_TYPE)
+                                {
+                                    offset--;
+                                    bufferIndex--;
+
+                                    //return the previous ASCII characters
+                                    break;
+                                }
+                                else
+                                {
+                                    buffer[length++] = c;
+                                    tokenType = DOUBLE_TOKEN_TYPE;
+
+                                    if (length == 2)
+                                    {
+                                        offset--;
+                                        bufferIndex--;
+                                        preIsTokened = true;
+
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+                        else if (length > 0)
+                        {
+                            if (preIsTokened == true)
+                            {
+                                // empty the buffer
+                                length = 0;
+                                preIsTokened = false;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (length > 0)
+                {
+                    termAtt.SetTermBuffer(buffer, 0, length);
+                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+                    typeAtt.SetType(TOKEN_TYPE_NAMES[tokenType]);
+                    return true;
+                }
+                else if (dataLen == 0)
+                {
+                    offset--;
+                    return false;
+                }
+
+                // Cycle back and try for the next token (don't
+                // return an empty string)
+            }
+        }
+
+        public override void End()
+        {
+            // set final offset
+            int finalOffset = CorrectOffset(offset);
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            offset = bufferIndex = dataLen = 0;
+            preIsTokened = false;
+            tokenType = WORD_TYPE;
+        }
+
+        public override void Reset(TextReader reader)
+        {
+            base.Reset(reader);
+            Reset();
+        }
+    }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseAnalyzer.cs Tue Feb 28 22:43:08 2012
@@ -28,86 +28,58 @@ using Lucene.Net.Analysis;
 
 namespace Lucene.Net.Analysis.Cn
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
+    /// <summary>
+    /// An <see cref="Analyzer"/> that tokenizes text with <see cref="ChineseTokenizer"/> and
+    /// filters with <see cref="ChineseFilter"/>
+    /// </summary>
+    public class ChineseAnalyzer : Analyzer
+    {
 
-	/// <summary>
-	/// Title: ChineseAnalyzer
-	/// Description:
-	///   Subclass of org.apache.lucene.analysis.Analyzer
-	///   build from a ChineseTokenizer, filtered with ChineseFilter.
-	/// Copyright:   Copyright (c) 2001
-	/// Company:
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseAnalyzer.java, v 1.2 2003/01/22 20:54:47 ehatcher Exp $</version>
-	/// </summary>
-	public class ChineseAnalyzer : Analyzer 
-	{
+        public ChineseAnalyzer()
+        {
+        }
 
-		public ChineseAnalyzer() 
-		{
-		}
+        /// <summary>
+        /// Creates a TokenStream which tokenizes all the text in the provided Reader.
+        /// </summary>
+        /// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
+        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new ChineseTokenizer(reader);
+            result = new ChineseFilter(result);
+            return result;
+        }
 
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided Reader.
-		/// </summary>
-		/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
-		public override sealed TokenStream TokenStream(String fieldName, TextReader reader) 
-		{
-			TokenStream result = new ChineseTokenizer(reader);
-			result = new ChineseFilter(result);
-			return result;
-		}
-	}
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /// <summary>
+        /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text in the
+        /// provided <see cref="TextReader"/>.
+        /// </summary>
+        /// <returns>
+        ///   A <see cref="TokenStream"/> built from a <see cref="ChineseTokenizer"/> 
+        ///   filtered with <see cref="ChineseFilter"/>.
+        /// </returns>
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            /* tokenStream() is final, no back compat issue */
+            SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new ChineseTokenizer(reader);
+                streams.result = new ChineseFilter(streams.source);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseFilter.cs Tue Feb 28 22:43:08 2012
@@ -25,135 +25,75 @@ using System.Collections;
 using System.Globalization;
 
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Cn
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Title: ChineseFilter
-	/// Description: Filter with a stop word table
-	///              Rule: No digital is allowed.
-	///                    English word/token should larger than 1 character.
-	///                    One Chinese character as one Chinese word.
-	/// TO DO:
-	///   1. Add Chinese stop words, such as \ue400
-	///   2. Dictionary based Chinese word extraction
-	///   3. Intelligent Chinese word extraction
-	/// 
-	/// Copyright:    Copyright (c) 2001
-	/// Company:
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $</version>
-	/// </summary>
-	public sealed class ChineseFilter : TokenFilter 
-	{
-		// Only English now, Chinese to be added later.
-		public static String[] STOP_WORDS = 
-				 {
-					 "and", "are", "as", "at", "be", "but", "by",
-					 "for", "if", "in", "into", "is", "it",
-					 "no", "not", "of", "on", "or", "such",
-					 "that", "the", "their", "then", "there", "these",
-					 "they", "this", "to", "was", "will", "with"
-				 };
-
-		private Hashtable stopTable;
-
-		public ChineseFilter(TokenStream _in) : base (_in)
-		{
-			stopTable = new Hashtable(STOP_WORDS.Length);
-
-			for (int i = 0; i < STOP_WORDS.Length; i++)
-				stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
-		}
-
-		public override Token Next()
-		{
-
-			for (Token token = input.Next(); token != null; token = input.Next()) 
-			{
-				String text = token.TermText();
-
-				// why not key off token type here assuming ChineseTokenizer comes first?
-				if (stopTable[text] == null) 
-				{
-					switch (Char.GetUnicodeCategory(text[0])) 
-					{
-
-						case UnicodeCategory.LowercaseLetter:
-						case UnicodeCategory.UppercaseLetter:
-
-							// English word/token should larger than 1 character.
-							if (text.Length > 1) 
-							{
-								return token;
-							}
-							break;
-						case UnicodeCategory.OtherLetter:
-
-							// One Chinese character as one Chinese word.
-							// Chinese word extraction to be added later here.
-
-							return token;
-					}
-
-				}
-
-			}
-			return null;
-		}
-	}
+    // TODO: convert this XML code to valid .NET
+    /// <summary>
+    /// A {@link TokenFilter} with a stop word table.  
+    /// <ul>
+    /// <li>Numeric tokens are removed.</li>
+    /// <li>English tokens must be larger than 1 char.</li>
+    /// <li>One Chinese char as one Chinese word.</li>
+    /// </ul>
+    /// TO DO:
+    /// <ol>
+    /// <li>Add Chinese stop words, such as \ue400</li>
+    /// <li>Dictionary based Chinese word extraction</li>
+    /// <li>Intelligent Chinese word extraction</li>
+    /// </ol>
+    /// </summary>
+    public sealed class ChineseFilter : TokenFilter
+    {
+        // Only English now, Chinese to be added later.
+        public static String[] STOP_WORDS =
+            {
+                "and", "are", "as", "at", "be", "but", "by",
+                "for", "if", "in", "into", "is", "it",
+                "no", "not", "of", "on", "or", "such",
+                "that", "the", "their", "then", "there", "these",
+                "they", "this", "to", "was", "will", "with"
+            };
+
+        private CharArraySet stopTable;
+        private TermAttribute termAtt;
+
+        public ChineseFilter(TokenStream _in)
+            : base(_in)
+        {
+            stopTable = new CharArraySet(STOP_WORDS, false);
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            while (input.IncrementToken())
+            {
+                char[] text = termAtt.TermBuffer();
+                int termLength = termAtt.TermLength();
+
+                // why not key off token type here assuming ChineseTokenizer comes first?
+                if (!stopTable.Contains(text, 0, termLength))
+                {
+                    switch (char.GetUnicodeCategory(text[0]))
+                    {
+                        case UnicodeCategory.LowercaseLetter:
+                        case UnicodeCategory.UppercaseLetter:
+                            // English word/token should larger than 1 char.
+                            if (termLength > 1)
+                            {
+                                return true;
+                            }
+                            break;
+                        case UnicodeCategory.OtherLetter:
+                            // One Chinese char as one Chinese word.
+                            // Chinese word extraction to be added later here.
+                            return true;
+                    }
+                }
+            }
+            return false;
+        }
+    }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Cn/ChineseTokenizer.cs Tue Feb 28 22:43:08 2012
@@ -26,175 +26,166 @@ using System.Collections;
 using System.Globalization;
 
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis.Cn
 {
-	/* ====================================================================
-	 * The Apache Software License, Version 1.1
-	 *
-	 * Copyright (c) 2004 The Apache Software Foundation.  All rights
-	 * reserved.
-	 *
-	 * Redistribution and use in source and binary forms, with or without
-	 * modification, are permitted provided that the following conditions
-	 * are met:
-	 *
-	 * 1. Redistributions of source code must retain the above copyright
-	 *    notice, this list of conditions and the following disclaimer.
-	 *
-	 * 2. Redistributions in binary form must reproduce the above copyright
-	 *    notice, this list of conditions and the following disclaimer in
-	 *    the documentation and/or other materials provided with the
-	 *    distribution.
-	 *
-	 * 3. The end-user documentation included with the redistribution,
-	 *    if any, must include the following acknowledgment:
-	 *       "This product includes software developed by the
-	 *        Apache Software Foundation (http://www.apache.org/)."
-	 *    Alternately, this acknowledgment may appear in the software itself,
-	 *    if and wherever such third-party acknowledgments normally appear.
-	 *
-	 * 4. The names "Apache" and "Apache Software Foundation" and
-	 *    "Apache Lucene" must not be used to endorse or promote products
-	 *    derived from this software without prior written permission. For
-	 *    written permission, please contact apache@apache.org.
-	 *
-	 * 5. Products derived from this software may not be called "Apache",
-	 *    "Apache Lucene", nor may "Apache" appear in their name, without
-	 *    prior written permission of the Apache Software Foundation.
-	 *
-	 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
-	 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-	 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-	 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
-	 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-	 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-	 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-	 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-	 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-	 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-	 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-	 * SUCH DAMAGE.
-	 * ====================================================================
-	 *
-	 * This software consists of voluntary contributions made by many
-	 * individuals on behalf of the Apache Software Foundation.  For more
-	 * information on the Apache Software Foundation, please see
-	 * <http://www.apache.org/>.
-	 */
-
-	/// <summary>
-	/// Title: ChineseTokenizer
-	/// Description: Extract tokens from the Stream using Character.getType()
-	///              Rule: A Chinese character as a single token
-	/// Copyright:   Copyright (c) 2001
-	/// Company:
-	/// 
-	/// The difference between thr ChineseTokenizer and the
-	/// CJKTokenizer (id=23545) is that they have different
-	/// token parsing logic.
-	/// 
-	/// Let me use an example. If having a Chinese text
-	/// "C1C2C3C4" to be indexed, the tokens returned from the
-	/// ChineseTokenizer are C1, C2, C3, C4. And the tokens
-	/// returned from the CJKTokenizer are C1C2, C2C3, C3C4.
-	/// 
-	/// Therefore the index the CJKTokenizer created is much
-	/// larger.
-	/// 
-	/// The problem is that when searching for C1, C1C2, C1C3,
-	/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
-	/// CJKTokenizer will not work.
-	/// <author>Yiyi Sun</author>
-	/// <version>$Id: ChineseTokenizer.java, v 1.4 2003/03/02 13:56:03 otis Exp $</version>
-	/// </summary>
-	public sealed class ChineseTokenizer : Tokenizer 
-	{
-
-
-		public ChineseTokenizer(TextReader _in) 
-		{
-			input = _in;
-		}
-
-		private int offset = 0, bufferIndex=0, dataLen=0;
-		private static int MAX_WORD_LEN = 255;
-		private static int IO_BUFFER_SIZE = 1024;
-		private char[] buffer = new char[MAX_WORD_LEN];
-		private char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-		private int length;
-		private int start;
-
-		private void Push(char c) 
-		{
-
-			if (length == 0) start = offset-1;            // start of token
-			buffer[length++] = Char.ToLower(c);  // buffer it
-
-		}
-
-		private Token Flush() 
-		{
-
-			if (length > 0) 
-			{
-				//System.out.println(new String(buffer, 0, length));
-				return new Token(new String(buffer, 0, length), start, start+length);
-			}
-			else
-				return null;
-		}
-
-		public override Token Next()
-		{
-
-			length = 0;
-			start = offset;
-
-
-			while (true) 
-			{
-
-				char c;
-				offset++;
-
-				if (bufferIndex >= dataLen) 
-				{
-					dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
-					bufferIndex = 0;
-				};
-
-				if (dataLen == 0) return Flush();
-				else
-					c = ioBuffer[bufferIndex++];
-
-
-				switch(Char.GetUnicodeCategory(c)) 
-				{
-
-					case UnicodeCategory.DecimalDigitNumber:
-					case UnicodeCategory.LowercaseLetter:
-					case UnicodeCategory.UppercaseLetter:
-						Push(c);
-						if (length == MAX_WORD_LEN) return Flush();
-						break;
-
-					case UnicodeCategory.OtherLetter:
-						if (length>0) 
-						{
-							bufferIndex--;
-							return Flush();
-						}
-						Push(c);
-						return Flush();
-
-					default:
-						if (length>0) return Flush();
-						break;
-				}
-			}
-
-		}
-	}
+    /// <summary>
+    /// Tokenize Chinese text as individual chinese chars.
+    /// <p>
+    /// The difference between ChineseTokenizer and
+    /// CJKTokenizer is that they have different
+    /// token parsing logic.
+    /// </p>
+    /// <p>
+    /// For example, if the Chinese text
+    /// "C1C2C3C4" is to be indexed:
+    /// <ul>
+    /// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li>
+    /// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li>
+    /// </ul>
+    /// </p>
+    /// <p>
+    /// Therefore the index created by CJKTokenizer is much larger.
+    /// </p>
+    /// <p>
+    /// The problem is that when searching for C1, C1C2, C1C3,
+    /// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+    /// CJKTokenizer will not work.
+    /// </p>
+    /// </summary> 
+    public sealed class ChineseTokenizer : Tokenizer
+    {
+        public ChineseTokenizer(TextReader _in)
+            : base(_in)
+        {
+            Init();
+        }
+
+        public ChineseTokenizer(AttributeSource source, TextReader _in)
+            : base(source, _in)
+        {
+            Init();
+        }
+
+        public ChineseTokenizer(AttributeFactory factory, TextReader _in)
+            : base(factory, _in)
+        {
+            Init();
+        }
+
+        private void Init()
+        {
+            termAtt = AddAttribute<TermAttribute>();
+            offsetAtt = AddAttribute<OffsetAttribute>();
+        }
+
+        private int offset = 0, bufferIndex = 0, dataLen = 0;
+        private static readonly int MAX_WORD_LEN = 255;
+        private static readonly int IO_BUFFER_SIZE = 1024;
+        private readonly char[] buffer = new char[MAX_WORD_LEN];
+        private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+        private int length;
+        private int start;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+
+        private void Push(char c)
+        {
+            if (length == 0) start = offset - 1; // start of token
+            buffer[length++] = Char.ToLower(c); // buffer it
+        }
+
+        private bool Flush()
+        {
+
+            if (length > 0)
+            {
+                termAtt.SetTermBuffer(buffer, 0, length);
+                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
+                return true;
+            }
+            else
+                return false;
+        }
+
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+
+            length = 0;
+            start = offset;
+
+
+            while (true)
+            {
+
+                char c;
+                offset++;
+
+                if (bufferIndex >= dataLen)
+                {
+                    dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
+                    bufferIndex = 0;
+                }
+
+                if (dataLen == 0)
+                {
+                    offset--;
+                    return Flush();
+                }
+                else
+                    c = ioBuffer[bufferIndex++];
+
+
+                switch (char.GetUnicodeCategory(c))
+                {
+
+                    case UnicodeCategory.DecimalDigitNumber:
+                    case UnicodeCategory.LowercaseLetter:
+                    case UnicodeCategory.UppercaseLetter:
+                        Push(c);
+                        if (length == MAX_WORD_LEN) return Flush();
+                        break;
+
+                    case UnicodeCategory.OtherLetter:
+                        if (length > 0)
+                        {
+                            bufferIndex--;
+                            offset--;
+                            return Flush();
+                        }
+                        Push(c);
+                        return Flush();
+
+                    default:
+                        if (length > 0) return Flush();
+                        break;
+                }
+            }
+        }
+
+        public override sealed void End()
+        {
+            // set final offset
+            int finalOffset = CorrectOffset(offset);
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            offset = bufferIndex = dataLen = 0;
+        }
+
+        public override void Reset(TextReader input)
+        {
+            base.Reset(input);
+            Reset();
+        }
+    }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1294875&r1=1294874&r2=1294875&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Tue Feb 28 22:43:08 2012
@@ -19,7 +19,6 @@
  under the License.
 
 -->
-
 <Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
@@ -29,7 +28,7 @@
     <ProjectGuid>{4286E961-9143-4821-B46D-3D39D3736386}</ProjectGuid>
     <OutputType>Library</OutputType>
     <AppDesignerFolder>Properties</AppDesignerFolder>
-    <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+    <RootNamespace>Lucene.Net.Analysis</RootNamespace>
     <AssemblyName>Lucene.Net.Contrib.Analyzers</AssemblyName>
     <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
     <FileAlignment>512</FileAlignment>
@@ -84,16 +83,34 @@
     <Compile Include="Cn\ChineseAnalyzer.cs" />
     <Compile Include="Cn\ChineseFilter.cs" />
     <Compile Include="Cn\ChineseTokenizer.cs" />
+    <Compile Include="Compound\CompoundWordTokenFilterBase.cs" />
+    <Compile Include="Compound\DictionaryCompoundWordTokenFilter.cs" />
+    <Compile Include="Compound\HyphenationCompoundWordTokenFilter.cs" />
+    <Compile Include="Compound\Hyphenation\ByteVector.cs" />
+    <Compile Include="Compound\Hyphenation\CharVector.cs" />
+    <Compile Include="Compound\Hyphenation\Hyphen.cs" />
+    <Compile Include="Compound\Hyphenation\Hyphenation.cs" />
+    <Compile Include="Compound\Hyphenation\HyphenationException.cs" />
+    <Compile Include="Compound\Hyphenation\HyphenationTree.cs" />
+    <Compile Include="Compound\Hyphenation\PatternConsumer.cs" />
+    <Compile Include="Compound\Hyphenation\PatternParser.cs" />
+    <Compile Include="Compound\Hyphenation\TernaryTree.cs" />
     <Compile Include="Cz\CzechAnalyzer.cs" />
     <Compile Include="De\GermanAnalyzer.cs" />
     <Compile Include="De\GermanStemFilter.cs" />
     <Compile Include="De\GermanStemmer.cs" />
-    <Compile Include="De\WordlistLoader.cs" />
+    <Compile Include="El\GreekAnalyzer.cs" />
+    <Compile Include="El\GreekLowerCaseFilter.cs" />
+    <Compile Include="Fa\PersianAnalyzer.cs" />
+    <Compile Include="Fa\PersianNormalizationFilter.cs" />
+    <Compile Include="Fa\PersianNormalizer.cs" />
+    <Compile Include="Fr\ElisionFilter.cs" />
     <Compile Include="Fr\FrenchAnalyzer.cs" />
     <Compile Include="Fr\FrenchStemFilter.cs" />
     <Compile Include="Fr\FrenchStemmer.cs" />
     <Compile Include="Miscellaneous\EmptyTokenStream.cs" />
     <Compile Include="Miscellaneous\InjectablePrefixAwareTokenFilter.cs" />
+    <Compile Include="Miscellaneous\PatternAnalyzer.cs" />
     <Compile Include="Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />
     <Compile Include="Miscellaneous\PrefixAwareTokenStream.cs" />
     <Compile Include="Miscellaneous\SingleTokenTokenStream.cs" />
@@ -104,10 +121,20 @@
     <Compile Include="Nl\DutchAnalyzer.cs" />
     <Compile Include="Nl\DutchStemFilter.cs" />
     <Compile Include="Nl\DutchStemmer.cs" />
-    <Compile Include="Nl\WordlistLoader.cs" />
+    <Compile Include="Payloads\AbstractEncoder.cs" />
+    <Compile Include="Payloads\DelimitedPayloadTokenFilter.cs" />
+    <Compile Include="Payloads\FloatEncoder.cs" />
+    <Compile Include="Payloads\IdentityEncoder.cs" />
+    <Compile Include="Payloads\IntegerEncoder.cs" />
+    <Compile Include="Payloads\NumericPayloadTokenFilter.cs" />
+    <Compile Include="Payloads\PayloadEncoder.cs" />
     <Compile Include="Payloads\PayloadHelper.cs" />
+    <Compile Include="Payloads\TokenOffsetPayloadTokenFilter.cs" />
+    <Compile Include="Payloads\TypeAsPayloadTokenFilter.cs" />
+    <Compile Include="Position\PositionFilter.cs" />
+    <Compile Include="Query\QueryAutoStopWordAnalyzer.cs" />
+    <Compile Include="Reverse\ReverseStringFilter.cs" />
     <Compile Include="Ru\RussianAnalyzer.cs" />
-    <Compile Include="Ru\RussianCharsets.cs" />
     <Compile Include="Ru\RussianLetterTokenizer.cs" />
     <Compile Include="Ru\RussianLowerCaseFilter.cs" />
     <Compile Include="Ru\RussianStemFilter.cs" />
@@ -125,6 +152,11 @@
     <Compile Include="Shingle\Codec\SimpleThreeDimensionalTokenSettingsCodec.cs" />
     <Compile Include="Shingle\Codec\TokenSettingsCodec.cs" />
     <Compile Include="Shingle\Codec\TwoDimensionalNonWeightedSynonymTokenSettingsCodec.cs" />
+    <Compile Include="Sinks\DateRecognizerSinkFilter.cs" />
+    <Compile Include="Sinks\TokenRangeSinkFilter.cs" />
+    <Compile Include="Sinks\TokenTypeSinkFilter.cs" />
+    <Compile Include="Th\ThaiAnalyzer.cs" />
+    <Compile Include="Th\ThaiWordFilter.cs" />
     <Compile Include="WordlistLoader.cs" />
   </ItemGroup>
   <ItemGroup>
@@ -137,8 +169,12 @@
     </ProjectReference>
   </ItemGroup>
   <ItemGroup>
+    <None Include="Compound\Hyphenation\hyphenation.dtd" />
     <None Include="Lucene.Net.snk" />
   </ItemGroup>
+  <ItemGroup>
+    <Content Include="FileDiffs.txt" />
+  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.



Mime
View raw message