lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aro...@apache.org
Subject svn commit: r671406 [1/3] - in /incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis: ./ Standard/
Date Wed, 25 Jun 2008 02:53:12 GMT
Author: aroush
Date: Tue Jun 24 19:53:11 2008
New Revision: 671406

URL: http://svn.apache.org/viewvc?rev=671406&view=rev
Log:
Release: Apache Lucene.Net.2.3.1 build 001 "Alpha"

Added:
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TeeTokenFilter.cs
Modified:
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizer.jj
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerConstants.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardTokenizerTokenManager.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Token.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/TokenMgrError.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopAnalyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/StopFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs
    incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Analyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Analyzer.cs Tue Jun 24 19:53:11 2008
@@ -39,9 +39,41 @@
 		/// field name for backward compatibility. 
 		/// </summary>
 		public abstract TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader);
-		
-		
-		/// <summary> Invoked before indexing a Fieldable instance if
+
+        /// <summary>Creates a TokenStream that is allowed to be re-used
+        /// from the previous time that the same thread called
+        /// this method.  Callers that do not need to use more
+        /// than one TokenStream at the same time from this
+        /// analyzer should use this method for better
+        /// performance.
+        /// </summary>
+        public virtual TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+        {
+            return TokenStream(fieldName, reader);
+        }
+
+        private System.LocalDataStoreSlot tokenStreams = System.Threading.Thread.AllocateDataSlot();
+
+        /// <summary>Used by Analyzers that implement reusableTokenStream
+        /// to retrieve previously saved TokenStreams for re-use
+        /// by the same thread. 
+        /// </summary>
+        protected internal virtual System.Object GetPreviousTokenStream()
+        {
+            return System.Threading.Thread.GetData(tokenStreams);
+        }
+
+        /// <summary>Used by Analyzers that implement reusableTokenStream
+        /// to save a TokenStream for later re-use by the same
+        /// thread. 
+        /// </summary>
+        protected internal virtual void SetPreviousTokenStream(System.Object obj)
+        {
+            System.Threading.Thread.SetData(tokenStreams, obj);
+        }
+
+
+        /// <summary> Invoked before indexing a Fieldable instance if
 		/// terms have already been added to that field.  This allows custom
 		/// analyzers to place an automatic position increment gap between
 		/// Fieldable instances using the same field name.  The default value

Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CachingTokenFilter.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CachingTokenFilter.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+	
+	/// <summary> This class can be used if the Tokens of a TokenStream
+	/// are intended to be consumed more than once. It caches
+	/// all Tokens locally in a List.
+	/// 
+	/// CachingTokenFilter implements the optional method
+	/// {@link TokenStream#Reset()}, which repositions the
+	/// stream to the first Token. 
+	/// 
+	/// </summary>
+	public class CachingTokenFilter : TokenFilter
+	{
+		private System.Collections.IList cache;
+		private System.Collections.IEnumerator iterator;
+		
+		public CachingTokenFilter(TokenStream input) : base(input)
+		{
+		}
+		
+		public override Token Next()
+		{
+			if (cache == null)
+			{
+				// fill cache lazily
+				cache = new System.Collections.ArrayList();
+				FillCache();
+				iterator = cache.GetEnumerator();
+			}
+			
+			if (!iterator.MoveNext())
+			{
+				// the cache is exhausted, return null
+				return null;
+			}
+			
+			return (Token) iterator.Current;
+		}
+		
+		public override void  Reset()
+		{
+			if (cache != null)
+			{
+				iterator = cache.GetEnumerator();
+			}
+		}
+		
+		private void  FillCache()
+		{
+			Token token;
+			while ((token = input.Next()) != null)
+			{
+				cache.Add(token);
+			}
+		}
+	}
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CharArraySet.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharArraySet.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+	
+	
+	/// <summary> A simple class that stores Strings as char[]'s in a
+	/// hash table.  Note that this is not a general purpose
+	/// class.  For example, it cannot remove items from the
+	/// set, nor does it resize its hash table to be smaller,
+	/// etc.  It is designed to be quick to test if a char[]
+	/// is in the set without the necessity of converting it
+	/// to a String first.
+	/// </summary>
+
+    public class CharArraySet : System.Collections.Hashtable
+	{
+
+		private const int INIT_SIZE = 8;
+		private char[][] entries;
+		private int count;
+		private bool ignoreCase;
+		
+		/// <summary>Create set with enough capacity to hold startSize
+		/// terms 
+		/// </summary>
+		public CharArraySet(int startSize, bool ignoreCase)
+		{
+			this.ignoreCase = ignoreCase;
+			int size = INIT_SIZE;
+			while (startSize + (startSize >> 2) > size)
+				size <<= 1;
+			entries = new char[size][];
+		}
+		
+		/// <summary>Create set from a Collection of char[] or String </summary>
+		public CharArraySet(System.Collections.ICollection c, bool ignoreCase) : this(c.Count, ignoreCase)
+		{
+			System.Collections.IEnumerator e = c.GetEnumerator();
+			while (e.MoveNext())
+			{
+				Add(e.Current);
+			}
+		}
+		
+		/// <summary>true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+		/// are in the set 
+		/// </summary>
+		public virtual bool Contains(char[] text, int off, int len)
+		{
+			return entries[GetSlot(text, off, len)] != null;
+		}
+		
+		// {{Doug-2.3.1}}: commented to determine if used internally to library
+		// /// <summary>true if the <code>CharSequence</code> is in the set </summary>
+		// public virtual bool Contains(CharSequence cs)
+		// {
+		// 	return entries[GetSlot(cs)] != null;
+		// }
+		
+		private int GetSlot(char[] text, int off, int len)
+		{
+			int code = GetHashCode(text, off, len);
+			int pos = code & (entries.Length - 1);
+			char[] text2 = entries[pos];
+			if (text2 != null && !Equals(text, off, len, text2))
+			{
+				int inc = ((code >> 8) + code) | 1;
+				do 
+				{
+					code += inc;
+					pos = code & (entries.Length - 1);
+					text2 = entries[pos];
+				}
+				while (text2 != null && !Equals(text, off, len, text2));
+			}
+			return pos;
+		}
+		
+		// {{Doug-2.3.1}}: commented to determine if used internally to library
+		// /// <summary>Returns true if the String is in the set </summary>
+		// private int GetSlot(CharSequence text)
+		// {
+		// 	int code = GetHashCode(text);
+		// 	int pos = code & (entries.Length - 1);
+		// 	char[] text2 = entries[pos];
+		// 	if (text2 != null && !Equals(text, text2))
+		// 	{
+		// 		int inc = ((code >> 8) + code) | 1;
+		// 		do 
+		// 		{
+		// 			code += inc;
+		// 			pos = code & (entries.Length - 1);
+		// 			text2 = entries[pos];
+		// 		}
+		// 		while (text2 != null && !Equals(text, text2));
+		// 	}
+		// 	return pos;
+		// }
+		
+		// {{Doug-2.3.1}}: commented to determine if used internally to library
+		// /// <summary>Add this CharSequence into the set </summary>
+		// public virtual bool Add(CharSequence text)
+		// {
+		// 	return Add(text.toString()); // could be more efficient
+		// }
+		
+		/// <summary>Add this String into the set </summary>
+		public virtual bool Add(System.String text)
+		{
+			return Add(text.ToCharArray());
+		}
+		
+		/// <summary>Add this char[] directly to the set.
+		/// If ignoreCase is true for this Set, the text array will be directly modified.
+		/// The user should never modify this text array after calling this method.
+		/// </summary>
+		public virtual bool Add(char[] text)
+		{
+			if (ignoreCase)
+				for (int i = 0; i < text.Length; i++)
+					text[i] = System.Char.ToLower(text[i]);
+			int slot = GetSlot(text, 0, text.Length);
+			if (entries[slot] != null)
+				return false;
+			entries[slot] = text;
+			count++;
+			
+			if (count + (count >> 2) > entries.Length)
+			{
+				Rehash();
+			}
+			
+			return true;
+		}
+		
+		private bool Equals(char[] text1, int off, int len, char[] text2)
+		{
+			if (len != text2.Length)
+				return false;
+			if (ignoreCase)
+			{
+				for (int i = 0; i < len; i++)
+				{
+					if (System.Char.ToLower(text1[off + i]) != text2[i])
+						return false;
+				}
+			}
+			else
+			{
+				for (int i = 0; i < len; i++)
+				{
+					if (text1[off + i] != text2[i])
+						return false;
+				}
+			}
+			return true;
+		}
+		
+		// {{Doug-2.3.1}}: commented to determine if used internally to library
+		// private bool Equals(CharSequence text1, char[] text2)
+		// {
+		// 	int len = text1.length();
+		// 	if (len != text2.Length)
+		// 		return false;
+		// 	if (ignoreCase)
+		// 	{
+		// 		for (int i = 0; i < len; i++)
+		// 		{
+		// 			if (Character.toLowerCase(text1.charAt(i)) != text2[i])
+		// 				return false;
+		// 		}
+		// 	}
+		// 	else
+		// 	{
+		// 		for (int i = 0; i < len; i++)
+		// 		{
+		// 			if (text1.charAt(i) != text2[i])
+		// 				return false;
+		// 		}
+		// 	}
+		// 	return true;
+		// }
+		
+		private void  Rehash()
+		{
+			int newSize = 2 * entries.Length;
+			char[][] oldEntries = entries;
+			entries = new char[newSize][];
+			
+			for (int i = 0; i < oldEntries.Length; i++)
+			{
+				char[] text = oldEntries[i];
+				if (text != null)
+				{
+					// todo: could be faster... no need to compare strings on collision
+					entries[GetSlot(text, 0, text.Length)] = text;
+				}
+			}
+		}
+		
+		private int GetHashCode(char[] text, int offset, int len)
+		{
+			int code = 0;
+			int stop = offset + len;
+			if (ignoreCase)
+			{
+				for (int i = offset; i < stop; i++)
+				{
+					code = code * 31 + System.Char.ToLower(text[i]);
+				}
+			}
+			else
+			{
+				for (int i = offset; i < stop; i++)
+				{
+					code = code * 31 + text[i];
+				}
+			}
+			return code;
+		}
+		
+		// {{Doug-2.3.1}}: commented to determine if used internally to library
+		// private int GetHashCode(CharSequence text)
+		// {
+		// 	int code;
+		// 	if (ignoreCase)
+		// 	{
+		// 		code = 0;
+		// 		int len = text.length();
+		// 		for (int i = 0; i < len; i++)
+		// 		{
+		// 			code = code * 31 + Character.toLowerCase(text.charAt(i));
+		// 		}
+		// 	}
+		// 	else
+		// 	{
+		// 		if (false && text is System.String)
+		// 		{
+		// 			code = text.hashCode();
+		// 		}
+		// 		else
+		// 		{
+		// 			code = 0;
+		// 			int len = text.length();
+		// 			for (int i = 0; i < len; i++)
+		// 			{
+		// 				code = code * 31 + text.charAt(i);
+		// 			}
+		// 		}
+		// 	}
+		// 	return code;
+		// }
+		
+		public virtual int Size()
+		{
+			return count;
+		}
+		
+		public virtual bool IsEmpty()
+		{
+			return count == 0;
+		}
+		
+		public override bool Contains(System.Object o)
+		{
+			if (o is char[])
+			{
+				char[] text = (char[]) o;
+				return Contains(text, 0, text.Length);
+			}
+            else if (o is String)
+            {
+                return Contains((String) o);
+            }
+			// {{Doug-2.3.1}}: commented to determine if used internally to library
+			// else if (o is CharSequence)
+			// {
+			// 	return Contains((CharSequence) o);
+			// }
+			return false;
+		}
+		
+		public virtual bool Add(System.Object o)
+		{
+			if (o is char[])
+			{
+				return Add((char[]) o);
+			}
+			else if (o is System.String)
+			{
+				return Add((System.String) o);
+			}
+			// {{Doug-2.3.1}}: commented to determine if used internally to library
+			// else if (o is CharSequence)
+			// {
+			// 	return Add((CharSequence) o);
+			// }
+			else
+			{
+				return Add(o.ToString());
+			}
+		}
+		
+		/// <summary>The Iterator<String> for this set.  Strings are constructed on the fly, so
+		/// use <code>nextCharArray</code> for more efficient access. 
+		/// </summary>
+		public class CharArraySetIterator : System.Collections.IEnumerator
+		{
+			private void  InitBlock(CharArraySet enclosingInstance)
+			{
+				this.enclosingInstance = enclosingInstance;
+			}
+			private CharArraySet enclosingInstance;
+			/// <summary>Returns the next String, as a Set<String> would...
+			/// use nextCharArray() for better efficiency. 
+			/// </summary>
+			public virtual System.Object Current
+			{
+				get
+				{
+					return new System.String(NextCharArray());
+				}
+				
+			}
+			public CharArraySet Enclosing_Instance
+			{
+				get
+				{
+					return enclosingInstance;
+				}
+				
+			}
+			internal int pos = - 1;
+			internal char[] next_Renamed_Field;
+			internal CharArraySetIterator(CharArraySet enclosingInstance)
+			{
+				InitBlock(enclosingInstance);
+				GoNext();
+			}
+			
+			private void  GoNext()
+			{
+				next_Renamed_Field = null;
+				pos++;
+				while (pos < Enclosing_Instance.entries.Length && (next_Renamed_Field = Enclosing_Instance.entries[pos]) == null)
+					pos++;
+			}
+			
+			public virtual bool MoveNext()
+			{
+				return next_Renamed_Field != null;
+			}
+			
+			/// <summary>do not modify the returned char[] </summary>
+			public virtual char[] NextCharArray()
+			{
+				char[] ret = next_Renamed_Field;
+				GoNext();
+				return ret;
+			}
+			
+			public virtual void  Remove()
+			{
+				throw new System.NotSupportedException();
+			}
+
+			virtual public void  Reset()
+			{
+			}
+		}
+		
+		
+		public new System.Collections.IEnumerator GetEnumerator()
+		{
+			return new CharArraySetIterator(this);
+		}
+	}
+}
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/CharTokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/CharTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -30,7 +30,6 @@
         private int offset = 0, bufferIndex = 0, dataLen = 0;
         private const int MAX_WORD_LEN = 255;
         private const int IO_BUFFER_SIZE = 1024;
-        private char[] buffer = new char[MAX_WORD_LEN];
         private char[] ioBuffer = new char[IO_BUFFER_SIZE];
 		
         /// <summary>Returns true iff a character should be included in a token.  This
@@ -48,43 +47,44 @@
         {
             return c;
         }
-		
-        /// <summary>Returns the next token in the stream, or null at EOS. </summary>
-        public override Token Next()
+
+        public override Token Next(Token token)
         {
+            token.Clear();
             int length = 0;
-            int start = offset;
+            int start = bufferIndex;
+            char[] buffer = token.TermBuffer();
             while (true)
             {
-                char c;
-				
-                offset++;
+
                 if (bufferIndex >= dataLen)
                 {
-                    dataLen = input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
+                    offset += dataLen;
+                    dataLen = input is Lucene.Net.Index.DocumentsWriter.ReusableStringReader ? ((Lucene.Net.Index.DocumentsWriter.ReusableStringReader) input).Read(ioBuffer) : input.Read((System.Char[]) ioBuffer, 0, ioBuffer.Length);
+                    if (dataLen == -1)
+                    {
+                        if (length > 0)
+                            break;
+                        else
+                            return null;
+                    }
                     bufferIndex = 0;
                 }
-                ;
-                if (dataLen <= 0)
-                {
-                    if (length > 0)
-                        break;
-                    else
-                        return null;
-                }
-                else
-                    c = ioBuffer[bufferIndex++];
-				
+
+                char c = ioBuffer[bufferIndex++];
+
                 if (IsTokenChar(c))
                 {
                     // if it's a token char
-					
+
                     if (length == 0)
                         // start of token
-                        start = offset - 1;
-					
+                        start = offset + bufferIndex - 1;
+                    else if (length == buffer.Length)
+                        buffer = token.ResizeTermBuffer(1 + length);
+
                     buffer[length++] = Normalize(c); // buffer it, normalized
-					
+
                     if (length == MAX_WORD_LEN)
                         // buffer overflow!
                         break;
@@ -93,8 +93,19 @@
                     // at non-Letter w/ chars
                     break; // return 'em
             }
-			
-            return new Token(new System.String(buffer, 0, length), start, start + length);
+
+            token.termLength = length;
+            token.startOffset = start;
+            token.endOffset = start + length;
+            return token;
+        }
+
+        public override void Reset(System.IO.TextReader input)
+        {
+            base.Reset(input);
+            bufferIndex = 0;
+            offset = 0;
+            dataLen = 0;
         }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/ISOLatin1AccentFilter.cs Tue Jun 24 19:53:11 2008
@@ -31,173 +31,260 @@
         public ISOLatin1AccentFilter(TokenStream input) : base(input)
         {
         }
-		
-		public override Token Next()
+
+        private char[] output = new char[256];
+        private int outputPos;
+
+        public override Token Next(Token result)
 		{
-			Token t = input.Next();
-			if (t != null)
-				t.SetTermText(RemoveAccents(t.TermText()));
-			return t;
-		}
+            result = input.Next(result);
+            if (result != null)
+            {
+                char[] buffer = result.TermBuffer();
+                int length = result.TermLength();
+                // If no characters actually require rewriting then we
+                // just return token as-is:
+                for (int i = 0; i < length; i++)
+                {
+                    char c = buffer[i];
+                    if (c >= '\u00c0' && c <= '\u0178')
+                    {
+                        RemoveAccents(buffer, length);
+                        result.SetTermBuffer(output, 0, outputPos);
+                        break;
+                    }
+                }
+                return result;
+            }
+            else
+                return null;
+        }
 		
         /// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
-        public static System.String RemoveAccents(System.String input)
+        public void RemoveAccents(char[] input, int length)
         {
-            System.Text.StringBuilder output = new System.Text.StringBuilder();
-            for (int i = 0; i < input.Length; i++)
+
+            // Worst-case length required:
+            int maxSizeNeeded = 2 * length;
+
+            int size = output.Length;
+            while (size < maxSizeNeeded)
+                size *= 2;
+
+            if (size != output.Length)
+                output = new char[size];
+
+            outputPos = 0;
+
+            int pos = 0;
+
+            for (int i = 0; i < length; i++, pos++)
             {
-                long val = input[i];
+                char c = input[pos];
 
-                switch (input[i])
+                // Quick test: if it's not in range then just keep
+                // current character
+                if (c < '\u00c0')
+                    output[outputPos++] = c;
+                else
                 {
-					
-                    case '\u00C0':  // À
-                    case '\u00C1':  // Ã?
-                    case '\u00C2':  // Â
-                    case '\u00C3':  // Ã
-                    case '\u00C4':  // Ä
-                    case '\u00C5':  // Ã…
-                        output.Append("A");
-                        break;
-					
-                    case '\u00C6':  // Æ
-                        output.Append("AE");
-                        break;
-					
-                    case '\u00C7':  // Ç
-                        output.Append("C");
-                        break;
-					
-                    case '\u00C8':  // È
-                    case '\u00C9':  // É
-                    case '\u00CA':  // Ê
-                    case '\u00CB':  // Ë
-                        output.Append("E");
-                        break;
-					
-                    case '\u00CC':  // Ì
-                    case '\u00CD':  // Ã?
-                    case '\u00CE':  // ÃŽ
-                    case '\u00CF':  // Ã?
-                        output.Append("I");
-                        break;
-					
-                    case '\u00D0':  // Ã?
-                        output.Append("D");
-                        break;
-					
-                    case '\u00D1':  // Ñ
-                        output.Append("N");
-                        break;
-					
-                    case '\u00D2':  // Ã’
-                    case '\u00D3':  // Ó
-                    case '\u00D4':  // Ô
-                    case '\u00D5':  // Õ
-                    case '\u00D6':  // Ö
-                    case '\u00D8':  // Ø
-                        output.Append("O");
-                        break;
-					
-                    case '\u0152':  // Å’
-                        output.Append("OE");
-                        break;
-					
-                    case '\u00DE':  // Þ
-                        output.Append("TH");
-                        break;
-					
-                    case '\u00D9':  // Ù
-                    case '\u00DA':  // Ú
-                    case '\u00DB':  // Û
-                    case '\u00DC':  // Ü
-                        output.Append("U");
-                        break;
-					
-                    case '\u00DD':  // Ã?
-                    case '\u0178':  // Ÿ
-                        output.Append("Y");
-                        break;
-					
-                    case '\u00E0':  // à
-                    case '\u00E1':  // á
-                    case '\u00E2':  // â
-                    case '\u00E3':  // ã
-                    case '\u00E4':  // ä
-                    case '\u00E5':  // å
-                        output.Append("a");
-                        break;
-					
-                    case '\u00E6':  // æ
-                        output.Append("ae");
-                        break;
-					
-                    case '\u00E7':  // ç
-                        output.Append("c");
-                        break;
-					
-                    case '\u00E8':  // è
-                    case '\u00E9':  // é
-                    case '\u00EA':  // ê
-                    case '\u00EB':  // ë
-                        output.Append("e");
-                        break;
-					
-                    case '\u00EC':  // ì
-                    case '\u00ED':  // í
-                    case '\u00EE':  // î
-                    case '\u00EF':  // ï
-                        output.Append("i");
-                        break;
-					
-                    case '\u00F0':  // ð
-                        output.Append("d");
-                        break;
-					
-                    case '\u00F1':  // ñ
-                        output.Append("n");
-                        break;
-					
-                    case '\u00F2':  // ò
-                    case '\u00F3':  // ó
-                    case '\u00F4':  // ô
-                    case '\u00F5':  // õ
-                    case '\u00F6':  // ö
-                    case '\u00F8':  // ø
-                        output.Append("o");
-                        break;
-					
-                    case '\u0153':  // Å“
-                        output.Append("oe");
-                        break;
-					
-                    case '\u00DF':  // ß
-                        output.Append("ss");
-                        break;
-					
-                    case '\u00FE':  // þ
-                        output.Append("th");
-                        break;
-					
-                    case '\u00F9':  // ù
-                    case '\u00FA':  // ú
-                    case '\u00FB':  // û
-                    case '\u00FC':  // ü
-                        output.Append("u");
-                        break;
-					
-                    case '\u00FD':  // ý
-                    case '\u00FF':  // ÿ
-                        output.Append("y");
-                        break;
-					
-                    default: 
-                        output.Append(input[i]);
-                        break;
-					
+                    switch (c)
+                    {
+
+                        case '\u00C0':
+                        // À
+                        case '\u00C1':
+                        // Á
+                        case '\u00C2':
+                        // Â
+                        case '\u00C3':
+                        // Ã
+                        case '\u00C4':
+                        // Ä
+                        case '\u00C5':  // Ã…
+                            output[outputPos++] = 'A';
+                            break;
+
+                        case '\u00C6':  // Æ
+                            output[outputPos++] = 'A';
+                            output[outputPos++] = 'E';
+                            break;
+
+                        case '\u00C7':  // Ç
+                            output[outputPos++] = 'C';
+                            break;
+
+                        case '\u00C8':
+                        // È
+                        case '\u00C9':
+                        // É
+                        case '\u00CA':
+                        // Ê
+                        case '\u00CB':  // Ë
+                            output[outputPos++] = 'E';
+                            break;
+
+                        case '\u00CC':
+                        // Ì
+                        case '\u00CD':
+                        // Í
+                        case '\u00CE':
+                        // ÃŽ
+                        case '\u00CF':  // Ï
+                            output[outputPos++] = 'I';
+                            break;
+
+                        case '\u00D0':  // Ð
+                            output[outputPos++] = 'D';
+                            break;
+
+                        case '\u00D1':  // Ñ
+                            output[outputPos++] = 'N';
+                            break;
+
+                        case '\u00D2':
+                        // Ã’
+                        case '\u00D3':
+                        // Ó
+                        case '\u00D4':
+                        // Ô
+                        case '\u00D5':
+                        // Õ
+                        case '\u00D6':
+                        // Ö
+                        case '\u00D8':  // Ø
+                            output[outputPos++] = 'O';
+                            break;
+
+                        case '\u0152':  // Å’
+                            output[outputPos++] = 'O';
+                            output[outputPos++] = 'E';
+                            break;
+
+                        case '\u00DE':  // Þ
+                            output[outputPos++] = 'T';
+                            output[outputPos++] = 'H';
+                            break;
+
+                        case '\u00D9':
+                        // Ù
+                        case '\u00DA':
+                        // Ú
+                        case '\u00DB':
+                        // Û
+                        case '\u00DC':  // Ü
+                            output[outputPos++] = 'U';
+                            break;
+
+                        case '\u00DD':
+                        // Ý
+                        case '\u0178':  // Ÿ
+                            output[outputPos++] = 'Y';
+                            break;
+
+                        case '\u00E0':
+                        // à
+                        case '\u00E1':
+                        // á
+                        case '\u00E2':
+                        // â
+                        case '\u00E3':
+                        // ã
+                        case '\u00E4':
+                        // ä
+                        case '\u00E5':  // å
+                            output[outputPos++] = 'a';
+                            break;
+
+                        case '\u00E6':  // æ
+                            output[outputPos++] = 'a';
+                            output[outputPos++] = 'e';
+                            break;
+
+                        case '\u00E7':  // ç
+                            output[outputPos++] = 'c';
+                            break;
+
+                        case '\u00E8':
+                        // è
+                        case '\u00E9':
+                        // é
+                        case '\u00EA':
+                        // ê
+                        case '\u00EB':  // ë
+                            output[outputPos++] = 'e';
+                            break;
+
+                        case '\u00EC':
+                        // ì
+                        case '\u00ED':
+                        // í
+                        case '\u00EE':
+                        // î
+                        case '\u00EF':  // ï
+                            output[outputPos++] = 'i';
+                            break;
+
+                        case '\u00F0':  // ð
+                            output[outputPos++] = 'd';
+                            break;
+
+                        case '\u00F1':  // ñ
+                            output[outputPos++] = 'n';
+                            break;
+
+                        case '\u00F2':
+                        // ò
+                        case '\u00F3':
+                        // ó
+                        case '\u00F4':
+                        // ô
+                        case '\u00F5':
+                        // õ
+                        case '\u00F6':
+                        // ö
+                        case '\u00F8':  // ø
+                            output[outputPos++] = 'o';
+                            break;
+
+                        case '\u0153':  // Å“
+                            output[outputPos++] = 'o';
+                            output[outputPos++] = 'e';
+                            break;
+
+                        case '\u00DF':  // ß
+                            output[outputPos++] = 's';
+                            output[outputPos++] = 's';
+                            break;
+
+                        case '\u00FE':  // þ
+                            output[outputPos++] = 't';
+                            output[outputPos++] = 'h';
+                            break;
+
+                        case '\u00F9':
+                        // ù
+                        case '\u00FA':
+                        // ú
+                        case '\u00FB':
+                        // û
+                        case '\u00FC':  // ü
+                            output[outputPos++] = 'u';
+                            break;
+
+                        case '\u00FD':
+                        // ý
+                        case '\u00FF':  // ÿ
+                            output[outputPos++] = 'y';
+                            break;
+
+                        default:
+                            output[outputPos++] = c;
+                            break;
+
+                    }
                 }
             }
-            return output.ToString();
         }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/KeywordAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -29,5 +29,18 @@
         {
             return new KeywordTokenizer(reader);
         }
+
+        public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+        {
+            Tokenizer tokenizer = (Tokenizer)GetPreviousTokenStream();
+            if (tokenizer == null)
+            {
+                tokenizer = new KeywordTokenizer(reader);
+                SetPreviousTokenStream(tokenizer);
+            }
+            else
+                tokenizer.Reset(reader);
+            return tokenizer;
+        }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/KeywordTokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/KeywordTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -27,7 +27,6 @@
         private const int DEFAULT_BUFFER_SIZE = 256;
 		
         private bool done;
-        private char[] buffer;
 		
         public KeywordTokenizer(System.IO.TextReader input) : this(input, DEFAULT_BUFFER_SIZE)
         {
@@ -35,29 +34,36 @@
 		
         public KeywordTokenizer(System.IO.TextReader input, int bufferSize) : base(input)
         {
-            this.buffer = new char[bufferSize];
             this.done = false;
         }
-		
-        public override Token Next()
+
+        public override Token Next(Token result)
         {
             if (!done)
             {
                 done = true;
-                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
-                int length;
+                int upto = 0;
+                result.Clear();
+                char[] buffer = result.TermBuffer();
                 while (true)
                 {
-                    length = input.Read((System.Char[]) this.buffer, 0, this.buffer.Length);
-                    if (length <= 0)
+                    int length = input.Read(buffer, upto, buffer.Length - upto);
+                    if (length == -1)
                         break;
-					
-                    buffer.Append(this.buffer, 0, length);
+                    upto += length;
+                    if (upto == buffer.Length)
+                        buffer = result.ResizeTermBuffer(1 + buffer.Length);
                 }
-                System.String text = buffer.ToString();
-                return new Token(text, 0, text.Length);
+                result.termLength = upto;
+                return result;
             }
             return null;
         }
+
+        public override void Reset(System.IO.TextReader input)
+        {
+            base.Reset(input);
+            this.done = false;
+        }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/LengthFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LengthFilter.cs Tue Jun 24 19:53:11 2008
@@ -22,10 +22,9 @@
 	
     /// <summary> Removes words that are too long and too short from the stream.
     /// 
+    /// 
     /// </summary>
-    /// <author>  David Spencer
-    /// </author>
-    /// <version>  $Id: LengthFilter.java 347992 2005-11-21 21:41:43Z dnaber $
+    /// <version>  $Id: LengthFilter.java 564715 2007-08-10 18:34:33Z mikemccand $
     /// </version>
     public sealed class LengthFilter : TokenFilter
     {
@@ -43,10 +42,10 @@
         }
 		
         /// <summary> Returns the next input Token whose termText() is the right len</summary>
-        public override Token Next()
+        public override Token Next(Token result)
         {
             // return the first non-stop word found
-            for (Token token = input.Next(); token != null; token = input.Next())
+            for (Token token = input.Next(result); token != null; token = input.Next(result))
             {
                 int len = token.TermText().Length;
                 if (len >= min && len <= max)

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/LowerCaseFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/LowerCaseFilter.cs Tue Jun 24 19:53:11 2008
@@ -23,7 +23,7 @@
     /// <summary> Normalizes token text to lower case.
     /// 
     /// </summary>
-    /// <version>  $Id: LowerCaseFilter.java 150259 2004-03-29 22:48:07Z cutting $
+    /// <version>  $Id: LowerCaseFilter.java 564715 2007-08-10 18:34:33Z mikemccand $
     /// </version>
     public sealed class LowerCaseFilter : TokenFilter
     {
@@ -31,16 +31,21 @@
         {
         }
 		
-        public override Token Next()
+        public override Token Next(Token result)
         {
-            Token t = input.Next();
-			
-            if (t == null)
+            result = input.Next(result);
+            if (result != null)
+            {
+
+                char[] buffer = result.TermBuffer();
+                int length = result.termLength;
+                for (int i = 0; i < length; i++)
+                    buffer[i] = System.Char.ToLower(buffer[i]);
+
+                return result;
+            }
+            else
                 return null;
-			
-            t.termText = t.termText.ToLower();
-			
-            return t;
         }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Package.html?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Package.html Tue Jun 24 19:53:11 2008
@@ -1,10 +1,256 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-<head>
-   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
-   <meta name="Author" content="Doug Cutting">
-</head>
-<body>
-API and code to convert text into indexable tokens.
-</body>
-</html>
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+   <meta name="Author" content="Doug Cutting">
+</head>
+<body>
+<p>API and code to convert text into indexable/searchable tokens.  Covers {@link org.apache.lucene.analysis.Analyzer} and related classes.</p>
+<h2>Parsing? Tokenization? Analysis!</h2>
+<p>
+Lucene, indexing and search library, accepts only plain text input.
+<p>
+<h2>Parsing</h2>
+<p>
+Applications that build their search capabilities upon Lucene may support documents in various formats &ndash; HTML, XML, PDF, Word &ndash; just to name a few.
+Lucene does not care about the <i>Parsing</i> of these and other document formats, and it is the responsibility of the 
+application using Lucene to use an appropriate <i>Parser</i> to convert the original format into plain text before passing that plain text to Lucene.
+<p>
+<h2>Tokenization</h2>
+<p>
+Plain text passed to Lucene for indexing goes through a process generally called tokenization &ndash; namely breaking of the 
+input text into small indexing elements &ndash; 
+{@link org.apache.lucene.analysis.Token Tokens}.
+The way input text is broken into tokens very 
+much dictates further capabilities of search upon that text. 
+For instance, sentences beginnings and endings can be identified to provide for more accurate phrase 
+and proximity searches (though sentence identification is not provided by Lucene).
+<p>
+In some cases simply breaking the input text into tokens is not enough &ndash; a deeper <i>Analysis</i> is needed,
+providing for several functions, including (but not limited to):
+<ul>
+  <li><a href = "http://en.wikipedia.org//wiki/Stemming">Stemming</a> &ndash; 
+      Replacing of words by their stems. 
+      For instance with English stemming "bikes" is replaced by "bike"; 
+      now query "bike" can find both documents containing "bike" and those containing "bikes".
+  </li>
+  <li><a href = "http://en.wikipedia.org//wiki/Stop_words">Stop Words Filtering</a> &ndash; 
+      Common words like "the", "and" and "a" rarely add any value to a search.
+      Removing them shrinks the index size and increases performance.
+      It may also reduce some "noise" and actually improve search quality.
+  </li>
+  <li><a href = "http://en.wikipedia.org//wiki/Text_normalization">Text Normalization</a> &ndash; 
+      Stripping accents and other character markings can make for better searching.
+  </li>
+  <li><a href = "http://en.wikipedia.org//wiki/Synonym">Synonym Expansion</a> &ndash; 
+      Adding in synonyms at the same token position as the current word can mean better 
+      matching when users search with words in the synonym set.
+  </li>
+</ul> 
+<p>
+<h2>Core Analysis</h2>
+<p>
+  The analysis package provides the mechanism to convert Strings and Readers into tokens that can be indexed by Lucene.  There
+  are three main classes in the package from which all analysis processes are derived.  These are:
+  <ul>
+    <li>{@link org.apache.lucene.analysis.Analyzer} &ndash; An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
+    by the indexing and searching processes.  See below for more information on implementing your own Analyzer.</li>
+    <li>{@link org.apache.lucene.analysis.Tokenizer} &ndash; A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
+    up incoming text into {@link org.apache.lucene.analysis.Token}s.  In most cases, an Analyzer will use a Tokenizer as the first step in
+    the analysis process.</li>
+    <li>{@link org.apache.lucene.analysis.TokenFilter} &ndash; A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
+    for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer.  Common modifications performed by a
+    TokenFilter are: deletion, stemming, synonym injection, and down casing.  Not all Analyzers require TokenFilters</li>
+  </ul>
+</p>
+<h2>Hints, Tips and Traps</h2>
+<p>
+   The synergy between {@link org.apache.lucene.analysis.Analyzer} and {@link org.apache.lucene.analysis.Tokenizer}
+   is sometimes confusing. To ease on this confusion, some clarifications:
+   <ul>
+      <li>The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of 
+          <u>creating</u> tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer}
+          is only responsible for <u>breaking</u> the input text into tokens. Very likely, tokens created 
+          by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted 
+          by the {@link org.apache.lucene.analysis.Analyzer} (via one or more
+          {@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
+       </li>
+       <li>{@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream}, 
+           but {@link org.apache.lucene.analysis.Analyzer} is not.
+       </li>
+       <li>{@link org.apache.lucene.analysis.Analyzer} is "field aware", but 
+           {@link org.apache.lucene.analysis.Tokenizer} is not.
+       </li>
+   </ul>
+</p>
+<p>
+  Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link
+  org.apache.lucene.analysis.standard.StandardAnalyzer}.  Many applications will have a long and industrious life with nothing more
+  than the StandardAnalyzer.  However, there are a few other classes/packages that are worth mentioning:
+  <ol>
+    <li>{@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper} &ndash; Most Analyzers perform the same operation on all
+      {@link org.apache.lucene.document.Field}s.  The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different
+      {@link org.apache.lucene.document.Field}s.</li>
+    <li>The contrib/analyzers library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety
+    of different problems related to searching.  Many of the Analyzers are designed to analyze non-English languages.</li>
+    <li>The {@link org.apache.lucene.analysis.snowball contrib/snowball library} 
+        located at the root of the Lucene distribution has Analyzer and TokenFilter 
+        implementations for a variety of Snowball stemmers.  
+        See <a href = "http://snowball.tartarus.org">http://snowball.tartarus.org</a> 
+        for more information on Snowball stemmers.</li>
+    <li>There are a variety of Tokenizer and TokenFilter implementations in this package.  Take a look around, chances are someone has implemented what you need.</li>
+  </ol>
+</p>
+<p>
+  Analysis is one of the main causes of performance degradation during indexing.  Simply put, the more you analyze the slower the indexing (in most cases).
+  Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a
+  {@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process.
+</p>
+<h2>Invoking the Analyzer</h2>
+<p>
+  Applications usually do not invoke analysis &ndash; Lucene does it for them:
+  <ul>
+    <li>At indexing, as a consequence of 
+        {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document) addDocument(doc)},
+        the Analyzer in effect for indexing is invoked for each indexed field of the added document.
+    </li>
+    <li>At search, as a consequence of
+        {@link org.apache.lucene.queryParser.QueryParser#parse(java.lang.String) QueryParser.parse(queryText)},
+        the QueryParser may invoke the Analyzer in effect.
+        Note that for some queries analysis does not take place, e.g. wildcard queries.
+    </li>
+  </ul>
+  However an application might invoke Analysis of any text for testing or for any other purpose, something like:
+  <PRE>
+      Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
+      TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
+      Token t = ts.next();
+      while (t!=null) {
+        System.out.println("token: "+t));
+        t = ts.next();
+      }
+  </PRE>
+</p>
+<h2>Indexing Analysis vs. Search Analysis</h2>
+<p>
+  Selecting the "correct" analyzer is crucial
+  for search quality, and can also affect indexing and search performance.
+  The "correct" analyzer differs between applications.
+  Lucene java's wiki page 
+  <a href = "http://wiki.apache.org//lucene-java/AnalysisParalysis">AnalysisParalysis</a> 
+  provides some data on "analyzing your analyzer".
+  Here are some rules of thumb:
+  <ol>
+    <li>Test test test... (did we say test?)</li>
+    <li>Beware of over analysis &ndash; might hurt indexing performance.</li>
+    <li>Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
+    <li>In some cases a different analyzer is required for indexing and search, for instance:
+        <ul>
+           <li>Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)</li>
+           <li>Query expansion by synonyms, acronyms, auto spell correction, etc.</li>
+        </ul>
+        This might sometimes require a modified analyzer &ndash; see the next section on how to do that.
+    </li>
+  </ol>
+</p>
+<h2>Implementing your own Analyzer</h2>
+<p>Creating your own Analyzer is straightforward. It usually involves either wrapping an existing Tokenizer and  set of TokenFilters to create a new Analyzer
+or creating both the Analyzer and a Tokenizer or TokenFilter.  Before pursuing this approach, you may find it worthwhile
+to explore the contrib/analyzers library and/or ask on the java-user@lucene.apache.org mailing list first to see if what you need already exists.
+If you are still committed to creating your own Analyzer or TokenStream derivation (Tokenizer or TokenFilter) have a look at
+the source code of any one of the many samples located in this package.
+</p>
+<p>
+  The following sections discuss some aspects of implementing your own analyzer.
+</p>
+<h3>Field Section Boundaries</h2>
+<p>
+  When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
+  is called multiple times for the same field name, we could say that each such call creates a new 
+  section for that field in that document. 
+  In fact, a separate call to 
+  {@link org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) tokenStream(field,reader)}
+  would take place for each of these so called "sections".
+  However, the default Analyzer behavior is to treat all these sections as one large section. 
+  This allows phrase search and proximity search to seamlessly cross 
+  boundaries between these "sections".
+  In other words, if a certain field "f" is added like this:
+  <PRE>
+      document.add(new Field("f","first ends",...);
+      document.add(new Field("f","starts two",...);
+      indexWriter.addDocument(document);
+  </PRE>
+  Then, a phrase search for "ends starts" would find that document.
+  Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections", 
+  simply by overriding 
+  {@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
+  <PRE>
+      Analyzer myAnalyzer = new StandardAnalyzer() {
+         public int getPositionIncrementGap(String fieldName) {
+           return 10;
+         }
+      };
+  </PRE>
+</p>
+<h3>Token Position Increments</h2>
+<p>
+   By default, all tokens created by Analyzers and Tokenizers have a 
+   {@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
+   This means that the position stored for that token in the index would be one more than
+   that of the previous token.
+   Recall that phrase and proximity searches rely on position info.
+</p>
+<p>
+   If the selected analyzer filters the stop words "is" and "the", then for a document 
+   containing the string "blue is the sky", only the tokens "blue", "sky" are indexed, 
+   with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
+   would find that document, because the same analyzer filters the same stop words from
+   that query. But also the phrase query "blue sky" would find that document.
+</p>
+<p>   
+   If this behavior does not fit the application needs,
+   a modified analyzer can be used, that would increment further the positions of
+   tokens following a removed stop word, using
+   {@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
+   This can be done with something like:
+   <PRE>
+      public TokenStream tokenStream(final String fieldName, Reader reader) {
+        final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
+        TokenStream res = new TokenStream() {
+          public Token next() throws IOException {
+            int extraIncrement = 0;
+            while (true) {
+              Token t = ts.next();
+              if (t!=null) {
+                if (stopWords.contains(t.termText())) {
+                  extraIncrement++; // filter this word
+                  continue;
+                } 
+                if (extraIncrement>0) {
+                  t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
+                }
+              }
+              return t;
+            }
+          }
+        };
+        return res;
+      }
+   </PRE>
+   Now, with this modified analyzer, the phrase query "blue sky" would find that document.
+   But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
+   where both w1 and w2 are stop words would match that document.
+</p>
+<p>
+   Few more use cases for modifying position increments are:
+   <ol>
+     <li>Inhibiting phrase and proximity matches in sentence boundaries &ndash; for this, a tokenizer that 
+         identifies a new sentence can add 1 to the position increment of the first token of the new sentence.</li>
+     <li>Injecting synonyms &ndash; here, synonyms of a token should be added after that token, 
+         and their position increment should be set to 0.
+         As result, all synonyms of a token would be considered to appear in exactly the 
+         same position as that token, and so would they be seen by phrase and proximity searches.</li>
+   </ol>
+</p>
+</body>
+</html>

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PerFieldAnalyzerWrapper.cs Tue Jun 24 19:53:11 2008
@@ -78,7 +78,16 @@
 			
             return analyzer.TokenStream(fieldName, reader);
         }
-		
+
+        public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+        {
+            Analyzer analyzer = (Analyzer)analyzerMap[fieldName];
+            if (analyzer == null)
+                analyzer = defaultAnalyzer;
+
+            return analyzer.ReusableTokenStream(fieldName, reader);
+        }
+
         /// <summary>Return the positionIncrementGap from the analyzer assigned to fieldName </summary>
         public override int GetPositionIncrementGap(System.String fieldName)
         {

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PorterStemFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemFilter.cs Tue Jun 24 19:53:11 2008
@@ -46,21 +46,18 @@
         {
             stemmer = new PorterStemmer();
         }
-		
-        /// <summary>Returns the next input Token, after being stemmed </summary>
-        public override Token Next()
+
+        public override Token Next(Token result)
         {
-            Token token = input.Next();
-            if (token == null)
-                return null;
-            else
+            result = input.Next(result);
+            if (result != null)
             {
-                System.String s = stemmer.Stem(token.termText);
-                if ((System.Object) s != (System.Object) token.termText)
-                    // Yes, I mean object reference comparison here
-                    token.termText = s;
-                return token;
+                if (stemmer.Stem(result.TermBuffer(), 0, result.termLength))
+                    result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
+                return result;
             }
+            else
+                return null;
         }
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/PorterStemmer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/PorterStemmer.cs Tue Jun 24 19:53:11 2008
@@ -84,8 +84,7 @@
             if (b.Length <= i + EXTRA)
             {
                 char[] new_b = new char[b.Length + INC];
-                for (int c = 0; c < b.Length; c++)
-                    new_b[c] = b[c];
+                Array.Copy(b, 0, new_b, 0, b.Length);
                 b = new_b;
             }
             b[i++] = ch;
@@ -643,8 +642,7 @@
                 char[] new_b = new char[wordLen + EXTRA];
                 b = new_b;
             }
-            for (int j = 0; j < wordLen; j++)
-                b[j] = wordBuffer[offset + j];
+            Array.Copy(wordBuffer, offset, b, 0, wordLen);
             i = wordLen;
             return Stem(0);
         }

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/SimpleAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SimpleAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -28,5 +28,18 @@
         {
             return new LowerCaseTokenizer(reader);
         }
+
+        public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+		{
+			Tokenizer tokenizer = (Tokenizer) GetPreviousTokenStream();
+			if (tokenizer == null)
+			{
+				tokenizer = new LowerCaseTokenizer(reader);
+				SetPreviousTokenStream(tokenizer);
+			}
+			else
+				tokenizer.Reset(reader);
+			return tokenizer;
+		}
     }
 }
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/SinkTokenizer.cs?rev=671406&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/SinkTokenizer.cs Tue Jun 24 19:53:11 2008
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Analysis
+{
+	
+	
+	/// <summary> A SinkTokenizer can be used to cache Tokens for use in an Analyzer
+	/// 
+	/// </summary>
+	/// <seealso cref="TeeTokenFilter">
+	/// 
+	/// 
+	/// </seealso>
+	public class SinkTokenizer : Tokenizer
+	{
+		protected internal System.Collections.IList lst = new System.Collections.ArrayList();
+		protected internal System.Collections.IEnumerator iter;
+		
+		public SinkTokenizer(System.Collections.IList input)
+		{
+			this.lst = input;
+			if (this.lst == null)
+				this.lst = new System.Collections.ArrayList();
+		}
+		
+		public SinkTokenizer()
+		{
+			this.lst = new System.Collections.ArrayList();
+		}
+		
+		public SinkTokenizer(int initCap)
+		{
+			this.lst = new System.Collections.ArrayList(initCap);
+		}
+		
+		/// <summary> Get the tokens in the internal List.
+		/// <p/>
+		/// WARNING: Adding tokens to this list requires the {@link #Reset()} method to be called in order for them
+		/// to be made available.  Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s
+		/// in the case of adds happening while {@link #Next(Lucene.Net.Analysis.Token)} is being called.
+		/// 
+		/// </summary>
+		/// <returns> A List of {@link Lucene.Net.Analysis.Token}s
+		/// </returns>
+		public virtual System.Collections.IList GetTokens()
+		{
+			return lst;
+		}
+		
+		/// <summary> Returns the next token out of the list of cached tokens</summary>
+		/// <returns> The next {@link Lucene.Net.Analysis.Token} in the Sink.
+		/// </returns>
+		/// <throws>  IOException </throws>
+		public override Token Next()
+		{
+			if (iter == null)
+				iter = lst.GetEnumerator();
+			return iter.MoveNext() ? (Token) iter.Current : null;
+		}
+		
+		
+		
+		/// <summary> Override this method to cache only certain tokens, or new tokens based
+		/// on the old tokens.
+		/// 
+		/// </summary>
+		/// <param name="t">The {@link Lucene.Net.Analysis.Token} to add to the sink
+		/// </param>
+		public virtual void  Add(Token t)
+		{
+			if (t == null)
+				return ;
+			lst.Add((Token) t.Clone());
+		}
+		
+		public override void  Close()
+		{
+			//nothing to close
+			input = null;
+			lst = null;
+		}
+		
+		/// <summary> Reset the internal data structures to the start at the front of the list of tokens.  Should be called
+		/// if tokens were added to the list after an invocation of {@link #Next(Token)}
+		/// </summary>
+		/// <throws>  IOException </throws>
+		public override void  Reset()
+		{
+			iter = lst.GetEnumerator();
+		}
+	}
+}
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/CharStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/CharStream.cs Tue Jun 24 19:53:11 2008
@@ -14,8 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+// {{Aroush-2.3.1}} remove this file from SVN
 /* Generated By:JavaCC: Do not edit this line. CharStream.java Version 3.0 */
+/*
 using System;
 
 namespace Lucene.Net.Analysis.Standard
@@ -117,4 +118,5 @@
         /// </summary>
         void  Done();
     }
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/FastCharStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/FastCharStream.cs Tue Jun 24 19:53:11 2008
@@ -14,7 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+// {{Aroush-2.3.1}} remove this file from SVN
+/*
 using System;
 
 namespace Lucene.Net.Analysis.Standard
@@ -148,4 +149,5 @@
             return 1;
         }
     }
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/Package.html?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/Package.html Tue Jun 24 19:53:11 2008
@@ -1,15 +1,10 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-<head>
-   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
-   <meta name="Author" content="Doug Cutting">
-</head>
-<body>
-A grammar-based tokenizer constructed with JavaCC.
-<p>Note that JavaCC defines lots of public classes, methods and fields
-that do not need to be public.&nbsp; These clutter the documentation.&nbsp;
-Sorry.
-<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>Lucene.Net.Analysis.Token</tt>
-must always be fully qualified in source code in this package.
-</body>
-</html>
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+   <meta name="Author" content="Stanislaw Osinski">
+</head>
+<body>
+A fast grammar-based tokenizer constructed with JFlex.
+</body>
+</html>

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/ParseException.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/ParseException.cs Tue Jun 24 19:53:11 2008
@@ -14,9 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+// {{Aroush-2.3.1}} remove this file from SVN
 /* Generated By:JavaCC: Do not edit this line. ParseException.java Version 0.7pre6 */
-
+/*
 using System;
 
 namespace Lucene.Net.Analysis.Standard
@@ -227,4 +227,5 @@
 			return retval.ToString();
 		}
 	}
-}
\ No newline at end of file
+}
+*/
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -26,12 +26,22 @@
     /// LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
     /// 
     /// </summary>
-    /// <version>  $Id: StandardAnalyzer.java 219090 2005-07-14 20:36:28Z dnaber $
+    /// <version>  $Id: StandardAnalyzer.java 613280 2008-01-18 21:27:10Z gsingers $
     /// </version>
     public class StandardAnalyzer : Analyzer
     {
         private System.Collections.Hashtable stopSet;
 		
+		/// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
+		/// This is false by default to support backward compatibility.
+		/// 
+		/// </summary>
+		/// <deprecated> this should be removed in the next release (3.0).
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// </deprecated>
+		private bool replaceInvalidAcronym = false;
+		
         /// <summary>An array containing some common English words that are usually not
         /// useful for searching. 
         /// </summary>
@@ -70,20 +80,162 @@
             stopSet = WordlistLoader.GetWordSet(stopwords);
         }
 		
-        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
-        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
-        /// </summary>
-        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
-        {
-            TokenStream result = new StandardTokenizer(reader);
-            result = new StandardFilter(result);
-            result = new LowerCaseFilter(result);
-            result = new StopFilter(result, stopSet);
-            return result;
-        }
-        static StandardAnalyzer()
-        {
-            STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
-        }
-    }
+		/// <summary> </summary>
+		/// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// 
+		/// </param>
+		/// <deprecated> Remove in 3.X and make true the only valid value
+		/// </deprecated>
+		public StandardAnalyzer(bool replaceInvalidAcronym):this(STOP_WORDS)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		
+		/// <param name="stopwords">The stopwords to use
+		/// </param>
+		/// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// 
+		/// </param>
+		/// <deprecated> Remove in 3.X and make true the only valid value
+		/// </deprecated>
+		public StandardAnalyzer(System.IO.TextReader stopwords, bool replaceInvalidAcronym):this(stopwords)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		
+		/// <param name="stopwords">The stopwords to use
+		/// </param>
+		/// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// 
+		/// </param>
+		/// <deprecated> Remove in 3.X and make true the only valid value
+		/// </deprecated>
+		public StandardAnalyzer(System.IO.FileInfo stopwords, bool replaceInvalidAcronym):this(stopwords)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		
+		/// <summary> </summary>
+		/// <param name="stopwords">The stopwords to use
+		/// </param>
+		/// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// 
+		/// </param>
+		/// <deprecated> Remove in 3.X and make true the only valid value
+		/// </deprecated>
+		public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym):this(stopwords)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		
+		/// <param name="stopwords">The stopwords to use
+		/// </param>
+		/// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// 
+		/// </param>
+		/// <deprecated> Remove in 3.X and make true the only valid value
+		/// </deprecated>
+		public StandardAnalyzer(System.Collections.Hashtable stopwords, bool replaceInvalidAcronym) : this(stopwords)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		
+		/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
+		/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
+		/// </summary>
+		public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+		{
+			StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
+			tokenStream.SetMaxTokenLength(maxTokenLength);
+			TokenStream result = new StandardFilter(tokenStream);
+			result = new LowerCaseFilter(result);
+			result = new StopFilter(result, stopSet);
+			return result;
+		}
+		
+		private sealed class SavedStreams
+		{
+			internal StandardTokenizer tokenStream;
+			internal TokenStream filteredTokenStream;
+		}
+		
+		/// <summary>Default maximum allowed token length </summary>
+		public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+		
+		private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+		
+		/// <summary> Set maximum allowed token length.  If a token is seen
+		/// that exceeds this length then it is discarded.  This
+		/// setting only takes effect the next time tokenStream or
+		/// reusableTokenStream is called.
+		/// </summary>
+		public virtual void  SetMaxTokenLength(int length)
+		{
+			maxTokenLength = length;
+		}
+		
+		/// <seealso cref="setMaxTokenLength">
+		/// </seealso>
+		public virtual int GetMaxTokenLength()
+		{
+			return maxTokenLength;
+		}
+		
+		public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+		{
+			SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
+			if (streams == null)
+			{
+				streams = new SavedStreams();
+				SetPreviousTokenStream(streams);
+				streams.tokenStream = new StandardTokenizer(reader);
+				streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
+				streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+				streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
+			}
+			else
+			{
+				streams.tokenStream.Reset(reader);
+			}
+			streams.tokenStream.SetMaxTokenLength(maxTokenLength);
+			
+			streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
+			
+			return streams.filteredTokenStream;
+		}
+		
+		/// <summary> </summary>
+		/// <returns> true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// </returns>
+		public virtual bool IsReplaceInvalidAcronym()
+		{
+			return replaceInvalidAcronym;
+		}
+		
+		/// <summary> </summary>
+		/// <param name="replaceInvalidAcronym">Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
+		/// 
+		/// See https://issues.apache.org/jira/browse/LUCENE-1068
+		/// </param>
+		public virtual void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
+		{
+			this.replaceInvalidAcronym = replaceInvalidAcronym;
+		}
+		static StandardAnalyzer()
+		{
+			STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
+		}
+	}
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Standard/StandardFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Standard/StandardFilter.cs Tue Jun 24 19:53:11 2008
@@ -17,59 +17,66 @@
 
 using System;
 
-using Lucene.Net.Analysis;
+using Token = Lucene.Net.Analysis.Token;
+using TokenFilter = Lucene.Net.Analysis.TokenFilter;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
 
 namespace Lucene.Net.Analysis.Standard
 {
 	
-    /// <summary>Normalizes tokens extracted with {@link StandardTokenizer}. </summary>
+	/// <summary>Normalizes tokens extracted with {@link StandardTokenizer}. </summary>
 	
-    public sealed class StandardFilter : TokenFilter
-    {
+	public sealed class StandardFilter:TokenFilter
+	{
 		
 		
-        /// <summary>Construct filtering <i>in</i>. </summary>
-        public StandardFilter(TokenStream in_Renamed) : base(in_Renamed)
-        {
-        }
+		/// <summary>Construct filtering <i>in</i>. </summary>
+		public StandardFilter(TokenStream in_Renamed):base(in_Renamed)
+		{
+		}
 		
-        private static readonly System.String APOSTROPHE_TYPE = Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Net.Analysis.Standard.StandardTokenizerConstants.APOSTROPHE];
-        private static readonly System.String ACRONYM_TYPE = Lucene.Net.Analysis.Standard.StandardTokenizerConstants.tokenImage[Lucene.Net.Analysis.Standard.StandardTokenizerConstants.ACRONYM];
+		private static readonly System.String APOSTROPHE_TYPE;
+		private static readonly System.String ACRONYM_TYPE;
 		
-        /// <summary>Returns the next token in the stream, or null at EOS.
-        /// <p>Removes <tt>'s</tt> from the end of words.
-        /// <p>Removes dots from acronyms.
-        /// </summary>
-        public override Lucene.Net.Analysis.Token Next()
-        {
-            Lucene.Net.Analysis.Token t = input.Next();
+		/// <summary>Returns the next token in the stream, or null at EOS.
+		/// <p>Removes <tt>'s</tt> from the end of words.
+		/// <p>Removes dots from acronyms.
+		/// </summary>
+		public override Token Next(Token result)
+		{
+			Token t = input.Next(result);
 			
-            if (t == null)
-                return null;
+			if (t == null)
+				return null;
 			
-            System.String text = t.TermText();
-            System.String type = t.Type();
+			char[] buffer = t.TermBuffer();
+			int bufferLength = t.TermLength();
+			System.String type = t.Type();
 			
-            if (type == APOSTROPHE_TYPE && (text.EndsWith("'s") || text.EndsWith("'S")))
-            {
-                return new Lucene.Net.Analysis.Token(text.Substring(0, (text.Length - 2) - (0)), t.StartOffset(), t.EndOffset(), type);
-            }
-            else if (type == ACRONYM_TYPE)
-            {
-                // remove dots
-                System.Text.StringBuilder trimmed = new System.Text.StringBuilder();
-                for (int i = 0; i < text.Length; i++)
-                {
-                    char c = text[i];
-                    if (c != '.')
-                        trimmed.Append(c);
-                }
-                return new Lucene.Net.Analysis.Token(trimmed.ToString(), t.StartOffset(), t.EndOffset(), type);
-            }
-            else
-            {
-                return t;
-            }
-        }
-    }
+			if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+			{
+				// Strip last 2 characters off
+				t.SetTermLength(bufferLength - 2);
+			}
+			else if (type == ACRONYM_TYPE)
+			{
+				// remove dots
+				int upto = 0;
+				for (int i = 0; i < bufferLength; i++)
+				{
+					char c = buffer[i];
+					if (c != '.')
+						buffer[upto++] = c;
+				}
+				t.SetTermLength(upto);
+			}
+			
+			return t;
+		}
+		static StandardFilter()
+		{
+			APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
+			ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+		}
+	}
 }
\ No newline at end of file



Mime
View raw message