lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aro...@apache.org
Subject svn commit: r671406 [3/3] - in /incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis: ./ Standard/
Date Wed, 25 Jun 2008 02:53:12 GMT
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Token.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs Tue Jun 24 19:53:11 2008
@@ -17,150 +17,435 @@
 
 using System;
 
+using Payload = Lucene.Net.Index.Payload;
+
 namespace Lucene.Net.Analysis
 {
 	
-    /// <summary>A Token is an occurence of a term from the text of a field.  It consists
of
-    /// a term's text, the start and end offset of the term in the text of the field,
-    /// and a type string.
-    /// The start and end offsets permit applications to re-associate a token with
-    /// its source text, e.g., to display highlighted query terms in a document
-    /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
-    /// display, etc.
-    /// The type is an interned string, assigned by a lexical analyzer
-    /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
-    /// belongs to.  For example an end of sentence marker token might be implemented
-    /// with type "eos".  The default token type is "word".  
-    /// </summary>
-	
+	/// <summary>A Token is an occurence of a term from the text of a field.  It consists
of
+	/// a term's text, the start and end offset of the term in the text of the field,
+	/// and a type string.
+	/// <p>
+	/// The start and end offsets permit applications to re-associate a token with
+	/// its source text, e.g., to display highlighted query terms in a document
+	/// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+	/// display, etc.
+	/// <p>
+	/// The type is an interned string, assigned by a lexical analyzer
+	/// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+	/// belongs to.  For example an end of sentence marker token might be implemented
+	/// with type "eos".  The default token type is "word".  
+	/// <p>
+	/// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
+	/// length byte array. Use {@link TermPositions#GetPayloadLength()} and 
+	/// {@link TermPositions#GetPayload(byte[], int)} to retrieve the payloads from the index.
+	/// </summary>
+	/// <summary><br><br>
+	/// <p><font color="#FF0000">
+	/// WARNING: The status of the <b>Payloads</b> feature is experimental. 
+	/// The APIs introduced here might change in the future and will not be 
+	/// supported anymore in such a case.</font>
+	/// <br><br>
+	/// <p><b>NOTE:</b> As of 2.3, Token stores the term text
+	/// internally as a malleable char[] termBuffer instead of
+	/// String termText.  The indexing code and core tokenizers
+	/// have been changed re-use a single Token instance, changing
+	/// its buffer and other fields in-place as the Token is
+	/// processed.  This provides substantially better indexing
+	/// performance as it saves the GC cost of new'ing a Token and
+	/// String for every term.  The APIs that accept String
+	/// termText are still available but a warning about the
+	/// associated performance cost has been added (below).  The
+	/// {@link #TermText()} method has been deprecated.</p>
+	/// </summary>
+	/// <summary><p>Tokenizers and filters should try to re-use a Token
+	/// instance when possible for best performance, by
+	/// implementing the {@link TokenStream#Next(Token)} API.
+	/// Failing that, to create a new Token you should first use
+	/// one of the constructors that starts with null text.  Then
+	/// you should call either {@link #TermBuffer()} or {@link
+	/// #ResizeTermBuffer(int)} to retrieve the Token's
+	/// termBuffer.  Fill in the characters of your term into this
+	/// buffer, and finally call {@link #SetTermLength(int)} to
+	/// set the length of the term text.  See <a target="_top"
+	/// href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+	/// for details.</p>
+	/// </summary>
+	/// <seealso cref="Lucene.Net.Index.Payload">
+	/// </seealso>
     public class Token : System.ICloneable
     {
-        internal System.String termText; // the text of the term
-        internal int startOffset; // start in source text
-        internal int endOffset; // end in source text
-        internal System.String type = "word"; // lexical type
-		
-        private int positionIncrement = 1;
-		
-        /// <summary>Constructs a Token with the given term text, and start & end
offsets.
-        /// The type defaults to "word." 
-        /// </summary>
-        public Token(System.String text, int start, int end)
-        {
-            termText = text;
-            startOffset = start;
-            endOffset = end;
-        }
-		
-        /// <summary>Constructs a Token with the given text, start and end offsets,
& type. </summary>
-        public Token(System.String text, int start, int end, System.String typ)
-        {
-            termText = text;
-            startOffset = start;
-            endOffset = end;
-            type = typ;
-        }
-		
-        /// <summary>Set the position increment.  This determines the position of this
token
-        /// relative to the previous Token in a {@link TokenStream}, used in phrase
-        /// searching.
-        /// 
-        /// <p>The default value is one.
-        /// 
-        /// <p>Some common uses for this are:<ul>
-        /// 
-        /// <li>Set it to zero to put multiple terms in the same position.  This is
-        /// useful if, e.g., a word has multiple stems.  Searches for phrases
-        /// including either stem will match.  In this case, all but the first stem's
-        /// increment should be set to zero: the increment of the first instance
-        /// should be one.  Repeating a token with an increment of zero can also be
-        /// used to boost the scores of matches on that token.
-        /// 
-        /// <li>Set it to values greater than one to inhibit exact phrase matches.
-        /// If, for example, one does not want phrases to match across removed stop
-        /// words, then one could build a stop word filter that removes stop words and
-        /// also sets the increment to the number of stop words removed before each
-        /// non-stop word.  Then exact phrase queries will only match when the terms
-        /// occur with no intervening stop words.
-        /// 
-        /// </ul>
-        /// </summary>
-        /// <seealso cref="Lucene.Net.index.TermPositions">
-        /// </seealso>
-        public void  SetPositionIncrement(int positionIncrement)
-        {
-            if (positionIncrement < 0)
-                throw new System.ArgumentException("Increment must be zero or greater: "
+ positionIncrement);
-            this.positionIncrement = positionIncrement;
-        }
-		
-        /// <summary>Returns the position increment of this Token.</summary>
-        /// <seealso cref="setPositionIncrement">
-        /// </seealso>
-        public int GetPositionIncrement()
-        {
-            return positionIncrement;
-        }
-		
-        /// <summary>Sets the Token's term text. </summary>
-        public virtual void  SetTermText(System.String text)
-        {
-            termText = text;
-        }
-		
-        /// <summary>Returns the Token's term text. </summary>
-        public System.String TermText()
-        {
-            return termText;
-        }
-		
-        /// <summary>Returns this Token's starting offset, the position of the first
character
-        /// corresponding to this token in the source text.
-        /// Note that the difference between endOffset() and startOffset() may not be
-        /// equal to termText.length(), as the term text may have been altered by a
-        /// stemmer or some other filter. 
-        /// </summary>
-        public int StartOffset()
-        {
-            return startOffset;
-        }
-		
-        /// <summary>Returns this Token's ending offset, one greater than the position
of the
-        /// last character corresponding to this token in the source text. 
-        /// </summary>
-        public int EndOffset()
-        {
-            return endOffset;
-        }
-		
-        /// <summary>Returns this Token's lexical type.  Defaults to "word". </summary>
-        public System.String Type()
-        {
-            return type;
-        }
-		
-        public override System.String ToString()
-        {
-            System.Text.StringBuilder sb = new System.Text.StringBuilder();
-            sb.Append("(" + termText + "," + startOffset + "," + endOffset);
-            if (!type.Equals("word"))
-                sb.Append(",type=" + type);
-            if (positionIncrement != 1)
-                sb.Append(",posIncr=" + positionIncrement);
-            sb.Append(")");
-            return sb.ToString();
-        }
-		
-        public virtual System.Object Clone()
-        {
-            try
-            {
-                return base.MemberwiseClone();
-            }
-            catch (System.Exception e)
-            {
-                throw new System.SystemException("", e); // shouldn't happen since we implement
Cloneable
-            }
-        }
-    }
+		
+		public const System.String DEFAULT_TYPE = "word";
+		private static int MIN_BUFFER_SIZE = 10;
+		
+		/// <deprecated>: we will remove this when we remove the
+		/// deprecated APIs 
+		/// </deprecated>
+		private System.String termText;
+		
+		internal char[] termBuffer; // characters for the term text
+		internal int termLength; // length of term text in buffer
+		
+		internal int startOffset; // start in source text
+		internal int endOffset; // end in source text
+		internal System.String type = DEFAULT_TYPE; // lexical type
+		
+		internal Payload payload;
+		
+		internal int positionIncrement = 1;
+		
+		/// <summary>Constructs a Token will null text. </summary>
+		public Token()
+		{
+		}
+		
+		/// <summary>Constructs a Token with null text and start & end
+		/// offsets.
+		/// </summary>
+		/// <param name="start">start offset
+		/// </param>
+		/// <param name="end">end offset 
+		/// </param>
+		public Token(int start, int end)
+		{
+			startOffset = start;
+			endOffset = end;
+		}
+		
+		/// <summary>Constructs a Token with null text and start & end
+		/// offsets plus the Token type.
+		/// </summary>
+		/// <param name="start">start offset
+		/// </param>
+		/// <param name="end">end offset 
+		/// </param>
+		public Token(int start, int end, System.String typ)
+		{
+			startOffset = start;
+			endOffset = end;
+			type = typ;
+		}
+		
+		/// <summary>Constructs a Token with the given term text, and start
+		/// & end offsets.  The type defaults to "word."
+		/// <b>NOTE:</b> for better indexing speed you should
+		/// instead use the char[] termBuffer methods to set the
+		/// term text.
+		/// </summary>
+		/// <param name="text">term text
+		/// </param>
+		/// <param name="start">start offset
+		/// </param>
+		/// <param name="end">end offset 
+		/// </param>
+		public Token(System.String text, int start, int end)
+		{
+			termText = text;
+			startOffset = start;
+			endOffset = end;
+		}
+		
+		/// <summary>Constructs a Token with the given text, start and end
+		/// offsets, & type.  <b>NOTE:</b> for better indexing
+		/// speed you should instead use the char[] termBuffer
+		/// methods to set the term text.
+		/// </summary>
+		/// <param name="text">term text
+		/// </param>
+		/// <param name="start">start offset
+		/// </param>
+		/// <param name="end">end offset
+		/// </param>
+		/// <param name="typ">token type 
+		/// </param>
+		public Token(System.String text, int start, int end, System.String typ)
+		{
+			termText = text;
+			startOffset = start;
+			endOffset = end;
+			type = typ;
+		}
+		
+		/// <summary>Set the position increment.  This determines the position of this token
+		/// relative to the previous Token in a {@link TokenStream}, used in phrase
+		/// searching.
+		/// 
+		/// <p>The default value is one.
+		/// 
+		/// <p>Some common uses for this are:<ul>
+		/// 
+		/// <li>Set it to zero to put multiple terms in the same position.  This is
+		/// useful if, e.g., a word has multiple stems.  Searches for phrases
+		/// including either stem will match.  In this case, all but the first stem's
+		/// increment should be set to zero: the increment of the first instance
+		/// should be one.  Repeating a token with an increment of zero can also be
+		/// used to boost the scores of matches on that token.
+		/// 
+		/// <li>Set it to values greater than one to inhibit exact phrase matches.
+		/// If, for example, one does not want phrases to match across removed stop
+		/// words, then one could build a stop word filter that removes stop words and
+		/// also sets the increment to the number of stop words removed before each
+		/// non-stop word.  Then exact phrase queries will only match when the terms
+		/// occur with no intervening stop words.
+		/// 
+		/// </ul>
+		/// </summary>
+		/// <seealso cref="Lucene.Net.Index.TermPositions">
+		/// </seealso>
+		public virtual void  SetPositionIncrement(int positionIncrement)
+		{
+			if (positionIncrement < 0)
+				throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
+			this.positionIncrement = positionIncrement;
+		}
+		
+		/// <summary>Returns the position increment of this Token.</summary>
+		/// <seealso cref="setPositionIncrement">
+		/// </seealso>
+		public virtual int GetPositionIncrement()
+		{
+			return positionIncrement;
+		}
+		
+		/// <summary>Sets the Token's term text.  <b>NOTE:</b> for better
+		/// indexing speed you should instead use the char[]
+		/// termBuffer methods to set the term text. 
+		/// </summary>
+		public virtual void  SetTermText(System.String text)
+		{
+			termText = text;
+			termBuffer = null;
+		}
+		
+		/// <summary>Returns the Token's term text.
+		/// 
+		/// </summary>
+		/// <deprecated> Use {@link #TermBuffer()} and {@link
+		/// #TermLength()} instead. 
+		/// </deprecated>
+		public System.String TermText()
+		{
+			if (termText == null && termBuffer != null)
+				termText = new System.String(termBuffer, 0, termLength);
+			return termText;
+		}
+		
+		/// <summary>Copies the contents of buffer, starting at offset for
+		/// length characters, into the termBuffer
+		/// array. <b>NOTE:</b> for better indexing speed you
+		/// should instead retrieve the termBuffer, using {@link
+		/// #TermBuffer()} or {@link #ResizeTermBuffer(int)}, and
+		/// fill it in directly to set the term text.  This saves
+		/// an extra copy. 
+		/// </summary>
+		public void  SetTermBuffer(char[] buffer, int offset, int length)
+		{
+			ResizeTermBuffer(length);
+			Array.Copy(buffer, offset, termBuffer, 0, length);
+			termLength = length;
+		}
+		
+		/// <summary>Returns the internal termBuffer character array which
+		/// you can then directly alter.  If the array is too
+		/// small for your token, use {@link
+		/// #ResizeTermBuffer(int)} to increase it.  After
+		/// altering the buffer be sure to call {@link
+		/// #setTermLength} to record the number of valid
+		/// characters that were placed into the termBuffer. 
+		/// </summary>
+		public char[] TermBuffer()
+		{
+			InitTermBuffer();
+			return termBuffer;
+		}
+		
+		/// <summary>Grows the termBuffer to at least size newSize.</summary>
+		/// <param name="newSize">minimum size of the new termBuffer
+		/// </param>
+		/// <returns> newly created termBuffer with length >= newSize
+		/// </returns>
+		public virtual char[] ResizeTermBuffer(int newSize)
+		{
+			InitTermBuffer();
+			if (newSize > termBuffer.Length)
+			{
+				int size = termBuffer.Length;
+				while (size < newSize)
+					size *= 2;
+				char[] newBuffer = new char[size];
+				Array.Copy(termBuffer, 0, newBuffer, 0, termBuffer.Length);
+				termBuffer = newBuffer;
+			}
+			return termBuffer;
+		}
+		
+		// TODO: once we remove the deprecated termText() method
+		// and switch entirely to char[] termBuffer we don't need
+		// to use this method anymore
+		private void  InitTermBuffer()
+		{
+			if (termBuffer == null)
+			{
+				if (termText == null)
+				{
+					termBuffer = new char[MIN_BUFFER_SIZE];
+					termLength = 0;
+				}
+				else
+				{
+					int length = termText.Length;
+					if (length < MIN_BUFFER_SIZE)
+						length = MIN_BUFFER_SIZE;
+					termBuffer = new char[length];
+					termLength = termText.Length;
+
+					int offset = 0;
+					while (offset < termText.Length)
+					{
+						termBuffer[offset] = (char) termText[offset];
+						offset++;
+					}
+
+					termText = null;
+				}
+			}
+			else if (termText != null)
+				termText = null;
+		}
+		
+		/// <summary>Return number of valid characters (length of the term)
+		/// in the termBuffer array. 
+		/// </summary>
+		public int TermLength()
+		{
+			InitTermBuffer();
+			return termLength;
+		}
+		
+		/// <summary>Set number of valid characters (length of the term) in
+		/// the termBuffer array. 
+		/// </summary>
+		public void  SetTermLength(int length)
+		{
+			InitTermBuffer();
+			termLength = length;
+		}
+		
+		/// <summary>Returns this Token's starting offset, the position of the first character
+		/// corresponding to this token in the source text.
+		/// Note that the difference between endOffset() and startOffset() may not be
+		/// equal to termText.length(), as the term text may have been altered by a
+		/// stemmer or some other filter. 
+		/// </summary>
+		public int StartOffset()
+		{
+			return startOffset;
+		}
+		
+		/// <summary>Set the starting offset.</summary>
+		/// <seealso cref="StartOffset()">
+		/// </seealso>
+		public virtual void  SetStartOffset(int offset)
+		{
+			this.startOffset = offset;
+		}
+		
+		/// <summary>Returns this Token's ending offset, one greater than the position of
the
+		/// last character corresponding to this token in the source text. 
+		/// </summary>
+		public int EndOffset()
+		{
+			return endOffset;
+		}
+		
+		/// <summary>Set the ending offset.</summary>
+		/// <seealso cref="EndOffset()">
+		/// </seealso>
+		public virtual void  SetEndOffset(int offset)
+		{
+			this.endOffset = offset;
+		}
+		
+		/// <summary>Returns this Token's lexical type.  Defaults to "word". </summary>
+		public System.String Type()
+		{
+			return type;
+		}
+		
+		/// <summary>Set the lexical type.</summary>
+		/// <seealso cref="Type()">
+		/// </seealso>
+		public void  SetType(System.String type)
+		{
+			this.type = type;
+		}
+		
+		/// <summary> Returns this Token's payload.</summary>
+		public virtual Payload GetPayload()
+		{
+			return this.payload;
+		}
+		
+		/// <summary> Sets this Token's payload.</summary>
+		public virtual void  SetPayload(Payload payload)
+		{
+			this.payload = payload;
+		}
+		
+		public override System.String ToString()
+		{
+			System.Text.StringBuilder sb = new System.Text.StringBuilder();
+			sb.Append('(');
+			InitTermBuffer();
+			if (termBuffer == null)
+				sb.Append("null");
+			else
+				sb.Append(termBuffer, 0, termLength);
+			sb.Append(',').Append(startOffset).Append(',').Append(endOffset);
+			if (!type.Equals("word"))
+				sb.Append(",type=").Append(type);
+			if (positionIncrement != 1)
+				sb.Append(",posIncr=").Append(positionIncrement);
+			sb.Append(')');
+			return sb.ToString();
+		}
+		
+		/// <summary>Resets the term text, payload, and positionIncrement to default.
+		/// Other fields such as startOffset, endOffset and the token type are
+		/// not reset since they are normally overwritten by the tokenizer. 
+		/// </summary>
+		public virtual void  Clear()
+		{
+			payload = null;
+			// Leave termBuffer to allow re-use
+			termLength = 0;
+			termText = null;
+			positionIncrement = 1;
+			// startOffset = endOffset = 0;
+			// type = DEFAULT_TYPE;
+		}
+		
+		public virtual System.Object Clone()
+		{
+			try
+			{
+				Token t = (Token) base.MemberwiseClone();
+				if (termBuffer != null)
+				{
+					t.termBuffer = null;
+					t.SetTermBuffer(termBuffer, 0, termLength);
+				}
+				if (payload != null)
+				{
+					t.SetPayload((Payload) payload.Clone());
+				}
+				return t;
+			}
+			catch (System.Exception e)
+			{
+				throw new System.SystemException("", e); // shouldn't happen
+			}
+		}
+	}
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs Tue Jun 24 19:53:11
2008
@@ -23,8 +23,9 @@
     /// <summary>A TokenFilter is a TokenStream whose input is another token stream.
     /// <p>
     /// This is an abstract class.
-    /// </summary>
-	
+	/// NOTE: subclasses must override at least one of {@link
+	/// #Next()} or {@link #Next(Token)}.
+	/// </summary>
     public abstract class TokenFilter : TokenStream
     {
         /// <summary>The source of tokens for this filter. </summary>

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs Tue Jun 24 19:53:11
2008
@@ -17,29 +17,91 @@
 
 using System;
 
+using Payload = Lucene.Net.Index.Payload;
+
 namespace Lucene.Net.Analysis
 {
 	
-    /// <summary>A TokenStream enumerates the sequence of tokens, either from
-    /// fields of a document or from query text.
-    /// <p>
-    /// This is an abstract class.  Concrete subclasses are:
-    /// <ul>
-    /// <li>{@link Tokenizer}, a TokenStream
-    /// whose input is a Reader; and
-    /// <li>{@link TokenFilter}, a TokenStream
-    /// whose input is another TokenStream.
-    /// </ul>
-    /// </summary>
+	/// <summary>A TokenStream enumerates the sequence of tokens, either from
+	/// fields of a document or from query text.
+	/// <p>
+	/// This is an abstract class.  Concrete subclasses are:
+	/// <ul>
+	/// <li>{@link Tokenizer}, a TokenStream
+	/// whose input is a Reader; and
+	/// <li>{@link TokenFilter}, a TokenStream
+	/// whose input is another TokenStream.
+	/// </ul>
+	/// NOTE: subclasses must override at least one of {@link
+	/// #Next()} or {@link #Next(Token)}.
+	/// </summary>
 	
-    public abstract class TokenStream
-    {
-        /// <summary>Returns the next token in the stream, or null at EOS. </summary>
-        public abstract Token Next();
+	public abstract class TokenStream
+	{
+		
+		/// <summary>Returns the next token in the stream, or null at EOS.
+		/// The returned Token is a "full private copy" (not
+		/// re-used across calls to next()) but will be slower
+		/// than calling {@link #Next(Token)} instead.. 
+		/// </summary>
+		public virtual Token Next()
+		{
+			Token result = Next(new Token());
+			
+			if (result != null)
+			{
+				Payload p = result.GetPayload();
+				if (p != null)
+				{
+					result.SetPayload((Payload) p.Clone());
+				}
+			}
+			
+			return result;
+		}
+		
+		/// <summary>Returns the next token in the stream, or null at EOS.
+		/// When possible, the input Token should be used as the
+		/// returned Token (this gives fastest tokenization
+		/// performance), but this is not required and a new Token
+		/// may be returned. Callers may re-use a single Token
+		/// instance for successive calls to this method.
+		/// <p>
+		/// This implicitly defines a "contract" between 
+		/// consumers (callers of this method) and 
+		/// producers (implementations of this method 
+		/// that are the source for tokens):
+		/// <ul>
+		/// <li>A consumer must fully consume the previously 
+		/// returned Token before calling this method again.</li>
+		/// <li>A producer must call {@link Token#Clear()}
+		/// before setting the fields in it & returning it</li>
+		/// </ul>
+		/// Note that a {@link TokenFilter} is considered a consumer.
+		/// </summary>
+		/// <param name="result">a Token that may or may not be used to return
+		/// </param>
+		/// <returns> next token in the stream or null if end-of-stream was hit
+		/// </returns>
+		public virtual Token Next(Token result)
+		{
+			return Next();
+		}
+		
+		/// <summary>Resets this stream to the beginning. This is an
+		/// optional operation, so subclasses may or may not
+		/// implement this method. Reset() is not needed for
+		/// the standard indexing process. However, if the Tokens 
+		/// of a TokenStream are intended to be consumed more than 
+		/// once, it is necessary to implement reset(). 
+		/// </summary>
+		public virtual void  Reset()
+		{
+		}
 		
-        /// <summary>Releases resources associated with this stream. </summary>
-        public virtual void  Close()
-        {
-        }
-    }
+		/// <summary>Releases resources associated with this stream. </summary>
+		public virtual void  Close()
+		{
+		}
+	}
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Tokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs Tue Jun 24 19:53:11
2008
@@ -20,10 +20,16 @@
 namespace Lucene.Net.Analysis
 {
 	
-    /// <summary>A Tokenizer is a TokenStream whose input is a Reader.
-    /// <p>
-    /// This is an abstract class.
-    /// </summary>
+	/// <summary>A Tokenizer is a TokenStream whose input is a Reader.
+	/// <p>
+	/// This is an abstract class.
+	/// <p>
+	/// NOTE: subclasses must override at least one of {@link
+	/// #Next()} or {@link #Next(Token)}.
+	/// <p>
+	/// NOTE: subclasses overriding {@link #Next(Token)} must  
+	/// call {@link Token#Clear()}.
+	/// </summary>
 	
     public abstract class Tokenizer : TokenStream
     {
@@ -49,5 +55,14 @@
                 input.Close();
             }
         }
+		
+		/// <summary>Expert: Reset the tokenizer to a new reader.  Typically, an
+		/// analyzer (in its reusableTokenStream method) will use
+		/// this to re-use a previously created tokenizer. 
+		/// </summary>
+		public virtual void  Reset(System.IO.TextReader input)
+		{
+			this.input = input;
+		}
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs Tue Jun 24
19:53:11 2008
@@ -28,5 +28,18 @@
         {
             return new WhitespaceTokenizer(reader);
         }
+
+        public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader
reader)
+		{
+			Tokenizer tokenizer = (Tokenizer) GetPreviousTokenStream();
+			if (tokenizer == null)
+			{
+				tokenizer = new WhitespaceTokenizer(reader);
+				SetPreviousTokenStream(tokenizer);
+			}
+			else
+				tokenizer.Reset(reader);
+			return tokenizer;
+		}
     }
 }
\ No newline at end of file

Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WordlistLoader.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs Tue Jun 24 19:53:11
2008
@@ -20,13 +20,12 @@
 namespace Lucene.Net.Analysis
 {
 	
-    /// <summary> Loader for text files that represent a list of stopwords.
-    /// 
-    /// </summary>
-    /// <author>  Gerhard Schwarz
-    /// </author>
-    /// <version>  $Id: WordlistLoader.java 192989 2005-06-22 19:59:03Z dnaber $
-    /// </version>
+	/// <summary> Loader for text files that represent a list of stopwords.
+	/// 
+	/// 
+	/// </summary>
+	/// <version>  $Id: WordlistLoader.java 564236 2007-08-09 15:21:19Z gsingers $
+	/// </version>
     public class WordlistLoader
     {
 		



Mime
View raw message