lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ccurr...@apache.org
Subject [Lucene.Net] svn commit: r1204353 [7/9] - in /incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src: contrib/Analyzers/ contrib/Analyzers/AR/ contrib/Analyzers/BR/ contrib/Analyzers/CJK/ contrib/Analyzers/Cn/ contrib/Analyzers/Compound/ contrib/Analyzers/Compoun...
Date Mon, 21 Nov 2011 04:44:59 GMT
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs
(added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Payloads/TypeAsPayloadTokenFilter.cs
Mon Nov 21 04:44:55 2011
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Analyzers.Payloads
+{
+    /// <summary>
+    /// Makes the Token.Type() a payload.
+    /// Encodes the type using <see cref="System.Text.Encoding.UTF8"/> as the encoding
+    /// </summary>
+    public class TypeAsPayloadTokenFilter : TokenFilter
+    {
+        private PayloadAttribute payloadAtt;
+        private TypeAttribute typeAtt;
+
+        public TypeAsPayloadTokenFilter(TokenStream input)
+            : base(input)
+        {
+            payloadAtt = AddAttribute<PayloadAttribute>();
+            typeAtt = AddAttribute<TypeAttribute>();
+        }
+
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String type = typeAtt.Type();
+                if (type != null && type.Equals("") == false)
+                {
+                    payloadAtt.SetPayload(new Payload(Encoding.UTF8.GetBytes(type)));
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs
(added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Position/PositionFilter.cs
Mon Nov 21 04:44:55 2011
@@ -0,0 +1,76 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis.Position
+{
+    /** Set the positionIncrement of all tokens to the "positionIncrement",
+     * except the first return token which retains its original positionIncrement value.
+     * The default positionIncrement value is zero.
+     */
+    public sealed class PositionFilter : TokenFilter
+    {
+
+        /** Position increment to assign to all but the first token - default = 0 */
+        private int positionIncrement = 0;
+
+        /** The first token must have non-zero positionIncrement **/
+        private bool firstTokenPositioned = false;
+
+        private PositionIncrementAttribute posIncrAtt;
+
+        /**
+         * Constructs a PositionFilter that assigns a position increment of zero to
+         * all but the first token from the given input stream.
+         * 
+         * @param input the input stream
+         */
+        public PositionFilter(TokenStream input)
+            : base(input)
+        {
+            posIncrAtt = AddAttribute<PositionIncrementAttribute>();
+        }
+
+        /**
+         * Constructs a PositionFilter that assigns the given position increment to
+         * all but the first token from the given input stream.
+         * 
+         * @param input the input stream
+         * @param positionIncrement position increment to assign to all but the first
+         *  token from the input stream
+         */
+        public PositionFilter(TokenStream input, int positionIncrement)
+            : this(input)
+        {
+            this.positionIncrement = positionIncrement;
+        }
+
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                if (firstTokenPositioned)
+                {
+                    posIncrAtt.SetPositionIncrement(positionIncrement);
+                }
+                else
+                {
+                    firstTokenPositioned = true;
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            firstTokenPositioned = false;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs
(added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs
Mon Nov 21 04:44:55 2011
@@ -0,0 +1,279 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Index;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Query
+{
+/**
+ * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide
a layer of protection
+ * which prevents very common words from being passed into queries. 
+ * <p>
+ * For very large indexes the cost
+ * of reading TermDocs for a very common word can be  high. This analyzer was created after
experience with
+ * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries
for 
+ * this term to take 2 seconds.
+ * </p>
+ * <p>
+ * Use the various "addStopWords" methods in this class to automate the identification and
addition of 
+ * stop words found in an already existing index.
+ * </p>
+ */
+public class QueryAutoStopWordAnalyzer : Analyzer {
+  Analyzer _delegate;
+  HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
+  //The default maximum percentage (40%) of index documents which
+  //can contain a term, after which the term is considered to be a stop word.
+  public const float defaultMaxDocFreqPercent = 0.4f;
+  private readonly Version matchVersion;
+
+  /**
+   * Initializes this analyzer with the Analyzer object that actually produces the tokens
+   *
+   * @param _delegate The choice of {@link Analyzer} that is used to produce the token stream
which needs filtering
+   */
+  public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate) 
+  {
+    this._delegate = _delegate;
+    SetOverridesTokenStreamMethod(typeof(QueryAutoStopWordAnalyzer));
+    this.matchVersion = matchVersion;
+  }
+
+  /**
+   * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
+   *
+   * @param reader The {@link IndexReader} which will be consulted to identify potential
stop words that
+   *               exceed the required document frequency
+   * @return The number of stop words identified.
+   * @throws IOException
+   */
+  public int AddStopWords(IndexReader reader) 
+  {
+    return AddStopWords(reader, defaultMaxDocFreqPercent);
+  }
+
+  /**
+   * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
+   *
+   * @param reader     The {@link IndexReader} which will be consulted to identify potential
stop words that
+   *                   exceed the required document frequency
+   * @param maxDocFreq The maximum number of index documents which can contain a term, after
which
+   *                   the term is considered to be a stop word
+   * @return The number of stop words identified.
+   * @throws IOException
+   */
+  public int AddStopWords(IndexReader reader, int maxDocFreq) 
+  {
+    int numStopWords = 0;
+    ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
+    for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
+      String fieldName = iter.Current;
+      numStopWords += AddStopWords(reader, fieldName, maxDocFreq);
+    }
+    return numStopWords;
+  }
+
+  /**
+   * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
+   *
+   * @param reader        The {@link IndexReader} which will be consulted to identify potential
stop words that
+   *                      exceed the required document frequency
+   * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents
which
+   *                      contain a term, after which the word is considered to be a stop
word.
+   * @return The number of stop words identified.
+   * @throws IOException
+   */
+  public int AddStopWords(IndexReader reader, float maxPercentDocs) 
+  {
+    int numStopWords = 0;
+    ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
+    for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
+      String fieldName = iter.Current;
+      numStopWords += AddStopWords(reader, fieldName, maxPercentDocs);
+    }
+    return numStopWords;
+  }
+
+  /**
+   * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+   *
+   * @param reader         The {@link IndexReader} which will be consulted to identify potential
stop words that
+   *                       exceed the required document frequency
+   * @param fieldName      The field for which stopwords will be added
+   * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents
which
+   *                       contain a term, after which the word is considered to be a stop
word.
+   * @return The number of stop words identified.
+   * @throws IOException
+   */
+  public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs) 
+  {
+    return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs));
+  }
+
+  /**
+   * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
+   *
+   * @param reader     The {@link IndexReader} which will be consulted to identify potential
stop words that
+   *                   exceed the required document frequency
+   * @param fieldName  The field for which stopwords will be added
+   * @param maxDocFreq The maximum number of index documents which
+   *                   can contain a term, after which the term is considered to be a stop
word.
+   * @return The number of stop words identified.
+   * @throws IOException
+   */
+  public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq) 
+  {
+    HashSet<String> stopWords = new HashSet<String>();
+    String internedFieldName = StringHelper.Intern(fieldName);
+    TermEnum te = reader.Terms(new Term(fieldName));
+    Term term = te.Term();
+    while (term != null) {
+      if (term.Field() != internedFieldName) {
+        break;
+      }
+      if (te.DocFreq() > maxDocFreq) {
+        stopWords.Add(term.Text());
+      }
+      if (!te.Next()) {
+        break;
+      }
+      term = te.Term();
+    }
+    stopWordsPerField.Add(fieldName, stopWords);
+    
+    /* if the stopwords for a field are changed,
+     * then saved streams for that field are erased.
+     */
+    IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>)
GetPreviousTokenStream();
+    if (streamMap != null)
+      streamMap.Remove(fieldName);
+    
+    return stopWords.Count;
+  }
+
+  public override TokenStream TokenStream(String fieldName, TextReader reader) {
+    TokenStream result;
+    try {
+      result = _delegate.ReusableTokenStream(fieldName, reader);
+    } catch (IOException e) {
+      result = _delegate.TokenStream(fieldName, reader);
+    }
+    HashSet<String> stopWords = stopWordsPerField[fieldName];
+    if (stopWords != null) {
+      result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                              result, stopWords);
+    }
+    return result;
+  }
+  
+  private class SavedStreams {
+    /* the underlying stream */
+    protected internal TokenStream Wrapped;
+
+    /*
+     * when there are no stopwords for the field, refers to wrapped.
+     * if there stopwords, it is a StopFilter around wrapped.
+     */
+    protected internal TokenStream WithStopFilter;
+  };
+  
+  public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+{
+    if (overridesTokenStreamMethod) {
+      // LUCENE-1678: force fallback to tokenStream() if we
+      // have been subclassed and that subclass overrides
+      // tokenStream but not reusableTokenStream
+      return TokenStream(fieldName, reader);
+    }
+
+    /* map of SavedStreams for each field */
+    IDictionary<String, SavedStreams> streamMap = (IDictionary<String, SavedStreams>)GetPreviousTokenStream();
+    if (streamMap == null) {
+      streamMap = new HashMap<String, SavedStreams>();
+      SetPreviousTokenStream(streamMap);
+    }
+
+    SavedStreams streams = streamMap[fieldName];
+    if (streams == null) {
+      /* an entry for this field does not exist, create one */
+      streams = new SavedStreams();
+      streamMap.Add(fieldName, streams);
+      streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader);
+
+      /* if there are any stopwords for the field, save the stopfilter */
+      HashSet<String> stopWords = stopWordsPerField[fieldName];
+      if (stopWords != null)
+        streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.Wrapped, stopWords);
+      else
+        streams.WithStopFilter = streams.Wrapped;
+
+    } else {
+      /*
+       * an entry for this field exists, verify the wrapped stream has not
+       * changed. if it has not, reuse it, otherwise wrap the new stream.
+       */
+      TokenStream result = _delegate.ReusableTokenStream(fieldName, reader);
+      if (result == streams.Wrapped) {
+        /* the wrapped analyzer reused the stream */
+        streams.WithStopFilter.Reset();
+      } else {
+        /*
+         * the wrapped analyzer did not. if there are any stopwords for the
+         * field, create a new StopFilter around the new stream
+         */
+        streams.Wrapped = result;
+        HashSet<String> stopWords = stopWordsPerField[fieldName];
+        if (stopWords != null)
+          streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                  streams.Wrapped, stopWords);
+        else
+          streams.WithStopFilter = streams.Wrapped;
+      }
+    }
+
+    return streams.WithStopFilter;
+  }
+
+  /**
+   * Provides information on which stop words have been identified for a field
+   *
+   * @param fieldName The field for which stop words identified in "addStopWords"
+   *                  method calls will be returned
+   * @return the stop words identified for a field
+   */
+  public String[] GetStopWords(String fieldName) {
+    String[] result;
+    HashSet<String> stopWords = stopWordsPerField[fieldName];
+    if (stopWords != null) {
+      result = stopWords.ToArray();
+    } else {
+      result = new String[0];
+    }
+    return result;
+  }
+
+  /**
+   * Provides information on which stop words have been identified for all fields
+   *
+   * @return the stop words (as terms)
+   */
+  public Term[] GetStopWords() {
+    List<Term> allStopWords = new List<Term>();
+    foreach(var fieldName in stopWordsPerField.Keys) 
+    {
+      HashSet<String> stopWords = stopWordsPerField[fieldName];
+      foreach(var text in stopWords) {
+        allStopWords.Add(new Term(fieldName, text));
+      }
+    }
+    return allStopWords.ToArray();
+	}
+
+}
+}

Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs?rev=1204353&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs
(added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Reverse/ReverseStringFilter.cs
Mon Nov 21 04:44:55 2011
@@ -0,0 +1,128 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Analysis.Reverse
+{
+    /**
+     * Reverse token string, for example "country" => "yrtnuoc".
+     * <p>
+     * If <code>marker</code> is supplied, then tokens will be also prepended
by
+     * that character. For example, with a marker of &#x5C;u0001, "country" =>
+     * "&#x5C;u0001yrtnuoc". This is useful when implementing efficient leading
+     * wildcards search.
+     * </p>
+     */
+    public sealed class ReverseStringFilter : TokenFilter
+    {
+
+        private TermAttribute termAtt;
+        private readonly char marker;
+        private const char NOMARKER = '\uFFFF';
+
+        /**
+         * Example marker character: U+0001 (START OF HEADING) 
+         */
+        public const char START_OF_HEADING_MARKER = '\u0001';
+
+        /**
+         * Example marker character: U+001F (INFORMATION SEPARATOR ONE)
+         */
+        public const char INFORMATION_SEPARATOR_MARKER = '\u001F';
+
+        /**
+         * Example marker character: U+EC00 (PRIVATE USE AREA: EC00) 
+         */
+        public const char PUA_EC00_MARKER = '\uEC00';
+
+        /**
+         * Example marker character: U+200F (RIGHT-TO-LEFT MARK)
+         */
+        public const char RTL_DIRECTION_MARKER = '\u200F';
+
+        /**
+         * Create a new ReverseStringFilter that reverses all tokens in the 
+         * supplied {@link TokenStream}.
+         * <p>
+         * The reversed tokens will not be marked. 
+         * </p>
+         * 
+         * @param in {@link TokenStream} to filter
+         */
+        public ReverseStringFilter(TokenStream _in)
+            : this(_in, NOMARKER)
+        {
+
+        }
+
+        /**
+         * Create a new ReverseStringFilter that reverses and marks all tokens in the
+         * supplied {@link TokenStream}.
+         * <p>
+         * The reversed tokens will be prepended (marked) by the <code>marker</code>
+         * character.
+         * </p>
+         * 
+         * @param in {@link TokenStream} to filter
+         * @param marker A character used to mark reversed tokens
+         */
+        public ReverseStringFilter(TokenStream _in, char marker)
+            : base(_in)
+        {
+            this.marker = marker;
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                int len = termAtt.TermLength();
+                if (marker != NOMARKER)
+                {
+                    len++;
+                    termAtt.ResizeTermBuffer(len);
+                    termAtt.TermBuffer()[len - 1] = marker;
+                }
+                Reverse(termAtt.TermBuffer(), len);
+                termAtt.SetTermLength(len);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        public static String Reverse(String input)
+        {
+            char[] charInput = input.ToCharArray();
+            Reverse(charInput);
+            return new String(charInput);
+        }
+
+        public static void Reverse(char[] buffer)
+        {
+            Reverse(buffer, buffer.Length);
+        }
+
+        public static void Reverse(char[] buffer, int len)
+        {
+            Reverse(buffer, 0, len);
+        }
+
+        public static void Reverse(char[] buffer, int start, int len)
+        {
+            if (len <= 1) return;
+            int num = len >> 1;
+            for (int i = start; i < (start + num); i++)
+            {
+                char c = buffer[i];
+                buffer[i] = buffer[start * 2 + len - i - 1];
+                buffer[start * 2 + len - i - 1] = c;
+            }
+        }
+    }
+}

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
(original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianAnalyzer.cs
Mon Nov 21 04:44:55 2011
@@ -1,4 +1,4 @@
-/*
+/*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,253 +20,153 @@
 */
 
 using System;
+using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 using System.IO;
 using System.Collections;
 using Lucene.Net.Analysis;
+using Version = Lucene.Net.Util.Version;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// Analyzer for Russian language. Supports an external list of stopwords (words that
-	/// will not be indexed at all).
-	/// A default set of stopwords is used unless an alternative list is specified.
-	/// </summary>
-	public sealed class RussianAnalyzer : Analyzer
-	{
-		// letters
-		private static char A = (char)0;
-		private static char B = (char)1;
-		private static char V = (char)2;
-		private static char G = (char)3;
-		private static char D = (char)4;
-		private static char E = (char)5;
-		private static char ZH = (char)6;
-		private static char Z = (char)7;
-		private static char I = (char)8;
-		private static char I_ = (char)9;
-		private static char K = (char)10;
-		private static char L = (char)11;
-		private static char M = (char)12;
-		private static char N = (char)13;
-		private static char O = (char)14;
-		private static char P = (char)15;
-		private static char R = (char)16;
-		private static char S = (char)17;
-		private static char T = (char)18;
-		private static char U = (char)19;
-		//private static char F = (char)20;
-		private static char X = (char)21;
-		//private static char TS = (char)22;
-		private static char CH = (char)23;
-		private static char SH = (char)24;
-		private static char SHCH = (char)25;
-		//private static char HARD = (char)26;
-		private static char Y = (char)27;
-		private static char SOFT = (char)28;
-		private static char AE = (char)29;
-		private static char IU = (char)30;
-		private static char IA = (char)31;
-
-		/// <summary>
-		/// List of typical Russian stopwords.
-		/// </summary>
-		private static char[][] RUSSIAN_STOP_WORDS = {
-		new char[] {A},
-		new char[] {B, E, Z},
-		new char[] {B, O, L, E, E},
-		new char[] {B, Y},
-		new char[] {B, Y, L},
-		new char[] {B, Y, L, A},
-		new char[] {B, Y, L, I},
-		new char[] {B, Y, L, O},
-		new char[] {B, Y, T, SOFT},
-		new char[] {V},
-		new char[] {V, A, M},
-		new char[] {V, A, S},
-		new char[] {V, E, S, SOFT},
-		new char[] {V, O},
-		new char[] {V, O, T},
-		new char[] {V, S, E},
-		new char[] {V, S, E, G, O},
-		new char[] {V, S, E, X},
-		new char[] {V, Y},
-		new char[] {G, D, E},
-		new char[] {D, A},
-		new char[] {D, A, ZH, E},
-		new char[] {D, L, IA},
-		new char[] {D, O},
-		new char[] {E, G, O},
-		new char[] {E, E},
-		new char[] {E, I_,},
-		new char[] {E, IU},
-		new char[] {E, S, L, I},
-		new char[] {E, S, T, SOFT},
-		new char[] {E, SHCH, E},
-		new char[] {ZH, E},
-		new char[] {Z, A},
-		new char[] {Z, D, E, S, SOFT},
-		new char[] {I},
-		new char[] {I, Z},
-		new char[] {I, L, I},
-		new char[] {I, M},
-		new char[] {I, X},
-		new char[] {K},
-		new char[] {K, A, K},
-		new char[] {K, O},
-		new char[] {K, O, G, D, A},
-		new char[] {K, T, O},
-		new char[] {L, I},
-		new char[] {L, I, B, O},
-		new char[] {M, N, E},
-		new char[] {M, O, ZH, E, T},
-		new char[] {M, Y},
-		new char[] {N, A},
-		new char[] {N, A, D, O},
-		new char[] {N, A, SH},
-		new char[] {N, E},
-		new char[] {N, E, G, O},
-		new char[] {N, E, E},
-		new char[] {N, E, T},
-		new char[] {N, I},
-		new char[] {N, I, X},
-		new char[] {N, O},
-		new char[] {N, U},
-		new char[] {O},
-		new char[] {O, B},
-		new char[] {O, D, N, A, K, O},
-		new char[] {O, N},
-		new char[] {O, N, A},
-		new char[] {O, N, I},
-		new char[] {O, N, O},
-		new char[] {O, T},
-		new char[] {O, CH, E, N, SOFT},
-		new char[] {P, O},
-		new char[] {P, O, D},
-		new char[] {P, R, I},
-		new char[] {S},
-		new char[] {S, O},
-		new char[] {T, A, K},
-		new char[] {T, A, K, ZH, E},
-		new char[] {T, A, K, O, I_},
-		new char[] {T, A, M},
-		new char[] {T, E},
-		new char[] {T, E, M},
-		new char[] {T, O},
-		new char[] {T, O, G, O},
-		new char[] {T, O, ZH, E},
-		new char[] {T, O, I_},
-		new char[] {T, O, L, SOFT, K, O},
-		new char[] {T, O, M},
-		new char[] {T, Y},
-		new char[] {U},
-		new char[] {U, ZH, E},
-		new char[] {X, O, T, IA},
-		new char[] {CH, E, G, O},
-		new char[] {CH, E, I_},
-		new char[] {CH, E, M},
-		new char[] {CH, T, O},
-		new char[] {CH, T, O, B, Y},
-		new char[] {CH, SOFT, E},
-		new char[] {CH, SOFT, IA},
-		new char[] {AE, T, A},
-		new char[] {AE, T, I},
-		new char[] {AE, T, O},
-		new char[] {IA}
-													 };
-
-		/// <summary>
-		/// Contains the stopwords used with the StopFilter.
-		/// </summary>
-		private Hashtable stoptable = new Hashtable();
-
-		/// <summary>
-		/// Charset for Russian letters.
-	    /// Represents encoding for 32 lowercase Russian letters.
-		/// Predefined charsets can be taken from RussianCharSets class
-		/// </summary>
-		private char[] charset;
-
-		/// <summary>
-		/// Builds an analyzer.
-		/// </summary>
-		public RussianAnalyzer()
-		{
-			this.charset = RussianCharsets.UnicodeRussian;
-			stoptable = StopFilter.MakeStopSet(MakeStopWords(RussianCharsets.UnicodeRussian));
-		}
-
-		/// <summary>
-		/// Builds an analyzer.
-		/// </summary>
-		/// <param name="charset"></param>
-		public RussianAnalyzer(char[] charset)
-		{
-			this.charset = charset;
-			stoptable = StopFilter.MakeStopSet(MakeStopWords(charset));
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <param name="stopwords"></param>
-		public RussianAnalyzer(char[] charset, String[] stopwords)
-		{
-			this.charset = charset;
-			stoptable = StopFilter.MakeStopSet(stopwords);
-		}
-
-		/// <summary>
-		/// Takes russian stop words and translates them to a String array, using
-		/// the given charset 
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <returns></returns>
-		private static String[] MakeStopWords(char[] charset)
-		{
-			String[] res = new String[RUSSIAN_STOP_WORDS.Length];
-			for (int i = 0; i < res.Length; i++)
-			{
-				char[] theStopWord = RUSSIAN_STOP_WORDS[i];
-				// translate the word,using the charset
-				StringBuilder theWord = new StringBuilder();
-				for (int j = 0; j < theStopWord.Length; j++)
-				{
-					theWord.Append(charset[theStopWord[j]]);
-				}
-				res[i] = theWord.ToString();
-			}
-			return res;
-		}
-
-		/// <summary>
-		/// Builds an analyzer with the given stop words.
-		/// </summary>
-		/// <param name="charset"></param>
-		/// <param name="stopwords"></param>
-		public RussianAnalyzer(char[] charset, Hashtable stopwords)
-		{
-			this.charset = charset;
-			stoptable = stopwords;
-		}
-
-		/// <summary>
-		/// Creates a TokenStream which tokenizes all the text in the provided TextReader.
-		/// </summary>
-		/// <param name="fieldName"></param>
-		/// <param name="reader"></param>
-		/// <returns>
-		///		A TokenStream build from a RussianLetterTokenizer filtered with
-		///     RussianLowerCaseFilter, StopFilter, and RussianStemFilter
-		///  </returns>
-		public override TokenStream TokenStream(String fieldName, TextReader reader)
-		{
-			TokenStream result = new RussianLetterTokenizer(reader, charset);
-			result = new RussianLowerCaseFilter(result, charset);
-			result = new StopFilter(result, stoptable);
-			result = new RussianStemFilter(result, charset);
-			return result;
-		}
-	}
+    /// <summary>
+    /// Analyzer for Russian language. Supports an external list of stopwords (words that
+    /// will not be indexed at all).
+    /// A default set of stopwords is used unless an alternative list is specified.
+    /// </summary>
+    public sealed class RussianAnalyzer : Analyzer
+    {
+        /// <summary>
+        /// List of typical Russian stopwords.
+        /// </summary>
+        private static readonly String[] RUSSIAN_STOP_WORDS = {
+                                                                  "а", "без",
"более", "бы", "был", "была", "были",
+                                                                  "было", "быть",
"в",
+                                                                  "вам", "вас",
"весь", "во", "вот", "все", "всего",
+                                                                  "всех", "вы",
"где",
+                                                                  "да", "даже",
"для", "до", "его", "ее", "ей", "ею",
+                                                                  "если", "есть",
+                                                                  "еще", "же",
"за", "здесь", "и", "из", "или", "им",
+                                                                  "их", "к", "как",
+                                                                  "ко", "когда",
"кто", "ли", "либо", "мне", "может",
+                                                                  "мы", "на",
"надо",
+                                                                  "наш", "не",
"него", "нее", "нет", "ни", "них", "но",
+                                                                  "ну", "о", "об",
+                                                                  "однако",
"он", "она", "они", "оно", "от", "очень",
+                                                                  "по", "под",
"при",
+                                                                  "с", "со", "так",
"также", "такой", "там", "те", "тем"
+                                                                  , "то", "того",
+                                                                  "тоже", "той",
"только", "том", "ты", "у", "уже",
+                                                                  "хотя", "чего",
"чей",
+                                                                  "чем", "что",
"чтобы", "чье", "чья", "эта", "эти",
+                                                                  "это", "я"
+                                                              };
+
+        private static class DefaultSetHolder
+        {
+            internal static readonly ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new
CharArraySet(RUSSIAN_STOP_WORDS, false));
+        }
+
+        /// <summary>
+        /// Contains the stopwords used with the StopFilter.
+        /// </summary>
+        private readonly ISet<string> stopSet;
+
+        private readonly Version matchVersion;
+
+
+        public RussianAnalyzer(Version matchVersion)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+        {
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+         */
+        public RussianAnalyzer(Version matchVersion, params string[] stopwords)
+            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
+        {
+
+        }
+
+        /**
+         * Builds an analyzer with the given stop words
+         * 
+         * @param matchVersion
+         *          lucene compatibility version
+         * @param stopwords
+         *          a stopword set
+         */
+        public RussianAnalyzer(Version matchVersion, ISet<string> stopwords)
+        {
+            stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
+            this.matchVersion = matchVersion;
+        }
+
+        /**
+         * Builds an analyzer with the given stop words.
+         * TODO: create a Set version of this ctor
+         * @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
+         */
+        public RussianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
+            : this(matchVersion, stopwords.Keys.ToArray())
+        {
+        }
+
+        /**
+         * Creates a {@link TokenStream} which tokenizes all the text in the 
+         * provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a 
+         *   {@link RussianLetterTokenizer} filtered with 
+         *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+         *   and {@link RussianStemFilter}
+         */
+        public override TokenStream TokenStream(String fieldName, TextReader reader)
+        {
+            TokenStream result = new RussianLetterTokenizer(reader);
+            result = new LowerCaseFilter(result);
+            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                    result, stopSet);
+            result = new RussianStemFilter(result);
+            return result;
+        }
+
+        private class SavedStreams
+        {
+            protected internal Tokenizer source;
+            protected internal TokenStream result;
+        };
+
+        /**
+         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+         * in the provided {@link Reader}.
+         *
+         * @return  A {@link TokenStream} built from a 
+         *   {@link RussianLetterTokenizer} filtered with 
+         *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
+         *   and {@link RussianStemFilter}
+         */
+        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
+        {
+            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();
+            if (streams == null)
+            {
+                streams = new SavedStreams();
+                streams.source = new RussianLetterTokenizer(reader);
+                streams.result = new LowerCaseFilter(streams.source);
+                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
+                                                streams.result, stopSet);
+                streams.result = new RussianStemFilter(streams.result);
+                SetPreviousTokenStream(streams);
+            }
+            else
+            {
+                streams.source.Reset(reader);
+            }
+            return streams.result;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
(original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLetterTokenizer.cs
Mon Nov 21 04:44:55 2011
@@ -22,42 +22,41 @@
 using System;
 using System.IO;
 using Lucene.Net.Analysis;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally
looking up letters
-	/// in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter()
method,
-	/// which doesn't know how to detect letters in encodings like CP1252 and KOI8
-	/// (well-known problems with 0xD7 and 0xF7 chars)
-	/// </summary>
-	public class RussianLetterTokenizer : CharTokenizer
-	{
-		/// <summary>
-		/// Construct a new LetterTokenizer.
-		/// </summary>
-		private char[] charset;
+    ///<summary>
+    /// A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
+    /// by also allowing the basic latin digits 0-9. 
+    ///</summary>
+    public class RussianLetterTokenizer : CharTokenizer
+    {
+        public RussianLetterTokenizer(TextReader _in)
+            : base(_in)
+        {
+        }
 
-		public RussianLetterTokenizer(TextReader _in, char[] charset) : base(_in)
-		{
-			this.charset = charset;
-		}
+        public RussianLetterTokenizer(AttributeSource source, TextReader _in)
+            : base(source, _in)
+        {
+        }
 
-		/// <summary>
-		/// Collects only characters which satisfy Char.IsLetter(char).
-		/// </summary>
-		/// <param name="c"></param>
-		/// <returns></returns>
-		protected override bool IsTokenChar(char c)
-		{
-			if (Char.IsLetter(c))
-				return true;
-			for (int i = 0; i < charset.Length; i++)
-			{
-				if (c == charset[i])
-					return true;
-			}
-			return false;
-		}
-	}
+        public RussianLetterTokenizer(AttributeSource.AttributeFactory factory, TextReader
__in)
+            : base(factory, __in)
+        {
+        }
+
+        /**
+         * Collects only characters which satisfy
+         * {@link Character#isLetter(char)}.
+         */
+        protected override bool IsTokenChar(char c)
+        {
+            if (char.IsLetter(c) || (c >= '0' && c <= '9'))
+                return true;
+            else
+                return false;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
(original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianLowerCaseFilter.cs
Mon Nov 21 04:44:55 2011
@@ -21,41 +21,40 @@
 
 using System;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// Normalizes token text to lower case, analyzing given ("russian") charset.
-	/// </summary>
-	public sealed class RussianLowerCaseFilter : TokenFilter
-	{
-		char[] charset;
-
-		public RussianLowerCaseFilter(TokenStream _in, char[] charset) : base(_in)
-		{
-			this.charset = charset;
-		}
-
-		public override Token Next() 
-		{
-			Token t = input.Next();
-
-			if (t == null)
-				return null;
-
-			String txt = t.TermText();
-
-			char[] chArray = txt.ToCharArray();
-			for (int i = 0; i < chArray.Length; i++)
-			{
-				chArray[i] = RussianCharsets.ToLowerCase(chArray[i], charset);
-			}
-
-			String newTxt = new String(chArray);
-			// create new token
-			Token newToken = new Token(newTxt, t.StartOffset(), t.EndOffset());
-
-			return newToken;
-		}
-	}
+    /// <summary>
+    /// Normalizes token text to lower case.
+    /// </summary>
+    [Obsolete("Use LowerCaseFilter instead, which has the same functionality. This filter
will be removed in Lucene 4.0")]
+    public sealed class RussianLowerCaseFilter : TokenFilter
+    {
+        private TermAttribute termAtt;
+
+        public RussianLowerCaseFilter(TokenStream _in)
+            : base(_in)
+        {
+            termAtt = AddAttribute<TermAttribute>();
+        }
+
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                char[] chArray = termAtt.TermBuffer();
+                int chLen = termAtt.TermLength();
+                for (int i = 0; i < chLen; i++)
+                {
+                    chArray[i] = char.ToLower(chArray[i]);
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs?rev=1204353&r1=1204352&r2=1204353&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
(original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Analyzers/Ru/RussianStemFilter.cs
Mon Nov 21 04:44:55 2011
@@ -21,59 +21,65 @@
 
 using System;
 using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
 
 namespace Lucene.Net.Analysis.Ru
 {
-	/// <summary>
-	/// A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
-	/// The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter,
-	/// because RussianStemFilter only works  with lowercase part of any "russian" charset.
-	/// </summary>
-	public sealed class RussianStemFilter : TokenFilter
-	{
-		/// <summary>
-		/// The actual token in the input stream.
-		/// </summary>
-		private Token token = null;
-		private RussianStemmer stemmer = null;
-
-		public RussianStemFilter(TokenStream _in, char[] charset) : base(_in)
-		{
-			stemmer = new RussianStemmer(charset);
-		}
-
-		/// <summary>
-		/// 
-		/// </summary>
-		/// <returns>Returns the next token in the stream, or null at EOS</returns>
-		public override Token Next() 
-		{
-			if ((token = input.Next()) == null)
-			{
-				return null;
-			}
-			else
-			{
-				String s = stemmer.Stem(token.TermText());
-				if (!s.Equals(token.TermText()))
-				{
-					return new Token(s, token.StartOffset(), token.EndOffset(),
-						token.Type());
-				}
-				return token;
-			}
-		}
-
-		/// <summary>
-		/// Set a alternative/custom RussianStemmer for this filter.
-		/// </summary>
-		/// <param name="stemmer"></param>
-		public void SetStemmer(RussianStemmer stemmer)
-		{
-			if (stemmer != null)
-			{
-				this.stemmer = stemmer;
-			}
-		}
-	}
+    /**
+    * A {@link TokenFilter} that stems Russian words. 
+    * <p>
+    * The implementation was inspired by GermanStemFilter.
+    * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter
,
+    * because RussianStemFilter only works with lowercase characters.
+    * </p>
+    */
+    public sealed class RussianStemFilter : TokenFilter
+    {
+        /**
+         * The actual token in the input stream.
+         */
+        private RussianStemmer stemmer = null;
+
+        private TermAttribute termAtt;
+
+        public RussianStemFilter(TokenStream _in)
+            : base(_in)
+        {
+            stemmer = new RussianStemmer();
+            termAtt = AddAttribute<TermAttribute>();
+        }
+        /**
+         * Returns the next token in the stream, or null at EOS
+         */
+        public sealed override bool IncrementToken()
+        {
+            if (input.IncrementToken())
+            {
+                String term = termAtt.Term();
+                String s = stemmer.Stem(term);
+                if (s != null && !s.Equals(term))
+                    termAtt.SetTermBuffer(s);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+
+        // I don't get the point of this.  All methods in java are private, so they can't
be
+        // overridden...You can't really subclass any of its behavior.  I've commented it
out,
+        // as it doesn't compile as is. - cc
+        ////**
+        // * Set a alternative/custom {@link RussianStemmer} for this filter.
+        // */
+        //public void SetStemmer(RussianStemmer stemmer)
+        //{
+        //    if (stemmer != null)
+        //    {
+        //        this.stemmer = stemmer;
+        //    }
+        //}
+    }
 }
\ No newline at end of file



Mime
View raw message