lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r820756 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/el/ analyzers/common/src/java/org/apache/lucene/analysis/ru/ analyzers/common/src/test/org/apache/lucene/analysis/ru/
Date Thu, 01 Oct 2009 19:20:10 GMT
Author: rmuir
Date: Thu Oct  1 19:20:09 2009
New Revision: 820756

URL: http://svn.apache.org/viewvc?rev=820756&view=rev
Log:
LUCENE-1936: Remove deprecated charset support from Greek and Russian analyzers

Removed:
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/res1251.htm
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/test1251.txt
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Oct  1 19:20:09 2009
@@ -6,6 +6,10 @@
 
 API Changes
 
+ * LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
+   text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
+   instead, which has the same functionality.  (Robert Muir)
+   
 Bug fixes
 
  * LUCENE-1781: Fixed various issues with the lat/lng bounding box

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
Thu Oct  1 19:20:09 2009
@@ -39,111 +39,19 @@
  */
 public final class GreekAnalyzer extends Analyzer
 {
-    // the letters are indexes to the charset array (see GreekCharsets.java)
-    private static char A = 6;
-    private static char B = 7;
-    private static char G = 8;
-    private static char D = 9;
-    private static char E = 10;
-    private static char Z = 11;
-    private static char H = 12;
-    private static char TH = 13;
-    private static char I = 14;
-    private static char K = 15;
-    private static char L = 16;
-    private static char M = 17;
-    private static char N = 18;
-    private static char KS = 19;
-    private static char O = 20;
-    private static char P = 21;
-    private static char R = 22;
-    private static char S = 24;	// skip final sigma
-    private static char T = 25;
-    private static char Y = 26;
-    private static char F = 27;
-    private static char X = 28;
-    private static char PS = 29;
-    private static char W = 30;
-
     /**
      * List of typical Greek stopwords.
      */
-    private static char[][] GREEK_STOP_WORDS = {
-        {O},
-		{H},
-		{T, O},
-        {O, I},
-		{T, A},
-		{T, O, Y},
-		{T, H, S},
-		{T, W, N},
-		{T, O, N},
-		{T, H, N},
-		{K, A, I},
-		{K, I},
-		{K},
-		{E, I, M, A, I},
-		{E, I, S, A, I},
-		{E, I, N, A, I},
-		{E, I, M, A, S, T, E},
-		{E, I, S, T, E},
-		{S, T, O},
-		{S, T, O, N},
-		{S, T, H},
-		{S, T, H, N},
-		{M, A},
-		{A, L, L, A},
-		{A, P, O},
-		{G, I, A},
-		{P, R, O, S},
-		{M, E},
-		{S, E},
-		{W, S},
-		{P, A, R, A},
-		{A, N, T, I},
-		{K, A, T, A},
-		{M, E, T, A},
-		{TH, A},
-		{N, A},
-		{D, E},
-		{D, E, N},
-		{M, H},
-		{M, H, N},
-		{E, P, I},
-		{E, N, W},
-		{E, A, N},
-		{A, N},
-		{T, O, T, E},
-		{P, O, Y},
-		{P, W, S},
-		{P, O, I, O, S},
-		{P, O, I, A},
-		{P, O, I, O},
-		{P, O, I, O, I},
-		{P, O, I, E, S},
-		{P, O, I, W, N},
-		{P, O, I, O, Y, S},
-		{A, Y, T, O, S},
-		{A, Y, T, H},
-		{A, Y, T, O},
-		{A, Y, T, O, I},
-		{A, Y, T, W, N},
-		{A, Y, T, O, Y, S},
-		{A, Y, T, E, S},
-		{A, Y, T, A},
-		{E, K, E, I, N, O, S},
-		{E, K, E, I, N, H},
-		{E, K, E, I, N, O},
-		{E, K, E, I, N, O, I},
-		{E, K, E, I, N, E, S},
-		{E, K, E, I, N, A},
-		{E, K, E, I, N, W, N},
-		{E, K, E, I, N, O, Y, S},
-		{O, P, W, S},
-		{O, M, W, S},
-		{I, S, W, S},
-		{O, S, O},
-		{O, T, I}
+    private static final String[] GREEK_STOP_WORDS = {
+      "ο", "η", "το", "οι", "τα", "του", "τησ",
"των", "τον", "την", "και", 
+      "κι", "κ", "ειμαι", "εισαι", "ειναι",
"ειμαστε", "ειστε", "στο", "στον",
+      "στη", "στην", "μα", "αλλα", "απο",
"για", "προσ", "με", "σε", "ωσ",
+      "παρα", "αντι", "κατα", "μετα", "θα",
"να", "δε", "δεν", "μη", "μην",
+      "επι", "ενω", "εαν", "αν", "τοτε", "που",
"πωσ", "ποιοσ", "ποια", "ποιο",
+      "ποιοι", "ποιεσ", "ποιων", "ποιουσ",
"αυτοσ", "αυτη", "αυτο", "αυτοι",
+      "αυτων", "αυτουσ", "αυτεσ", "αυτα",
"εκεινοσ", "εκεινη", "εκεινο",
+      "εκεινοι", "εκεινεσ", "εκεινα",
"εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
+      "ισωσ", "οσο", "οτι"
     };
 
     /**
@@ -151,28 +59,8 @@
      */
     private Set stopSet = new HashSet();
 
-    /**
-     * Charset for Greek letters.
-     * Represents encoding for 24 lowercase Greek letters.
-     * Predefined charsets can be taken from {@link GreekCharsets} class
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
     public GreekAnalyzer() {
-        charset = GreekCharsets.UnicodeGreek;
-        stopSet = StopFilter.makeStopSet(
-                    makeStopWords(GreekCharsets.UnicodeGreek));
-    }
-
-    /**
-     * Builds an analyzer.
-     * @deprecated Use {@link #GreekAnalyzer()} instead.
-     */
-    public GreekAnalyzer(char[] charset)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
+        this(GREEK_STOP_WORDS);
     }
     
     /**
@@ -181,58 +69,16 @@
      */
     public GreekAnalyzer(String [] stopwords)
     {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
     	stopSet = StopFilter.makeStopSet(stopwords);
     }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
-     */
-    public GreekAnalyzer(char[] charset, String[] stopwords)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(stopwords);
-    }
-
-    /**
-     * Takes greek stop words and translates them to a String array, using
-     * the given charset.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private static String[] makeStopWords(char[] charset)
-    {
-        String[] res = new String[GREEK_STOP_WORDS.length];
-        for (int i = 0; i < res.length; i++)
-        {
-            char[] theStopWord = GREEK_STOP_WORDS[i];
-            // translate the word,using the charset
-            StringBuffer theWord = new StringBuffer();
-            for (int j = 0; j < theStopWord.length; j++)
-            {
-                theWord.append(charset[theStopWord[j]]);
-            }
-            res[i] = theWord.toString();
-        }
-        return res;
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
-     */
-    public GreekAnalyzer(char[] charset, Map stopwords)
-    {
-        this.charset = charset;
-        stopSet = new HashSet(stopwords.keySet());
-    }
     
     /**
      * Builds an analyzer with the given stop words.
      */
     public GreekAnalyzer(Map stopwords)
     {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
     	stopSet = new HashSet(stopwords.keySet());
     }
 
@@ -245,7 +91,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
     	TokenStream result = new StandardTokenizer(reader);
-        result = new GreekLowerCaseFilter(result, charset);
+        result = new GreekLowerCaseFilter(result);
         result = new StopFilter(result, stopSet);
         return result;
     }
@@ -268,7 +114,7 @@
       if (streams == null) {
         streams = new SavedStreams();
         streams.source = new StandardTokenizer(reader);
-        streams.result = new GreekLowerCaseFilter(streams.source, charset);
+        streams.result = new GreekLowerCaseFilter(streams.source);
         streams.result = new StopFilter(streams.result, stopSet);
         setPreviousTokenStream(streams);
       } else {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
Thu Oct  1 19:20:09 2009
@@ -23,44 +23,93 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * Normalizes token text to lower case, analyzing given ("greek") charset.
+ * Normalizes token text to lower case, removes some Greek diacritics,
+ * and standardizes final sigma to sigma. 
  *
  */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    char[] charset;
-
     private TermAttribute termAtt;
     
-    /**
-     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
-     */
-    public GreekLowerCaseFilter(TokenStream in, char[] charset)
-    {
-        super(in);
-        this.charset = charset;
-        termAtt = addAttribute(TermAttribute.class);
-    }
-    
     public GreekLowerCaseFilter(TokenStream in)
     {
-    	this(in, GreekCharsets.UnicodeGreek);
+    	super(in);
+    	termAtt = addAttribute(TermAttribute.class);
     }
 
     public boolean incrementToken() throws IOException {
       if (input.incrementToken()) {
         char[] chArray = termAtt.termBuffer();
         int chLen = termAtt.termLength();
+        // TODO: iterate codepoints to support supp. characters
         for (int i = 0; i < chLen; i++)
         {
-          chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = (char) lowerCase(chArray[i]);
         }
         return true;
       } else {
         return false;
       }
     }
+    
+    private int lowerCase(int codepoint) {
+      switch(codepoint) {
+        /* There are two lowercase forms of sigma:
+         *   U+03C2: small final sigma (end of word)
+         *   U+03C3: small sigma (otherwise)
+         *   
+         * Standardize both to U+03C3
+         */
+        case '\u03C2': /* small final sigma */
+          return '\u03C3'; /* small sigma */
+        
+        /* Some greek characters contain diacritics.
+         * This filter removes these, converting to the lowercase base form.
+         */
+        
+        case '\u0386': /* capital alpha with tonos */
+        case '\u03AC': /* small alpha with tonos */
+          return '\u03B1'; /* small alpha */
+          
+        case '\u0388': /* capital epsilon with tonos */
+        case '\u03AD': /* small epsilon with tonos */
+          return '\u03B5'; /* small epsilon */
+          
+        case '\u0389': /* capital eta with tonos */
+        case '\u03AE': /* small eta with tonos */
+          return '\u03B7'; /* small eta */
+        
+        case '\u038A': /* capital iota with tonos */
+        case '\u03AA': /* capital iota with dialytika */
+        case '\u03AF': /* small iota with tonos */
+        case '\u03CA': /* small iota with dialytika */
+        case '\u0390': /* small iota with dialytika and tonos */
+          return '\u03B9'; /* small iota */
+          
+        case '\u038E': /* capital upsilon with tonos */
+        case '\u03AB': /* capital upsilon with dialytika */
+        case '\u03CD': /* small upsilon with tonos */
+        case '\u03CB': /* small upsilon with dialytika */
+        case '\u03B0': /* small upsilon with dialytika and tonos */
+          return '\u03C5'; /* small upsilon */
+          
+        case '\u038C': /* capital omicron with tonos */
+        case '\u03CC': /* small omicron with tonos */
+          return '\u03BF'; /* small omicron */
+          
+        case '\u038F': /* capital omega with tonos */
+        case '\u03CE': /* small omega with tonos */
+          return '\u03C9'; /* small omega */
+          
+        /* The previous implementation did the conversion below.
+         * Only implemented for backwards compatibility with old indexes.
+         */
+          
+        case '\u03A2': /* reserved */
+          return '\u03C2'; /* small final sigma */
+          
+        default:
+          return Character.toLowerCase(codepoint);
+      }
+    }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
Thu Oct  1 19:20:09 2009
@@ -24,6 +24,7 @@
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -40,145 +41,20 @@
  */
 public final class RussianAnalyzer extends Analyzer
 {
-    // letters (currently unused letters are commented out)
-    private final static char A = 0;
-    private final static char B = 1;
-    private final static char V = 2;
-    private final static char G = 3;
-    private final static char D = 4;
-    private final static char E = 5;
-    private final static char ZH = 6;
-    private final static char Z = 7;
-    private final static char I = 8;
-    private final static char I_ = 9;
-    private final static char K = 10;
-    private final static char L = 11;
-    private final static char M = 12;
-    private final static char N = 13;
-    private final static char O = 14;
-    private final static char P = 15;
-    private final static char R = 16;
-    private final static char S = 17;
-    private final static char T = 18;
-    private final static char U = 19;
-    //private final static char F = 20;
-    private final static char X = 21;
-    //private final static char TS = 22;
-    private final static char CH = 23;
-    private final static char SH = 24;
-    private final static char SHCH = 25;
-    //private final static char HARD = 26;
-    private final static char Y = 27;
-    private final static char SOFT = 28;
-    private final static char AE = 29;
-    private final static char IU = 30;
-    private final static char IA = 31;
-
     /**
      * List of typical Russian stopwords.
      */
-    private static char[][] RUSSIAN_STOP_WORDS = {
-        {A},
-        {B, E, Z},
-        {B, O, L, E, E},
-        {B, Y},
-        {B, Y, L},
-        {B, Y, L, A},
-        {B, Y, L, I},
-        {B, Y, L, O},
-        {B, Y, T, SOFT},
-        {V},
-        {V, A, M},
-        {V, A, S},
-        {V, E, S, SOFT},
-        {V, O},
-        {V, O, T},
-        {V, S, E},
-        {V, S, E, G, O},
-        {V, S, E, X},
-        {V, Y},
-        {G, D, E},
-        {D, A},
-        {D, A, ZH, E},
-        {D, L, IA},
-        {D, O},
-        {E, G, O},
-        {E, E},
-        {E, I_,},
-        {E, IU},
-        {E, S, L, I},
-        {E, S, T, SOFT},
-        {E, SHCH, E},
-        {ZH, E},
-        {Z, A},
-        {Z, D, E, S, SOFT},
-        {I},
-        {I, Z},
-        {I, L, I},
-        {I, M},
-        {I, X},
-        {K},
-        {K, A, K},
-        {K, O},
-        {K, O, G, D, A},
-        {K, T, O},
-        {L, I},
-        {L, I, B, O},
-        {M, N, E},
-        {M, O, ZH, E, T},
-        {M, Y},
-        {N, A},
-        {N, A, D, O},
-        {N, A, SH},
-        {N, E},
-        {N, E, G, O},
-        {N, E, E},
-        {N, E, T},
-        {N, I},
-        {N, I, X},
-        {N, O},
-        {N, U},
-        {O},
-        {O, B},
-        {O, D, N, A, K, O},
-        {O, N},
-        {O, N, A},
-        {O, N, I},
-        {O, N, O},
-        {O, T},
-        {O, CH, E, N, SOFT},
-        {P, O},
-        {P, O, D},
-        {P, R, I},
-        {S},
-        {S, O},
-        {T, A, K},
-        {T, A, K, ZH, E},
-        {T, A, K, O, I_},
-        {T, A, M},
-        {T, E},
-        {T, E, M},
-        {T, O},
-        {T, O, G, O},
-        {T, O, ZH, E},
-        {T, O, I_},
-        {T, O, L, SOFT, K, O},
-        {T, O, M},
-        {T, Y},
-        {U},
-        {U, ZH, E},
-        {X, O, T, IA},
-        {CH, E, G, O},
-        {CH, E, I_},
-        {CH, E, M},
-        {CH, T, O},
-        {CH, T, O, B, Y},
-        {CH, SOFT, E},
-        {CH, SOFT, IA},
-        {AE, T, A},
-        {AE, T, I},
-        {AE, T, O},
-        {IA}
+    private static final String[] RUSSIAN_STOP_WORDS = {
+      "а", "без", "более", "бы", "был", "была",
"были", "было", "быть", "в",
+      "вам", "вас", "весь", "во", "вот", "все",
"всего", "всех", "вы", "где", 
+      "да", "даже", "для", "до", "его", "ее",
"ей", "ею", "если", "есть", 
+      "еще", "же", "за", "здесь", "и", "из",
"или", "им", "их", "к", "как",
+      "ко", "когда", "кто", "ли", "либо",
"мне", "может", "мы", "на", "надо", 
+      "наш", "не", "него", "нее", "нет", "ни",
"них", "но", "ну", "о", "об", 
+      "однако", "он", "она", "они", "оно",
"от", "очень", "по", "под", "при", 
+      "с", "со", "так", "также", "такой",
"там", "те", "тем", "то", "того", 
+      "тоже", "той", "только", "том", "ты",
"у", "уже", "хотя", "чего", "чей", 
+      "чем", "что", "чтобы", "чье", "чья",
"эта", "эти", "это", "я"
     };
 
     /**
@@ -186,89 +62,26 @@
      */
     private Set stopSet = new HashSet();
 
-    /**
-     * Charset for Russian letters.
-     * Represents encoding for 32 lowercase Russian letters.
-     * Predefined charsets can be taken from RussianCharSets class
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
-
     public RussianAnalyzer() {
-        charset = RussianCharsets.UnicodeRussian;
-        stopSet = StopFilter.makeStopSet(
-                    makeStopWords(RussianCharsets.UnicodeRussian));
+        this(RUSSIAN_STOP_WORDS);
     }
-
-    /**
-     * Builds an analyzer.
-     * @deprecated Use {@link #RussianAnalyzer()} instead.
-     */
-    public RussianAnalyzer(char[] charset)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
-     */
-    public RussianAnalyzer(char[] charset, String[] stopwords)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(stopwords);
-    }
-    
+  
     /**
      * Builds an analyzer with the given stop words.
      */
     public RussianAnalyzer(String[] stopwords)
     {
-    	this.charset = RussianCharsets.UnicodeRussian;
+    	super();
     	stopSet = StopFilter.makeStopSet(stopwords);
     }
-
-    /** Takes russian stop words and translates them to a String array, using
-     * the given charset.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private static String[] makeStopWords(char[] charset)
-    {
-        String[] res = new String[RUSSIAN_STOP_WORDS.length];
-        for (int i = 0; i < res.length; i++)
-        {
-            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
-            // translate the word, using the charset
-            StringBuffer theWord = new StringBuffer();
-            for (int j = 0; j < theStopWord.length; j++)
-            {
-                theWord.append(charset[theStopWord[j]]);
-            }
-            res[i] = theWord.toString();
-        }
-        return res;
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * TODO: create a Set version of this ctor
-     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
-     */
-    public RussianAnalyzer(char[] charset, Map stopwords)
-    {
-        this.charset = charset;
-        stopSet = new HashSet(stopwords.keySet());
-    }
-    
+   
     /**
      * Builds an analyzer with the given stop words.
      * TODO: create a Set version of this ctor
      */
     public RussianAnalyzer(Map stopwords)
     {
-    	charset = RussianCharsets.UnicodeRussian;
+    	super();
     	stopSet = new HashSet(stopwords.keySet());
     }
 
@@ -283,10 +96,10 @@
      */
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
-        TokenStream result = new RussianLetterTokenizer(reader, charset);
-        result = new RussianLowerCaseFilter(result, charset);
+        TokenStream result = new RussianLetterTokenizer(reader);
+        result = new LowerCaseFilter(result);
         result = new StopFilter(result, stopSet);
-        result = new RussianStemFilter(result, charset);
+        result = new RussianStemFilter(result);
         return result;
     }
     
@@ -309,10 +122,10 @@
     SavedStreams streams = (SavedStreams) getPreviousTokenStream();
     if (streams == null) {
       streams = new SavedStreams();
-      streams.source = new RussianLetterTokenizer(reader, charset);
-      streams.result = new RussianLowerCaseFilter(streams.source, charset);
+      streams.source = new RussianLetterTokenizer(reader);
+      streams.result = new LowerCaseFilter(streams.source);
       streams.result = new StopFilter(streams.result, stopSet);
-      streams.result = new RussianStemFilter(streams.result, charset);
+      streams.result = new RussianStemFilter(streams.result);
       setPreviousTokenStream(streams);
     } else {
       streams.source.reset(reader);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
Thu Oct  1 19:20:09 2009
@@ -25,49 +25,26 @@
 
 /**
  * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
- * by additionally looking up letters in a given "russian charset". 
- * <p>
- * The problem with 
- * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
- * which doesn't know how to detect letters in encodings like CP1252 and KOI8
- * (well-known problems with 0xD7 and 0xF7 chars)
- * </p>
+ * by also allowing the basic latin digits 0-9. 
  *
  * @version $Id$
  */
 
 public class RussianLetterTokenizer extends CharTokenizer
-{
-    /** 
-     * Charset this tokenizer uses.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
-    /**
-     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
-     */
-    public RussianLetterTokenizer(Reader in, char[] charset)
-    {
-        super(in);
-        this.charset = charset;
-    }
-    
+{    
     public RussianLetterTokenizer(Reader in)
     {
-    	this(in, RussianCharsets.UnicodeRussian);
+    	super(in);
     }
 
     public RussianLetterTokenizer(AttributeSource source, Reader in)
     {
         super(source, in);
-        this.charset = RussianCharsets.UnicodeRussian;
     }
 
     public RussianLetterTokenizer(AttributeFactory factory, Reader in)
     {
         super(factory, in);
-        this.charset = RussianCharsets.UnicodeRussian;
     }
     
     /**
@@ -76,14 +53,9 @@
      */
     protected boolean isTokenChar(char c)
     {
-    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
-        if (Character.isLetter(c))
+        if (Character.isLetter(c) || (c >= '0' && c <= '9'))
             return true;
-        for (int i = 0; i < charset.length; i++)
-        {
-            if (c == charset[i])
-                return true;
-        }
-        return false;
+        else
+            return false;
     }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
Thu Oct  1 19:20:09 2009
@@ -19,39 +19,27 @@
 
 import java.io.IOException;
 
+import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * Normalizes token text to lower case, analyzing given ("russian") charset.
- *
+ * Normalizes token text to lower case.
+ * @deprecated Use {@link LowerCaseFilter} instead, which has the same
+ *  functionality. This filter will be removed in Lucene 3.1
  *
  * @version $Id$
  */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    char[] charset;
-
     private TermAttribute termAtt;
-
-    /**
-     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
-     */
-    public RussianLowerCaseFilter(TokenStream in, char[] charset)
+   
+    public RussianLowerCaseFilter(TokenStream in)
     {
         super(in);
-        this.charset = charset;
         termAtt = addAttribute(TermAttribute.class);
     }
-    
-    public RussianLowerCaseFilter(TokenStream in)
-    {
-    	this(in, RussianCharsets.UnicodeRussian);
-    }
 
     public final boolean incrementToken() throws IOException
     {
@@ -60,7 +48,7 @@
         int chLen = termAtt.termLength();
         for (int i = 0; i < chLen; i++)
         {
-          chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = Character.toLowerCase(chArray[i]);
         }
         return true;
       } else {

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
Thu Oct  1 19:20:09 2009
@@ -17,6 +17,7 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -28,8 +29,8 @@
  * A {@link TokenFilter} that stems Russian words. 
  * <p>
  * The implementation was inspired by GermanStemFilter.
- * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter
,
- * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter
,
+ * because RussianStemFilter only works with lowercase characters.
  * </p>
  *
  * @version   $Id$
@@ -43,20 +44,12 @@
 
     private TermAttribute termAtt;
 
-    /**
-     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
-     */
-    public RussianStemFilter(TokenStream in, char[] charset)
+    public RussianStemFilter(TokenStream in)
     {
         super(in);
-        stemmer = new RussianStemmer(charset);
+        stemmer = new RussianStemmer();
         termAtt = addAttribute(TermAttribute.class);
     }
-
-    public RussianStemFilter(TokenStream in)
-    {
-    	this(in, RussianCharsets.UnicodeRussian);
-    }
     /**
      * Returns the next token in the stream, or null at EOS
      */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
Thu Oct  1 19:20:09 2009
@@ -25,47 +25,42 @@
  */
 class RussianStemmer
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
-     */
-    private char[] charset;
-
     // positions of RV, R1 and R2 respectively
     private int RV, R1, R2;
 
     // letters (currently unused letters are commented out)
-    private final static char A = 0;
-    //private final static char B = 1;
-    private final static char V = 2;
-    private final static char G = 3;
-    //private final static char D = 4;
-    private final static char E = 5;
-    //private final static char ZH = 6;
-    //private final static char Z = 7;
-    private final static char I = 8;
-    private final static char I_ = 9;
-    //private final static char K = 10;
-    private final static char L = 11;
-    private final static char M = 12;
-    private final static char N = 13;
-    private final static char O = 14;
-    //private final static char P = 15;
-    //private final static char R = 16;
-    private final static char S = 17;
-    private final static char T = 18;
-    private final static char U = 19;
-    //private final static char F = 20;
-    private final static char X = 21;
-    //private final static char TS = 22;
-    //private final static char CH = 23;
-    private final static char SH = 24;
-    private final static char SHCH = 25;
-    //private final static char HARD = 26;
-    private final static char Y = 27;
-    private final static char SOFT = 28;
-    private final static char AE = 29;
-    private final static char IU = 30;
-    private final static char IA = 31;
+    private final static char A = '\u0430';
+    //private final static char B = '\u0431';
+    private final static char V = '\u0432';
+    private final static char G = '\u0433';
+    //private final static char D = '\u0434';
+    private final static char E = '\u0435';
+    //private final static char ZH = '\u0436';
+    //private final static char Z = '\u0437';
+    private final static char I = '\u0438';
+    private final static char I_ = '\u0439';
+    //private final static char K = '\u043A';
+    private final static char L = '\u043B';
+    private final static char M = '\u043C';
+    private final static char N = '\u043D';
+    private final static char O = '\u043E';
+    //private final static char P = '\u043F';
+    //private final static char R = '\u0440';
+    private final static char S = '\u0441';
+    private final static char T = '\u0442';
+    private final static char U = '\u0443';
+    //private final static char F = '\u0444';
+    private final static char X = '\u0445';
+    //private final static char TS = '\u0446';
+    //private final static char CH = '\u0447';
+    private final static char SH = '\u0448';
+    private final static char SHCH = '\u0449';
+    //private final static char HARD = '\u044A';
+    private final static char Y = '\u044B';
+    private final static char SOFT = '\u044C';
+    private final static char AE = '\u044D';
+    private final static char IU = '\u044E';
+    private final static char IA = '\u044F';
 
     // stem definitions
     private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
@@ -257,16 +252,6 @@
     }
 
     /**
-     * RussianStemmer constructor comment.
-     * @deprecated Use {@link #RussianStemmer()} instead.
-     */
-    public RussianStemmer(char[] charset)
-    {
-        super();
-        this.charset = charset;
-    }
-
-    /**
      * Adjectival ending is an adjective ending,
      * optionally preceded by participle ending.
      * Creation date: (17/03/2002 12:14:58 AM)
@@ -333,7 +318,7 @@
             int stemmingIndex = startIndex;
             for (int j = theEnding.length - 1; j >= 0; j--)
             {
-                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
+                if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
                 {
                     match = false;
                     break;
@@ -451,7 +436,7 @@
     {
         for (int i = 0; i < vowels.length; i++)
         {
-            if (letter == charset[vowels[i]])
+            if (letter == vowels[i])
                 return true;
         }
         return false;
@@ -499,7 +484,7 @@
     private boolean removeI(StringBuffer stemmingZone)
     {
         if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == I)
         {
             stemmingZone.setLength(stemmingZone.length() - 1);
             return true;
@@ -518,7 +503,7 @@
     private boolean removeSoft(StringBuffer stemmingZone)
     {
         if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
         {
             stemmingZone.setLength(stemmingZone.length() - 1);
             return true;
@@ -530,17 +515,6 @@
     }
 
     /**
-     * Insert the method's description here.
-     * Creation date: (16/03/2002 10:58:42 PM)
-     * @param newCharset char[]
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    public void setCharset(char[] newCharset)
-    {
-        charset = newCharset;
-    }
-
-    /**
      * Finds the stem for given Russian word.
      * Creation date: (16/03/2002 3:36:48 PM)
      * @return java.lang.String
@@ -622,25 +596,13 @@
             verb1Predessors)
             || findAndRemoveEnding(stemmingZone, verbEndings2);
     }
-
-    /**
-     * Static method for stemming with different charsets
-     * @deprecated Use {@link #stemWord(String)} instead.
-     */
-    public static String stem(String theWord, char[] charset)
-    {
-        RussianStemmer stemmer = new RussianStemmer();
-        stemmer.setCharset(charset);
-        return stemmer.stem(theWord);
-    }
-    
+   
     /**
      * Static method for stemming.
      */
     public static String stemWord(String theWord)
     {
         RussianStemmer stemmer = new RussianStemmer();
-        stemmer.setCharset(RussianCharsets.UnicodeRussian);
         return stemmer.stem(theWord);
     }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
Thu Oct  1 19:20:09 2009
@@ -42,14 +42,6 @@
 
     private InputStreamReader sampleUnicode;
 
-    private Reader inWordsKOI8;
-
-    private Reader sampleKOI8;
-
-    private Reader inWords1251;
-
-    private Reader sample1251;
-
     private File dataDir;
 
     protected void setUp() throws Exception
@@ -97,76 +89,6 @@
         inWords.close();
         sampleUnicode.close();
     }
-
-    public void testKOI8() throws IOException
-    {
-        //System.out.println(new java.util.Date());
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
-        // KOI8
-        inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")),
"iso-8859-1");
-
-        sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")),
"iso-8859-1");
-
-        TokenStream in = ra.tokenStream("all", inWordsKOI8);
-        RussianLetterTokenizer sample =
-            new RussianLetterTokenizer(
-                sampleKOI8,
-                RussianCharsets.KOI8);
-
-        TermAttribute text = in.getAttribute(TermAttribute.class);
-        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
-
-        for (;;)
-        {
-          if (in.incrementToken() == false)
-            break;
-
-            boolean nextSampleToken = sample.incrementToken();
-            assertEquals(
-                "KOI8",
-                text.term(),
-                nextSampleToken == false
-                ? null
-                : sampleText.term());
-        }
-        inWordsKOI8.close();
-        sampleKOI8.close();
-    }
-
-    public void test1251() throws IOException
-    {
-        // 1251
-        inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")),
"iso-8859-1");
-
-        sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")),
"iso-8859-1");
-
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
-        TokenStream in = ra.tokenStream("", inWords1251);
-        RussianLetterTokenizer sample =
-            new RussianLetterTokenizer(
-                sample1251,
-                RussianCharsets.CP1251);
-
-        TermAttribute text = in.getAttribute(TermAttribute.class);
-        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
-
-        for (;;)
-        {
-          if (in.incrementToken() == false)
-            break;
-
-            boolean nextSampleToken = sample.incrementToken();
-            assertEquals(
-                "1251",
-                text.term(),
-                nextSampleToken == false
-                ? null
-                : sampleText.term());
-        }
-
-        inWords1251.close();
-        sample1251.close();
-    }
     
     public void testDigitsInRussianCharset() 
     {

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java?rev=820756&r1=820755&r2=820756&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
Thu Oct  1 19:20:09 2009
@@ -84,9 +84,8 @@
         {
             //if ( (i % 100) == 0 ) System.err.println(i);
             String realStem =
-                RussianStemmer.stem(
-                    (String) words.get(i),
-                    RussianCharsets.UnicodeRussian);
+                RussianStemmer.stemWord(
+                    (String) words.get(i));
             assertEquals("unicode", stems.get(i), realStem);
         }
     }



Mime
View raw message