lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/ru RussianAnalyzer.java RussianCharsets.java RussianLetterTokenizer.java RussianLowerCaseFilter.java RussianStemFilter.java RussianStemmer.java
Date Mon, 16 Sep 2002 02:51:58 GMT
otis        2002/09/15 19:51:58

  Added:       src/java/org/apache/lucene/analysis/ru RussianAnalyzer.java
                        RussianCharsets.java RussianLetterTokenizer.java
                        RussianLowerCaseFilter.java RussianStemFilter.java
                        RussianStemmer.java
  Log:
  - Russian Analyzer, by Boris Okner.  Initial checkin.
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
  
  Index: RussianAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.LowerCaseFilter;
  import org.apache.lucene.analysis.StopFilter;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.standard.StandardFilter;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
  
  import java.io.File;
  import java.io.Reader;
  import java.util.Hashtable;
  
  /**
   * Analyzer for Russian language. Supports an external list of stopwords (words that
   * will not be indexed at all).
   * A default set of stopwords is used unless an alternative list is specified.
   *
   * @author    Boris Okner
   * @version $Id
   */
  public final class RussianAnalyzer extends Analyzer
  {
      // letters
      private static char A = 0;
      private static char B = 1;
      private static char V = 2;
      private static char G = 3;
      private static char D = 4;
      private static char E = 5;
      private static char ZH = 6;
      private static char Z = 7;
      private static char I = 8;
      private static char I_ = 9;
      private static char K = 10;
      private static char L = 11;
      private static char M = 12;
      private static char N = 13;
      private static char O = 14;
      private static char P = 15;
      private static char R = 16;
      private static char S = 17;
      private static char T = 18;
      private static char U = 19;
      private static char F = 20;
      private static char X = 21;
      private static char TS = 22;
      private static char CH = 23;
      private static char SH = 24;
      private static char SHCH = 25;
      private static char HARD = 26;
      private static char Y = 27;
      private static char SOFT = 28;
      private static char AE = 29;
      private static char IU = 30;
      private static char IA = 31;
  
      /**
       * List of typical Russian stopwords.
       */
      private static char[][] RUSSIAN_STOP_WORDS = {
          {A},
          {B, E, Z},
          {B, O, L, E, E},
          {B, Y},
          {B, Y, L},
          {B, Y, L, A},
          {B, Y, L, I},
          {B, Y, L, O},
          {B, Y, T, SOFT},
          {V},
          {V, A, M},
          {V, A, S},
          {V, E, S, SOFT},
          {V, O},
          {V, O, T},
          {V, S, E},
          {V, S, E, G, O},
          {V, S, E, X},
          {V, Y},
          {G, D, E},
          {D, A},
          {D, A, ZH, E},
          {D, L, IA},
          {D, O},
          {E, G, O},
          {E, E},
          {E, I_,},
          {E, IU},
          {E, S, L, I},
          {E, S, T, SOFT},
          {E, SHCH, E},
          {ZH, E},
          {Z, A},
          {Z, D, E, S, SOFT},
          {I},
          {I, Z},
          {I, L, I},
          {I, M},
          {I, X},
          {K},
          {K, A, K},
          {K, O},
          {K, O, G, D, A},
          {K, T, O},
          {L, I},
          {L, I, B, O},
          {M, N, E},
          {M, O, ZH, E, T},
          {M, Y},
          {N, A},
          {N, A, D, O},
          {N, A, SH},
          {N, E},
          {N, E, G, O},
          {N, E, E},
          {N, E, T},
          {N, I},
          {N, I, X},
          {N, O},
          {N, U},
          {O},
          {O, B},
          {O, D, N, A, K, O},
          {O, N},
          {O, N, A},
          {O, N, I},
          {O, N, O},
          {O, T},
          {O, CH, E, N, SOFT},
          {P, O},
          {P, O, D},
          {P, R, I},
          {S},
          {S, O},
          {T, A, K},
          {T, A, K, ZH, E},
          {T, A, K, O, I_},
          {T, A, M},
          {T, E},
          {T, E, M},
          {T, O},
          {T, O, G, O},
          {T, O, ZH, E},
          {T, O, I_},
          {T, O, L, SOFT, K, O},
          {T, O, M},
          {T, Y},
          {U},
          {U, ZH, E},
          {X, O, T, IA},
          {CH, E, G, O},
          {CH, E, I_},
          {CH, E, M},
          {CH, T, O},
          {CH, T, O, B, Y},
          {CH, SOFT, E},
          {CH, SOFT, IA},
          {AE, T, A},
          {AE, T, I},
          {AE, T, O},
          {IA}
      };
  
      /**
       * Contains the stopwords used with the StopFilter.
       */
      private Hashtable stoptable = new Hashtable();
  
      /**
       * Charset for Russian letters.
       * Represents encoding for 32 lowercase Russian letters.
       * Predefined charsets can be taken from RussianCharSets class
       */
      private char[] charset;
  
      /**
       * Builds an analyzer.
       */
      public RussianAnalyzer(char[] charset)
      {
          this.charset = charset;
          stoptable = StopFilter.makeStopTable(makeStopWords(charset));
      }
  
      /**
       * Builds an analyzer with the given stop words.
       */
      public RussianAnalyzer(char[] charset, String[] stopwords)
      {
          this.charset = charset;
          stoptable = StopFilter.makeStopTable(stopwords);
      }
  
      // Takes russian stop words and translates them to a String array, using
      // the given charset
      private static String[] makeStopWords(char[] charset)
      {
          String[] res = new String[RUSSIAN_STOP_WORDS.length];
          for (int i = 0; i < res.length; i++)
          {
              char[] theStopWord = RUSSIAN_STOP_WORDS[i];
              // translate the word,using the charset
              StringBuffer theWord = new StringBuffer();
              for (int j = 0; j < theStopWord.length; j++)
              {
                  theWord.append(charset[theStopWord[j]]);
              }
              res[i] = theWord.toString();
          }
          return res;
      }
  
      /**
       * Builds an analyzer with the given stop words.
       */
      public RussianAnalyzer(char[] charset, Hashtable stopwords)
      {
          this.charset = charset;
          stoptable = stopwords;
      }
  
      /**
       * Creates a TokenStream which tokenizes all the text in the provided Reader.
       *
       * @return  A TokenStream build from a RussianLetterTokenizer filtered with
       *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
       */
      public final TokenStream tokenStream(String fieldName, Reader reader)
      {
          TokenStream result = new RussianLetterTokenizer(reader, charset);
          result = new RussianLowerCaseFilter(result, charset);
          result = new StopFilter(result, stoptable);
          result = new RussianStemFilter(result, charset);
          return result;
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
  
  Index: RussianCharsets.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /**
   * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
   * for russian characters in Unicode, KOI8 and CP1252.
   * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
   * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
   * and adding logic to toLowerCase() method for that charset.
   *
   * @author Boris Okner
   * @version $Id: RussianCharsets.java,v 1.1 2002/09/16 02:51:58 otis Exp $
   */
  public class RussianCharsets
  {
      // Unicode Russian charset (lowercase letters only)
      public static char[] UnicodeRussian = {
          '\u0430',
          '\u0431',
          '\u0432',
          '\u0433',
          '\u0434',
          '\u0435',
          '\u0436',
          '\u0437',
          '\u0438',
          '\u0439',
          '\u043A',
          '\u043B',
          '\u043C',
          '\u043D',
          '\u043E',
          '\u043F',
          '\u0440',
          '\u0441',
          '\u0442',
          '\u0443',
          '\u0444',
          '\u0445',
          '\u0446',
          '\u0447',
          '\u0448',
          '\u0449',
          '\u044A',
          '\u044B',
          '\u044C',
          '\u044D',
          '\u044E',
          '\u044F',
          // upper case
          '\u0410',
          '\u0411',
          '\u0412',
          '\u0413',
          '\u0414',
          '\u0415',
          '\u0416',
          '\u0417',
          '\u0418',
          '\u0419',
          '\u041A',
          '\u041B',
          '\u041C',
          '\u041D',
          '\u041E',
          '\u041F',
          '\u0420',
          '\u0421',
          '\u0422',
          '\u0423',
          '\u0424',
          '\u0425',
          '\u0426',
          '\u0427',
          '\u0428',
          '\u0429',
          '\u042A',
          '\u042B',
          '\u042C',
          '\u042D',
          '\u042E',
          '\u042F'
      };
  
      // KOI8 charset
      public static char[] KOI8 = {
          0xc1,
          0xc2,
          0xd7,
          0xc7,
          0xc4,
          0xc5,
          0xd6,
          0xda,
          0xc9,
          0xca,
          0xcb,
          0xcc,
          0xcd,
          0xce,
          0xcf,
          0xd0,
          0xd2,
          0xd3,
          0xd4,
          0xd5,
          0xc6,
          0xc8,
          0xc3,
          0xde,
          0xdb,
          0xdd,
          0xdf,
          0xd9,
          0xd8,
          0xdc,
          0xc0,
          0xd1,
          // upper case
          0xe1,
          0xe2,
          0xf7,
          0xe7,
          0xe4,
          0xe5,
          0xf6,
          0xfa,
          0xe9,
          0xea,
          0xeb,
          0xec,
          0xed,
          0xee,
          0xef,
          0xf0,
          0xf2,
          0xf3,
          0xf4,
          0xf5,
          0xe6,
          0xe8,
          0xe3,
          0xfe,
          0xfb,
          0xfd,
          0xff,
          0xf9,
          0xf8,
          0xfc,
          0xe0,
          0xf1
      };
  
      // CP1251 eharset
      public static char[] CP1251 = {
          0xE0,
          0xE1,
          0xE2,
          0xE3,
          0xE4,
          0xE5,
          0xE6,
          0xE7,
          0xE8,
          0xE9,
          0xEA,
          0xEB,
          0xEC,
          0xED,
          0xEE,
          0xEF,
          0xF0,
          0xF1,
          0xF2,
          0xF3,
          0xF4,
          0xF5,
          0xF6,
          0xF7,
          0xF8,
          0xF9,
          0xFA,
          0xFB,
          0xFC,
          0xFD,
          0xFE,
          0xFF,
          // upper case
          0xC0,
          0xC1,
          0xC2,
          0xC3,
          0xC4,
          0xC5,
          0xC6,
          0xC7,
          0xC8,
          0xC9,
          0xCA,
          0xCB,
          0xCC,
          0xCD,
          0xCE,
          0xCF,
          0xD0,
          0xD1,
          0xD2,
          0xD3,
          0xD4,
          0xD5,
          0xD6,
          0xD7,
          0xD8,
          0xD9,
          0xDA,
          0xDB,
          0xDC,
          0xDD,
          0xDE,
          0xDF
      };
  
      public static char toLowerCase(char letter, char[] charset)
      {
          if (charset == UnicodeRussian)
          {
              if (letter >= '\u0430' && letter <= '\u044F')
              {
                  return letter;
              }
              if (letter >= '\u0410' && letter <= '\u042F')
              {
                  return (char) (letter + 32);
              }
          }
  
          if (charset == KOI8)
          {
              if (letter >= 0xe0 && letter <= 0xff)
              {
                  return (char) (letter - 32);
              }
              if (letter >= 0xc0 && letter <= 0xdf)
              {
                  return letter;
              }
  
          }
  
          if (charset == CP1251)
          {
              if (letter >= 0xC0 && letter <= 0xDF)
              {
                  return (char) (letter + 32);
              }
              if (letter >= 0xE0 && letter <= 0xFF)
              {
                  return letter;
              }
  
          }
  
          return Character.toLowerCase(letter);
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
  
  Index: RussianLetterTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  import org.apache.lucene.analysis.CharTokenizer;
  
  /**
   * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
   * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
   * which doesn't know how to detect letters in encodings like CP1252 and KOI8
   * (well-known problems with 0xD7 and 0xF7 chars)
   *
   * @version $Id: RussianLetterTokenizer.java,v 1.1 2002/09/16 02:51:58 otis Exp $
   */
  
  public class RussianLetterTokenizer extends CharTokenizer
  {
      /** Construct a new LetterTokenizer. */
      private char[] charset;
  
      public RussianLetterTokenizer(Reader in, char[] charset)
      {
          super(in);
          this.charset = charset;
      }
  
      /**
       * Collects only characters which satisfy
       * {@link Character#isLetter(char)}.
       */
      protected boolean isTokenChar(char c)
      {
          if (Character.isLetter(c))
              return true;
          for (int i = 0; i < charset.length; i++)
          {
              if (c == charset[i])
                  return true;
          }
          return false;
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
  
  Index: RussianLowerCaseFilter.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenStream;
  
  /**
   * Normalizes token text to lower case, analyzing given ("russian") charset.
   *
   * @version $Id: RussianLowerCaseFilter.java,v 1.1 2002/09/16 02:51:58 otis Exp $
   */
  public final class RussianLowerCaseFilter extends TokenFilter
  {
      char[] charset;
  
      public RussianLowerCaseFilter(TokenStream in, char[] charset)
      {
          input = in;
          this.charset = charset;
      }
  
      public final Token next() throws java.io.IOException
      {
          Token t = input.next();
  
          if (t == null)
              return null;
  
          String txt = t.termText();
  
          char[] chArray = txt.toCharArray();
          for (int i = 0; i < chArray.length; i++)
          {
              chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
          }
  
          String newTxt = new String(chArray);
          // create new token
          Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
  
          return newToken;
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
  
  Index: RussianStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.TokenStream;
  import java.io.IOException;
  import java.util.Hashtable;
  
  /**
   * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
   * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
   * because RussianStemFilter only works  with lowercase part of any "russian" charset.
   * @author    Boris Okner
   * @version   $Id: RussianStemFilter.java,v 1.1 2002/09/16 02:51:58 otis Exp $
   */
  public final class RussianStemFilter extends TokenFilter
  {
      /**
       * The actual token in the input stream.
       */
      private Token token = null;
      private RussianStemmer stemmer = null;
  
      public RussianStemFilter(TokenStream in, char[] charset)
      {
          stemmer = new RussianStemmer(charset);
          input = in;
      }
  
      /**
       * @return  Returns the next token in the stream, or null at EOS
       */
      public final Token next() throws IOException
      {
          if ((token = input.next()) == null)
          {
              return null;
          }
          else
          {
              String s = stemmer.stem(token.termText());
              if (!s.equals(token.termText()))
              {
                  return new Token(s, token.startOffset(), token.endOffset(),
                      token.type());
              }
              return token;
          }
      }
  
      /**
       * Set a alternative/custom RussianStemmer for this filter.
       */
      public void setStemmer(RussianStemmer stemmer)
      {
          if (stemmer != null)
          {
              this.stemmer = stemmer;
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
  
  Index: RussianStemmer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /**
   * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
   * Creation date: (12/02/2002 10:34:15 PM)
   * @author: Boris Okner
   * @version $Id: RussianStemmer.java,v 1.1 2002/09/16 02:51:58 otis Exp $
   */
  class RussianStemmer
  {
      private char[] charset;
  
      // positions of RV, R1 and R2 respectively
      private int RV, R1, R2;
  
      // letters
      private static char A = 0;
      private static char B = 1;
      private static char V = 2;
      private static char G = 3;
      private static char D = 4;
      private static char E = 5;
      private static char ZH = 6;
      private static char Z = 7;
      private static char I = 8;
      private static char I_ = 9;
      private static char K = 10;
      private static char L = 11;
      private static char M = 12;
      private static char N = 13;
      private static char O = 14;
      private static char P = 15;
      private static char R = 16;
      private static char S = 17;
      private static char T = 18;
      private static char U = 19;
      private static char F = 20;
      private static char X = 21;
      private static char TS = 22;
      private static char CH = 23;
      private static char SH = 24;
      private static char SHCH = 25;
      private static char HARD = 26;
      private static char Y = 27;
      private static char SOFT = 28;
      private static char AE = 29;
      private static char IU = 30;
      private static char IA = 31;
  
      // stem definitions
      private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
  
      private static char[][] perfectiveGerundEndings1 = {
          { V },
          { V, SH, I },
          { V, SH, I, S, SOFT }
      };
  
      private static char[][] perfectiveGerund1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] perfectiveGerundEndings2 = { { I, V }, {
          Y, V }, {
              I, V, SH, I }, {
                  Y, V, SH, I }, {
                      I, V, SH, I, S, SOFT }, {
                          Y, V, SH, I, S, SOFT }
      };
  
      private static char[][] adjectiveEndings = {
          { E, E },
          { I, E },
          { Y, E },
          { O, E },
          { E, I_ },
          { I, I_ },
          { Y, I_ },
          { O, I_ },
          { E, M },
          { I, M },
          { Y, M },
          { O, M },
          { I, X },
          { Y, X },
          { U, IU },
          { IU, IU },
          { A, IA },
          { IA, IA },
          { O, IU },
          { E, IU },
          { I, M, I },
          { Y, M, I },
          { E, G, O },
          { O, G, O },
          { E, M, U },
          {O, M, U }
      };
  
      private static char[][] participleEndings1 = {
          { SHCH },
          { E, M },
          { N, N },
          { V, SH },
          { IU, SHCH }
      };
  
      private static char[][] participleEndings2 = {
          { I, V, SH },
          { Y, V, SH },
          { U, IU, SHCH }
      };
  
      private static char[][] participle1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] reflexiveEndings = {
          { S, IA },
          { S, SOFT }
      };
  
      private static char[][] verbEndings1 = {
          { I_ },
          { L },
          { N },
          { L, O },
          { N, O },
          { E, T },
          { IU, T },
          { L, A },
          { N, A },
          { L, I },
          { E, M },
          { N, Y },
          { E, T, E },
          { I_, T, E },
          { T, SOFT },
          { E, SH, SOFT },
          { N, N, O }
      };
  
      private static char[][] verbEndings2 = {
          { IU },
          { U, IU },
          { E, N },
          { E, I_ },
          { IA, T },
          { U, I_ },
          { I, L },
          { Y, L },
          { I, M },
          { Y, M },
          { I, T },
          { Y, T },
          { I, L, A },
          { Y, L, A },
          { E, N, A },
          { I, T, E },
          { I, L, I },
          { Y, L, I },
          { I, L, O },
          { Y, L, O },
          { E, N, O },
          { U, E, T },
          { U, IU, T },
          { E, N, Y },
          { I, T, SOFT },
          { Y, T, SOFT },
          { I, SH, SOFT },
          { E, I_, T, E },
          { U, I_, T, E }
      };
  
      private static char[][] verb1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] nounEndings = {
          { A },
          { U },
          { I_ },
          { O },
          { U },
          { E },
          { Y },
          { I },
          { SOFT },
          { IA },
          { E, V },
          { O, V },
          { I, E },
          { SOFT, E },
          { IA, X },
          { I, IU },
          { E, I },
          { I, I },
          { E, I_ },
          { O, I_ },
          { E, M },
          { A, M },
          { O, M },
          { A, X },
          { SOFT, IU },
          { I, IA },
          { SOFT, IA },
          { I, I_ },
          { IA, M },
          { IA, M, I },
          { A, M, I },
          { I, E, I_ },
          { I, IA, M },
          { I, E, M },
          { I, IA, X },
          { I, IA, M, I }
      };
  
      private static char[][] superlativeEndings = {
          { E, I_, SH },
          { E, I_, SH, E }
      };
  
      private static char[][] derivationalEndings = {
          { O, S, T },
          { O, S, T, SOFT }
      };
  
      /**
       * RussianStemmer constructor comment.
       */
      public RussianStemmer()
      {
          super();
      }
  
      /**
       * RussianStemmer constructor comment.
       */
      public RussianStemmer(char[] charset)
      {
          super();
          this.charset = charset;
      }
  
      /**
       * Adjectival ending is an adjective ending,
       * optionally preceded by participle ending.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean adjectival(StringBuffer stemmingZone)
      {
          // look for adjective ending in a stemming zone
          if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
              return false;
          // if adjective ending was found, try for participle ending
          boolean r =
              findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
              ||
              findAndRemoveEnding(stemmingZone, participleEndings2);
          return true;
      }
  
      /**
       * Derivational endings
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean derivational(StringBuffer stemmingZone)
      {
          int endingLength = findEnding(stemmingZone, derivationalEndings);
          if (endingLength == 0)
               // no derivational ending found
              return false;
          else
          {
              // Ensure that the ending locates in R2
              if (R2 - RV <= stemmingZone.length() - endingLength)
              {
                  stemmingZone.setLength(stemmingZone.length() - endingLength);
                  return true;
              }
              else
              {
                  return false;
              }
          }
      }
  
      /**
       * Finds ending among given ending class and returns the length of ending found(0, if not found).
       * Creation date: (17/03/2002 8:18:34 PM)
       * @return int
       * @param word java.lang.StringBuffer
       * @param theEnding char[]
       */
      private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
      {
          boolean match = false;
          for (int i = theEndingClass.length - 1; i >= 0; i--)
          {
              char[] theEnding = theEndingClass[i];
              // check if the ending is bigger than stemming zone
              if (startIndex < theEnding.length - 1)
              {
                  match = false;
                  continue;
              }
              match = true;
              int stemmingIndex = startIndex;
              for (int j = theEnding.length - 1; j >= 0; j--)
              {
                  if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
                  {
                      match = false;
                      break;
                  }
              }
              // check if ending was found
              if (match)
              {
                  return theEndingClass[i].length; // cut ending
              }
          }
          return 0;
      }
  
      private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
      {
          return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
      }
  
      /**
       * Finds the ending among the given class of endings and removes it from stemming zone.
       * Creation date: (17/03/2002 8:18:34 PM)
       * @return boolean
       * @param word java.lang.StringBuffer
       * @param theEnding char[]
       */
      private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
      {
          int endingLength = findEnding(stemmingZone, theEndingClass);
          if (endingLength == 0)
              // not found
              return false;
          else {
              stemmingZone.setLength(stemmingZone.length() - endingLength);
              // cut the ending found
              return true;
          }
      }
  
      /**
       * Finds the ending among the given class of endings, then checks if this ending was
       * preceded by any of given predessors, and if so, removes it from stemming zone.
       * Creation date: (17/03/2002 8:18:34 PM)
       * @return boolean
       * @param word java.lang.StringBuffer
       * @param theEnding char[]
       */
      private boolean findAndRemoveEnding(StringBuffer stemmingZone,
          char[][] theEndingClass, char[][] thePredessors)
      {
          int endingLength = findEnding(stemmingZone, theEndingClass);
          if (endingLength == 0)
              // not found
              return false;
          else
          {
              int predessorLength =
                  findEnding(stemmingZone,
                      stemmingZone.length() - endingLength - 1,
                      thePredessors);
              if (predessorLength == 0)
                  return false;
              else {
                  stemmingZone.setLength(stemmingZone.length() - endingLength);
                  // cut the ending found
                  return true;
              }
          }
  
      }
  
      /**
       * Marks positions of RV, R1 and R2 in a given word.
       * Creation date: (16/03/2002 3:40:11 PM)
       * @return int
       * @param word java.lang.String
       */
      private void markPositions(String word)
      {
          RV = 0;
          R1 = 0;
          R2 = 0;
          int i = 0;
          // find RV
          while (word.length() > i && !isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // RV zone is empty
          RV = i;
          // find R1
          while (word.length() > i && isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R1 zone is empty
          R1 = i;
          // find R2
          while (word.length() > i && !isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R2 zone is empty
          while (word.length() > i && isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R2 zone is empty
          R2 = i;
      }
  
      /**
       * Checks if character is a vowel..
       * Creation date: (16/03/2002 10:47:03 PM)
       * @return boolean
       * @param letter char
       */
      private boolean isVowel(char letter)
      {
          for (int i = 0; i < vowels.length; i++)
          {
              if (letter == charset[vowels[i]])
                  return true;
          }
          return false;
      }
  
      /**
       * Noun endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean noun(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, nounEndings);
      }
  
      /**
       * Perfective gerund endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean perfectiveGerund(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(
              stemmingZone,
              perfectiveGerundEndings1,
              perfectiveGerund1Predessors)
              || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
      }
  
      /**
       * Reflexive endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean reflexive(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, reflexiveEndings);
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean removeI(StringBuffer stemmingZone)
      {
          if (stemmingZone.length() > 0
              && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean removeSoft(StringBuffer stemmingZone)
      {
          if (stemmingZone.length() > 0
              && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (16/03/2002 10:58:42 PM)
       * @param newCharset char[]
       */
      public void setCharset(char[] newCharset)
      {
          charset = newCharset;
      }
  
      /**
       * Set ending definition as in Russian stemming algorithm.
       * Creation date: (16/03/2002 11:16:36 PM)
       */
      private void setEndings()
      {
          vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
  
          perfectiveGerundEndings1 = new char[][] {
              { V }, { V, SH, I }, { V, SH, I, S, SOFT }
          };
  
          perfectiveGerund1Predessors = new char[][] { { A }, { IA }
          };
  
          perfectiveGerundEndings2 = new char[][] {
              { I, V },
              { Y, V },
              { I, V, SH, I },
              { Y, V, SH, I },
              { I, V, SH, I, S, SOFT },
              { Y, V, SH, I, S, SOFT }
          };
  
          adjectiveEndings = new char[][] {
              { E, E },
              { I, E },
              { Y, E },
              { O, E },
              { E, I_ },
              { I, I_ },
              { Y, I_ },
              { O, I_ },
              { E, M },
              { I, M },
              { Y, M },
              { O, M },
              { I, X },
              { Y, X },
              { U, IU },
              { IU, IU },
              { A, IA },
              { IA, IA },
              { O, IU },
              { E, IU },
              { I, M, I },
              { Y, M, I },
              { E, G, O },
              { O, G, O },
              { E, M, U },
              { O, M, U }
          };
  
          participleEndings1 = new char[][] {
              { SHCH },
              { E, M },
              { N, N },
              { V, SH },
              { IU, SHCH }
          };
  
          participleEndings2 = new char[][] {
              { I, V, SH },
              { Y, V, SH },
              { U, IU, SHCH }
          };
  
          participle1Predessors = new char[][] {
              { A },
              { IA }
          };
  
          reflexiveEndings = new char[][] {
              { S, IA },
              { S, SOFT }
          };
  
          verbEndings1 = new char[][] {
              { I_ },
              { L },
              { N },
              { L, O },
              { N, O },
              { E, T },
              { IU, T },
              { L, A },
              { N, A },
              { L, I },
              { E, M },
              { N, Y },
              { E, T, E },
              { I_, T, E },
              { T, SOFT },
              { E, SH, SOFT },
              { N, N, O }
          };
  
          verbEndings2 = new char[][] {
              { IU },
              { U, IU },
              { E, N },
              { E, I_ },
              { IA, T },
              { U, I_ },
              { I, L },
              { Y, L },
              { I, M },
              { Y, M },
              { I, T },
              { Y, T },
              { I, L, A },
              { Y, L, A },
              { E, N, A },
              { I, T, E },
              { I, L, I },
              { Y, L, I },
              { I, L, O },
              { Y, L, O },
              { E, N, O },
              { U, E, T },
              { U, IU, T },
              { E, N, Y },
              { I, T, SOFT },
              { Y, T, SOFT },
              { I, SH, SOFT },
              { E, I_, T, E },
              { U, I_, T, E }
          };
  
          verb1Predessors = new char[][] {
              { A },
              { IA }
          };
  
          nounEndings = new char[][] {
              { A },
              { IU },
              { I_ },
              { O },
              { U },
              { E },
              { Y },
              { I },
              { SOFT },
              { IA },
              { E, V },
              { O, V },
              { I, E },
              { SOFT, E },
              { IA, X },
              { I, IU },
              { E, I },
              { I, I },
              { E, I_ },
              { O, I_ },
              { E, M },
              { A, M },
              { O, M },
              { A, X },
              { SOFT, IU },
              { I, IA },
              { SOFT, IA },
              { I, I_ },
              { IA, M },
              { IA, M, I },
              { A, M, I },
              { I, E, I_ },
              { I, IA, M },
              { I, E, M },
              { I, IA, X },
              { I, IA, M, I }
          };
  
          superlativeEndings = new char[][] {
              { E, I_, SH },
              { E, I_, SH, E }
          };
  
          derivationalEndings = new char[][] {
              { O, S, T },
              { O, S, T, SOFT }
          };
      }
  
      /**
       * Finds the stem for given Russian word.
       * Creation date: (16/03/2002 3:36:48 PM)
       * @return java.lang.String
       * @param input java.lang.String
       */
      public String stem(String input)
      {
          markPositions(input);
          if (RV == 0)
              return input; //RV wasn't detected, nothing to stem
          StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
          // stemming goes on in RV
          // Step 1
  
          if (!perfectiveGerund(stemmingZone))
          {
              reflexive(stemmingZone);
              boolean r =
                  adjectival(stemmingZone)
                  || verb(stemmingZone)
                  || noun(stemmingZone);
          }
          // Step 2
          removeI(stemmingZone);
          // Step 3
          derivational(stemmingZone);
          // Step 4
          superlative(stemmingZone);
          undoubleN(stemmingZone);
          removeSoft(stemmingZone);
          // return result
          return input.substring(0, RV) + stemmingZone.toString();
      }
  
      /**
       * Superlative endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean superlative(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, superlativeEndings);
      }
  
      /**
       * Undoubles N.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean undoubleN(StringBuffer stemmingZone)
      {
          char[][] doubleN = {
              { N, N }
          };
          if (findEnding(stemmingZone, doubleN) != 0)
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Verb endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean verb(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(
              stemmingZone,
              verbEndings1,
              verb1Predessors)
              || findAndRemoveEnding(stemmingZone, verbEndings2);
      }
  
      /**
       * Static method for stemming with different charsets
       */
      public static String stem(String theWord, char[] charset)
      {
          RussianStemmer stemmer = new RussianStemmer();
          stemmer.setCharset(charset);
          return stemmer.stem(theWord);
      }
  }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message