lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dna...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de TestGermanStemFilter.java data.txt
Date Mon, 16 Aug 2004 20:30:46 GMT
dnaber      2004/08/16 13:30:46

  Added:       contributions/analyzers/src/java/org/apache/lucene/analysis/de
                        GermanStemmer.java package.html WordlistLoader.java
                        GermanStemFilter.java GermanAnalyzer.java
               contributions/analyzers/src/test/org/apache/lucene/analysis/ru
                        testUnicode.txt testKOI8.txt resKOI8.htm
                        stemsUnicode.txt TestRussianAnalyzer.java
                        resUnicode.htm res1251.htm wordsUnicode.txt
                        test1251.txt TestRussianStem.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/ru
                        RussianStemmer.java RussianCharsets.java
                        RussianAnalyzer.java RussianStemFilter.java
                        RussianLowerCaseFilter.java
                        RussianLetterTokenizer.java package.html
               contributions/analyzers/src/test/org/apache/lucene/analysis/de
                        TestGermanStemFilter.java data.txt
  Log:
  copy the Russian and German analyzers plus their test cases to the sandbox
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  /**
   * A stemmer for German words. The algorithm is based on the report
   * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
   * Caumanns (joerg.caumanns@isst.fhg.de).
   *
   * @author    Gerhard Schwarz
   * @version   $Id: GermanStemmer.java,v 1.1 2004/08/16 20:30:44 dnaber Exp $
   */
  public class GermanStemmer
  {
      /**
       * Buffer for the terms while stemming them.
       */
      private StringBuffer sb = new StringBuffer();
  
      /**
       * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
       */
      private int substCount = 0;
  
      /**
       * Stemms the given term to an unique <tt>discriminator</tt>.
       *
       * @param term  The term that should be stemmed.
       * @return      Discriminator for <tt>term</tt>
       */
      protected String stem( String term )
      {
        // Use lowercase for medium stemming.
        term = term.toLowerCase();
        if ( !isStemmable( term ) )
          return term;
        // Reset the StringBuffer.
        sb.delete( 0, sb.length() );
        sb.insert( 0, term );
        // Stemming starts here...
        substitute( sb );
        strip( sb );
        optimize( sb );
        resubstitute( sb );
        removeParticleDenotion( sb );
        return sb.toString();
      }
  
      /**
       * Checks if a term could be stemmed.
       *
       * @return  true if, and only if, the given term consists in letters.
       */
      private boolean isStemmable( String term )
      {
        for ( int c = 0; c < term.length(); c++ ) {
          if ( !Character.isLetter( term.charAt( c ) ) )
            return false;
        }
        return true;
      }
  
      /**
       * suffix stripping (stemming) on the current term. The stripping is reduced
       * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
       * from which all regular suffixes are build of. The simplification causes
       * some overstemming, and way more irregular stems, but still provides unique.
       * discriminators in the most of those cases.
       * The algorithm is context free, except of the length restrictions.
       */
      private void strip( StringBuffer buffer )
      {
        boolean doMore = true;
        while ( doMore && buffer.length() > 3 ) {
          if ( ( buffer.length() + substCount > 5 ) &&
            buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
          {
            buffer.delete( buffer.length() - 2, buffer.length() );
          }
          else if ( ( buffer.length() + substCount > 4 ) &&
            buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
              buffer.delete( buffer.length() - 2, buffer.length() );
          }
          else if ( ( buffer.length() + substCount > 4 ) &&
            buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
              buffer.delete( buffer.length() - 2, buffer.length() );
          }
          else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
            buffer.deleteCharAt( buffer.length() - 1 );
          }
          else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
            buffer.deleteCharAt( buffer.length() - 1 );
          }
          else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
            buffer.deleteCharAt( buffer.length() - 1 );
          }
          // "t" occurs only as suffix of verbs.
          else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
            buffer.deleteCharAt( buffer.length() - 1 );
          }
          else {
            doMore = false;
          }
        }
      }
  
      /**
       * Does some optimizations on the term. This optimisations are
       * contextual.
       */
      private void optimize( StringBuffer buffer )
      {
        // Additional step for female plurals of professions and inhabitants.
        if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
          buffer.deleteCharAt( buffer.length() -1 );
          strip( buffer );
        }
        // Additional step for irregular plural nouns like "Matrizen -> Matrix".
        if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
          buffer.setCharAt( buffer.length() - 1, 'x' );
        }
      }
  
      /**
       * Removes a particle denotion ("ge") from a term.
       */
      private void removeParticleDenotion( StringBuffer buffer )
      {
        if ( buffer.length() > 4 ) {
          for ( int c = 0; c < buffer.length() - 3; c++ ) {
            if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
              buffer.delete( c, c + 2 );
              return;
            }
          }
        }
      }
  
      /**
       * Do some substitutions for the term to reduce overstemming:
       *
       * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
       *   "ß" is substituted by "ss"
       * - Substitute a second char of a pair of equal characters with
       *   an asterisk: ?? -> ?*
       * - Substitute some common character combinations with a token:
       *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
       */
      private void substitute( StringBuffer buffer )
      {
        substCount = 0;
        for ( int c = 0; c < buffer.length(); c++ ) {
          // Replace the second char of a pair of the equal characters with an asterisk
          if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
            buffer.setCharAt( c, '*' );
          }
          // Substitute Umlauts.
          else if ( buffer.charAt( c ) == 'ä' ) {
            buffer.setCharAt( c, 'a' );
          }
          else if ( buffer.charAt( c ) == 'ö' ) {
            buffer.setCharAt( c, 'o' );
          }
          else if ( buffer.charAt( c ) == 'ü' ) {
            buffer.setCharAt( c, 'u' );
          }
          // Fix bug so that 'ß' at the end of a word is replaced.
          else if ( buffer.charAt( c ) == 'ß' ) {
              buffer.setCharAt( c, 's' );
              buffer.insert( c + 1, 's' );
              substCount++;
          }
          // Take care that at least one character is left left side from the current one
          if ( c < buffer.length() - 1 ) {
            // Masking several common character combinations with an token
            if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
              buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
            {
              buffer.setCharAt( c, '$' );
              buffer.delete( c + 1, c + 3 );
              substCount =+ 2;
            }
            else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
              buffer.setCharAt( c, '§' );
              buffer.deleteCharAt( c + 1 );
              substCount++;
            }
            else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
              buffer.setCharAt( c, '%' );
              buffer.deleteCharAt( c + 1 );
              substCount++;
            }
            else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
              buffer.setCharAt( c, '&' );
              buffer.deleteCharAt( c + 1 );
              substCount++;
            }
            else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
              buffer.setCharAt( c, '#' );
              buffer.deleteCharAt( c + 1 );
              substCount++;
            }
            else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
              buffer.setCharAt( c, '!' );
              buffer.deleteCharAt( c + 1 );
              substCount++;
            }
          }
        }
      }
  
      /**
       * Undoes the changes made by substitute(). That are character pairs and
       * character combinations. Umlauts will remain as their corresponding vowel,
       * as "ß" remains as "ss".
       */
      private void resubstitute( StringBuffer buffer )
      {
        for ( int c = 0; c < buffer.length(); c++ ) {
          if ( buffer.charAt( c ) == '*' ) {
            char x = buffer.charAt( c - 1 );
            buffer.setCharAt( c, x );
          }
          else if ( buffer.charAt( c ) == '$' ) {
            buffer.setCharAt( c, 's' );
            buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
          }
          else if ( buffer.charAt( c ) == '§' ) {
            buffer.setCharAt( c, 'c' );
            buffer.insert( c + 1, 'h' );
          }
          else if ( buffer.charAt( c ) == '%' ) {
            buffer.setCharAt( c, 'e' );
            buffer.insert( c + 1, 'i' );
          }
          else if ( buffer.charAt( c ) == '&' ) {
            buffer.setCharAt( c, 'i' );
            buffer.insert( c + 1, 'e' );
          }
          else if ( buffer.charAt( c ) == '#' ) {
            buffer.setCharAt( c, 'i' );
            buffer.insert( c + 1, 'g' );
          }
          else if ( buffer.charAt( c ) == '!' ) {
            buffer.setCharAt( c, 's' );
            buffer.insert( c + 1, 't' );
          }
        }
      }
      
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html
  
  Index: package.html
  ===================================================================
  <html>
  <body>
  Support for indexing and searching of German text.
  </body>
  </html>
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import java.io.File;
  import java.io.FileReader;
  import java.io.IOException;
  import java.io.LineNumberReader;
  import java.util.HashSet;
  import java.util.Hashtable;
  import java.util.Iterator;
  
  /**
   * Loader for text files that represent a list of stopwords.
   *
   * @author Gerhard Schwarz
   * @version $Id: WordlistLoader.java,v 1.1 2004/08/16 20:30:44 dnaber Exp $
   *
   * @todo this is not specific to German, it should be moved up
   */
  public class WordlistLoader {
  
    /**
     * Loads a text file and adds every line as an entry to a HashSet (omitting
     * leading and trailing whitespace). Every line of the file should contain only 
     * one word. The words need to be in lowercase if you make use of an
     * Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
     * 
     * @param wordfile File containing the wordlist
     * @return A HashSet with the file's words
     */
    public static HashSet getWordSet(File wordfile) throws IOException {
      HashSet result = new HashSet();
      FileReader freader = null;
      LineNumberReader lnr = null;
      try {
        freader = new FileReader(wordfile);
        lnr = new LineNumberReader(freader);
        String word = null;
        while ((word = lnr.readLine()) != null) {
          result.add(word.trim());
        }
      }
      finally {
        if (lnr != null)
          lnr.close();
        if (freader != null)
          freader.close();
      }
      return result;
    }
  
    /**
     * @param path      Path to the wordlist
     * @param wordfile  Name of the wordlist
     * 
     * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
     */
    public static Hashtable getWordtable(String path, String wordfile) throws IOException {
      return getWordtable(new File(path, wordfile));
    }
  
    /**
     * @param wordfile  Complete path to the wordlist
     * 
     * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
     */
    public static Hashtable getWordtable(String wordfile) throws IOException {
      return getWordtable(new File(wordfile));
    }
  
    /**
     * @param wordfile  File object that points to the wordlist
     *
     * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
     */
    public static Hashtable getWordtable(File wordfile) throws IOException {
      HashSet wordSet = (HashSet)getWordSet(wordfile);
      Hashtable result = makeWordTable(wordSet);
      return result;
    }
  
    /**
     * Builds a wordlist table, using words as both keys and values
     * for backward compatibility.
     *
     * @param wordSet   stopword set
     */
    private static Hashtable makeWordTable(HashSet wordSet) {
      Hashtable table = new Hashtable();
      for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
        String word = (String)iter.next();
        table.put(word, word);
      }
      return table;
    }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.TokenStream;
  import java.io.IOException;
  import java.util.Hashtable;
  import java.util.Set;
  import java.util.HashSet;
  
  /**
   * A filter that stems German words. It supports a table of words that should
   * not be stemmed at all. The stemmer used can be changed at runtime after the
   * filter object is created (as long as it is a GermanStemmer).
   *
   * @author    Gerhard Schwarz
   * @version   $Id: GermanStemFilter.java,v 1.1 2004/08/16 20:30:44 dnaber Exp $
   */
  public final class GermanStemFilter extends TokenFilter
  {
      /**
       * The actual token in the input stream.
       */
      private Token token = null;
      private GermanStemmer stemmer = null;
      private Set exclusionSet = null;
  
      public GermanStemFilter( TokenStream in )
      {
        super(in);
        stemmer = new GermanStemmer();
      }
  
      /**
       * Builds a GermanStemFilter that uses an exclusiontable.
       * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
       */
      public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
      {
        this( in );
        exclusionSet = new HashSet(exclusiontable.keySet());
      }
  
      /**
       * Builds a GermanStemFilter that uses an exclusiontable.
       */
      public GermanStemFilter( TokenStream in, Set exclusionSet )
      {
        this( in );
        this.exclusionSet = exclusionSet;
      }
  
      /**
       * @return  Returns the next token in the stream, or null at EOS
       */
      public final Token next()
        throws IOException
      {
        if ( ( token = input.next() ) == null ) {
          return null;
        }
        // Check the exclusiontable
        else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
          return token;
        }
        else {
          String s = stemmer.stem( token.termText() );
          // If not stemmed, dont waste the time creating a new token
          if ( !s.equals( token.termText() ) ) {
            return new Token( s, token.startOffset(),
              token.endOffset(), token.type() );
          }
          return token;
        }
      }
  
      /**
       * Set a alternative/custom GermanStemmer for this filter.
       */
      public void setStemmer( GermanStemmer stemmer )
      {
        if ( stemmer != null ) {
          this.stemmer = stemmer;
        }
      }
  
      /**
       * Set an alternative exclusion list for this filter.
       * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
       */
      public void setExclusionTable( Hashtable exclusiontable )
      {
        exclusionSet = new HashSet(exclusiontable.keySet());
      }
  
      /**
       * Set an alternative exclusion list for this filter.
       */
      public void setExclusionSet( Set exclusionSet )
      {
        this.exclusionSet = exclusionSet;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.LowerCaseFilter;
  import org.apache.lucene.analysis.StopFilter;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.standard.StandardFilter;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
  
  import java.io.File;
  import java.io.IOException;
  import java.io.Reader;
  import java.util.HashSet;
  import java.util.Hashtable;
  import java.util.Set;
  
  /**
   * Analyzer for German language. Supports an external list of stopwords (words that
   * will not be indexed at all) and an external list of exclusions (word that will
   * not be stemmed, but indexed).
   * A default set of stopwords is used unless an alternative list is specified, the
   * exclusion list is empty by default.
   *
   * @author Gerhard Schwarz
   * @version $Id: GermanAnalyzer.java,v 1.1 2004/08/16 20:30:44 dnaber Exp $
   */
  public class GermanAnalyzer extends Analyzer {
    /**
     * List of typical german stopwords.
     */
    private String[] GERMAN_STOP_WORDS = {
      "einer", "eine", "eines", "einem", "einen",
      "der", "die", "das", "dass", "daß",
      "du", "er", "sie", "es",
      "was", "wer", "wie", "wir",
      "und", "oder", "ohne", "mit",
      "am", "im", "in", "aus", "auf",
      "ist", "sein", "war", "wird",
      "ihr", "ihre", "ihres",
      "als", "für", "von", "mit",
      "dich", "dir", "mich", "mir",
      "mein", "sein", "kein",
      "durch", "wegen", "wird"
    };
  
    /**
     * Contains the stopwords used with the StopFilter.
     */
    private Set stopSet = new HashSet();
  
    /**
     * Contains words that should be indexed but not stemmed.
     */
    private Set exclusionSet = new HashSet();
  
    /**
     * Builds an analyzer.
     */
    public GermanAnalyzer() {
      stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
    }
  
    /**
     * Builds an analyzer with the given stop words.
     */
    public GermanAnalyzer(String[] stopwords) {
      stopSet = StopFilter.makeStopSet(stopwords);
    }
  
    /**
     * Builds an analyzer with the given stop words.
     */
    public GermanAnalyzer(Hashtable stopwords) {
      stopSet = new HashSet(stopwords.keySet());
    }
  
    /**
     * Builds an analyzer with the given stop words.
     */
    public GermanAnalyzer(File stopwords) throws IOException {
      stopSet = WordlistLoader.getWordSet(stopwords);
    }
  
    /**
     * Builds an exclusionlist from an array of Strings.
     */
    public void setStemExclusionTable(String[] exclusionlist) {
      exclusionSet = StopFilter.makeStopSet(exclusionlist);
    }
  
    /**
     * Builds an exclusionlist from a Hashtable.
     */
    public void setStemExclusionTable(Hashtable exclusionlist) {
      exclusionSet = new HashSet(exclusionlist.keySet());
    }
  
    /**
     * Builds an exclusionlist from the words contained in the given file.
     */
    public void setStemExclusionTable(File exclusionlist) throws IOException {
      exclusionSet = WordlistLoader.getWordSet(exclusionlist);
    }
  
    /**
     * Creates a TokenStream which tokenizes all the text in the provided Reader.
     *
     * @return A TokenStream build from a StandardTokenizer filtered with
     *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
     */
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new StandardTokenizer(reader);
      result = new StandardFilter(result);
      result = new LowerCaseFilter(result);
      result = new StopFilter(result, stopSet);
      result = new GermanStemFilter(result, exclusionSet);
      return result;
    }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
  
  Index: TestRussianAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import junit.framework.TestCase;
  
  import java.io.*;
  
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.Token;
  
  /**
   * Test case for RussianAnalyzer.
   *
   * @author    Boris Okner
   * @version   $Id: TestRussianAnalyzer.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  
  public class TestRussianAnalyzer extends TestCase
  {
      private InputStreamReader inWords;
  
      private InputStreamReader sampleUnicode;
  
      private Reader inWordsKOI8;
  
      private Reader sampleKOI8;
  
      private Reader inWords1251;
  
      private Reader sample1251;
  
      private File dataDir;
  
      protected void setUp() throws Exception
      {
        dataDir = new File(System.getProperty("dataDir", "./bin"));
      }
  
      public void testUnicode() throws IOException
      {
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
          inWords =
              new InputStreamReader(
                  new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
                  "Unicode");
  
          sampleUnicode =
              new InputStreamReader(
                  new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
                  "Unicode");
  
          TokenStream in = ra.tokenStream("all", inWords);
  
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sampleUnicode,
                  RussianCharsets.UnicodeRussian);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "Unicode",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
          }
  
          inWords.close();
          sampleUnicode.close();
      }
  
      public void testKOI8() throws IOException
      {
          //System.out.println(new java.util.Date());
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
          // KOI8
          inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
  
          sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
  
          TokenStream in = ra.tokenStream("all", inWordsKOI8);
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sampleKOI8,
                  RussianCharsets.KOI8);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "KOI8",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
  
          }
  
          inWordsKOI8.close();
          sampleKOI8.close();
      }
  
      public void test1251() throws IOException
      {
          // 1251
          inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
  
          sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
  
          RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
          TokenStream in = ra.tokenStream("", inWords1251);
          RussianLetterTokenizer sample =
              new RussianLetterTokenizer(
                  sample1251,
                  RussianCharsets.CP1251);
  
          for (;;)
          {
              Token token = in.next();
  
              if (token == null)
              {
                  break;
              }
  
              Token sampleToken = sample.next();
              assertEquals(
                  "1251",
                  token.termText(),
                  sampleToken == null
                  ? null
                  : sampleToken.termText());
  
          }
  
          inWords1251.close();
          sample1251.close();
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
  
  Index: TestRussianStem.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import junit.framework.TestCase;
  
  import java.io.BufferedReader;
  import java.io.File;
  import java.io.InputStreamReader;
  import java.io.FileInputStream;
  import java.util.ArrayList;
  
  public class TestRussianStem extends TestCase
  {
      private ArrayList words = new ArrayList();
      private ArrayList stems = new ArrayList();
  
      public TestRussianStem(String name)
      {
          super(name);
      }
  
      /**
       * @see TestCase#setUp()
       */
      protected void setUp() throws Exception
      {
          super.setUp();
          //System.out.println(new java.util.Date());
          String str;
          
          File dataDir = new File(System.getProperty("dataDir", "./bin"));
  
          // open and read words into an array list
          BufferedReader inWords =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")),
                      "Unicode"));
          while ((str = inWords.readLine()) != null)
          {
              words.add(str);
          }
          inWords.close();
  
          // open and read stems into an array list
          BufferedReader inStems =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")),
                      "Unicode"));
          while ((str = inStems.readLine()) != null)
          {
              stems.add(str);
          }
          inStems.close();
      }
  
      /**
       * @see TestCase#tearDown()
       */
      protected void tearDown() throws Exception
      {
          super.tearDown();
      }
  
      public void testStem()
      {
          for (int i = 0; i < words.size(); i++)
          {
              //if ( (i % 100) == 0 ) System.err.println(i);
              String realStem =
                  RussianStemmer.stem(
                      (String) words.get(i),
                      RussianCharsets.UnicodeRussian);
              assertEquals("unicode", stems.get(i), realStem);
          }
      }
  
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
  
  Index: RussianStemmer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  /**
   * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
   *
   * @author  Boris Okner, b.okner@rogers.com
   * @version $Id: RussianStemmer.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  class RussianStemmer
  {
      private char[] charset;
  
      // positions of RV, R1 and R2 respectively
      private int RV, R1, R2;
  
      // letters (currently unused letters are commented out)
      private final static char A = 0;
      //private final static char B = 1;
      private final static char V = 2;
      private final static char G = 3;
      //private final static char D = 4;
      private final static char E = 5;
      //private final static char ZH = 6;
      //private final static char Z = 7;
      private final static char I = 8;
      private final static char I_ = 9;
      //private final static char K = 10;
      private final static char L = 11;
      private final static char M = 12;
      private final static char N = 13;
      private final static char O = 14;
      //private final static char P = 15;
      //private final static char R = 16;
      private final static char S = 17;
      private final static char T = 18;
      private final static char U = 19;
      //private final static char F = 20;
      private final static char X = 21;
      //private final static char TS = 22;
      //private final static char CH = 23;
      private final static char SH = 24;
      private final static char SHCH = 25;
      //private final static char HARD = 26;
      private final static char Y = 27;
      private final static char SOFT = 28;
      private final static char AE = 29;
      private final static char IU = 30;
      private final static char IA = 31;
  
      // stem definitions
      private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
  
      private static char[][] perfectiveGerundEndings1 = {
          { V },
          { V, SH, I },
          { V, SH, I, S, SOFT }
      };
  
      private static char[][] perfectiveGerund1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] perfectiveGerundEndings2 = { { I, V }, {
          Y, V }, {
              I, V, SH, I }, {
                  Y, V, SH, I }, {
                      I, V, SH, I, S, SOFT }, {
                          Y, V, SH, I, S, SOFT }
      };
  
      private static char[][] adjectiveEndings = {
          { E, E },
          { I, E },
          { Y, E },
          { O, E },
          { E, I_ },
          { I, I_ },
          { Y, I_ },
          { O, I_ },
          { E, M },
          { I, M },
          { Y, M },
          { O, M },
          { I, X },
          { Y, X },
          { U, IU },
          { IU, IU },
          { A, IA },
          { IA, IA },
          { O, IU },
          { E, IU },
          { I, M, I },
          { Y, M, I },
          { E, G, O },
          { O, G, O },
          { E, M, U },
          {O, M, U }
      };
  
      private static char[][] participleEndings1 = {
          { SHCH },
          { E, M },
          { N, N },
          { V, SH },
          { IU, SHCH }
      };
  
      private static char[][] participleEndings2 = {
          { I, V, SH },
          { Y, V, SH },
          { U, IU, SHCH }
      };
  
      private static char[][] participle1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] reflexiveEndings = {
          { S, IA },
          { S, SOFT }
      };
  
      private static char[][] verbEndings1 = {
          { I_ },
          { L },
          { N },
          { L, O },
          { N, O },
          { E, T },
          { IU, T },
          { L, A },
          { N, A },
          { L, I },
          { E, M },
          { N, Y },
          { E, T, E },
          { I_, T, E },
          { T, SOFT },
          { E, SH, SOFT },
          { N, N, O }
      };
  
      private static char[][] verbEndings2 = {
          { IU },
          { U, IU },
          { E, N },
          { E, I_ },
          { IA, T },
          { U, I_ },
          { I, L },
          { Y, L },
          { I, M },
          { Y, M },
          { I, T },
          { Y, T },
          { I, L, A },
          { Y, L, A },
          { E, N, A },
          { I, T, E },
          { I, L, I },
          { Y, L, I },
          { I, L, O },
          { Y, L, O },
          { E, N, O },
          { U, E, T },
          { U, IU, T },
          { E, N, Y },
          { I, T, SOFT },
          { Y, T, SOFT },
          { I, SH, SOFT },
          { E, I_, T, E },
          { U, I_, T, E }
      };
  
      private static char[][] verb1Predessors = {
          { A },
          { IA }
      };
  
      private static char[][] nounEndings = {
          { A },
          { U },
          { I_ },
          { O },
          { U },
          { E },
          { Y },
          { I },
          { SOFT },
          { IA },
          { E, V },
          { O, V },
          { I, E },
          { SOFT, E },
          { IA, X },
          { I, IU },
          { E, I },
          { I, I },
          { E, I_ },
          { O, I_ },
          { E, M },
          { A, M },
          { O, M },
          { A, X },
          { SOFT, IU },
          { I, IA },
          { SOFT, IA },
          { I, I_ },
          { IA, M },
          { IA, M, I },
          { A, M, I },
          { I, E, I_ },
          { I, IA, M },
          { I, E, M },
          { I, IA, X },
          { I, IA, M, I }
      };
  
      private static char[][] superlativeEndings = {
          { E, I_, SH },
          { E, I_, SH, E }
      };
  
      private static char[][] derivationalEndings = {
          { O, S, T },
          { O, S, T, SOFT }
      };
  
      /**
       * RussianStemmer constructor comment.
       */
      public RussianStemmer()
      {
          super();
      }
  
      /**
       * RussianStemmer constructor comment.
       */
      public RussianStemmer(char[] charset)
      {
          super();
          this.charset = charset;
      }
  
      /**
       * Adjectival ending is an adjective ending,
       * optionally preceded by participle ending.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean adjectival(StringBuffer stemmingZone)
      {
          // look for adjective ending in a stemming zone
          if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
              return false;
          // if adjective ending was found, try for participle ending.
          // variable r is unused, we are just interested in the side effect of
          // findAndRemoveEnding():
          boolean r =
              findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
              ||
              findAndRemoveEnding(stemmingZone, participleEndings2);
          return true;
      }
  
      /**
       * Derivational endings
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean derivational(StringBuffer stemmingZone)
      {
          int endingLength = findEnding(stemmingZone, derivationalEndings);
          if (endingLength == 0)
               // no derivational ending found
              return false;
          else
          {
              // Ensure that the ending locates in R2
              if (R2 - RV <= stemmingZone.length() - endingLength)
              {
                  stemmingZone.setLength(stemmingZone.length() - endingLength);
                  return true;
              }
              else
              {
                  return false;
              }
          }
      }
  
      /**
       * Finds ending among given ending class and returns the length of ending found(0, if not found).
       * Creation date: (17/03/2002 8:18:34 PM)
       */
      private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
      {
          boolean match = false;
          for (int i = theEndingClass.length - 1; i >= 0; i--)
          {
              char[] theEnding = theEndingClass[i];
              // check if the ending is bigger than stemming zone
              if (startIndex < theEnding.length - 1)
              {
                  match = false;
                  continue;
              }
              match = true;
              int stemmingIndex = startIndex;
              for (int j = theEnding.length - 1; j >= 0; j--)
              {
                  if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
                  {
                      match = false;
                      break;
                  }
              }
              // check if ending was found
              if (match)
              {
                  return theEndingClass[i].length; // cut ending
              }
          }
          return 0;
      }
  
      private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
      {
          return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
      }
  
      /**
       * Finds the ending among the given class of endings and removes it from stemming zone.
       * Creation date: (17/03/2002 8:18:34 PM)
       */
      private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
      {
          int endingLength = findEnding(stemmingZone, theEndingClass);
          if (endingLength == 0)
              // not found
              return false;
          else {
              stemmingZone.setLength(stemmingZone.length() - endingLength);
              // cut the ending found
              return true;
          }
      }
  
      /**
       * Finds the ending among the given class of endings, then checks if this ending was
       * preceded by any of given predessors, and if so, removes it from stemming zone.
       * Creation date: (17/03/2002 8:18:34 PM)
       */
      private boolean findAndRemoveEnding(StringBuffer stemmingZone,
          char[][] theEndingClass, char[][] thePredessors)
      {
          int endingLength = findEnding(stemmingZone, theEndingClass);
          if (endingLength == 0)
              // not found
              return false;
          else
          {
              int predessorLength =
                  findEnding(stemmingZone,
                      stemmingZone.length() - endingLength - 1,
                      thePredessors);
              if (predessorLength == 0)
                  return false;
              else {
                  stemmingZone.setLength(stemmingZone.length() - endingLength);
                  // cut the ending found
                  return true;
              }
          }
  
      }
  
      /**
       * Marks positions of RV, R1 and R2 in a given word.
       * Creation date: (16/03/2002 3:40:11 PM)
       */
      private void markPositions(String word)
      {
          RV = 0;
          R1 = 0;
          R2 = 0;
          int i = 0;
          // find RV
          while (word.length() > i && !isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // RV zone is empty
          RV = i;
          // find R1
          while (word.length() > i && isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R1 zone is empty
          R1 = i;
          // find R2
          while (word.length() > i && !isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R2 zone is empty
          while (word.length() > i && isVowel(word.charAt(i)))
          {
              i++;
          }
          if (word.length() - 1 < ++i)
              return; // R2 zone is empty
          R2 = i;
      }
  
      /**
       * Checks if character is a vowel..
       * Creation date: (16/03/2002 10:47:03 PM)
       * @return boolean
       * @param letter char
       */
      private boolean isVowel(char letter)
      {
          for (int i = 0; i < vowels.length; i++)
          {
              if (letter == charset[vowels[i]])
                  return true;
          }
          return false;
      }
  
      /**
       * Noun endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean noun(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, nounEndings);
      }
  
      /**
       * Perfective gerund endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean perfectiveGerund(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(
              stemmingZone,
              perfectiveGerundEndings1,
              perfectiveGerund1Predessors)
              || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
      }
  
      /**
       * Reflexive endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean reflexive(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, reflexiveEndings);
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean removeI(StringBuffer stemmingZone)
      {
          if (stemmingZone.length() > 0
              && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean removeSoft(StringBuffer stemmingZone)
      {
          if (stemmingZone.length() > 0
              && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Insert the method's description here.
       * Creation date: (16/03/2002 10:58:42 PM)
       * @param newCharset char[]
       */
      public void setCharset(char[] newCharset)
      {
          charset = newCharset;
      }
  
      /**
       * Finds the stem for given Russian word.
       * Creation date: (16/03/2002 3:36:48 PM)
       * @return java.lang.String
       * @param input java.lang.String
       */
      public String stem(String input)
      {
          markPositions(input);
          if (RV == 0)
              return input; //RV wasn't detected, nothing to stem
          StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
          // stemming goes on in RV
          // Step 1
  
          if (!perfectiveGerund(stemmingZone))
          {
              reflexive(stemmingZone);
              // variable r is unused, we are just interested in the flow that gets
              // created by logical expression: apply adjectival(); if that fails,
              // apply verb() etc
              boolean r =
                  adjectival(stemmingZone)
                  || verb(stemmingZone)
                  || noun(stemmingZone);
          }
          // Step 2
          removeI(stemmingZone);
          // Step 3
          derivational(stemmingZone);
          // Step 4
          superlative(stemmingZone);
          undoubleN(stemmingZone);
          removeSoft(stemmingZone);
          // return result
          return input.substring(0, RV) + stemmingZone.toString();
      }
  
      /**
       * Superlative endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean superlative(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(stemmingZone, superlativeEndings);
      }
  
      /**
       * Undoubles N.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean undoubleN(StringBuffer stemmingZone)
      {
          char[][] doubleN = {
              { N, N }
          };
          if (findEnding(stemmingZone, doubleN) != 0)
          {
              stemmingZone.setLength(stemmingZone.length() - 1);
              return true;
          }
          else
          {
              return false;
          }
      }
  
      /**
       * Verb endings.
       * Creation date: (17/03/2002 12:14:58 AM)
       * @param stemmingZone java.lang.StringBuffer
       */
      private boolean verb(StringBuffer stemmingZone)
      {
          return findAndRemoveEnding(
              stemmingZone,
              verbEndings1,
              verb1Predessors)
              || findAndRemoveEnding(stemmingZone, verbEndings2);
      }
  
      /**
       * Static method for stemming with different charsets
       */
      public static String stem(String theWord, char[] charset)
      {
          RussianStemmer stemmer = new RussianStemmer();
          stemmer.setCharset(charset);
          return stemmer.stem(theWord);
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
  
  Index: RussianCharsets.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  /**
   * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
   * for russian characters in Unicode, KOI8 and CP1252.
   * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
   * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
   * and adding logic to toLowerCase() method for that charset.
   *
   * @author  Boris Okner, b.okner@rogers.com
   * @version $Id: RussianCharsets.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  public class RussianCharsets
  {
      // Unicode Russian charset (lowercase letters only)
      public static char[] UnicodeRussian = {
          '\u0430',
          '\u0431',
          '\u0432',
          '\u0433',
          '\u0434',
          '\u0435',
          '\u0436',
          '\u0437',
          '\u0438',
          '\u0439',
          '\u043A',
          '\u043B',
          '\u043C',
          '\u043D',
          '\u043E',
          '\u043F',
          '\u0440',
          '\u0441',
          '\u0442',
          '\u0443',
          '\u0444',
          '\u0445',
          '\u0446',
          '\u0447',
          '\u0448',
          '\u0449',
          '\u044A',
          '\u044B',
          '\u044C',
          '\u044D',
          '\u044E',
          '\u044F',
          // upper case
          '\u0410',
          '\u0411',
          '\u0412',
          '\u0413',
          '\u0414',
          '\u0415',
          '\u0416',
          '\u0417',
          '\u0418',
          '\u0419',
          '\u041A',
          '\u041B',
          '\u041C',
          '\u041D',
          '\u041E',
          '\u041F',
          '\u0420',
          '\u0421',
          '\u0422',
          '\u0423',
          '\u0424',
          '\u0425',
          '\u0426',
          '\u0427',
          '\u0428',
          '\u0429',
          '\u042A',
          '\u042B',
          '\u042C',
          '\u042D',
          '\u042E',
          '\u042F'
      };
  
      // KOI8 charset
      public static char[] KOI8 = {
          0xc1,
          0xc2,
          0xd7,
          0xc7,
          0xc4,
          0xc5,
          0xd6,
          0xda,
          0xc9,
          0xca,
          0xcb,
          0xcc,
          0xcd,
          0xce,
          0xcf,
          0xd0,
          0xd2,
          0xd3,
          0xd4,
          0xd5,
          0xc6,
          0xc8,
          0xc3,
          0xde,
          0xdb,
          0xdd,
          0xdf,
          0xd9,
          0xd8,
          0xdc,
          0xc0,
          0xd1,
          // upper case
          0xe1,
          0xe2,
          0xf7,
          0xe7,
          0xe4,
          0xe5,
          0xf6,
          0xfa,
          0xe9,
          0xea,
          0xeb,
          0xec,
          0xed,
          0xee,
          0xef,
          0xf0,
          0xf2,
          0xf3,
          0xf4,
          0xf5,
          0xe6,
          0xe8,
          0xe3,
          0xfe,
          0xfb,
          0xfd,
          0xff,
          0xf9,
          0xf8,
          0xfc,
          0xe0,
          0xf1
      };
  
      // CP1251 eharset
      public static char[] CP1251 = {
          0xE0,
          0xE1,
          0xE2,
          0xE3,
          0xE4,
          0xE5,
          0xE6,
          0xE7,
          0xE8,
          0xE9,
          0xEA,
          0xEB,
          0xEC,
          0xED,
          0xEE,
          0xEF,
          0xF0,
          0xF1,
          0xF2,
          0xF3,
          0xF4,
          0xF5,
          0xF6,
          0xF7,
          0xF8,
          0xF9,
          0xFA,
          0xFB,
          0xFC,
          0xFD,
          0xFE,
          0xFF,
          // upper case
          0xC0,
          0xC1,
          0xC2,
          0xC3,
          0xC4,
          0xC5,
          0xC6,
          0xC7,
          0xC8,
          0xC9,
          0xCA,
          0xCB,
          0xCC,
          0xCD,
          0xCE,
          0xCF,
          0xD0,
          0xD1,
          0xD2,
          0xD3,
          0xD4,
          0xD5,
          0xD6,
          0xD7,
          0xD8,
          0xD9,
          0xDA,
          0xDB,
          0xDC,
          0xDD,
          0xDE,
          0xDF
      };
  
      public static char toLowerCase(char letter, char[] charset)
      {
          if (charset == UnicodeRussian)
          {
              if (letter >= '\u0430' && letter <= '\u044F')
              {
                  return letter;
              }
              if (letter >= '\u0410' && letter <= '\u042F')
              {
                  return (char) (letter + 32);
              }
          }
  
          if (charset == KOI8)
          {
              if (letter >= 0xe0 && letter <= 0xff)
              {
                  return (char) (letter - 32);
              }
              if (letter >= 0xc0 && letter <= 0xdf)
              {
                  return letter;
              }
  
          }
  
          if (charset == CP1251)
          {
              if (letter >= 0xC0 && letter <= 0xDF)
              {
                  return (char) (letter + 32);
              }
              if (letter >= 0xE0 && letter <= 0xFF)
              {
                  return letter;
              }
  
          }
  
          return Character.toLowerCase(letter);
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
  
  Index: RussianAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.StopFilter;
  import org.apache.lucene.analysis.TokenStream;
  
  import java.io.Reader;
  import java.util.Hashtable;
  import java.util.Set;
  import java.util.HashSet;
  
  /**
   * Analyzer for Russian language. Supports an external list of stopwords (words that
   * will not be indexed at all).
   * A default set of stopwords is used unless an alternative list is specified.
   *
   * @author  Boris Okner, b.okner@rogers.com
   * @version $Id: RussianAnalyzer.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  public final class RussianAnalyzer extends Analyzer
  {
      // letters (currently unused letters are commented out)
      private final static char A = 0;
      private final static char B = 1;
      private final static char V = 2;
      private final static char G = 3;
      private final static char D = 4;
      private final static char E = 5;
      private final static char ZH = 6;
      private final static char Z = 7;
      private final static char I = 8;
      private final static char I_ = 9;
      private final static char K = 10;
      private final static char L = 11;
      private final static char M = 12;
      private final static char N = 13;
      private final static char O = 14;
      private final static char P = 15;
      private final static char R = 16;
      private final static char S = 17;
      private final static char T = 18;
      private final static char U = 19;
      //private final static char F = 20;
      private final static char X = 21;
      //private final static char TS = 22;
      private final static char CH = 23;
      private final static char SH = 24;
      private final static char SHCH = 25;
      //private final static char HARD = 26;
      private final static char Y = 27;
      private final static char SOFT = 28;
      private final static char AE = 29;
      private final static char IU = 30;
      private final static char IA = 31;
  
      /**
       * List of typical Russian stopwords.
       */
      private static char[][] RUSSIAN_STOP_WORDS = {
          {A},
          {B, E, Z},
          {B, O, L, E, E},
          {B, Y},
          {B, Y, L},
          {B, Y, L, A},
          {B, Y, L, I},
          {B, Y, L, O},
          {B, Y, T, SOFT},
          {V},
          {V, A, M},
          {V, A, S},
          {V, E, S, SOFT},
          {V, O},
          {V, O, T},
          {V, S, E},
          {V, S, E, G, O},
          {V, S, E, X},
          {V, Y},
          {G, D, E},
          {D, A},
          {D, A, ZH, E},
          {D, L, IA},
          {D, O},
          {E, G, O},
          {E, E},
          {E, I_,},
          {E, IU},
          {E, S, L, I},
          {E, S, T, SOFT},
          {E, SHCH, E},
          {ZH, E},
          {Z, A},
          {Z, D, E, S, SOFT},
          {I},
          {I, Z},
          {I, L, I},
          {I, M},
          {I, X},
          {K},
          {K, A, K},
          {K, O},
          {K, O, G, D, A},
          {K, T, O},
          {L, I},
          {L, I, B, O},
          {M, N, E},
          {M, O, ZH, E, T},
          {M, Y},
          {N, A},
          {N, A, D, O},
          {N, A, SH},
          {N, E},
          {N, E, G, O},
          {N, E, E},
          {N, E, T},
          {N, I},
          {N, I, X},
          {N, O},
          {N, U},
          {O},
          {O, B},
          {O, D, N, A, K, O},
          {O, N},
          {O, N, A},
          {O, N, I},
          {O, N, O},
          {O, T},
          {O, CH, E, N, SOFT},
          {P, O},
          {P, O, D},
          {P, R, I},
          {S},
          {S, O},
          {T, A, K},
          {T, A, K, ZH, E},
          {T, A, K, O, I_},
          {T, A, M},
          {T, E},
          {T, E, M},
          {T, O},
          {T, O, G, O},
          {T, O, ZH, E},
          {T, O, I_},
          {T, O, L, SOFT, K, O},
          {T, O, M},
          {T, Y},
          {U},
          {U, ZH, E},
          {X, O, T, IA},
          {CH, E, G, O},
          {CH, E, I_},
          {CH, E, M},
          {CH, T, O},
          {CH, T, O, B, Y},
          {CH, SOFT, E},
          {CH, SOFT, IA},
          {AE, T, A},
          {AE, T, I},
          {AE, T, O},
          {IA}
      };
  
      /**
       * Contains the stopwords used with the StopFilter.
       */
      private Set stopSet = new HashSet();
  
      /**
       * Charset for Russian letters.
       * Represents encoding for 32 lowercase Russian letters.
       * Predefined charsets can be taken from RussianCharSets class
       */
      private char[] charset;
  
  
      public RussianAnalyzer() {
          charset = RussianCharsets.UnicodeRussian;
          stopSet = StopFilter.makeStopSet(
                      makeStopWords(RussianCharsets.UnicodeRussian));
      }
  
      /**
       * Builds an analyzer.
       */
      public RussianAnalyzer(char[] charset)
      {
          this.charset = charset;
          stopSet = StopFilter.makeStopSet(makeStopWords(charset));
      }
  
      /**
       * Builds an analyzer with the given stop words.
       */
      public RussianAnalyzer(char[] charset, String[] stopwords)
      {
          this.charset = charset;
          stopSet = StopFilter.makeStopSet(stopwords);
      }
  
      // Takes russian stop words and translates them to a String array, using
      // the given charset
      private static String[] makeStopWords(char[] charset)
      {
          String[] res = new String[RUSSIAN_STOP_WORDS.length];
          for (int i = 0; i < res.length; i++)
          {
              char[] theStopWord = RUSSIAN_STOP_WORDS[i];
              // translate the word, using the charset
              StringBuffer theWord = new StringBuffer();
              for (int j = 0; j < theStopWord.length; j++)
              {
                  theWord.append(charset[theStopWord[j]]);
              }
              res[i] = theWord.toString();
          }
          return res;
      }
  
      /**
       * Builds an analyzer with the given stop words.
       * @todo create a Set version of this ctor
       */
      public RussianAnalyzer(char[] charset, Hashtable stopwords)
      {
          this.charset = charset;
          stopSet = new HashSet(stopwords.keySet());
      }
  
      /**
       * Creates a TokenStream which tokenizes all the text in the provided Reader.
       *
       * @return  A TokenStream build from a RussianLetterTokenizer filtered with
       *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
       */
      public TokenStream tokenStream(String fieldName, Reader reader)
      {
          TokenStream result = new RussianLetterTokenizer(reader, charset);
          result = new RussianLowerCaseFilter(result, charset);
          result = new StopFilter(result, stopSet);
          result = new RussianStemFilter(result, charset);
          return result;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
  
  Index: RussianStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.TokenStream;
  import java.io.IOException;
  
  /**
   * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
   * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
   * because RussianStemFilter only works  with lowercase part of any "russian" charset.
   *
   * @author    Boris Okner, b.okner@rogers.com
   * @version   $Id: RussianStemFilter.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  public final class RussianStemFilter extends TokenFilter
  {
      /**
       * The actual token in the input stream.
       */
      private Token token = null;
      private RussianStemmer stemmer = null;
  
      public RussianStemFilter(TokenStream in, char[] charset)
      {
          super(in);
          stemmer = new RussianStemmer(charset);
      }
  
      /**
       * @return  Returns the next token in the stream, or null at EOS
       */
      public final Token next() throws IOException
      {
          if ((token = input.next()) == null)
          {
              return null;
          }
          else
          {
              String s = stemmer.stem(token.termText());
              if (!s.equals(token.termText()))
              {
                  return new Token(s, token.startOffset(), token.endOffset(),
                      token.type());
              }
              return token;
          }
      }
  
      /**
       * Set a alternative/custom RussianStemmer for this filter.
       */
      public void setStemmer(RussianStemmer stemmer)
      {
          if (stemmer != null)
          {
              this.stemmer = stemmer;
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
  
  Index: RussianLowerCaseFilter.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenStream;
  
  /**
   * Normalizes token text to lower case, analyzing given ("russian") charset.
   *
   * @author  Boris Okner, b.okner@rogers.com
   * @version $Id: RussianLowerCaseFilter.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  public final class RussianLowerCaseFilter extends TokenFilter
  {
      char[] charset;
  
      public RussianLowerCaseFilter(TokenStream in, char[] charset)
      {
          super(in);
          this.charset = charset;
      }
  
      public final Token next() throws java.io.IOException
      {
          Token t = input.next();
  
          if (t == null)
              return null;
  
          String txt = t.termText();
  
          char[] chArray = txt.toCharArray();
          for (int i = 0; i < chArray.length; i++)
          {
              chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
          }
  
          String newTxt = new String(chArray);
          // create new token
          Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
  
          return newToken;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
  
  Index: RussianLetterTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis.ru;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import java.io.Reader;
  import org.apache.lucene.analysis.CharTokenizer;
  
  /**
   * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
   * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
   * which doesn't know how to detect letters in encodings like CP1252 and KOI8
   * (well-known problems with 0xD7 and 0xF7 chars)
   *
   * @author  Boris Okner, b.okner@rogers.com
   * @version $Id: RussianLetterTokenizer.java,v 1.1 2004/08/16 20:30:45 dnaber Exp $
   */
  
  public class RussianLetterTokenizer extends CharTokenizer
  {
      /** Construct a new LetterTokenizer. */
      private char[] charset;
  
      public RussianLetterTokenizer(Reader in, char[] charset)
      {
          super(in);
          this.charset = charset;
      }
  
      /**
       * Collects only characters which satisfy
       * {@link Character#isLetter(char)}.
       */
      protected boolean isTokenChar(char c)
      {
          if (Character.isLetter(c))
              return true;
          for (int i = 0; i < charset.length; i++)
          {
              if (c == charset[i])
                  return true;
          }
          return false;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html
  
  Index: package.html
  ===================================================================
  <html>
  <body>
  Support for indexing and searching Russian text.
  </body>
  </html>
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
  
  Index: TestGermanStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import java.io.BufferedReader;
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.IOException;
  import java.io.InputStreamReader;
  import java.io.StringReader;
  
  import junit.framework.TestCase;
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
  
  /**
   * Test the German stemmer. The stemming algorithm is known to work less 
   * than perfect, as it doesn't use any word lists with exceptions. We 
   * also check some of the cases where the algorithm is wrong.
   * 
   * @author Daniel Naber
   */
  public class TestGermanStemFilter extends TestCase {
  
    public void testStemming() {
      try {
        // read test cases from external file:
        File dataDir = new File(System.getProperty("dataDir", "./bin"));
        File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
        FileInputStream fis = new FileInputStream(testFile);
        InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
        BufferedReader breader = new BufferedReader(isr);
        while(true) {
          String line = breader.readLine();
          if (line == null)
            break;
          line = line.trim();
          if (line.startsWith("#") || line.equals(""))
            continue;    // ignore comments and empty lines
          String[] parts = line.split(";");
          //System.out.println(parts[0] + " -- " + parts[1]);
          check(parts[0], parts[1]);
        }
        breader.close();
        isr.close();
        fis.close();
      } catch (IOException e) {
         e.printStackTrace();
         fail();
      }
    }
  
    private void check(final String input, final String expected) throws IOException {
      StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
      GermanStemFilter filter = new GermanStemFilter(tokenStream);
      Token t = filter.next();
      if (t == null)
        fail();
      assertEquals(expected, t.termText());
      filter.close();
    }
  
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt
  
  Index: data.txt
  ===================================================================
  # German special characters are replaced:
  hufig;haufig
  
  # here the stemmer works okay, it maps related words to the same stem:
  abschlieen;abschliess
  abschlieender;abschliess
  abschlieendes;abschliess
  abschlieenden;abschliess
  
  Tisch;tisch
  Tische;tisch
  Tischen;tisch
  
  Haus;hau
  Hauses;hau
  Huser;hau
  Husern;hau
  # here's a case where overstemming occurs, i.e. a word is 
  # mapped to the same stem as unrelated words:
  hauen;hau
  
  # here's a case where understemming occurs, i.e. two related words
  # are not mapped to the same stem. This is the case with basically
  # all irregular forms:
  Drama;drama
  Dramen;dram
  
  # replace "" with 'ss':
  Ausma;ausmass
  
  # fake words to test if suffixes are cut off:
  xxxxxe;xxxxx
  xxxxxs;xxxxx
  xxxxxn;xxxxx
  xxxxxt;xxxxx
  xxxxxem;xxxxx
  xxxxxer;xxxxx
  xxxxxnd;xxxxx
  # the suffixes are also removed when combined:
  xxxxxetende;xxxxx
  
  # words that are shorter than four charcters are not changed:
  xxe;xxe
  # -em and -er are not removed from words shorter than five characters:
  xxem;xxem
  xxer;xxer
  # -nd is not removed from words shorter than six characters:
  xxxnd;xxxnd
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message