lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ehatc...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl DutchAnalyzer.java DutchStemFilter.java DutchStemmer.java WordlistLoader.java
Date Thu, 11 Mar 2004 03:05:36 GMT
ehatcher    2004/03/10 19:05:36

  Modified:    contributions/analyzers/src/java/org/apache/lucene/analysis
                        LengthFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/br
                        BrazilianAnalyzer.java BrazilianStemFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/cjk
                        CJKAnalyzer.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/cz
                        CzechAnalyzer.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/fr
                        FrenchAnalyzer.java FrenchStemFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/nl
                        DutchAnalyzer.java DutchStemFilter.java
                        DutchStemmer.java WordlistLoader.java
  Log:
  bringing sandbox analyzers up to date with changes to the core StopFilter and migrating away from using Hashtable
  
  Revision  Changes    Path
  1.2       +2 -2      jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java
  
  Index: LengthFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- LengthFilter.java	2 Mar 2004 12:52:16 -0000	1.1
  +++ LengthFilter.java	11 Mar 2004 03:05:36 -0000	1.2
  @@ -35,7 +35,7 @@
      */
     public LengthFilter(TokenStream in, int min, int max)
     {
  -    input = in;
  +    super(in);
       this.min = min;
       this.max =max;
     }
  
  
  
  1.4       +10 -9     jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
  
  Index: BrazilianAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- BrazilianAnalyzer.java	22 Jan 2004 20:54:46 -0000	1.3
  +++ BrazilianAnalyzer.java	11 Mar 2004 03:05:36 -0000	1.4
  @@ -64,6 +64,7 @@
   import java.io.File;
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Analyzer for brazilian language. Supports an external list of stopwords (words that
  @@ -102,57 +103,57 @@
   	/**
   	 * Contains the stopwords used with the StopFilter.
   	 */
  -	private Hashtable stoptable = new Hashtable();
  +	private HashSet stoptable = new HashSet();
   	/**
   	 * Contains words that should be indexed but not stemmed.
   	 */
  -	private Hashtable excltable = new Hashtable();
  +	private HashSet excltable = new HashSet();
   
   	/**
   	 * Builds an analyzer.
   	 */
   	public BrazilianAnalyzer() {
  -		stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS );
  +		stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public BrazilianAnalyzer( String[] stopwords ) {
  -		stoptable = StopFilter.makeStopTable( stopwords );
  +		stoptable = StopFilter.makeStopSet( stopwords );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public BrazilianAnalyzer( Hashtable stopwords ) {
  -		stoptable = stopwords;
  +		stoptable = new HashSet(stopwords.keySet());
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public BrazilianAnalyzer( File stopwords ) {
  -		stoptable = WordlistLoader.getWordtable( stopwords );
  +		stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
   	}
   
   	/**
   	 * Builds an exclusionlist from an array of Strings.
   	 */
   	public void setStemExclusionTable( String[] exclusionlist ) {
  -		excltable = StopFilter.makeStopTable( exclusionlist );
  +		excltable = StopFilter.makeStopSet( exclusionlist );
   	}
   	/**
   	 * Builds an exclusionlist from a Hashtable.
   	 */
   	public void setStemExclusionTable( Hashtable exclusionlist ) {
  -		excltable = exclusionlist;
  +		excltable = new HashSet(exclusionlist.keySet());
   	}
   	/**
   	 * Builds an exclusionlist from the words contained in the given file.
   	 */
   	public void setStemExclusionTable( File exclusionlist ) {
  -		excltable = WordlistLoader.getWordtable( exclusionlist );
  +		excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
   	}
   
   	/**
  
  
  
  1.5       +9 -1      jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
  
  Index: BrazilianStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- BrazilianStemFilter.java	22 Jan 2004 20:54:46 -0000	1.4
  +++ BrazilianStemFilter.java	11 Mar 2004 03:05:36 -0000	1.5
  @@ -59,6 +59,7 @@
   import org.apache.lucene.analysis.TokenStream;
   import java.io.IOException;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Based on (copied) the GermanStemFilter
  @@ -79,7 +80,7 @@
   	 */
   	private Token token = null;
   	private BrazilianStemmer stemmer = null;
  -	private Hashtable exclusions = null;
  +	private HashSet exclusions = null;
   
   	public BrazilianStemFilter( TokenStream in ) {
       super(in);
  @@ -88,8 +89,15 @@
   
   	/**
   	 * Builds a BrazilianStemFilter that uses an exclusiontable.
  +   * 
  +   * @deprecated
   	 */
   	public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) {
  +		this( in );
  +		this.exclusions = new HashSet(exclusiontable.keySet());
  +	}
  +
  +	public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) {
   		this( in );
   		this.exclusions = exclusiontable;
   	}
  
  
  
  1.3       +5 -4      jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
  
  Index: CJKAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- CJKAnalyzer.java	22 Jan 2004 20:54:47 -0000	1.2
  +++ CJKAnalyzer.java	11 Mar 2004 03:05:36 -0000	1.3
  @@ -63,6 +63,7 @@
   import java.io.Reader;
   
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   
   /**
  @@ -91,7 +92,7 @@
       //~ Instance fields --------------------------------------------------------
   
       /** stop word list */
  -    private Hashtable stopTable;
  +    private HashSet stopTable;
   
       //~ Constructors -----------------------------------------------------------
   
  @@ -99,7 +100,7 @@
        * Builds an analyzer which removes words in STOP_WORDS.
        */
       public CJKAnalyzer() {
  -        stopTable = StopFilter.makeStopTable(stopWords);
  +        stopTable = StopFilter.makeStopSet(stopWords);
       }
   
       /**
  @@ -108,7 +109,7 @@
        * @param stopWords stop word array
        */
       public CJKAnalyzer(String[] stopWords) {
  -        stopTable = StopFilter.makeStopTable(stopWords);
  +        stopTable = StopFilter.makeStopSet(stopWords);
       }
   
       //~ Methods ----------------------------------------------------------------
  
  
  
  1.3       +14 -7     jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
  
  Index: CzechAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- CzechAnalyzer.java	22 Jan 2004 20:54:47 -0000	1.2
  +++ CzechAnalyzer.java	11 Mar 2004 03:05:36 -0000	1.3
  @@ -64,6 +64,7 @@
   
   import java.io.*;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Analyzer for Czech language. Supports an external list of stopwords (words that
  @@ -102,26 +103,32 @@
   	/**
   	 * Contains the stopwords used with the StopFilter.
   	 */
  -	private Hashtable stoptable = new Hashtable();
  +	private HashSet stoptable;
   
   	/**
   	 * Builds an analyzer.
   	 */
   	public CzechAnalyzer() {
  -		stoptable = StopFilter.makeStopTable( STOP_WORDS );
  +		stoptable = StopFilter.makeStopSet( STOP_WORDS );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public CzechAnalyzer( String[] stopwords ) {
  -		stoptable = StopFilter.makeStopTable( stopwords );
  +		stoptable = StopFilter.makeStopSet( stopwords );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
  +   *
  +   * @deprecated
   	 */
   	public CzechAnalyzer( Hashtable stopwords ) {
  +		stoptable = new HashSet(stopwords.keySet());
  +	}
  +
  +	public CzechAnalyzer( HashSet stopwords ) {
   		stoptable = stopwords;
   	}
   
  @@ -129,7 +136,7 @@
   	 * Builds an analyzer with the given stop words.
   	 */
   	public CzechAnalyzer( File stopwords ) {
  -		stoptable = WordlistLoader.getWordtable( stopwords );
  +		stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
   	}
   
       /**
  @@ -139,12 +146,12 @@
        */
       public void loadStopWords( InputStream wordfile, String encoding ) {
           if ( wordfile == null ) {
  -            stoptable = new Hashtable();
  +            stoptable = new HashSet();
               return;
           }
           try {
               // clear any previous table (if present)
  -            stoptable = new Hashtable();
  +            stoptable = new HashSet();
   
               InputStreamReader isr;
               if (encoding == null)
  @@ -156,7 +163,7 @@
               LineNumberReader lnr = new LineNumberReader(isr);
               String word;
               while ( ( word = lnr.readLine() ) != null ) {
  -                stoptable.put(word, word);
  +                stoptable.add(word);
               }
   
           } catch ( IOException e ) {
  
  
  
  1.4       +14 -10    jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
  
  Index: FrenchAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FrenchAnalyzer.java	23 Jan 2004 12:49:34 -0000	1.3
  +++ FrenchAnalyzer.java	11 Mar 2004 03:05:36 -0000	1.4
  @@ -63,6 +63,8 @@
   import java.io.File;
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.HashSet;
  +
   import org.apache.lucene.analysis.de.WordlistLoader;
   
   /**
  @@ -108,57 +110,59 @@
   	/**
   	 * Contains the stopwords used with the StopFilter.
   	 */
  -	private Hashtable stoptable = new Hashtable();
  +	private HashSet stoptable = new HashSet();
   	/**
   	 * Contains words that should be indexed but not stemmed.
   	 */
  -	private Hashtable excltable = new Hashtable();
  +	private HashSet excltable = new HashSet();
   
   	/**
   	 * Builds an analyzer.
   	 */
   	public FrenchAnalyzer() {
  -		stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
  +		stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public FrenchAnalyzer( String[] stopwords ) {
  -		stoptable = StopFilter.makeStopTable( stopwords );
  +		stoptable = StopFilter.makeStopSet( stopwords );
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
  +   *
  +   * @deprecated
   	 */
   	public FrenchAnalyzer( Hashtable stopwords ) {
  -		stoptable = stopwords;
  +		stoptable = new HashSet(stopwords.keySet());
   	}
   
   	/**
   	 * Builds an analyzer with the given stop words.
   	 */
   	public FrenchAnalyzer( File stopwords ) {
  -		stoptable = WordlistLoader.getWordtable( stopwords );
  +		stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
   	}
   
   	/**
   	 * Builds an exclusionlist from an array of Strings.
   	 */
   	public void setStemExclusionTable( String[] exclusionlist ) {
  -		excltable = StopFilter.makeStopTable( exclusionlist );
  +		excltable = StopFilter.makeStopSet( exclusionlist );
   	}
   	/**
   	 * Builds an exclusionlist from a Hashtable.
   	 */
   	public void setStemExclusionTable( Hashtable exclusionlist ) {
  -		excltable = exclusionlist;
  +		excltable = new HashSet(exclusionlist.keySet());
   	}
   	/**
   	 * Builds an exclusionlist from the words contained in the given file.
   	 */
   	public void setStemExclusionTable( File exclusionlist ) {
  -		excltable = WordlistLoader.getWordtable( exclusionlist );
  +		excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
   	}
   
   	/**
  
  
  
  1.3       +10 -2     jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
  
  Index: FrenchStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FrenchStemFilter.java	22 Jan 2004 20:54:47 -0000	1.2
  +++ FrenchStemFilter.java	11 Mar 2004 03:05:36 -0000	1.3
  @@ -59,6 +59,7 @@
   import org.apache.lucene.analysis.TokenStream;
   import java.io.IOException;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * A filter that stemms french words. It supports a table of words that should
  @@ -74,7 +75,7 @@
   	 */
   	private Token token = null;
   	private FrenchStemmer stemmer = null;
  -	private Hashtable exclusions = null;
  +	private HashSet exclusions = null;
   
   	public FrenchStemFilter( TokenStream in ) {
       super(in);
  @@ -83,9 +84,16 @@
   
   	/**
   	 * Builds a FrenchStemFilter that uses an exclusiontable.
  +   *
  +   * @deprecated
   	 */
   	public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
   		this( in );
  +		exclusions = new HashSet(exclusiontable.keySet());
  +	}
  +
  +	public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) {
  +		this( in );
   		exclusions = exclusiontable;
   	}
   
  @@ -122,7 +130,7 @@
   	 * Set an alternative exclusion list for this filter.
   	 */
   	public void setExclusionTable( Hashtable exclusiontable ) {
  -		exclusions = exclusiontable;
  +		exclusions = new HashSet(exclusiontable.keySet());
   	}
   }
   
  
  
  
  1.2       +127 -138  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
  
  Index: DutchAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchAnalyzer.java	9 Mar 2004 14:55:08 -0000	1.1
  +++ DutchAnalyzer.java	11 Mar 2004 03:05:36 -0000	1.2
  @@ -21,148 +21,137 @@
   import org.apache.lucene.analysis.TokenStream;
   import org.apache.lucene.analysis.standard.StandardFilter;
   import org.apache.lucene.analysis.standard.StandardTokenizer;
  -import org.apache.lucene.analysis.Token;
  +
   import java.io.File;
  -import java.io.*;
   import java.io.Reader;
  -import java.util.Hashtable;
  +import java.util.HashMap;
  +import java.util.HashSet;
   
   /**
  - *
    * @author Edwin de Jonge
  - *
  - * Analyzer for Dutch language. Supports an external list of stopwords (words that
  - * will not be indexed at all), an external list of exclusions (word that will
  - * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
  - * the algorithm (dictionary stemming).
  - * A default set of stopwords is used unless an alternative list is specified, the
  - * exclusion list is empty by default.
  - * As start for the Analyzer the German Analyzer was used. The stemming algorithm
  - * implemented can be found at @link
  + *         <p/>
  + *         Analyzer for Dutch language. Supports an external list of stopwords (words that
  + *         will not be indexed at all), an external list of exclusions (word that will
  + *         not be stemmed, but indexed) and an external list of word-stem pairs that overrule
  + *         the algorithm (dictionary stemming).
  + *         A default set of stopwords is used unless an alternative list is specified, the
  + *         exclusion list is empty by default.
  + *         As start for the Analyzer the German Analyzer was used. The stemming algorithm
  + *         implemented can be found at @link
    */
  -public class DutchAnalyzer extends Analyzer
  -{
  -	/**
  -	 * List of typical Dutch stopwords.
  -	 */
  -	private String[] DUTCH_STOP_WORDS =
  -	{
  -	   "de","en","van","ik","te","dat","die","in","een",
  -	   "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
  -	   "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
  -	   "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
  -	   "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
  -	   "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
  -	   "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
  -	   "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
  -	   "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
  -	   "uw","iemand","geweest","andere"
  -	};
  -
  -
  -	/**
  -	 * Contains the stopwords used with the StopFilter.
  -	 */
  -	private Hashtable stoptable = new Hashtable();
  -
  -	/**
  -	 * Contains words that should be indexed but not stemmed.
  -	 */
  -	private Hashtable excltable = new Hashtable();
  -
  -	private Hashtable _stemdict = new Hashtable();
  -
  -
  -	/**
  -	 * Builds an analyzer.
  -	 */
  -	public DutchAnalyzer()
  -	{
  -		stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS );
  -		_stemdict.put("fiets","fiets"); //otherwise fiet
  -		_stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet
  -		_stemdict.put("ei","eier");
  -		_stemdict.put("kind","kinder");
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 *
  -	 * @param stopwords
  -	 */
  -	public DutchAnalyzer( String[] stopwords )
  -	{
  -		stoptable = StopFilter.makeStopTable( stopwords );
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 *
  -	 * @param stopwords
  -	 */
  -	public DutchAnalyzer( Hashtable stopwords )
  -	{
  -		stoptable = stopwords;
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 *
  -	 *  @param stopwords
  -	 */
  -	public DutchAnalyzer( File stopwords )
  -	{
  -		stoptable = WordlistLoader.getWordtable( stopwords );
  -	}
  -
  -	/**
  -	 * Builds an exclusionlist from an array of Strings.
  -	 *
  -	 * @param exclusionlist
  -	 */
  -	public void setStemExclusionTable( String[] exclusionlist )
  -	{
  -		excltable = StopFilter.makeStopTable( exclusionlist );
  -	}
  -
  -	/**
  -	 * Builds an exclusionlist from a Hashtable.
  -	 */
  -	public void setStemExclusionTable( Hashtable exclusionlist )
  -	{
  -		excltable = exclusionlist;
  -	}
  -
  -	/**
  -	 * Builds an exclusionlist from the words contained in the given file.
  -	 */
  -	public void setStemExclusionTable(File exclusionlist)
  -	{
  -		excltable = WordlistLoader.getWordtable(exclusionlist);
  -	}
  -
  -	/**
  -	 * Reads a stemdictionary file , that overrules the stemming algorithm
  -	 * This is a textfile that contains per line
  -	 * word\tstem
  -	 * i.e: tabseperated
  -	 */
  -	public void setStemDictionary(File stemdict)
  -	{
  -		_stemdict = WordlistLoader.getStemDict(stemdict);
  -	}
  -
  -	/**
  -	 * Creates a TokenStream which tokenizes all the text in the provided TextReader.
  -	 *
  -	 * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
  -	 */
  -	public TokenStream tokenStream(String fieldName, Reader reader)
  -	{
  -		TokenStream result = new StandardTokenizer( reader );
  -		result = new StandardFilter( result );
  -		result = new StopFilter( result, stoptable );
  -		result = new DutchStemFilter( result, excltable, _stemdict);
  -		return result;
  -	}
  +public class DutchAnalyzer extends Analyzer {
  +  /**
  +   * List of typical Dutch stopwords.
  +   */
  +  private String[] DUTCH_STOP_WORDS =
  +      {
  +        "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
  +        "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
  +        "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
  +        "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
  +        "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
  +        "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
  +        "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
  +        "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
  +        "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
  +        "uw", "iemand", "geweest", "andere"
  +      };
  +
  +
  +  /**
  +   * Contains the stopwords used with the StopFilter.
  +   */
  +  private HashSet stoptable = new HashSet();
  +
  +  /**
  +   * Contains words that should be indexed but not stemmed.
  +   */
  +  private HashSet excltable = new HashSet();
  +
  +  private HashMap _stemdict = new HashMap();
  +
  +
  +  /**
  +   * Builds an analyzer.
  +   */
  +  public DutchAnalyzer() {
  +    stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
  +    _stemdict.put("fiets", "fiets"); //otherwise fiet
  +    _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
  +    _stemdict.put("ei", "eier");
  +    _stemdict.put("kind", "kinder");
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(String[] stopwords) {
  +    stoptable = StopFilter.makeStopSet(stopwords);
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(HashSet stopwords) {
  +    stoptable = stopwords;
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(File stopwords) {
  +    stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from an array of Strings.
  +   *
  +   * @param exclusionlist
  +   */
  +  public void setStemExclusionTable(String[] exclusionlist) {
  +    excltable = StopFilter.makeStopSet(exclusionlist);
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from a Hashtable.
  +   */
  +  public void setStemExclusionTable(HashSet exclusionlist) {
  +    excltable = exclusionlist;
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from the words contained in the given file.
  +   */
  +  public void setStemExclusionTable(File exclusionlist) {
  +    excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
  +  }
  +
  +  /**
  +   * Reads a stemdictionary file , that overrules the stemming algorithm
  +   * This is a textfile that contains per line
  +   * word\tstem
  +   * i.e: tabseperated
  +   */
  +  public void setStemDictionary(File stemdict) {
  +    _stemdict = WordlistLoader.getStemDict(stemdict);
  +  }
  +
  +  /**
  +   * Creates a TokenStream which tokenizes all the text in the provided TextReader.
  +   *
  +   * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
  +   */
  +  public TokenStream tokenStream(String fieldName, Reader reader) {
  +    TokenStream result = new StandardTokenizer(reader);
  +    result = new StandardFilter(result);
  +    result = new StopFilter(result, stoptable);
  +    result = new DutchStemFilter(result, excltable, _stemdict);
  +    return result;
  +  }
   }
  
  
  
  1.2       +82 -96    jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
  
  Index: DutchStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchStemFilter.java	9 Mar 2004 14:55:08 -0000	1.1
  +++ DutchStemFilter.java	11 Mar 2004 03:05:36 -0000	1.2
  @@ -19,105 +19,91 @@
   import org.apache.lucene.analysis.Token;
   import org.apache.lucene.analysis.TokenFilter;
   import org.apache.lucene.analysis.TokenStream;
  +
   import java.io.IOException;
  -import java.util.Hashtable;
  +import java.util.HashMap;
  +import java.util.HashSet;
   
   /**
  - *
    * @author Edwin de Jonge
  - *
  - * A filter that stems Dutch words. It supports a table of words that should
  - * not be stemmed at all. The stemmer used can be changed at runtime after the
  - * filter object is created (as long as it is a DutchStemmer).
  + *         <p/>
  + *         A filter that stems Dutch words. It supports a table of words that should
  + *         not be stemmed at all. The stemmer used can be changed at runtime after the
  + *         filter object is created (as long as it is a DutchStemmer).
    */
  -public final class DutchStemFilter extends TokenFilter
  -{
  -	/**
  -	 * The actual token in the input stream.
  -	 */
  -	private Token token = null;
  -	private DutchStemmer stemmer = null;
  -	private Hashtable exclusions = null;
  -
  -	public DutchStemFilter( TokenStream _in )
  -	{
  -		super(_in);
  -		stemmer = new DutchStemmer();
  -	}
  -
  -	/**
  -	 * Builds a DutchStemFilter that uses an exclusiontable.
  -	 */
  -	public DutchStemFilter( TokenStream _in, Hashtable exclusiontable )
  -	{
  -		this(_in);
  -		exclusions = exclusiontable;
  -	}
  -
  -	/**
  -	 * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
  -	 */
  -	public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary)
  -	{
  -		this(_in, exclusiontable);
  -		stemmer.setStemDictionary(stemdictionary);
  -	}
  -
  -	/**
  -	 * @return Returns the next token in the stream, or null at EOS
  -	 */
  -	public Token next() throws IOException
  -
  -	{
  -		if ( ( token = input.next() ) == null )
  -		{
  -			return null;
  -		}
  -
  -		// Check the exclusiontable
  -		else if ( exclusions != null && exclusions.contains( token.termText() ) )
  -		{
  -			return token;
  -		}
  -		else
  -		{
  -			String s = stemmer.stem( token.termText() );
  -			// If not stemmed, dont waste the time creating a new token
  -			if ( !s.equals( token.termText() ) )
  -			{
  -				return new Token( s, token.startOffset(),
  -					token.endOffset(), token.type() );
  -			}
  -			return token;
  -		}
  -	}
  -
  -	/**
  -	 * Set a alternative/custom DutchStemmer for this filter.
  -	 */
  -	public void setStemmer( DutchStemmer stemmer )
  -	{
  -		if ( stemmer != null )
  -		{
  -			this.stemmer = stemmer;
  -		}
  -	}
  -
  -	/**
  -	 * Set an alternative exclusion list for this filter.
  -	 */
  -	public void setExclusionTable( Hashtable exclusiontable )
  -	{
  -		exclusions = exclusiontable;
  -	}
  -
  -	/**
  -	 * Set dictionary for stemming, this dictionary overrules the algorithm,
  -	 * so you can correct for a particular unwanted word-stem pair.
  -	 */
  -	public void setStemDictionary(Hashtable dict)
  -	{
  -		if (stemmer != null)
  -			stemmer.setStemDictionary(dict);
  -	}
  +public final class DutchStemFilter extends TokenFilter {
  +  /**
  +   * The actual token in the input stream.
  +   */
  +  private Token token = null;
  +  private DutchStemmer stemmer = null;
  +  private HashSet exclusions = null;
  +
  +  public DutchStemFilter(TokenStream _in) {
  +    super(_in);
  +    stemmer = new DutchStemmer();
  +  }
  +
  +  /**
  +   * Builds a DutchStemFilter that uses an exclusiontable.
  +   */
  +  public DutchStemFilter(TokenStream _in, HashSet exclusiontable) {
  +    this(_in);
  +    exclusions = exclusiontable;
  +  }
  +
  +  /**
  +   * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
  +   */
  +  public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap stemdictionary) {
  +    this(_in, exclusiontable);
  +    stemmer.setStemDictionary(stemdictionary);
  +  }
  +
  +  /**
  +   * @return Returns the next token in the stream, or null at EOS
  +   */
  +  public Token next() throws IOException {
  +    if ((token = input.next()) == null) {
  +      return null;
  +    }
  +
  +    // Check the exclusiontable
  +    else if (exclusions != null && exclusions.contains(token.termText())) {
  +      return token;
  +    } else {
  +      String s = stemmer.stem(token.termText());
  +      // If not stemmed, dont waste the time creating a new token
  +      if (!s.equals(token.termText())) {
  +        return new Token(s, token.startOffset(),
  +            token.endOffset(), token.type());
  +      }
  +      return token;
  +    }
  +  }
  +
  +  /**
  +   * Set a alternative/custom DutchStemmer for this filter.
  +   */
  +  public void setStemmer(DutchStemmer stemmer) {
  +    if (stemmer != null) {
  +      this.stemmer = stemmer;
  +    }
  +  }
  +
  +  /**
  +   * Set an alternative exclusion list for this filter.
  +   */
  +  public void setExclusionTable(HashSet exclusiontable) {
  +    exclusions = exclusiontable;
  +  }
  +
  +  /**
  +   * Set dictionary for stemming, this dictionary overrules the algorithm,
  +   * so you can correct for a particular unwanted word-stem pair.
  +   */
  +  public void setStemDictionary(HashMap dict) {
  +    if (stemmer != null)
  +      stemmer.setStemDictionary(dict);
  +  }
   }
  
  
  
  1.2       +379 -425  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
  
  Index: DutchStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchStemmer.java	9 Mar 2004 14:55:08 -0000	1.1
  +++ DutchStemmer.java	11 Mar 2004 03:05:36 -0000	1.2
  @@ -16,9 +16,8 @@
    * limitations under the License.
    */
   
  -import java.util.Hashtable;
  -import java.util.ArrayList;
  -import java.io.*;
  +import java.util.HashMap;
  +
   /*
    * @author Edwin de Jonge (ejne@cbs.nl)
    *
  @@ -26,427 +25,382 @@
    * the <see cref="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
    *  algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
    */
  -public class DutchStemmer
  -{
  -	/**
  -	 * Buffer for the terms while stemming them.
  -	 */
  -	private StringBuffer sb = new StringBuffer();
  -	private boolean _removedE;
  -	private Hashtable _stemDict;
  -
  -	private int _R1;
  -	private int _R2;
  -
  -	//TODO convert to internal
  -	/*
  -	 * Stemms the given term to an unique <tt>discriminator</tt>.
  -	 *
  -	 * @param term The term that should be stemmed.
  -	 * @return Discriminator for <tt>term</tt>
  -	 */
  -	public String stem( String term )
  -	{
  -		term = term.toLowerCase();
  -		if ( !isStemmable( term ) )
  -			return term;
  -		if (_stemDict != null && _stemDict.contains(term))
  -			if (_stemDict.get(term) instanceof String)
  -				return (String)_stemDict.get(term);
  -			else return null;
  -
  -		// Reset the StringBuffer.
  -		sb.delete(0, sb.length());
  -		sb.insert(0, term);
  -		// Stemming starts here...
  -		substitute(sb);
  -		storeYandI(sb);
  -		_R1 = getRIndex(sb, 0);
  -		_R1 = Math.max(3,_R1);
  -		step1(sb);
  -		step2(sb);
  -		_R2 = getRIndex(sb, _R1);
  -		step3a(sb);
  -		step3b(sb);
  -		step4(sb);
  -		reStoreYandI(sb);
  -		return sb.toString();
  -	}
  -
  -	private boolean enEnding(StringBuffer sb)
  -	{
  -		String[] enend = new String[]{"ene","en"};
  -		for (int i = 0; i < enend.length; i++)
  -		{
  -			String end = enend[i];
  -			String s = sb.toString();
  -			int index = s.length() - end.length();
  -			if ( s.endsWith(end) &&
  -				  index >= _R1 &&
  -				  isValidEnEnding(sb,index-1)
  -				)
  -			{
  -				sb.delete(index, index + end.length());
  -				unDouble(sb,index);
  -				return true;
  -			}
  -		}
  -		return false;
  -	}
  -
  -
  -	private void step1(StringBuffer sb)
  -	{
  -		if (_R1 >= sb.length())
  -			return;
  -
  -		String s = sb.toString();
  -		int lengthR1 = sb.length() - _R1;
  -		int index;
  -
  -		if (s.endsWith("heden"))
  -		{
  -			sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
  -			return;
  -		}
  -
  -		if (enEnding(sb))
  -			return;
  -
  -		if (s.endsWith("se")              &&
  -			 (index = s.length() - 2) >= _R1  &&
  -			 isValidSEnding(sb, index -1)
  -			)
  -		{
  -			sb.delete(index, index + 2);
  -			return;
  -		}
  -		if (s.endsWith("s") &&
  -			(index = s.length() - 1) >= _R1  &&
  -			isValidSEnding(sb, index - 1))
  -		{
  -			sb.delete(index, index + 1);
  -		}
  -	}
  -
  -	/**
  -	 * Delete suffix e if in R1 and
  -	 * preceded by a non-vowel, and then undouble the ending
  -	 *
  -	 * @param sb String being stemmed
  -	 */
  -	private void step2(StringBuffer sb)
  -	{
  -		_removedE = false;
  -		if (_R1 >= sb.length())
  -			return;
  -		String s = sb.toString();
  -		int index = s.length() - 1;
  -		if ( index >= _R1   &&
  -			 s.endsWith("e") &&
  -			 !isVowel(sb.charAt(index-1)))
  -		{
  -			sb.delete(index, index + 1);
  -			unDouble(sb);
  -			_removedE = true;
  -		}
  -	}
  -
  -	/**
  -	 * Delete "heid"
  -	 *
  -	 * @param sb String being stemmed
  -	 */
  -	private void step3a(StringBuffer sb)
  -	{
  -		if (_R2 >= sb.length())
  -			return;
  -		String s = sb.toString();
  -		int index = s.length() - 4;
  -		if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c')
  -		{
  -			sb.delete(index, index + 4); //remove heid
  -			enEnding(sb);
  -		}
  -	}
  -
  -	/**
  -	 *  <p>A d-suffix, or derivational suffix, enables a new word,
  -	 *  often with a different grammatical category, or with a different
  -	 *  sense, to be built from another word. Whether a d-suffix can be
  -	 *  attached is discovered not from the rules of grammar, but by
  -	 *  referring to a dictionary. So in English, ness can be added to
  -	 *  certain adjectives to form corresponding nouns (littleness,
  -	 *  kindness, foolishness ...) but not to all adjectives
  - 	 *  (not for example, to big, cruel, wise ...) d-suffixes can be
  -	 *  used to change meaning, often in rather exotic ways.</p>
  -	 *  Remove "ing", "end", "ig", "lijk", "baar" and "bar"
  -	 *
  -	 * @param sb String being stemmed
  -	 */
  -	private void step3b(StringBuffer sb)
  -	{
  -		if (_R2 >= sb.length())
  -			return;
  -		String s = sb.toString();
  -		int index;
  -
  -		if ((s.endsWith("end") || s.endsWith("ing")) &&
  -  		 	(index = s.length() - 3) >= _R2)
  -		{
  -			sb.delete(index, index + 3);
  -			if (sb.charAt(index - 2) == 'i' &&
  -				sb.charAt(index - 1) == 'g')
  -			{
  -				if (sb.charAt(index - 3) != 'e' & index-2 >= _R2)
  -				{
  -					index -= 2;
  -					sb.delete(index, index + 2);
  -				}
  -			}
  -			else
  -			{
  -				unDouble(sb,index);
  -			}
  -			return;
  -		}
  -		if ( s.endsWith("ig")    &&
  -			  (index = s.length() - 2) >= _R2
  -			)
  -		{
  -			if (sb.charAt(index - 1) != 'e')
  -				sb.delete(index, index + 2);
  -			return;
  -		}
  -		if (s.endsWith("lijk") &&
  -			 (index = s.length() - 4) >= _R2
  -			)
  -		{
  -			sb.delete(index, index + 4);
  -			step2(sb);
  -			return;
  -		}
  -		if (s.endsWith("baar") &&
  -			(index = s.length() - 4) >= _R2
  -			)
  -		{
  -			sb.delete(index, index + 4);
  -			return;
  -		}
  -		if (s.endsWith("bar")  &&
  -			 (index = s.length() - 3) >= _R2
  -			)
  -		{
  -			if (_removedE)
  -				sb.delete(index, index + 3);
  -			return;
  -		}
  -	}
  -
  -	/**
  -	 * undouble vowel
  -	 * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
  -	 *
  -	 * @param sb String being stemmed
  -	 */
  -	private void step4(StringBuffer sb)
  -	{
  -		if (sb.length() < 4)
  -			return;
  -		String end = sb.substring(sb.length() - 4, sb.length());
  -		char c = end.charAt(0);
  -		char v1 = end.charAt(1);
  -		char v2 = end.charAt(2);
  -		char d = end.charAt(3);
  -		if (v1 == v2    &&
  -			 d != 'I'    &&
  -			 v1 != 'i'    &&
  -			 isVowel(v1) &&
  -			!isVowel(d)  &&
  -			!isVowel(c))
  -		{
  -			sb.delete(sb.length() - 2, sb.length() - 1);
  -		}
  -	}
  -
  -	/**
  -	 * Checks if a term could be stemmed.
  -	 *
  -	 * @return true if, and only if, the given term consists in letters.
  -	 */
  -	private boolean isStemmable( String term )
  -	{
  -		for ( int c = 0; c < term.length(); c++ )
  -		{
  -			if ( !Character.isLetter(term.charAt(c))) return false;
  -		}
  -		return true;
  -	}
  -
  -	/**
  -	 * Substitute , , , , ,  , , , , 
  -	 */
  -	private void substitute( StringBuffer buffer )
  -	{
  -		for ( int i = 0; i < buffer.length(); i++ )
  -		{
  -			switch (buffer.charAt(i))
  -			{
  -				case '':
  -				case '':
  -				{
  -					buffer.setCharAt(i, 'a');
  -					break;
  -				}
  -				case '':
  -				case '':
  -				{
  -					buffer.setCharAt(i, 'e');
  -					break;
  -				}
  -				case '':
  -				case '':
  -				{
  -					buffer.setCharAt(i, 'u');
  -					break;
  -				}
  -				case '':
  -				case 'i':
  -				{
  -					buffer.setCharAt(i, 'i');
  -					break;
  -				}
  -				case '':
  -				case '':
  -				{
  -					buffer.setCharAt(i, 'o');
  -					break;
  -				}
  -			}
  -		}
  -	}
  -
  -	private boolean isValidSEnding(StringBuffer sb)
  -	{
  -		return  isValidSEnding(sb,sb.length() - 1);
  -	}
  -
  -	private boolean isValidSEnding(StringBuffer sb, int index)
  -	{
  -		char c = sb.charAt(index);
  -		if (isVowel(c) || c == 'j')
  -			return false;
  -		return true;
  -	}
  -
  -	private boolean isValidEnEnding(StringBuffer sb)
  -	{
  -		return isValidEnEnding(sb,sb.length() - 1);
  -	}
  -
  -	private boolean isValidEnEnding(StringBuffer sb, int index)
  -	{
  -		char c = sb.charAt(index);
  -		if (isVowel(c))
  -			return false;
  -		if (c < 3)
  -			return false;
  -		// ends with "gem"?
  -		if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 'e')
  -			return false;
  -		return true;
  -	}
  -
  -	private void unDouble(StringBuffer sb)
  -	{
  -		unDouble(sb, sb.length());
  -	}
  -
  -	private void unDouble(StringBuffer sb, int endIndex)
  -	{
  -		String s = sb.substring(0, endIndex);
  -		if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff"))
  -		{
  -			sb.delete(endIndex-1, endIndex);
  -		}
  -	}
  -
  -	private int getRIndex(StringBuffer sb, int start)
  -	{
  -		if (start == 0)
  -			start = 1;
  -		int i = start;
  -		for (; i < sb.length(); i++)
  -		{
  -			//first non-vowel preceded by a vowel
  -			if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1)))
  -			{
  -				return i + 1;
  -			}
  -		}
  -		return i + 1;
  -	}
  -
  -	private void storeYandI(StringBuffer sb)
  -	{
  -		if (sb.charAt(0) == 'y')
  -			sb.setCharAt(0, 'Y');
  -
  -		char c;
  -		int last = sb.length() - 1;
  -
  -		for (int i = 1; i < last; i++)
  -		{
  -			switch (sb.charAt(i))
  -			{
  -				case 'i':
  -				{
  -					if (isVowel(sb.charAt(i-1)) &&
  -						isVowel(sb.charAt(i+1))
  -						)
  -						sb.setCharAt(i, 'I');
  -					break;
  -				}
  -				case 'y':
  -				{
  -					if (isVowel(sb.charAt(i-1)))
  -						sb.setCharAt(i, 'Y');
  -					break;
  -				}
  -			}
  -		}
  -		if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1)))
  -			sb.setCharAt(last, 'Y');
  -	}
  -
  -	private void reStoreYandI(StringBuffer sb)
  -	{
  -		String tmp = sb.toString();
  -		sb.delete(0, sb.length());
  -		sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y"));
  -	}
  -
  -	private boolean isVowel(char c)
  -	{
  -		switch (c)
  -		{
  -			case 'e':
  -			case 'a':
  -			case 'o':
  -			case 'i':
  -			case 'u':
  -			case 'y':
  -			case '':
  -			{
  -				return true;
  -			}
  -		}
  -		return false;
  -	}
  -
  -	void setStemDictionary(Hashtable dict)
  -	{
  -		_stemDict = dict;
  -	}
  +
  +public class DutchStemmer {
  +  /**
  +   * Buffer for the terms while stemming them.
  +   */
  +  private StringBuffer sb = new StringBuffer();
  +  private boolean _removedE;
  +  private HashMap _stemDict;
  +
  +  private int _R1;
  +  private int _R2;
  +
  +  //TODO convert to internal
  +  /*
  +   * Stemms the given term to an unique <tt>discriminator</tt>.
  +   *
  +   * @param term The term that should be stemmed.
  +   * @return Discriminator for <tt>term</tt>
  +   */
  +  public String stem(String term) {
  +    term = term.toLowerCase();
  +    if (!isStemmable(term))
  +      return term;
  +    if (_stemDict != null && _stemDict.containsKey(term))
  +      if (_stemDict.get(term) instanceof String)
  +        return (String) _stemDict.get(term);
  +      else
  +        return null;
  +
  +    // Reset the StringBuffer.
  +    sb.delete(0, sb.length());
  +    sb.insert(0, term);
  +    // Stemming starts here...
  +    substitute(sb);
  +    storeYandI(sb);
  +    _R1 = getRIndex(sb, 0);
  +    _R1 = Math.max(3, _R1);
  +    step1(sb);
  +    step2(sb);
  +    _R2 = getRIndex(sb, _R1);
  +    step3a(sb);
  +    step3b(sb);
  +    step4(sb);
  +    reStoreYandI(sb);
  +    return sb.toString();
  +  }
  +
  +  private boolean enEnding(StringBuffer sb) {
  +    String[] enend = new String[]{"ene", "en"};
  +    for (int i = 0; i < enend.length; i++) {
  +      String end = enend[i];
  +      String s = sb.toString();
  +      int index = s.length() - end.length();
  +      if (s.endsWith(end) &&
  +          index >= _R1 &&
  +          isValidEnEnding(sb, index - 1)
  +      ) {
  +        sb.delete(index, index + end.length());
  +        unDouble(sb, index);
  +        return true;
  +      }
  +    }
  +    return false;
  +  }
  +
  +
  +  private void step1(StringBuffer sb) {
  +    if (_R1 >= sb.length())
  +      return;
  +
  +    String s = sb.toString();
  +    int lengthR1 = sb.length() - _R1;
  +    int index;
  +
  +    if (s.endsWith("heden")) {
  +      sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
  +      return;
  +    }
  +
  +    if (enEnding(sb))
  +      return;
  +
  +    if (s.endsWith("se") &&
  +        (index = s.length() - 2) >= _R1 &&
  +        isValidSEnding(sb, index - 1)
  +    ) {
  +      sb.delete(index, index + 2);
  +      return;
  +    }
  +    if (s.endsWith("s") &&
  +        (index = s.length() - 1) >= _R1 &&
  +        isValidSEnding(sb, index - 1)) {
  +      sb.delete(index, index + 1);
  +    }
  +  }
  +
  +  /**
  +   * Delete suffix e if in R1 and
  +   * preceded by a non-vowel, and then undouble the ending
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step2(StringBuffer sb) {
  +    _removedE = false;
  +    if (_R1 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = s.length() - 1;
  +    if (index >= _R1 &&
  +        s.endsWith("e") &&
  +        !isVowel(sb.charAt(index - 1))) {
  +      sb.delete(index, index + 1);
  +      unDouble(sb);
  +      _removedE = true;
  +    }
  +  }
  +
  +  /**
  +   * Delete "heid"
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step3a(StringBuffer sb) {
  +    if (_R2 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = s.length() - 4;
  +    if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
  +      sb.delete(index, index + 4); //remove heid
  +      enEnding(sb);
  +    }
  +  }
  +
  +  /**
  +   * <p>A d-suffix, or derivational suffix, enables a new word,
  +   * often with a different grammatical category, or with a different
  +   * sense, to be built from another word. Whether a d-suffix can be
  +   * attached is discovered not from the rules of grammar, but by
  +   * referring to a dictionary. So in English, ness can be added to
  +   * certain adjectives to form corresponding nouns (littleness,
  +   * kindness, foolishness ...) but not to all adjectives
  +   * (not for example, to big, cruel, wise ...) d-suffixes can be
  +   * used to change meaning, often in rather exotic ways.</p>
  +   * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step3b(StringBuffer sb) {
  +    if (_R2 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = 0;
  +
  +    if ((s.endsWith("end") || s.endsWith("ing")) &&
  +        (index = s.length() - 3) >= _R2) {
  +      sb.delete(index, index + 3);
  +      if (sb.charAt(index - 2) == 'i' &&
  +          sb.charAt(index - 1) == 'g') {
  +        if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
  +          index -= 2;
  +          sb.delete(index, index + 2);
  +        }
  +      } else {
  +        unDouble(sb, index);
  +      }
  +      return;
  +    }
  +    if (s.endsWith("ig") &&
  +        (index = s.length() - 2) >= _R2
  +    ) {
  +      if (sb.charAt(index - 1) != 'e')
  +        sb.delete(index, index + 2);
  +      return;
  +    }
  +    if (s.endsWith("lijk") &&
  +        (index = s.length() - 4) >= _R2
  +    ) {
  +      sb.delete(index, index + 4);
  +      step2(sb);
  +      return;
  +    }
  +    if (s.endsWith("baar") &&
  +        (index = s.length() - 4) >= _R2
  +    ) {
  +      sb.delete(index, index + 4);
  +      return;
  +    }
  +    if (s.endsWith("bar") &&
  +        (index = s.length() - 3) >= _R2
  +    ) {
  +      if (_removedE)
  +        sb.delete(index, index + 3);
  +      return;
  +    }
  +  }
  +
  +  /**
  +   * undouble vowel
  +   * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step4(StringBuffer sb) {
  +    if (sb.length() < 4)
  +      return;
  +    String end = sb.substring(sb.length() - 4, sb.length());
  +    char c = end.charAt(0);
  +    char v1 = end.charAt(1);
  +    char v2 = end.charAt(2);
  +    char d = end.charAt(3);
  +    if (v1 == v2 &&
  +        d != 'I' &&
  +        v1 != 'i' &&
  +        isVowel(v1) &&
  +        !isVowel(d) &&
  +        !isVowel(c)) {
  +      sb.delete(sb.length() - 2, sb.length() - 1);
  +    }
  +  }
  +
  +  /**
  +   * Checks if a term could be stemmed.
  +   *
  +   * @return true if, and only if, the given term consists in letters.
  +   */
  +  private boolean isStemmable(String term) {
  +    for (int c = 0; c < term.length(); c++) {
  +      if (!Character.isLetter(term.charAt(c))) return false;
  +    }
  +    return true;
  +  }
  +
  +  /**
  +   * Substitute , , , , ,  , , , , 
  +   */
  +  private void substitute(StringBuffer buffer) {
  +    for (int i = 0; i < buffer.length(); i++) {
  +      switch (buffer.charAt(i)) {
  +        case '':
  +        case '':
  +          {
  +            buffer.setCharAt(i, 'a');
  +            break;
  +          }
  +        case '':
  +        case '':
  +          {
  +            buffer.setCharAt(i, 'e');
  +            break;
  +          }
  +        case '':
  +        case '':
  +          {
  +            buffer.setCharAt(i, 'u');
  +            break;
  +          }
  +        case '':
  +        case 'i':
  +          {
  +            buffer.setCharAt(i, 'i');
  +            break;
  +          }
  +        case '':
  +        case '':
  +          {
  +            buffer.setCharAt(i, 'o');
  +            break;
  +          }
  +      }
  +    }
  +  }
  +
  +  private boolean isValidSEnding(StringBuffer sb) {
  +    return isValidSEnding(sb, sb.length() - 1);
  +  }
  +
  +  private boolean isValidSEnding(StringBuffer sb, int index) {
  +    char c = sb.charAt(index);
  +    if (isVowel(c) || c == 'j')
  +      return false;
  +    return true;
  +  }
  +
  +  private boolean isValidEnEnding(StringBuffer sb) {
  +    return isValidEnEnding(sb, sb.length() - 1);
  +  }
  +
  +  private boolean isValidEnEnding(StringBuffer sb, int index) {
  +    char c = sb.charAt(index);
  +    if (isVowel(c))
  +      return false;
  +    if (c < 3)
  +      return false;
  +    // ends with "gem"?
  +    if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
  +      return false;
  +    return true;
  +  }
  +
  +  private void unDouble(StringBuffer sb) {
  +    unDouble(sb, sb.length());
  +  }
  +
  +  private void unDouble(StringBuffer sb, int endIndex) {
  +    String s = sb.substring(0, endIndex);
  +    if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
  +      sb.delete(endIndex - 1, endIndex);
  +    }
  +  }
  +
  +  private int getRIndex(StringBuffer sb, int start) {
  +    if (start == 0)
  +      start = 1;
  +    int i = start;
  +    for (; i < sb.length(); i++) {
  +      //first non-vowel preceded by a vowel
  +      if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
  +        return i + 1;
  +      }
  +    }
  +    return i + 1;
  +  }
  +
  +  private void storeYandI(StringBuffer sb) {
  +    if (sb.charAt(0) == 'y')
  +      sb.setCharAt(0, 'Y');
  +
  +    char c;
  +    int last = sb.length() - 1;
  +
  +    for (int i = 1; i < last; i++) {
  +      switch (sb.charAt(i)) {
  +        case 'i':
  +          {
  +            if (isVowel(sb.charAt(i - 1)) &&
  +                isVowel(sb.charAt(i + 1))
  +            )
  +              sb.setCharAt(i, 'I');
  +            break;
  +          }
  +        case 'y':
  +          {
  +            if (isVowel(sb.charAt(i - 1)))
  +              sb.setCharAt(i, 'Y');
  +            break;
  +          }
  +      }
  +    }
  +    if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
  +      sb.setCharAt(last, 'Y');
  +  }
  +
  +  private void reStoreYandI(StringBuffer sb) {
  +    String tmp = sb.toString();
  +    sb.delete(0, sb.length());
  +    sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
  +  }
  +
  +  private boolean isVowel(char c) {
  +    switch (c) {
  +      case 'e':
  +      case 'a':
  +      case 'o':
  +      case 'i':
  +      case 'u':
  +      case 'y':
  +      case '':
  +        {
  +          return true;
  +        }
  +    }
  +    return false;
  +  }
  +
  +  void setStemDictionary(HashMap dict) {
  +    _stemDict = dict;
  +  }
   
   }
  
  
  
  1.2       +92 -111   jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- WordlistLoader.java	9 Mar 2004 14:55:08 -0000	1.1
  +++ WordlistLoader.java	11 Mar 2004 03:05:36 -0000	1.2
  @@ -20,123 +20,104 @@
   import java.io.FileReader;
   import java.io.IOException;
   import java.io.LineNumberReader;
  -import java.util.Hashtable;
  +import java.util.HashMap;
   
   /**
  - *
    * @author Gerhard Schwarz
  - *
  - * Loads a text file and adds every line as an entry to a Hashtable. Every line
  - * should contain only one word. If the file is not found or on any error, an
  - * empty table is returned.
  + *         <p/>
  + *         Loads a text file and adds every line as an entry to a Hashtable. Every line
  + *         should contain only one word. If the file is not found or on any error, an
  + *         empty table is returned.
    */
  -public class WordlistLoader
  -{
  -	/**
  -	 * @param path Path to the wordlist
  -	 * @param wordfile Name of the wordlist
  -	 */
  -	public static Hashtable getWordtable( String path, String wordfile )
  -	{
  -		if ( path == null || wordfile == null )
  -		{
  -			return new Hashtable();
  -		}
  -		return getWordtable(new File(path, wordfile));
  -	}
  +public class WordlistLoader {
  +  /**
  +   * @param path     Path to the wordlist
  +   * @param wordfile Name of the wordlist
  +   */
  +  public static HashMap getWordtable(String path, String wordfile) {
  +    if (path == null || wordfile == null) {
  +      return new HashMap();
  +    }
  +    return getWordtable(new File(path, wordfile));
  +  }
   
  -	/**
  -	 * @param wordfile Complete path to the wordlist
  -	 */
  -	public static Hashtable getWordtable( String wordfile )
  -	{
  -		if ( wordfile == null )
  -		{
  -			return new Hashtable();
  -		}
  -		return getWordtable( new File( wordfile ) );
  -	}
  +  /**
  +   * @param wordfile Complete path to the wordlist
  +   */
  +  public static HashMap getWordtable(String wordfile) {
  +    if (wordfile == null) {
  +      return new HashMap();
  +    }
  +    return getWordtable(new File(wordfile));
  +  }
   
  -	/**
  -	 * Reads a stemsdictionary. Each line contains:
  -     * word \t stem
  -	 * i.e. tab seperated)
  -	 *
  -	 * @return Stem dictionary that overrules, the stemming algorithm
  -	 */
  -	public static Hashtable getStemDict( File wordstemfile)
  -	{
  -		if ( wordstemfile == null )
  -		{
  -			return new Hashtable();
  -		}
  -		Hashtable result = new Hashtable();
  -		try
  -		{
  -			LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
  -			String line;
  -			String[] wordstem;
  -			while ((line = lnr.readLine()) != null)
  -			{
  -				wordstem = line.split("\t", 2);
  -				result.put(wordstem[0], wordstem[1]);
  -		   }
  -		}
  -		catch (IOException e)
  -		{}
  -		return result;
  -	}
  +  /**
  +   * Reads a stemsdictionary. Each line contains:
  +   * word \t stem
  +   * i.e. tab seperated)
  +   *
  +   * @return Stem dictionary that overrules, the stemming algorithm
  +   */
  +  public static HashMap getStemDict(File wordstemfile) {
  +    if (wordstemfile == null) {
  +      return new HashMap();
  +    }
  +    HashMap result = new HashMap();
  +    try {
  +      LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
  +      String line;
  +      String[] wordstem;
  +      while ((line = lnr.readLine()) != null) {
  +        wordstem = line.split("\t", 2);
  +        result.put(wordstem[0], wordstem[1]);
  +      }
  +    } catch (IOException e) {
  +    }
  +    return result;
  +  }
   
  -	/**
  -	 * @param wordfile File containing the wordlist
  -	 */
  -	public static Hashtable getWordtable( File wordfile )
  -	{
  -		if ( wordfile == null )
  -		{
  -			return new Hashtable();
  -		}
  -		Hashtable result = null;
  -		try
  -		{
  -			LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
  -			String word = null;
  -			String[] stopwords = new String[100];
  -			int wordcount = 0;
  -			while ( ( word = lnr.readLine() ) != null )
  -			{
  -				wordcount++;
  -				if ( wordcount == stopwords.length )
  -				{
  -					String[] tmp = new String[stopwords.length + 50];
  -					System.arraycopy( stopwords, 0, tmp, 0, wordcount );
  -					stopwords = tmp;
  -				}
  -				stopwords[wordcount-1] = word;
  -			}
  -			result = makeWordTable( stopwords, wordcount );
  -		}
  -			// On error, use an empty table
  -		catch (IOException e)
  -		{
  -			result = new Hashtable();
  -		}
  -		return result;
  -	}
  +  /**
  +   * @param wordfile File containing the wordlist
  +   */
  +  public static HashMap getWordtable(File wordfile) {
  +    if (wordfile == null) {
  +      return new HashMap();
  +    }
  +    HashMap result = null;
  +    try {
  +      LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
  +      String word = null;
  +      String[] stopwords = new String[100];
  +      int wordcount = 0;
  +      while ((word = lnr.readLine()) != null) {
  +        wordcount++;
  +        if (wordcount == stopwords.length) {
  +          String[] tmp = new String[stopwords.length + 50];
  +          System.arraycopy(stopwords, 0, tmp, 0, wordcount);
  +          stopwords = tmp;
  +        }
  +        stopwords[wordcount - 1] = word;
  +      }
  +      result = makeWordTable(stopwords, wordcount);
  +    }
  +        // On error, use an empty table
  +    catch (IOException e) {
  +      result = new HashMap();
  +    }
  +    return result;
  +  }
   
  -	/**
  -	 * Builds the wordlist table.
  -	 *
  -	 * @param words Word that where read
  -	 * @param length Amount of words that where read into <tt>words</tt>
  -	 */
  -	private static Hashtable makeWordTable( String[] words, int length )
  -	{
  -		Hashtable table = new Hashtable( length );
  -		for ( int i = 0; i < length; i++ )
  -		{
  -			table.put(words[i], words[i]);
  -		}
  -		return table;
  -	}
  +  /**
  +   * Builds the wordlist table.
  +   *
  +   * @param words  Word that where read
  +   * @param length Amount of words that where read into <tt>words</tt>
  +   */
  +  private static HashMap makeWordTable(String[] words, int length) {
  +    HashMap table = new HashMap(length);
  +    for (int i = 0; i < length; i++) {
  +      table.put(words[i], words[i]);
  +    }
  +    return table;
  +  }
   }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message