lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanAnalyzer.java GermanStemFilter.java GermanStemmer.java WordlistLoader.java
Date Sun, 18 Aug 2002 17:33:16 GMT
otis        2002/08/18 10:33:16

  Modified:    src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
                        GermanStemFilter.java GermanStemmer.java
                        WordlistLoader.java
  Log:
  - Fixed some funky indentation that I found while testing the contributed
    Portuguese stemmer.
  
  Revision  Changes    Path
  1.5       +101 -91   jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- GermanAnalyzer.java	21 Jul 2002 23:00:37 -0000	1.4
  +++ GermanAnalyzer.java	18 Aug 2002 17:33:16 -0000	1.5
  @@ -74,94 +74,104 @@
    * @author    Gerhard Schwarz
    * @version   $Id$
    */
  -public class GermanAnalyzer extends Analyzer {
  -
  -	/**
  -	 * List of typical german stopwords.
  -	 */
  -	private String[] GERMAN_STOP_WORDS = {
  -		"einer", "eine", "eines", "einem", "einen",
  -		"der", "die", "das", "dass", "da",
  -		"du", "er", "sie", "es",
  -		"was", "wer", "wie", "wir",
  -		"und", "oder", "ohne", "mit",
  -		"am", "im", "in", "aus", "auf",
  -		"ist", "sein", "war", "wird",
  -		"ihr", "ihre", "ihres",
  -		"als", "fr", "von", "mit",
  -		"dich", "dir", "mich", "mir",
  -		"mein", "sein", "kein",
  -		"durch", "wegen", "wird"
  -	};
  -	
  -	/**
  -	 * Contains the stopwords used with the StopFilter.
  -	 */
  -	private Hashtable stoptable = new Hashtable();
  -	/**
  -	 * Contains words that should be indexed but not stemmed.
  -	 */
  -	private Hashtable excltable = new Hashtable();
  -	
  -	/**
  -	 * Builds an analyzer.
  -	 */
  -	public GermanAnalyzer() {
  -		stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 */
  -	public GermanAnalyzer( String[] stopwords ) {
  -		stoptable = StopFilter.makeStopTable( stopwords );
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 */
  -	public GermanAnalyzer( Hashtable stopwords ) {
  -		stoptable = stopwords;
  -	}
  -
  -	/**
  -	 * Builds an analyzer with the given stop words.
  -	 */
  -	public GermanAnalyzer( File stopwords ) {
  -		stoptable = WordlistLoader.getWordtable( stopwords );
  -	}
  -
  -	/**
  -	 * Builds an exclusionlist from an array of Strings.
  -	 */
  -	public void setStemExclusionTable( String[] exclusionlist ) {
  -		excltable = StopFilter.makeStopTable( exclusionlist );
  -	}
  -	/**
  -	 * Builds an exclusionlist from a Hashtable.
  -	 */
  -	public void setStemExclusionTable( Hashtable exclusionlist ) {
  -		excltable = exclusionlist;
  -	}
  -	/**
  -	 * Builds an exclusionlist from the words contained in the given file.
  -	 */
  -	public void setStemExclusionTable( File exclusionlist ) {
  -		excltable = WordlistLoader.getWordtable( exclusionlist );
  -	}
  -	
  -	/**
  -	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
  -	 *
  -	 * @return  A TokenStream build from a StandardTokenizer filtered with
  -	 * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
  -	 */
  -	public TokenStream tokenStream( String fieldName, Reader reader ) {
  -		TokenStream result = new StandardTokenizer( reader );
  -		result = new StandardFilter( result );
  -		result = new StopFilter( result, stoptable );
  -		result = new GermanStemFilter( result, excltable );
  -		return result;
  -	}
  +public class GermanAnalyzer extends Analyzer
  +{
  +    /**
  +     * List of typical german stopwords.
  +     */
  +    private String[] GERMAN_STOP_WORDS = {
  +	"einer", "eine", "eines", "einem", "einen",
  +	"der", "die", "das", "dass", "da",
  +	"du", "er", "sie", "es",
  +	"was", "wer", "wie", "wir",
  +	"und", "oder", "ohne", "mit",
  +	"am", "im", "in", "aus", "auf",
  +	"ist", "sein", "war", "wird",
  +	"ihr", "ihre", "ihres",
  +	"als", "fr", "von", "mit",
  +	"dich", "dir", "mich", "mir",
  +	"mein", "sein", "kein",
  +	"durch", "wegen", "wird"
  +    };
  +
  +    /**
  +     * Contains the stopwords used with the StopFilter.
  +     */
  +    private Hashtable stoptable = new Hashtable();
  +
  +    /**
  +     * Contains words that should be indexed but not stemmed.
  +     */
  +    private Hashtable excltable = new Hashtable();
  +
  +    /**
  +     * Builds an analyzer.
  +     */
  +    public GermanAnalyzer()
  +    {
  +	stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
  +    }
  +
  +    /**
  +     * Builds an analyzer with the given stop words.
  +     */
  +    public GermanAnalyzer( String[] stopwords )
  +    {
  +	stoptable = StopFilter.makeStopTable( stopwords );
  +    }
  +
  +    /**
  +     * Builds an analyzer with the given stop words.
  +     */
  +    public GermanAnalyzer( Hashtable stopwords )
  +    {
  +	stoptable = stopwords;
  +    }
  +
  +    /**
  +     * Builds an analyzer with the given stop words.
  +     */
  +    public GermanAnalyzer( File stopwords )
  +    {
  +	stoptable = WordlistLoader.getWordtable( stopwords );
  +    }
  +
  +    /**
  +     * Builds an exclusionlist from an array of Strings.
  +     */
  +    public void setStemExclusionTable( String[] exclusionlist )
  +    {
  +	excltable = StopFilter.makeStopTable( exclusionlist );
  +    }
  +
  +    /**
  +     * Builds an exclusionlist from a Hashtable.
  +     */
  +    public void setStemExclusionTable( Hashtable exclusionlist )
  +    {
  +	excltable = exclusionlist;
  +    }
  +
  +    /**
  +     * Builds an exclusionlist from the words contained in the given file.
  +     */
  +    public void setStemExclusionTable( File exclusionlist )
  +    {
  +	excltable = WordlistLoader.getWordtable( exclusionlist );
  +    }
  +
  +    /**
  +     * Creates a TokenStream which tokenizes all the text in the provided Reader.
  +     *
  +     * @return  A TokenStream build from a StandardTokenizer filtered with
  +     *		StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
  +     */
  +    public TokenStream tokenStream( String fieldName, Reader reader )
  +    {
  +	TokenStream result = new StandardTokenizer( reader );
  +	result = new StandardFilter( result );
  +	result = new StopFilter( result, stoptable );
  +	result = new GermanStemFilter( result, excltable );
  +	return result;
  +    }
   }
  -
  
  
  
  1.4       +62 -55    jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- GermanStemFilter.java	19 Apr 2002 19:08:32 -0000	1.3
  +++ GermanStemFilter.java	18 Aug 2002 17:33:16 -0000	1.4
  @@ -68,62 +68,69 @@
    * @author    Gerhard Schwarz
    * @version   $Id$
    */
  -public final class GermanStemFilter extends TokenFilter {
  -
  -	/**
  -	 * The actual token in the input stream.
  -	 */
  -	private Token token = null;
  -	private GermanStemmer stemmer = null;
  -	private Hashtable exclusions = null;
  -	
  -	public GermanStemFilter( TokenStream in ) {
  -		stemmer = new GermanStemmer();
  -		input = in;
  -	}
  -	
  -	/**
  -	 * Builds a GermanStemFilter that uses an exclusiontable.
  -	 */
  -	public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) {
  -		this( in );
  -		exclusions = exclusiontable;
  +public final class GermanStemFilter extends TokenFilter
  +{
  +    /**
  +     * The actual token in the input stream.
  +     */
  +    private Token token = null;
  +    private GermanStemmer stemmer = null;
  +    private Hashtable exclusions = null;
  +    
  +    public GermanStemFilter( TokenStream in )
  +    {
  +	stemmer = new GermanStemmer();
  +	input = in;
  +    }
  +    
  +    /**
  +     * Builds a GermanStemFilter that uses an exclusiontable.
  +     */
  +    public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
  +    {
  +	this( in );
  +	exclusions = exclusiontable;
  +    }
  +    
  +    /**
  +     * @return  Returns the next token in the stream, or null at EOS
  +     */
  +    public final Token next()
  +	throws IOException
  +    {
  +	if ( ( token = input.next() ) == null ) {
  +	    return null;
   	}
  -
  -	/**
  -	 * @return  Returns the next token in the stream, or null at EOS
  -	 */
  -	public final Token next()
  -		throws IOException {
  -		if ( ( token = input.next() ) == null ) {
  -			return null;
  -		}
  -		// Check the exclusiontable
  -		else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
  -			return token;
  -		}
  -		else {
  -			String s = stemmer.stem( token.termText() );
  -			// If not stemmed, dont waste the time creating a new token
  -			if ( !s.equals( token.termText() ) ) {
  -				return new Token( s, token.startOffset(),
  -				    token.endOffset(), token.type() );
  -			}
  -			return token;
  -		}
  +	// Check the exclusiontable
  +	else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
  +	    return token;
   	}
  -	/**
  -	 * Set a alternative/custom GermanStemmer for this filter.
  -	 */
  -	public void setStemmer( GermanStemmer stemmer ) {
  -		if ( stemmer != null ) {
  -			this.stemmer = stemmer;
  -		}
  +	else {
  +	    String s = stemmer.stem( token.termText() );
  +	    // If not stemmed, dont waste the time creating a new token
  +	    if ( !s.equals( token.termText() ) ) {
  +		return new Token( s, token.startOffset(),
  +		    token.endOffset(), token.type() );
  +	    }
  +	    return token;
   	}
  -	/**
  -	 * Set an alternative exclusion list for this filter.
  -	 */
  -	public void setExclusionTable( Hashtable exclusiontable ) {
  -		exclusions = exclusiontable;
  +    }
  +
  +    /**
  +     * Set a alternative/custom GermanStemmer for this filter.
  +     */
  +    public void setStemmer( GermanStemmer stemmer )
  +    {
  +	if ( stemmer != null ) {
  +	    this.stemmer = stemmer;
   	}
  +    }
  +
  +    /**
  +     * Set an alternative exclusion list for this filter.
  +     */
  +    public void setExclusionTable( Hashtable exclusiontable )
  +    {
  +	exclusions = exclusiontable;
  +    }
   }
  
  
  
  1.6       +192 -177  jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- GermanStemmer.java	21 Jul 2002 23:16:09 -0000	1.5
  +++ GermanStemmer.java	18 Aug 2002 17:33:16 -0000	1.6
  @@ -62,17 +62,18 @@
    * @author    Gerhard Schwarz
    * @version   $Id$
    */
  -
  -public class GermanStemmer {
  -
  +public class GermanStemmer
  +{
       /**
        * Buffer for the terms while stemming them.
        */
       private StringBuffer sb = new StringBuffer();
  -	/**
  -	 * Indicates if a term is handled as a noun.
  +
  +    /**
  +     * Indicates if a term is handled as a noun.
        */
       private boolean uppercase = false;
  +
       /**
        * Amount of characters that are removed with <tt>substitute()</tt> while
stemming.
        */
  @@ -84,22 +85,24 @@
        * @param term  The term that should be stemmed.
        * @return      Discriminator for <tt>term</tt>
        */
  -    protected String stem( String term ) {
  -		// Mark a possible noun.
  -		uppercase = Character.isUpperCase( term.charAt( 0 ) );
  -		// Use lowercase for medium stemming.
  -		term = term.toLowerCase();
  -		if ( !isStemmable( term ) ) return term;
  -		// Reset the StringBuffer.
  -		sb.delete( 0, sb.length() );
  -		sb.insert( 0, term );
  -		// Stemming starts here...
  -		substitute( sb );
  -		strip( sb );
  -		optimize( sb );
  -		resubstitute( sb );
  -		removeParticleDenotion( sb );
  -		return sb.toString();
  +    protected String stem( String term )
  +    {
  +	// Mark a possible noun.
  +	uppercase = Character.isUpperCase( term.charAt( 0 ) );
  +	// Use lowercase for medium stemming.
  +	term = term.toLowerCase();
  +	if ( !isStemmable( term ) )
  +	    return term;
  +	// Reset the StringBuffer.
  +	sb.delete( 0, sb.length() );
  +	sb.insert( 0, term );
  +	// Stemming starts here...
  +	substitute( sb );
  +	strip( sb );
  +	optimize( sb );
  +	resubstitute( sb );
  +	removeParticleDenotion( sb );
  +	return sb.toString();
       }
   
       /**
  @@ -107,82 +110,90 @@
        *
        * @return  true if, and only if, the given term consists in letters.
        */
  -    private boolean isStemmable( String term ) {
  -		for ( int c = 0; c < term.length(); c++ ) {
  -			if ( !Character.isLetter( term.charAt( c ) ) ) return false;
  -		}
  -		return true;
  -    }
  -
  -	/**
  -	 * suffix stripping (stemming) on the current term. The stripping is reduced
  -	 * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
  -	 * from which all regular suffixes are build of. The simplification causes
  -	 * some overstemming, and way more irregular stems, but still provides unique.
  -	 * discriminators in the most of those cases.
  -	 * The algorithm is context free, except of the length restrictions.
  -	 */
  -	private void strip( StringBuffer buffer ) {
  -		boolean doMore = true;
  -		while ( doMore && buffer.length() > 3 ) {
  -			if ( ( buffer.length() + substCount > 5 ) && buffer.substring( buffer.length()
- 2, buffer.length() ).equals( "nd" ) ) {
  -				buffer.delete( buffer.length() - 2, buffer.length() );
  -			}
  -			else if ( ( buffer.length() + substCount > 4 ) && buffer.substring( buffer.length()
- 2, buffer.length() ).equals( "em" ) ) {
  -				buffer.delete( buffer.length() - 2, buffer.length() );
  -			}
  -			else if ( ( buffer.length() + substCount > 4 ) && buffer.substring( buffer.length()
- 2, buffer.length() ).equals( "er" ) ) {
  -				buffer.delete( buffer.length() - 2, buffer.length() );
  -			}
  -			else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  -				buffer.deleteCharAt( buffer.length() - 1 );
  -			}
  -			else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  -				buffer.deleteCharAt( buffer.length() - 1 );
  -			}
  -			else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  -				buffer.deleteCharAt( buffer.length() - 1 );
  -			}
  -			// "t" occurs only as suffix of verbs.
  -			else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
  -				buffer.deleteCharAt( buffer.length() - 1 );
  -			}
  -			else {
  -				doMore = false;
  -			}
  -		}
  +    private boolean isStemmable( String term )
  +    {
  +	for ( int c = 0; c < term.length(); c++ ) {
  +	    if ( !Character.isLetter( term.charAt( c ) ) ) return false;
   	}
  +	return true;
  +    }
   
  -	/**
  -	 * Does some optimizations on the term. This optimisations are
  -	 * contextual.
  -	 *
  -	 * @return  The term with the optimizations applied.
  -	 */
  -	private void optimize( StringBuffer buffer ) {
  -		// Additional step for female plurals of professions and inhabitants.
  -		if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length()
).equals( "erin*" ) ) {
  -			buffer.deleteCharAt( buffer.length() -1 );
  -			strip( buffer );
  -		}
  -		// Additional step for irregular plural nouns like "Matrizen -> Matrix".
  -		if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  -			buffer.setCharAt( buffer.length() - 1, 'x' );
  -		}
  +    /**
  +     * suffix stripping (stemming) on the current term. The stripping is reduced
  +     * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
  +     * from which all regular suffixes are build of. The simplification causes
  +     * some overstemming, and way more irregular stems, but still provides unique.
  +     * discriminators in the most of those cases.
  +     * The algorithm is context free, except of the length restrictions.
  +     */
  +    private void strip( StringBuffer buffer )
  +    {
  +	boolean doMore = true;
  +	while ( doMore && buffer.length() > 3 ) {
  +	    if ( ( buffer.length() + substCount > 5 ) &&
  +		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
  +	    {
  +		buffer.delete( buffer.length() - 2, buffer.length() );
  +	    }
  +	    else if ( ( buffer.length() + substCount > 4 ) &&
  +		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
  +		buffer.delete( buffer.length() - 2, buffer.length() );
  +	    }
  +	    else if ( ( buffer.length() + substCount > 4 ) &&
  +		buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
  +		buffer.delete( buffer.length() - 2, buffer.length() );
  +	    }
  +	    else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  +		buffer.deleteCharAt( buffer.length() - 1 );
  +	    }
  +	    else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  +		buffer.deleteCharAt( buffer.length() - 1 );
  +	    }
  +	    else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  +		buffer.deleteCharAt( buffer.length() - 1 );
  +	    }
  +	    // "t" occurs only as suffix of verbs.
  +	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
  +		buffer.deleteCharAt( buffer.length() - 1 );
  +	    }
  +	    else {
  +		doMore = false;
  +	    }
   	}
  +    }
  +
  +    /**
  +     * Does some optimizations on the term. This optimisations are
  +     * contextual.
  +     *
  +     * @return  The term with the optimizations applied.
  +     */
  +    private void optimize( StringBuffer buffer )
  +    {
  +	// Additional step for female plurals of professions and inhabitants.
  +	if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length()
).equals( "erin*" ) ) {
  +	    buffer.deleteCharAt( buffer.length() -1 );
  +	    strip( buffer );
  +	}
  +	// Additional step for irregular plural nouns like "Matrizen -> Matrix".
  +	if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  +	    buffer.setCharAt( buffer.length() - 1, 'x' );
  +	}
  +    }
   
       /**
        * Removes a particle denotion ("ge") from a term.
        */
  -    private void removeParticleDenotion( StringBuffer buffer ) {
  -		if ( buffer.length() > 4 ) {
  -			for ( int c = 0; c < buffer.length() - 3; c++ ) {
  -				if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  -					buffer.delete( c, c + 2 );
  -					return;
  -				}
  -			}
  +    private void removeParticleDenotion( StringBuffer buffer )
  +    {
  +	if ( buffer.length() > 4 ) {
  +	    for ( int c = 0; c < buffer.length() - 3; c++ ) {
  +		if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  +		    buffer.delete( c, c + 2 );
  +		    return;
   		}
  +	    }
  +	}
       }
   
       /**
  @@ -195,63 +206,66 @@
        * - Substitute some common character combinations with a token:
        *   sch/ch/ei/ie/ig/st -> $//%/&/#/!
        */
  -    private void substitute( StringBuffer buffer ) {
  -		substCount = 0;
  -		for ( int c = 0; c < buffer.length(); c++ ) {
  -			// Replace the second char of a pair of the equal characters with an asterisk
  -			if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  -				buffer.setCharAt( c, '*' );
  -			}
  -			// Substitute Umlauts.
  -			else if ( buffer.charAt( c ) == '' ) {
  -				buffer.setCharAt( c, 'a' );
  -			}
  -			else if ( buffer.charAt( c ) == '' ) {
  -				buffer.setCharAt( c, 'o' );
  -			}
  -			else if ( buffer.charAt( c ) == '' ) {
  -				buffer.setCharAt( c, 'u' );
  -			}
  -			// Take care that at least one character is left left side from the current one
  -			if ( c < buffer.length() - 1 ) {
  -				if ( buffer.charAt( c ) == '' ) {
  -					buffer.setCharAt( c, 's' );
  -					buffer.insert( c + 1, 's' );
  -					substCount++;
  -				}
  -				// Masking several common character combinations with an token
  -				else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) {
  -					buffer.setCharAt( c, '$' );
  -					buffer.delete( c + 1, c + 3 );
  -					substCount =+ 2;
  -				}
  -				else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  -					buffer.setCharAt( c, '' );
  -					buffer.deleteCharAt( c + 1 );
  -					substCount++;
  -				}
  -				else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
  -					buffer.setCharAt( c, '%' );
  -					buffer.deleteCharAt( c + 1 );
  -					substCount++;
  -				}
  -				else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
  -					buffer.setCharAt( c, '&' );
  -					buffer.deleteCharAt( c + 1 );
  -					substCount++;
  -				}
  -				else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
  -					buffer.setCharAt( c, '#' );
  -					buffer.deleteCharAt( c + 1 );
  -					substCount++;
  -				}
  -				else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
  -					buffer.setCharAt( c, '!' );
  -					buffer.deleteCharAt( c + 1 );
  -					substCount++;
  -				}
  -			}
  +    private void substitute( StringBuffer buffer )
  +    {
  +	substCount = 0;
  +	for ( int c = 0; c < buffer.length(); c++ ) {
  +	    // Replace the second char of a pair of the equal characters with an asterisk
  +	    if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  +		buffer.setCharAt( c, '*' );
  +	    }
  +	    // Substitute Umlauts.
  +	    else if ( buffer.charAt( c ) == '' ) {
  +		buffer.setCharAt( c, 'a' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '' ) {
  +		buffer.setCharAt( c, 'o' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '' ) {
  +		buffer.setCharAt( c, 'u' );
  +	    }
  +	    // Take care that at least one character is left left side from the current one
  +	    if ( c < buffer.length() - 1 ) {
  +		if ( buffer.charAt( c ) == '' ) {
  +		    buffer.setCharAt( c, 's' );
  +		    buffer.insert( c + 1, 's' );
  +		    substCount++;
  +		}
  +		// Masking several common character combinations with an token
  +		else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
  +		    buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
  +		{
  +		    buffer.setCharAt( c, '$' );
  +		    buffer.delete( c + 1, c + 3 );
  +		    substCount =+ 2;
  +		}
  +		else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  +		    buffer.setCharAt( c, '' );
  +		    buffer.deleteCharAt( c + 1 );
  +		    substCount++;
  +		}
  +		else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
  +		    buffer.setCharAt( c, '%' );
  +		    buffer.deleteCharAt( c + 1 );
  +		    substCount++;
  +		}
  +		else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
  +		    buffer.setCharAt( c, '&' );
  +		    buffer.deleteCharAt( c + 1 );
  +		    substCount++;
  +		}
  +		else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
  +		    buffer.setCharAt( c, '#' );
  +		    buffer.deleteCharAt( c + 1 );
  +		    substCount++;
  +		}
  +		else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
  +		    buffer.setCharAt( c, '!' );
  +		    buffer.deleteCharAt( c + 1 );
  +		    substCount++;
   		}
  +	    }
  +	}
       }
   
       /**
  @@ -259,36 +273,37 @@
        * character combinations. Umlauts will remain as their corresponding vowel,
        * as "" remains as "ss".
        */
  -    private void resubstitute( StringBuffer buffer ) {
  -		for ( int c = 0; c < buffer.length(); c++ ) {
  -			if ( buffer.charAt( c ) == '*' ) {
  -				char x = buffer.charAt( c - 1 );
  -				buffer.setCharAt( c, x );
  -			}
  -			else if ( buffer.charAt( c ) == '$' ) {
  -				buffer.setCharAt( c, 's' );
  -				buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  -			}
  -			else if ( buffer.charAt( c ) == '' ) {
  -				buffer.setCharAt( c, 'c' );
  -				buffer.insert( c + 1, 'h' );
  -			}
  -			else if ( buffer.charAt( c ) == '%' ) {
  -				buffer.setCharAt( c, 'e' );
  -				buffer.insert( c + 1, 'i' );
  -			}
  -			else if ( buffer.charAt( c ) == '&' ) {
  -				buffer.setCharAt( c, 'i' );
  -				buffer.insert( c + 1, 'e' );
  -			}
  -			else if ( buffer.charAt( c ) == '#' ) {
  -				buffer.setCharAt( c, 'i' );
  -				buffer.insert( c + 1, 'g' );
  -			}
  -			else if ( buffer.charAt( c ) == '!' ) {
  -				buffer.setCharAt( c, 's' );
  -				buffer.insert( c + 1, 't' );
  -			}
  -		}
  +    private void resubstitute( StringBuffer buffer )
  +    {
  +	for ( int c = 0; c < buffer.length(); c++ ) {
  +	    if ( buffer.charAt( c ) == '*' ) {
  +		char x = buffer.charAt( c - 1 );
  +		buffer.setCharAt( c, x );
  +	    }
  +	    else if ( buffer.charAt( c ) == '$' ) {
  +		buffer.setCharAt( c, 's' );
  +		buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  +	    }
  +	    else if ( buffer.charAt( c ) == '' ) {
  +		buffer.setCharAt( c, 'c' );
  +		buffer.insert( c + 1, 'h' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '%' ) {
  +		buffer.setCharAt( c, 'e' );
  +		buffer.insert( c + 1, 'i' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '&' ) {
  +		buffer.setCharAt( c, 'i' );
  +		buffer.insert( c + 1, 'e' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '#' ) {
  +		buffer.setCharAt( c, 'i' );
  +		buffer.insert( c + 1, 'g' );
  +	    }
  +	    else if ( buffer.charAt( c ) == '!' ) {
  +		buffer.setCharAt( c, 's' );
  +		buffer.insert( c + 1, 't' );
  +	    }
  +	}
       }
   }
  
  
  
  1.4       +62 -62    jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- WordlistLoader.java	19 Apr 2002 19:09:36 -0000	1.3
  +++ WordlistLoader.java	18 Aug 2002 17:33:16 -0000	1.4
  @@ -68,71 +68,71 @@
    * @author    Gerhard Schwarz
    * @version   $Id$
    */
  -public class WordlistLoader {
  -
  -	/**
  -	 * @param path      Path to the wordlist
  -	 * @param wordfile  Name of the wordlist
  -	 */
  -	public static Hashtable getWordtable( String path, String wordfile ) {
  -		if ( path == null || wordfile == null ) {
  -			return new Hashtable();
  -		}
  -		return getWordtable( new File( path, wordfile ) );
  +public class WordlistLoader
  +{
  +    /**
  +     * @param path      Path to the wordlist
  +     * @param wordfile  Name of the wordlist
  +     */
  +    public static Hashtable getWordtable( String path, String wordfile ) {
  +	if ( path == null || wordfile == null ) {
  +	    return new Hashtable();
   	}
  -	/**
  -	 * @param wordfile  Complete path to the wordlist
  -	 */
  -	public static Hashtable getWordtable( String wordfile ) {
  -		if ( wordfile == null ) {
  -			return new Hashtable();
  -		}
  -		return getWordtable( new File( wordfile ) );
  +	return getWordtable( new File( path, wordfile ) );
  +    }
  +
  +    /**
  +     * @param wordfile  Complete path to the wordlist
  +     */
  +    public static Hashtable getWordtable( String wordfile ) {
  +	if ( wordfile == null ) {
  +	    return new Hashtable();
   	}
  +	return getWordtable( new File( wordfile ) );
  +    }
   
  -	/**
  -	 * @param wordfile  File containing the wordlist
  -	 */
  -	public static Hashtable getWordtable( File wordfile ) {
  -		if ( wordfile == null ) {
  -			return new Hashtable();
  -		}
  -		Hashtable result = null;
  -		try {
  -			LineNumberReader lnr = new LineNumberReader( new FileReader( wordfile ) );
  -			String word = null;
  -			String[] stopwords = new String[100];
  -			int wordcount = 0;
  -			while ( ( word = lnr.readLine() ) != null ) {
  -				wordcount++;
  -				if ( wordcount == stopwords.length ) {
  -					String[] tmp = new String[stopwords.length + 50];
  -					System.arraycopy( stopwords, 0, tmp, 0, wordcount );
  -					stopwords = tmp;
  -				}
  -				stopwords[wordcount-1] = word;
  -			}
  -			result = makeWordTable( stopwords, wordcount );
  -		}
  -		// On error, use an empty table
  -		catch ( IOException e ) {
  -			result = new Hashtable();
  -		}
  -		return result;
  +    /**
  +     * @param wordfile  File containing the wordlist
  +     */
  +    public static Hashtable getWordtable( File wordfile ) {
  +	if ( wordfile == null ) {
  +	    return new Hashtable();
  +	}
  +	Hashtable result = null;
  +	try {
  +	    LineNumberReader lnr = new LineNumberReader( new FileReader( wordfile ) );
  +	    String word = null;
  +	    String[] stopwords = new String[100];
  +	    int wordcount = 0;
  +	    while ( ( word = lnr.readLine() ) != null ) {
  +		wordcount++;
  +		if ( wordcount == stopwords.length ) {
  +		    String[] tmp = new String[stopwords.length + 50];
  +		    System.arraycopy( stopwords, 0, tmp, 0, wordcount );
  +		    stopwords = tmp;
  +		}
  +		stopwords[wordcount-1] = word;
  +	    }
  +	    result = makeWordTable( stopwords, wordcount );
  +	}
  +	// On error, use an empty table
  +	catch ( IOException e ) {
  +	    result = new Hashtable();
   	}
  +	return result;
  +    }
   
  -	/**
  -	 * Builds the wordlist table.
  -	 *
  -	 * @param words   Word that where read
  -	 * @param length  Amount of words that where read into <tt>words</tt>
  -	 */
  -	private static Hashtable makeWordTable( String[] words, int length ) {
  -		Hashtable table = new Hashtable( length );
  -		for ( int i = 0; i < length; i++ ) {
  -			table.put( words[i], words[i] );
  -		}
  -		return table;
  +    /**
  +     * Builds the wordlist table.
  +     *
  +     * @param words   Word that where read
  +     * @param length  Amount of words that where read into <tt>words</tt>
  +     */
  +    private static Hashtable makeWordTable( String[] words, int length ) {
  +	Hashtable table = new Hashtable( length );
  +	for ( int i = 0; i < length; i++ ) {
  +	    table.put( words[i], words[i] );
   	}
  +	return table;
  +    }
   }
  -
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message