lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gschw...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanAnalyzer.java GermanStemFilter.java GermanStemmer.java WordlistLoader.java
Date Mon, 10 Dec 2001 21:18:24 GMT
gschwarz    01/12/10 13:18:24

  Modified:    src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
                        GermanStemFilter.java GermanStemmer.java
                        WordlistLoader.java
  Log:
  Fixed Bug 4555. Possible NullPointerException when a short term with
  substitutions was being checked for a particle denotion. Length checking
  corrected, and sequence of resubstitution and removing particle denotion
  changed to prevent denoted term to pass through remove because of the
  reduced length.
  Corrected and improved documentation.
  Fix in WordlistLoader, files are not read correct, loosing a line.
  Fix in GermanStemFilter, typo in constructor with custom exclusion table as
  parameter, parameter was ignored.
  GermanStemFilter has two new Methods for setting stemmer and exclusion
  list after creating the filter object.
  
  Revision  Changes    Path
  1.2       +7 -5      jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- GermanAnalyzer.java	2001/09/25 17:29:04	1.1
  +++ GermanAnalyzer.java	2001/12/10 21:18:24	1.2
  @@ -14,9 +14,11 @@
    * Analyzer for german language. Supports an external list of stopwords (words that
    * will not be indexed at all) and an external list of exclusions (word that will
    * not be stemmed, but indexed).
  + * A default set of stopwords is used unless an other list is specified, the
  + * exclusionlist is empty by default.
    *
    * @author    Gerhard Schwarz
  - * @version   $Id: GermanAnalyzer.java,v 1.1 2001/09/25 17:29:04 cutting Exp $
  + * @version   $Id: GermanAnalyzer.java,v 1.2 2001/12/10 21:18:24 gschwarz Exp $
    */
   public final class GermanAnalyzer extends Analyzer {
   
  @@ -35,8 +37,8 @@
   		"als", "fr", "von", "mit",
   		"dich", "dir", "mich", "mir",
   		"mein", "sein", "kein",
  -		"durch", "wegen"
  -		};
  +		"durch", "wegen", "wird"
  +	};
   	
   	/**
   	 * Contains the stopwords used with the StopFilter.
  @@ -98,9 +100,9 @@
   	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
   	 *
   	 * @return  A TokenStream build from a StandardTokenizer filtered with
  -	 * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
  +	 * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
   	 */
  -	public final TokenStream tokenStream(String fieldName, Reader reader) {
  +	public final TokenStream tokenStream( String fieldName, Reader reader ) {
   		TokenStream result = new StandardTokenizer( reader );
   		result = new StandardFilter( result );
   		result = new StopFilter( result, stoptable );
  
  
  
  1.2       +21 -6     jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- GermanStemFilter.java	2001/09/25 17:29:04	1.1
  +++ GermanStemFilter.java	2001/12/10 21:18:24	1.2
  @@ -8,10 +8,11 @@
   
   /**
    * A filter that stemms german words. It supports a table of words that should
  - * not be stemmed at all.
  + * not be stemmed at all. The used stemmer can be changed at runtime after the
  + * filter object is created (as long as it is a GermanStemmer).
    *
    * @author    Gerhard Schwarz
  - * @version   $Id: GermanStemFilter.java,v 1.1 2001/09/25 17:29:04 cutting Exp $
  + * @version   $Id: GermanStemFilter.java,v 1.2 2001/12/10 21:18:24 gschwarz Exp $
    */
   public final class GermanStemFilter extends TokenFilter {
   
  @@ -32,29 +33,43 @@
   	 */
   	public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) {
   		this( in );
  -		this.exclusions = exclusions;
  +		exclusions = exclusiontable;
   	}
   
   	/**
  -	 * @return  Returns the next token in the stream, or null at EOS.
  +	 * @return  Returns the next token in the stream, or null at EOS
   	 */
   	public final Token next()
   		throws IOException {
   		if ( ( token = input.next() ) == null ) {
   			return null;
   		}
  -		// Check the exclusiontable.
  +		// Check the exclusiontable
   		else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
   			return token;
   		}
   		else {
   			String s = stemmer.stem( token.termText() );
  -			// If not stemmed, dont waste the time creating a new token.
  +			// If not stemmed, dont waste the time creating a new token
   			if ( !s.equals( token.termText() ) ) {
   				return new Token( s, 0, s.length(), token.type() );
   			}
   			return token;
   		}
  +	}
  +	/**
  +	 * Set a alternative/custom GermanStemmer for this filter.
  +	 */
  +	public void setStemmer( GermanStemmer stemmer ) {
  +		if ( stemmer != null ) {
  +			this.stemmer = stemmer;
  +		}
  +	}
  +	/**
  +	 * Set an alternative exclusion list for this filter.
  +	 */
  +	public void setExclusionTable( Hashtable exclusiontable ) {
  +		exclusions = exclusiontable;
   	}
   }
   
  
  
  
  1.2       +19 -14    jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- GermanStemmer.java	2001/09/25 17:29:05	1.1
  +++ GermanStemmer.java	2001/12/10 21:18:24	1.2
  @@ -6,7 +6,7 @@
    * Caumanns (joerg.caumanns@isst.fhg.de).
    *
    * @author    Gerhard Schwarz
  - * @version   $Id: GermanStemmer.java,v 1.1 2001/09/25 17:29:05 cutting Exp $
  + * @version   $Id: GermanStemmer.java,v 1.2 2001/12/10 21:18:24 gschwarz Exp $
    */
   
   public class GermanStemmer {
  @@ -30,7 +30,7 @@
   	/**
   	 * Stemms the given term to an unique <tt>discriminator</tt>.
   	 *
  -	 * @param word  The term that should be stemmed.
  +	 * @param term  The term that should be stemmed.
   	 * @return      Discriminator for <tt>term</tt>
   	 */
   	protected String stem( String term ) {
  @@ -41,6 +41,9 @@
   		if ( Character.isUpperCase( term.charAt( 0 ) ) ) {
   			uppercase = true;
   		}
  +		else {
  +			uppercase = false;
  +		}
   		// Use lowercase for medium stemming.
   		term = term.toLowerCase();
   		// Reset the StringBuffer.
  @@ -79,9 +82,10 @@
   				sb.setCharAt( sb.length() - 1, 'x' );
   			}
   		}
  -		// Check the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" for all
  +		// Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all
   		// other terms. Adjectives, Verbs and Adverbs have a total of 52 different
  -		// possible suffixes.
  +		// possible suffixes, stripping only the characters from they are build
  +		// does mostly the same
   		else {
   			// Strip base suffixes as long as enough characters remain.
   			boolean doMore = true;
  @@ -112,10 +116,10 @@
   				}
   			}
   		}
  +		sb = resubstitute( sb );
   		if ( !uppercase ) {
   			sb = removeParticleDenotion( sb );
   		}
  -		sb = resubstitute( sb );
   		return sb.toString();
   	}
   
  @@ -127,8 +131,8 @@
   	 */
   	private StringBuffer removeParticleDenotion( StringBuffer buffer ) {
   		for ( int c = 0; c < buffer.length(); c++ ) {
  -			// Strip from the beginning of the string to the "ge" inclusive.
  -			if ( c < ( sb.length() - 3 ) && buffer.charAt( c ) == 'g' && buffer.charAt
( c + 1 ) == 'e' ) {
  +			// Strip from the beginning of the string to the "ge" inclusive
  +			if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' &&
buffer.charAt ( c + 1 ) == 'e' ) {
   				buffer.delete( 0, c + 2 );
   			}
   		}
  @@ -140,7 +144,7 @@
   	 *
   	 * - Substitute Umlauts with their corresponding vowel:  -> aou,
   	 *   "" is substituted by "ss"
  -	 * - Substitute an second char of an pair of equal characters with
  +	 * - Substitute a second char of an pair of equal characters with
   	 *   an asterisk: ?? -> ?*
   	 * - Substitute some common character combinations with a token:
   	 *   sch/ch/ei/ie/ig/st -> $//%/&/#/!
  @@ -149,7 +153,7 @@
   	 */
   	private StringBuffer substitute( StringBuffer buffer ) {
   		for ( int c = 0; c < buffer.length(); c++ ) {
  -			// Replace the second char of a pair of the equal characters with an asterisk.
  +			// Replace the second char of a pair of the equal characters with an asterisk
   			if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
   				buffer.setCharAt( c, '*' );
   			}
  @@ -163,14 +167,14 @@
   			else if ( buffer.charAt( c ) == '' ) {
   				buffer.setCharAt( c, 'u' );
   			}
  -			// Take care that enough characters at left for search.
  +			// Take care that at least one character is left left side from the current one
   			if ( c < buffer.length() - 1 ) {
   				if ( buffer.charAt( c ) == '' ) {
   					buffer.setCharAt( c, 's' );
   					buffer.insert( c + 1, 's' );
   					substCount++;
   				}
  -				// Masking several common character combinations with an token.
  +				// Masking several common character combinations with an token
   				else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) {
   					buffer.setCharAt( c, '$' );
   					buffer.delete( c + 1, c + 3 );
  @@ -240,10 +244,11 @@
   		return true;
   	}
   	/**
  -	 * Undoes some changes made by substitute(). That are character pairs and
  -	 * character combinations.
  +	 * Undoes the changes made by substitute(). That are character pairs and
  +	 * character combinations. Umlauts will remain as their corresponding vowel,
  +	 * as "" remains as "ss".
   	 *
  -	 * @return  The term without the not human reaqdable substitutions.
  +	 * @return  The term without the not human readable substitutions.
   	 */
   	private StringBuffer resubstitute( StringBuffer buffer ) {
   		for ( int c = 0; c < buffer.length(); c++ ) {
  
  
  
  1.2       +13 -14    jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- WordlistLoader.java	2001/09/25 17:29:05	1.1
  +++ WordlistLoader.java	2001/12/10 21:18:24	1.2
  @@ -7,24 +7,24 @@
   import java.util.Hashtable;
   
   /**
  - * Loads a textfile and adds every entry to a Hashtable. If a file is not found
  - * or on any error, an empty table is returned.
  + * Loads a textfile and adds every line as an entry to a Hashtable. Every line
  + * should contain only one word. If a file is not found or on any error, an
  + * empty table is returned.
    *
    * @author    Gerhard Schwarz
  - * @version   $Id: WordlistLoader.java,v 1.1 2001/09/25 17:29:05 cutting Exp $
  + * @version   $Id: WordlistLoader.java,v 1.2 2001/12/10 21:18:24 gschwarz Exp $
    */
   public class WordlistLoader {
   
   	/**
  -	 * @param path      Path to the wordlist.
  -	 * @param wordfile  Name of the wordlist.
  +	 * @param path      Path to the wordlist
  +	 * @param wordfile  Name of the wordlist
   	 */
   	public static Hashtable getWordtable( String path, String wordfile ) {
   		if ( path == null || wordfile == null ) {
   			return new Hashtable();
   		}
  -		File absoluteName = new File( path, wordfile );
  -		return getWordtable( absoluteName );
  +		return getWordtable( new File( path, wordfile ) );
   	}
   	/**
   	 * @param wordfile  Complete path to the wordlist
  @@ -33,12 +33,11 @@
   		if ( wordfile == null ) {
   			return new Hashtable();
   		}
  -		File absoluteName = new File( wordfile );
  -		return getWordtable( absoluteName );
  +		return getWordtable( new File( wordfile ) );
   	}
   
   	/**
  -	 * @param wordfile  File containing the wordlist.
  +	 * @param wordfile  File containing the wordlist
   	 */
   	public static Hashtable getWordtable( File wordfile ) {
   		if ( wordfile == null ) {
  @@ -57,11 +56,11 @@
   					System.arraycopy( stopwords, 0, tmp, 0, wordcount );
   					stopwords = tmp;
   				}
  -				stopwords[wordcount] = word;
  +				stopwords[wordcount-1] = word;
   			}
   			result = makeWordTable( stopwords, wordcount );
   		}
  -		// On error, use an empty table.
  +		// On error, use an empty table
   		catch ( IOException e ) {
   			result = new Hashtable();
   		}
  @@ -71,8 +70,8 @@
   	/**
   	 * Builds the wordlist table.
   	 *
  -	 * @param words   Word that where read.
  -	 * @param length  Amount of words that where read into <tt>words</tt>.
  +	 * @param words   Word that where read
  +	 * @param length  Amount of words that where read into <tt>words</tt>
   	 */
   	private static Hashtable makeWordTable( String[] words, int length ) {
   		Hashtable table = new Hashtable( length );
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message