lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ehatc...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr FrenchAnalyzer.java FrenchStemFilter.java FrenchStemmer.java
Date Tue, 20 Jan 2004 10:07:01 GMT
ehatcher    2004/01/20 02:07:01

  Added:       contributions/analyzers/src/java/org/apache/lucene/analysis/fr
                        FrenchAnalyzer.java FrenchStemFilter.java
                        FrenchStemmer.java
  Log:
  #26268 - Add FrenchAnalyzer
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
  
  Index: FrenchAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.fr;
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.LowerCaseFilter;
  import org.apache.lucene.analysis.StopFilter;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.standard.StandardFilter;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
  import java.io.File;
  import java.io.Reader;
  import java.util.Hashtable;
  import org.apache.lucene.analysis.de.WordlistLoader;
  
  /**
   * Analyzer for french language. Supports an external list of stopwords (words that
   * will not be indexed at all) and an external list of exclusions (word that will
   * not be stemmed, but indexed).
   * A default set of stopwords is used unless an other list is specified, the
   * exclusionlist is empty by default.
   *
   * @author    Patrick Talbot (based on Gerhard Schwarz work for German)
   * @version   $Id: FrenchAnalyzer.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
   */
  public final class FrenchAnalyzer extends Analyzer {
  
  	/**
  	 * Extended list of typical french stopwords.
  	 */
  	private String[] FRENCH_STOP_WORDS = {
  		"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
  		"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
  		"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
  		"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
  		"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
  		"dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
  		"desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
  		"diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
  		"en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
  		"hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
  		"le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
  		"ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
  		"moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
  		"nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
  		"partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
  		"proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
  		"qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
  		"seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
  		"son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
  		"tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
  		"voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
  		"été", "être", "ô"
  	};
  
  	/**
  	 * Contains the stopwords used with the StopFilter.
  	 */
  	private Hashtable stoptable = new Hashtable();
  	/**
  	 * Contains words that should be indexed but not stemmed.
  	 */
  	private Hashtable excltable = new Hashtable();
  
  	/**
  	 * Builds an analyzer.
  	 */
  	public FrenchAnalyzer() {
  		stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
  	}
  
  	/**
  	 * Builds an analyzer with the given stop words.
  	 */
  	public FrenchAnalyzer( String[] stopwords ) {
  		stoptable = StopFilter.makeStopTable( stopwords );
  	}
  
  	/**
  	 * Builds an analyzer with the given stop words.
  	 */
  	public FrenchAnalyzer( Hashtable stopwords ) {
  		stoptable = stopwords;
  	}
  
  	/**
  	 * Builds an analyzer with the given stop words.
  	 */
  	public FrenchAnalyzer( File stopwords ) {
  		stoptable = WordlistLoader.getWordtable( stopwords );
  	}
  
  	/**
  	 * Builds an exclusionlist from an array of Strings.
  	 */
  	public void setStemExclusionTable( String[] exclusionlist ) {
  		excltable = StopFilter.makeStopTable( exclusionlist );
  	}
  	/**
  	 * Builds an exclusionlist from a Hashtable.
  	 */
  	public void setStemExclusionTable( Hashtable exclusionlist ) {
  		excltable = exclusionlist;
  	}
  	/**
  	 * Builds an exclusionlist from the words contained in the given file.
  	 */
  	public void setStemExclusionTable( File exclusionlist ) {
  		excltable = WordlistLoader.getWordtable( exclusionlist );
  	}
  
  	/**
  	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
  	 *
  	 * @return  A TokenStream build from a StandardTokenizer filtered with
  	 * 			StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
  	 */
  	public final TokenStream tokenStream( String fieldName, Reader reader ) {
  		TokenStream result = new StandardTokenizer( reader );
  		result = new StandardFilter( result );
  		result = new StopFilter( result, stoptable );
  		result = new FrenchStemFilter( result, excltable );
  		// Convert to lowercase after stemming!
  		result = new LowerCaseFilter( result );
  		return result;
  	}
  }
  
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
  
  Index: FrenchStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.fr;
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenFilter;
  import org.apache.lucene.analysis.TokenStream;
  import java.io.IOException;
  import java.util.Hashtable;
  
  /**
   * A filter that stemms french words. It supports a table of words that should
   * not be stemmed at all. The used stemmer can be changed at runtime after the
   * filter object is created (as long as it is a FrenchStemmer).
   *
   * @author    Patrick Talbot (based on Gerhard Schwarz work for German)
   * @version   $Id: FrenchStemFilter.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
   */
  public final class FrenchStemFilter extends TokenFilter {
  
  	/**
  	 * The actual token in the input stream.
  	 */
  	private Token token = null;
  	private FrenchStemmer stemmer = null;
  	private Hashtable exclusions = null;
  
  	public FrenchStemFilter( TokenStream in ) {
      super(in);
  		stemmer = new FrenchStemmer();
  	}
  
  	/**
  	 * Builds a FrenchStemFilter that uses an exclusiontable.
  	 */
  	public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
  		this( in );
  		exclusions = exclusiontable;
  	}
  
  	/**
  	 * @return  Returns the next token in the stream, or null at EOS
  	 */
  	public final Token next()
  		throws IOException {
  		if ( ( token = input.next() ) == null ) {
  			return null;
  		}
  		// Check the exclusiontable
  		else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
  			return token;
  		}
  		else {
  			String s = stemmer.stem( token.termText() );
  			// If not stemmed, dont waste the time creating a new token
  			if ( !s.equals( token.termText() ) ) {
  				return new Token( s, 0, s.length(), token.type() );
  			}
  			return token;
  		}
  	}
  	/**
  	 * Set a alternative/custom FrenchStemmer for this filter.
  	 */
  	public void setStemmer( FrenchStemmer stemmer ) {
  		if ( stemmer != null ) {
  			this.stemmer = stemmer;
  		}
  	}
  	/**
  	 * Set an alternative exclusion list for this filter.
  	 */
  	public void setExclusionTable( Hashtable exclusiontable ) {
  		exclusions = exclusiontable;
  	}
  }
  
  
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
  
  Index: FrenchStemmer.java
  ===================================================================
  package org.apache.lucene.analysis.fr;
  
  /**
   * A stemmer for French words. The algorithm is based on the work of
   * Dr Martin Porter on his snowball project<br>
   * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
   * (French stemming algorithm) for details
   *
   * @author    Patrick Talbot
   * @version   $Id: FrenchStemmer.java,v 1.1 2004/01/20 10:07:01 ehatcher Exp $
   */
  
  public class FrenchStemmer {
  
      /**
       * Buffer for the terms while stemming them.
       */
      private StringBuffer sb = new StringBuffer();
  
      /**
       * A temporary buffer, used to reconstruct R2
       */
       private StringBuffer tb = new StringBuffer();
  
  	/**
  	 * Region R0 is equal to the whole buffer
  	 */
  	private String R0;
  
  	/**
  	 * Region RV
  	 * "If the word begins with two vowels, RV is the region after the third letter,
  	 * otherwise the region after the first vowel not at the beginning of the word,
  	 * or the end of the word if these positions cannot be found."
  	 */
      private String RV;
  
  	/**
  	 * Region R1
  	 * "R1 is the region after the first non-vowel following a vowel
  	 * or is the null region at the end of the word if there is no such non-vowel"
  	 */
      private String R1;
  
  	/**
  	 * Region R2
  	 * "R2 is the region after the first non-vowel in R1 following a vowel
  	 * or is the null region at the end of the word if there is no such non-vowel"
  	 */
      private String R2;
  
  
  	/**
  	 * Set to true if we need to perform step 2
  	 */
      private boolean suite;
  
  	/**
  	 * Set to true if the buffer was modified
  	 */
      private boolean modified;
  
  
      /**
       * Stemms the given term to a unique <tt>discriminator</tt>.
       *
       * @param term  java.langString The term that should be stemmed
       * @return java.lang.String  Discriminator for <tt>term</tt>
       */
      protected String stem( String term ) {
  		if ( !isStemmable( term ) ) {
  			return term;
  		}
  
  		// Use lowercase for medium stemming.
  		term = term.toLowerCase();
  
  		// Reset the StringBuffer.
  		sb.delete( 0, sb.length() );
  		sb.insert( 0, term );
  
  		// reset the booleans
  		modified = false;
  		suite = false;
  
  		sb = treatVowels( sb );
  
  		setStrings();
  
  		step1();
  
  		if (!modified || suite)
  		{
  			if (RV != null)
  			{
  				suite = step2a();
  				if (!suite)
  					step2b();
  			}
  		}
  
  		if (modified || suite)
  			step3();
  		else
  			step4();
  
  		step5();
  
  		step6();
  
  		return sb.toString();
      }
  
  	/**
  	 * Sets the search region Strings<br>
  	 * it needs to be done each time the buffer was modified
  	 */
  	private void setStrings() {
  		// set the strings
  		R0 = sb.toString();
  		RV = retrieveRV( sb );
  		R1 = retrieveR( sb );
  		if ( R1 != null )
  		{
  			tb.delete( 0, tb.length() );
  			tb.insert( 0, R1 );
  			R2 = retrieveR( tb );
  		}
  		else
  			R2 = null;
  	}
  
  	/**
  	 * First step of the Porter Algorithmn<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step1( ) {
  		String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme",
"able", "iste" };
  		deleteFrom( R2, suffix );
  
  		replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
  		replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
  		replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
  
  		String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
  		deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
  
  		deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false,
R0, "eux" );
  		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
  		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
  		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
  		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
  
  		deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0
);
  		deleteFrom( RV, new String[] { "ements", "ement" } );
  
  		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0,
"abl" );
  		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU"
);
  		deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
  
  		String[] autre = { "ifs", "ives", "if", "ive" };
  		deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
  		deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
  
  		replaceFrom( R0, new String[] { "eaux" }, "eau" );
  
  		replaceFrom( R1, new String[] { "aux" }, "al" );
  
  		deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux"
);
  
  		deleteFrom( R2, new String[] { "eux" } );
  
  		// if one of the next steps is performed, we will need to perform step2a
  		boolean temp = false;
  		temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
  		if (temp == true)
  			suite = true;
  		temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
  		if (temp == true)
  			suite = true;
  		temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
  		if (temp == true)
  			suite = true;
  
  	}
  
  	/**
  	 * Second step (A) of the Porter Algorithmn<br>
  	 * Will be performed if nothing changed from the first step
  	 * or changed were done in the amment, emment, ments or ment suffixes<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 *
  	 * @return boolean - true if something changed in the StringBuffer
  	 */
  	private boolean step2a() {
  		String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
  							"irent", "iriez", "irez", "irions", "irons", "iront",
  							"issaIent", "issais", "issantes", "issante", "issants", "issant",
  							"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
  							"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
  		return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
  	}
  
  	/**
  	 * Second step (B) of the Porter Algorithmn<br>
  	 * Will be performed if step 2 A was performed unsuccessfully<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step2b() {
  		String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
  							"erons", "eront","erez", "èrent", "era", "ées", "iez",
  							"ée", "és", "er", "ez", "é" };
  		deleteFrom( RV, suffix );
  
  		String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
  							"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
  							"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
  		deleteButSuffixFrom( RV, search, "e", true );
  
  		deleteFrom( R2, new String[] { "ions" } );
  	}
  
  	/**
  	 * Third step of the Porter Algorithmn<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step3() {
  		if (sb.length()>0)
  		{
  			char ch = sb.charAt( sb.length()-1 );
  			if (ch == 'Y')
  			{
  				sb.setCharAt( sb.length()-1, 'i' );
  				setStrings();
  			}
  			else if (ch == 'ç')
  			{
  				sb.setCharAt( sb.length()-1, 'c' );
  				setStrings();
  			}
  		}
  	}
  
  	/**
  	 * Fourth step of the Porter Algorithmn<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step4() {
  		if (sb.length() > 1)
  		{
  			char ch = sb.charAt( sb.length()-1 );
  			if (ch == 's')
  			{
  				char b = sb.charAt( sb.length()-2 );
  				if (b != 'a' && b != 'i' && b != 'o' && b != 'u' &&
b != 'è' && b != 's')
  				{
  					sb.delete( sb.length() - 1, sb.length());
  					setStrings();
  				}
  			}
  		}
  		boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
  		if (!found)
  		found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
  
  		replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
  		deleteFrom( RV, new String[] { "e" } );
  		deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
  	}
  
  	/**
  	 * Fifth step of the Porter Algorithmn<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step5() {
  		if (R0 != null)
  		{
  			if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell")
|| R0.endsWith("eill"))
  			{
  				sb.delete( sb.length() - 1, sb.length() );
  				setStrings();
  			}
  		}
  	}
  
  	/**
  	 * Sixth (and last!) step of the Porter Algorithmn<br>
  	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
  	 */
  	private void step6() {
  		if (R0!=null && R0.length()>0)
  		{
  			boolean seenVowel = false;
  			boolean seenConson = false;
  			int pos = -1;
  			for (int i = R0.length()-1; i > -1; i--)
  			{
  				char ch = R0.charAt(i);
  				if (isVowel(ch))
  				{
  					if (!seenVowel)
  					{
  						if (ch == 'é' || ch == 'è')
  						{
  							pos = i;
  							break;
  						}
  					}
  					seenVowel = true;
  				}
  				else
  				{
  					if (seenVowel)
  						break;
  					else
  						seenConson = true;
  				}
  			}
  			if (pos > -1 && seenConson && !seenVowel)
  				sb.setCharAt(pos, 'e');
  		}
  	}
  
  	/**
  	 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
  	 *
  	 * @param source java.lang.String - the primary source zone for search
  	 * @param search java.lang.String[] - the strings to search for suppression
  	 * @param from java.lang.String - the secondary source zone for search
  	 * @param prefix java.lang.String - the prefix to add to the search string to test
  	 * @return boolean - true if modified
  	 */
  	private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String
prefix ) {
  		boolean found = false;
  		if (source!=null )
  		{
  			for (int i = 0; i < search.length; i++) {
  				if ( source.endsWith( search[i] ))
  				{
  					if (from!=null && from.endsWith( prefix + search[i] ))
  					{
  						sb.delete( sb.length() - search[i].length(), sb.length());
  						found = true;
  						setStrings();
  						break;
  					}
  				}
  			}
  		}
  		return found;
  	}
  
  	/**
  	 * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
  	 *
  	 * @param source java.lang.String - the primary source zone for search
  	 * @param search java.lang.String[] - the strings to search for suppression
  	 * @param vowel boolean - true if we need a vowel before the search string
  	 * @param from java.lang.String - the secondary source zone for search (where vowel could
be)
  	 * @return boolean - true if modified
  	 */
  	private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean
vowel, String from ) {
  		boolean found = false;
  		if (source!=null && from!=null)
  		{
  			for (int i = 0; i < search.length; i++) {
  				if ( source.endsWith( search[i] ))
  				{
  					if ((search[i].length() + 1) <= from.length())
  					{
  						boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
  						if (test == vowel)
  						{
  							sb.delete( sb.length() - search[i].length(), sb.length());
  							modified = true;
  							found = true;
  							setStrings();
  							break;
  						}
  					}
  				}
  			}
  		}
  		return found;
  	}
  
  	/**
  	 * Delete a suffix searched in zone "source" if preceded by the prefix
  	 *
  	 * @param source java.lang.String - the primary source zone for search
  	 * @param search java.lang.String[] - the strings to search for suppression
  	 * @param prefix java.lang.String - the prefix to add to the search string to test
  	 * @param without boolean - true if it will be deleted even without prefix found
  	 */
  	private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean
without ) {
  		if (source!=null)
  		{
  			for (int i = 0; i < search.length; i++) {
  				if ( source.endsWith( prefix + search[i] ))
  				{
  					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
  					modified = true;
  					setStrings();
  					break;
  				}
  				else if ( without && source.endsWith( search[i] ))
  				{
  					sb.delete( sb.length() - search[i].length(), sb.length() );
  					modified = true;
  					setStrings();
  					break;
  				}
  			}
  		}
  	}
  
  	/**
  	 * Delete a suffix searched in zone "source" if preceded by prefix<br>
  	 * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
  	 * or delete the suffix if specified
  	 *
  	 * @param source java.lang.String - the primary source zone for search
  	 * @param search java.lang.String[] - the strings to search for suppression
  	 * @param prefix java.lang.String - the prefix to add to the search string to test
  	 * @param without boolean - true if it will be deleted even without prefix found
  	 */
  	private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix,
boolean without, String from, String replace ) {
  		if (source!=null)
  		{
  			for (int i = 0; i < search.length; i++) {
  				if ( source.endsWith( prefix + search[i] ))
  				{
  					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
  					modified = true;
  					setStrings();
  					break;
  				}
  				else if ( from!=null && from.endsWith( prefix + search[i] ))
  				{
  					sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace
);
  					modified = true;
  					setStrings();
  					break;
  				}
  				else if ( without && source.endsWith( search[i] ))
  				{
  					sb.delete( sb.length() - search[i].length(), sb.length() );
  					modified = true;
  					setStrings();
  					break;
  				}
  			}
  		}
  	}
  
  	/**
  	 * Replace a search string with another within the source zone
  	 *
  	 * @param source java.lang.String - the source zone for search
  	 * @param search java.lang.String[] - the strings to search for replacement
  	 * @param replace java.lang.String - the replacement string
  	 */
  	private boolean replaceFrom( String source, String[] search, String replace ) {
  		boolean found = false;
  		if (source!=null)
  		{
  			for (int i = 0; i < search.length; i++) {
  				if ( source.endsWith( search[i] ))
  				{
  					sb.replace( sb.length() - search[i].length(), sb.length(), replace );
  					modified = true;
  					found = true;
  					setStrings();
  					break;
  				}
  			}
  		}
  		return found;
  	}
  
  	/**
  	 * Delete a search string within the source zone
  	 *
  	 * @param source the source zone for search
  	 * @param suffix the strings to search for suppression
  	 */
  	private void deleteFrom(String source, String[] suffix ) {
  		if (source!=null)
  		{
  			for (int i = 0; i < suffix.length; i++) {
  				if (source.endsWith( suffix[i] ))
  				{
  					sb.delete( sb.length() - suffix[i].length(), sb.length());
  					modified = true;
  					setStrings();
  					break;
  				}
  			}
  		}
  	}
  
  	/**
  	 * Test if a char is a french vowel, including accentuated ones
  	 *
  	 * @param ch the char to test
  	 * @return boolean - true if the char is a vowel
  	 */
  	private boolean isVowel(char ch) {
  		switch (ch)
  		{
  			case 'a':
  			case 'e':
  			case 'i':
  			case 'o':
  			case 'u':
  			case 'y':
  			case 'â':
  			case 'à':
  			case 'ë':
  			case 'é':
  			case 'ê':
  			case 'è':
  			case 'ï':
  			case 'î':
  			case 'ô':
  			case 'ü':
  			case 'ù':
  			case 'û':
  				return true;
  			default:
  				return false;
  		}
  	}
  
  	/**
  	 * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding
string<br>
  	 * "R is the region after the first non-vowel following a vowel
  	 * or is the null region at the end of the word if there is no such non-vowel"<br>
  	 * @param buffer java.lang.StringBuffer - the in buffer
  	 * @return java.lang.String - the resulting string
  	 */
  	private String retrieveR( StringBuffer buffer ) {
  		int len = buffer.length();
  		int pos = -1;
  		for (int c = 0; c < len; c++) {
  			if (isVowel( buffer.charAt( c )))
  			{
  				pos = c;
  				break;
  			}
  		}
  		if (pos > -1)
  		{
  			int consonne = -1;
  			for (int c = pos; c < len; c++) {
  				if (!isVowel(buffer.charAt( c )))
  				{
  					consonne = c;
  					break;
  				}
  			}
  			if (consonne > -1 && (consonne+1) < len)
  				return buffer.substring( consonne+1, len );
  			else
  				return null;
  		}
  		else
  			return null;
  	}
  
  	/**
  	 * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
  	 * "If the word begins with two vowels, RV is the region after the third letter,
  	 * otherwise the region after the first vowel not at the beginning of the word,
  	 * or the end of the word if these positions cannot be found."<br>
  	 * @param buffer java.lang.StringBuffer - the in buffer
  	 * @return java.lang.String - the resulting string
  	 */
  	private String retrieveRV( StringBuffer buffer ) {
  		int len = buffer.length();
  		if ( buffer.length() > 3)
  		{
  			if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
  				return buffer.substring(3,len);
  			}
  			else
  			{
  				int pos = 0;
  				for (int c = 1; c < len; c++) {
  					if (isVowel( buffer.charAt( c )))
  					{
  						pos = c;
  						break;
  					}
  				}
  				if ( pos+1 < len )
  					return buffer.substring( pos+1, len );
  				else
  					return null;
  			}
  		}
  		else
  			return null;
  	}
  
  
  
      /**
  	 * Turns u and i preceded AND followed by a vowel to UpperCase<br>
  	 * Turns y preceded OR followed by a vowel to UpperCase<br>
  	 * Turns u preceded by q to UpperCase<br>
       *
       * @param buffer java.util.StringBuffer - the buffer to treat
       * @return java.util.StringBuffer - the treated buffer
       */
      private StringBuffer treatVowels( StringBuffer buffer ) {
  		for ( int c = 0; c < buffer.length(); c++ ) {
  			char ch = buffer.charAt( c );
  
  			if (c == 0) // first char
  			{
  				if (buffer.length()>1)
  				{
  					if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
  						buffer.setCharAt( c, 'Y' );
  				}
  			}
  			else if (c == buffer.length()-1) // last char
  			{
  				if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
  					buffer.setCharAt( c, 'U' );
  				if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
  					buffer.setCharAt( c, 'Y' );
  			}
  			else // other cases
  			{
  				if (ch == 'u')
  				{
  					if (buffer.charAt( c - 1) == 'q')
  						buffer.setCharAt( c, 'U' );
  					else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
  						buffer.setCharAt( c, 'U' );
  				}
  				if (ch == 'i')
  				{
  					if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
  						buffer.setCharAt( c, 'I' );
  				}
  				if (ch == 'y')
  				{
  					if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
  						buffer.setCharAt( c, 'Y' );
  				}
  			}
  		}
  
  		return buffer;
      }
  
      /**
       * Checks a term if it can be processed correctly.
       *
       * @return boolean - true if, and only if, the given term consists in letters.
       */
      private boolean isStemmable( String term ) {
  		boolean upper = false;
  		int first = -1;
  		for ( int c = 0; c < term.length(); c++ ) {
  			// Discard terms that contain non-letter characters.
  			if ( !Character.isLetter( term.charAt( c ) ) ) {
  				return false;
  			}
  			// Discard terms that contain multiple uppercase letters.
  			if ( Character.isUpperCase( term.charAt( c ) ) ) {
  				if ( upper ) {
  					return false;
  				}
  			// First encountered uppercase letter, set flag and save
  			// position.
  				else {
  					first = c;
  					upper = true;
  				}
  			}
  		}
  		// Discard the term if it contains a single uppercase letter that
  		// is not starting the term.
  		if ( first > 0 ) {
  			return false;
  		}
  		return true;
      }
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message