Mailing-List: contact java-user-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: java-user@lucene.apache.org
Date: Thu, 5 May 2016 14:21:18 -0500 (CDT)
From: Daniel Bigham <danielb@wolfram.com>
To: java-user@lucene.apache.org
Message-ID: <1184902543.13314183.1462476078834.JavaMail.zimbra@wolfram.com>
In-Reply-To: <572B8A98.3080002@wolfram.com>
References: <572B8A98.3080002@wolfram.com>
Subject: Re: StopFilterFactory with french_stop.txt
MIME-Version: 1.0
Content-Type: multipart/alternative;
	boundary="----=_Part_13314182_1666386289.1462476078834"
Thread-Topic: StopFilterFactory with french_stop.txt
Thread-Index: OSraCSqpNQ0nlsChgTxfho2j2jZEhA==
archived-at: Thu, 05 May 2016 19:21:27 -0000

------=_Part_13314182_1666386289.1462476078834
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit

For the time being I seem to be able to do this by using a custom TokenFilterFactory class as follows. 

If there is a better approach, or if this approach seems flawed, let me know. 

Thanks. 

package com.wolfram.textsearch; 

import java.io.IOException; 
import java.io.Reader; 
import java.nio.charset.StandardCharsets; 
import java.util.Map; 

import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.ar.ArabicAnalyzer; 
import org.apache.lucene.analysis.bg.BulgarianAnalyzer; 
import org.apache.lucene.analysis.ca.CatalanAnalyzer; 
import org.apache.lucene.analysis.cjk.CJKAnalyzer; 
import org.apache.lucene.analysis.ckb.SoraniAnalyzer; 
import org.apache.lucene.analysis.core.StopFilter; 
import org.apache.lucene.analysis.cz.CzechAnalyzer; 
import org.apache.lucene.analysis.el.GreekAnalyzer; 
import org.apache.lucene.analysis.eu.BasqueAnalyzer; 
import org.apache.lucene.analysis.fa.PersianAnalyzer; 
import org.apache.lucene.analysis.ga.IrishAnalyzer; 
import org.apache.lucene.analysis.gl.GalicianAnalyzer; 
import org.apache.lucene.analysis.hi.HindiAnalyzer; 
import org.apache.lucene.analysis.hy.ArmenianAnalyzer; 
import org.apache.lucene.analysis.id.IndonesianAnalyzer; 
import org.apache.lucene.analysis.lt.LithuanianAnalyzer; 
import org.apache.lucene.analysis.lv.LatvianAnalyzer; 
import org.apache.lucene.analysis.ro.RomanianAnalyzer; 
import org.apache.lucene.analysis.snowball.SnowballFilter; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.analysis.th.ThaiAnalyzer; 
import org.apache.lucene.analysis.tr.TurkishAnalyzer; 
import org.apache.lucene.analysis.util.CharArraySet; 
import org.apache.lucene.analysis.util.TokenFilterFactory; 
import org.apache.lucene.analysis.util.WordlistLoader; 
import org.apache.lucene.util.IOUtils; 

public class MultiLanguageStopWordFilterFactory extends TokenFilterFactory 
{ 
String language = "English"; 
private CharArraySet stopWords; 
private final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; 

public MultiLanguageStopWordFilterFactory(Map<String,String> args) throws IOException 
{ 
super(args); 
language = get(args, "language"); 
if (!args.isEmpty()) 
{ 
throw new IllegalArgumentException("Unknown parameters: " + args); 
} 

int stopwordStyle = 0; 
String commentChar = "#"; 
Class<? extends Analyzer> analyzerClass = null; 
String stopwordFile = DEFAULT_STOPWORD_FILE; 

switch(language) 
{ 
case "Arabic": 
analyzerClass = ArabicAnalyzer.class; 
break; 
case "Bulgarian": 
analyzerClass = BulgarianAnalyzer.class; 
break; 
case "Catalan": 
analyzerClass = CatalanAnalyzer.class; 
break; 
case "Chinese": 
analyzerClass = CJKAnalyzer.class; 
break; 
case "Japanese": 
analyzerClass = CJKAnalyzer.class; 
break; 
case "Korean": 
analyzerClass = CJKAnalyzer.class; 
break; 
case "KurdishCentral": 
analyzerClass = SoraniAnalyzer.class; 
break; 
case "Czech": 
analyzerClass = CzechAnalyzer.class; 
break; 
case "Danish": 
stopwordStyle = 1; 
stopwordFile = "danish_stop.txt"; 
break; 
case "German": 
stopwordStyle = 1; 
stopwordFile = "german_stop.txt"; 
break; 
case "Greek": 
analyzerClass = GreekAnalyzer.class; 
break; 
case "English": 
stopwordStyle = 2; 
break; 
case "Spanish": 
stopwordStyle = 1; 
stopwordFile = "spanish_stop.txt"; 
break; 
case "Basque": 
analyzerClass = BasqueAnalyzer.class; 
break; 
case "Persian": 
analyzerClass = PersianAnalyzer.class; 
break; 
case "Finnish": 
stopwordStyle = 1; 
stopwordFile = "finnish_stop.txt"; 
break; 
case "French": 
stopwordStyle = 1; 
stopwordFile = "french_stop.txt"; 
break; 
case "GaelicIrish": 
analyzerClass = IrishAnalyzer.class; 
break; 
case "Galician": 
analyzerClass = GalicianAnalyzer.class; 
break; 
case "Hindi": 
analyzerClass = HindiAnalyzer.class; 
break; 
case "Hungarian": 
stopwordStyle = 1; 
stopwordFile = "hungarian_stop.txt"; 
break; 
case "Armenian": 
analyzerClass = ArmenianAnalyzer.class; 
break; 
case "Indonesian": 
analyzerClass = IndonesianAnalyzer.class; 
break; 
case "Italian": 
stopwordStyle = 1; 
stopwordFile = "italian_stop.txt"; 
break; 
case "Lithuanian": 
analyzerClass = LithuanianAnalyzer.class; 
break; 
case "Latvian": 
analyzerClass = LatvianAnalyzer.class; 
break; 
case "Dutch": 
stopwordStyle = 1; 
stopwordFile = "dutch_stop.txt"; 
break; 
case "Norwegian": 
stopwordStyle = 1; 
stopwordFile = "norwegian_stop.txt"; 
break; 
case "Portuguese": 
stopwordStyle = 1; 
stopwordFile = "portuguese_stop.txt"; 
break; 
case "Romanian": 
analyzerClass = RomanianAnalyzer.class; 
break; 
case "Russian": 
stopwordStyle = 1; 
stopwordFile = "russian_stop.txt"; 
break; 
case "Swedish": 
stopwordStyle = 1; 
stopwordFile = "swedish_stop.txt"; 
break; 
case "Thai": 
analyzerClass = ThaiAnalyzer.class; 
break; 
case "Turkish": 
analyzerClass = TurkishAnalyzer.class; 
break; 
} 

if (stopwordStyle == 0) 
{ 
stopWords = loadStopwordSet(false, analyzerClass, stopwordFile, commentChar); 
} 
else if (stopwordStyle == 1) 
{ 
stopWords = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, stopwordFile, StandardCharsets.UTF_8)); 
} 
else if (stopwordStyle == 2) 
{ 
stopWords = StandardAnalyzer.STOP_WORDS_SET; 
} 
} 

/** 
* Load a stop word set. 
* 
* @param aClass the associated analyzer. 
* @param resource the file. 
* @param comment the character used in the file to indicate a comment. 
* 
* @return a set of stopwords. 
* 
* @throws IOException 
*/ 
static CharArraySet loadStopwordSet( 
boolean ignoreCase, 
final Class<? extends Analyzer> aClass, final String resource, 
final String comment) throws IOException 
{ 
Reader reader = null; 
try 
{ 
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8); 
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase)); 
} 
finally 
{ 
IOUtils.close(reader); 
} 
} 

@Override 
public TokenStream create(TokenStream input) 
{ 
StopFilter stopFilter = new StopFilter(input, stopWords); 
return stopFilter; 
} 
} 

----- On May 5, 2016, at 2:02 PM, danielb <danielb@wolfram.com> wrote: 

> I'd like to use CustomAnalyzer to create an analyzer that is much like
> the FrenchAnalyzer.

> In doing that, I'm using StopFilterFactory.

> But I'm unsure how to point it to use "french_stop.txt". ie. What
> FrenchAnalyzer is using here:

> public final class FrenchAnalyzer extends StopwordAnalyzerBase {
> public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
> ...

> The typical use of StopFilterFactory:

> .addTokenFilter(StopFilterFactory.class, "ignoreCase", "false", "words",
> "french_stop.txt", "format", "wordset")

> But this looks for a file "french_stop.txt" and can't find it.
> (presumably it's looking in a completely different location from
> FrenchAnalyzer)

> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org

------=_Part_13314182_1666386289.1462476078834--