lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Paul Taylor <paul_t...@fastmail.fm>
Subject Re: Dealing with special cases in analyser
Date Wed, 17 Mar 2010 15:34:40 GMT
Grant Ingersoll wrote:
> What's your current chain of TokenFilters?  How many exceptions do you expect?  That
is, could you enumerate them?
>   
Very few, yes I could enumerate them, but not sure what exactly you are 
suggesting, what I was going to do would be add to the charConvertMap 
(when I posted I thought this was only for individual chars not strings)


This is my analyzer:

package org.musicbrainz.search.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;


import java.io.IOException;
import java.io.Reader;

import com.ibm.icu.text.Transliterator;
import org.apache.lucene.util.Version;
import org.musicbrainz.search.LuceneVersion;

/**
* Filters StandardTokenizer with StandardFilter, ICUTransformFilter, 
AccentFilter, LowerCaseFilter
* and no stop words.
*/
public class StandardUnaccentAnalyzer extends Analyzer {

private NormalizeCharMap charConvertMap;

private void setCharConvertMap() {
charConvertMap = new NormalizeCharMap();
charConvertMap.add("&","and");

//Hebrew chars converted to western cases so matches both
charConvertMap.add("\u05f3","'");
charConvertMap.add("\u05be","-");
charConvertMap.add("\u05f4","\"");


}

public StandardUnaccentAnalyzer() {
setCharConvertMap();
}

public TokenStream tokenStream(String fieldName, Reader reader) {
CharFilter mappingCharFilter = new MappingCharFilter(charConvertMap,reader);
StandardTokenizer tokenStream = new 
StandardTokenizer(LuceneVersion.LUCENE_VERSION, mappingCharFilter);
TokenStream result = new ICUTransformFilter(tokenStream, 
Transliterator.getInstance("[ー[:Script=Katakana:]]Katakana-Hiragana"));
result = new ICUTransformFilter(result, 
Transliterator.getInstance("Traditional-Simplified"));
result = new StandardFilter(result);
result = new AccentFilter(result);
result = new LowercaseFilter(result);
return result;
}

private static final class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
}

public TokenStream reusableTokenStream(String fieldName, Reader reader) 
throws IOException {
SavedStreams streams = (SavedStreams)getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new 
StandardTokenizer(LuceneVersion.LUCENE_VERSION, new 
MappingCharFilter(charConvertMap, reader));
streams.filteredTokenStream = new 
ICUTransformFilter(streams.tokenStream, Transliterator.getInstance("[ー 
[:Script=Katakana:]]Katakana-Hiragana"));
streams.filteredTokenStream = new 
ICUTransformFilter(streams.filteredTokenStream, 
Transliterator.getInstance("Traditional-Simplified"));
streams.filteredTokenStream = new 
StandardFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new AccentFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new 
LowercaseFilter(streams.filteredTokenStream);
}
else {
streams.tokenStream.reset(new MappingCharFilter(charConvertMap,reader));
}
return streams.filteredTokenStream;
}
}


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message