lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jm <jmugur...@gmail.com>
Subject Re: analizer not doing the same thing at index and query time?
Date Tue, 12 Jul 2011 12:52:15 GMT
*Here is a self contained code:
*
*
*
*
I verified with luke no 's' is indexed in the index. The output I get is:
testChars
bbbb:(bloom's*) got 0 Query is: bbbb:bloom's*
bbbb:(bloom) got 1 Query is: bbbb:bloom
bbbb:(bloom AND b*) got 1 Query is: +bbbb:bloom +bbbb:b*

So what I don't understand why is the ' in the first query not being
removed.
thanks


public class AnalyzerTest {
    public static void main(String[] args) throws IOException,
ParseException {
        System.out.println("testChars ");
        Analyzer analyzer = getAnalyzer();

        //test search
        //        Directory directory = new RAMDirectory();
        Directory directory = FSDirectory.open(new
File("d:\\temp\\lucene.index"));
        IndexWriter writer = new IndexWriter(directory, analyzer,
IndexWriter.MaxFieldLength.UNLIMITED); //2
        Document doc = new Document(); // 3
        String text = "bloom's bird";
        doc.add((Fieldable) new Field("bbbb", text, Field.Store.NO,
Field.Index.ANALYZED)); // 3
        writer.addDocument(doc); // 3
        doc = new Document();
        doc.add((Fieldable) new Field("bbbb", "ungry abloom card",
Field.Store.NO, Field.Index.ANALYZED)); // 3
        writer.addDocument(doc); // 3
        writer.close(); // 3
        //proximity
        QueryParser qp = new QueryParser(Version.LUCENE_24, "bbbb",
analyzer);
        printHitCountQP(directory, qp, "bbbb:(bloom's*)");
        printHitCountQP(directory, qp, "bbbb:(bloom)");
        printHitCountQP(directory, qp, "bbbb:(bloom AND b*)");
    }

    private static Analyzer getAnalyzer() {
        return new MyAnalyzer();
    }

    protected static void printHitCountQP(Directory directory, QueryParser
qp, String searchString) throws IOException, ParseException {
        IndexSearcher searcher = new IndexSearcher(directory, true); //5
        Query query = qp.parse(searchString);
        int hitCount = searcher.search(query, 1).totalHits;
        searcher.close();
        System.out.println(searchString + " got " + hitCount + " Query is: "
+ query.toString());
    }
}

class MyAnalyzer extends Analyzer {

    private static final String[] STOPS = { "i", "s" };
    private final Set<?> stopWords;
    private final boolean enablePositionIncrements;
    private int maxWordLength = 2000;
    private int minWordLength = 2;

    public Set getStopWords() {
        return stopWords;
    }

    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = new MyLowerCaseLetterNumberTokenizer(reader);
        result = new LengthFilter(result, minWordLength, maxWordLength);
        result = new StopFilter(enablePositionIncrements, result, stopWords,
true);
        return result;
    }

    public MyAnalyzer() {
        this.stopWords = StopFilter.makeStopSet(STOPS);
        enablePositionIncrements =
StopFilter.getEnablePositionIncrementsVersionDefault(Version.LUCENE_24);
    }

    private class SavedStreams {
        Tokenizer source;
        TokenStream result;
    };

    @Override
    public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
        SavedStreams streams = (SavedStreams) getPreviousTokenStream();
        if (streams == null) {
            streams = new SavedStreams();

            streams.source = new MyLowerCaseLetterNumberTokenizer(reader);
            streams.result = new LengthFilter(streams.source, minWordLength,
maxWordLength);
            streams.result = new StopFilter(enablePositionIncrements,
streams.result, stopWords, true);
            setPreviousTokenStream(streams);
        } else
            streams.source.reset(reader);
        return streams.result;
    }
}

class MyLowerCaseLetterNumberTokenizer extends LetterTokenizer {

    public MyLowerCaseLetterNumberTokenizer(Reader in) {
        super(in);
    }

    public MyLowerCaseLetterNumberTokenizer(AttributeSource source, Reader
in) {
        super(source, in);
    }

    public MyLowerCaseLetterNumberTokenizer(AttributeFactory factory, Reader
in) {
        super(factory, in);
    }

    protected boolean isTokenChar(char c) {
        return Character.isLetterOrDigit(c);
    }

    protected char normalize(char c) {
        return Character.toLowerCase(c);
    }
}

*

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message