lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Paul Taylor <paul_t...@fastmail.fm>
Subject Not getting matches for analyzers using CharMappingFilter with Lucene 4.1
Date Wed, 20 Feb 2013 11:28:35 GMT
Just updating codebase from Lucene 3.6 to Lucene 4.1 and seems my tests 
that use NormalizeCharMap for replacing characters in the anyalzers are 
not working.

Below Ive created a self-contained test case, this is the output when I 
run it


     --term=and--
     --term=gold--
     --term=platinum--
     name:"platinum and gold"
     Size1
     name:"platinum & gold"
     Size0

     java.lang.AssertionError:
     Expected :1
     Actual   :0
      <Click to see difference>
         at org.junit.Assert.fail(Assert.java:93)
         at org.junit.Assert.failNotEquals(Assert.java:647)
         at org.junit.Assert.assertEquals(Assert.java:128)
         at org.junit.Assert.assertEquals(Assert.java:472)
         at org.junit.Assert.assertEquals(Assert.java:456)
         at 
org.musicbrainz.search.analysis.Lucene41CharFilterTest.testAmpersandSearching(Lucene41CharFilterTest.java:89)

As you can see the charfilter does seem to work because the the text 
'platinum & gold' is converted to three terms 'platnum, and , gold'. In 
fact search is working for 'platinum and gold' but not working for the 
original "platinum & gold" even though both index and search using same 
analyzer. Maybe the problem is with the query parser, but its certainly 
related to 4.1 because worked previously.

thanks Paul


     package org.musicbrainz.search.analysis;

     import org.apache.lucene.analysis.Analyzer;
     import org.apache.lucene.analysis.TokenStream;
     import org.apache.lucene.analysis.Tokenizer;
     import org.apache.lucene.analysis.charfilter.MappingCharFilter;
     import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
     import org.apache.lucene.analysis.core.LowerCaseFilter;
     import org.apache.lucene.document.Document;
     import org.apache.lucene.document.Field;
     import org.apache.lucene.index.*;
     import org.apache.lucene.queryparser.classic.QueryParser;
     import org.apache.lucene.search.IndexSearcher;
     import org.apache.lucene.search.Query;
     import org.apache.lucene.search.TopDocs;
     import org.apache.lucene.store.RAMDirectory;
     import org.apache.lucene.util.BytesRef;
     import org.apache.lucene.util.Version;
     import org.junit.Test;
     import java.io.Reader;

     import static org.junit.Assert.assertEquals;

     public class Lucene41CharFilterTest
     {
         class SimpleAnalyzer extends Analyzer {

             protected NormalizeCharMap charConvertMap;

             protected void setCharConvertMap() {

                 NormalizeCharMap.Builder builder = new 
NormalizeCharMap.Builder();
                 builder.add("&","and");
                 charConvertMap = builder.build();
             }

             public SimpleAnalyzer() {
                 setCharConvertMap();
             }

             @Override
             protected TokenStreamComponents createComponents(String 
fieldName, Reader reader) {
                 Tokenizer source = new 
MusicbrainzTokenizer(Version.LUCENE_41,
                         new MappingCharFilter(charConvertMap, reader));
                 TokenStream filter = new 
LowerCaseFilter(Version.LUCENE_41,source);
                 return new TokenStreamComponents(source, filter);
             }
         }

         @Test
         public void testAmpersandSearching() throws Exception {

             Analyzer analyzer = new SimpleAnalyzer();
             RAMDirectory dir = new RAMDirectory();
             IndexWriterConfig writerConfig = new 
IndexWriterConfig(Version.LUCENE_41,analyzer);
             IndexWriter writer = new IndexWriter(dir, writerConfig);
             {
                 Document doc = new Document();
                 doc.add(new Field("name", "platinum & gold", 
Field.Store.YES, Field.Index.ANALYZED));
                 writer.addDocument(doc);
             }
             writer.close();

             IndexReader ir = DirectoryReader.open(dir);
             Fields fields = MultiFields.getFields(ir);
             Terms terms = fields.terms("name");
             TermsEnum termsEnum = terms.iterator(null);
             BytesRef text;
             while((text = termsEnum.next()) != null) {
                 System.out.println("--term=" + text.utf8ToString()+"--");
             }
             ir.close();

             IndexSearcher searcher = new 
IndexSearcher(IndexReader.open(dir));
             {
                 Query q = new QueryParser(Version.LUCENE_41, "name", 
analyzer).parse("\"platinum and gold\"");
                 System.out.println(q);
                 TopDocs td = searcher.search(q, 10);
                 System.out.println("Size"+td.scoreDocs.length);
                 assertEquals(1, searcher.search(q, 10).totalHits);
             }

             searcher = new IndexSearcher(IndexReader.open(dir));
             {
                 Query q = new QueryParser(Version.LUCENE_41, "name", 
analyzer).parse("\"platinum & gold\"");
                 System.out.println(q);
                 TopDocs td = searcher.search(q, 10);
                 System.out.println("Size"+td.scoreDocs.length);
                 assertEquals(1, searcher.search(q, 10).totalHits);
             }
         }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message