lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Paul Taylor <paul_t...@fastmail.fm>
Subject Re: Not getting matches for analyzers using CharMappingFilter with Lucene 4.1
Date Mon, 25 Feb 2013 21:16:59 GMT
On 20/02/2013 11:28, Paul Taylor wrote:
> Just updating codebase from Lucene 3.6 to Lucene 4.1 and seems my 
> tests that use NormalizeCharMap for replacing characters in the 
> anyalzers are not working.
>
> Below Ive created a self-contained test case, this is the output when 
> I run it
>
>
>     --term=and--
>     --term=gold--
>     --term=platinum--
>     name:"platinum and gold"
>     Size1
>     name:"platinum & gold"
>     Size0
>
>     java.lang.AssertionError:
>     Expected :1
>     Actual   :0
>      <Click to see difference>
>         at org.junit.Assert.fail(Assert.java:93)
>         at org.junit.Assert.failNotEquals(Assert.java:647)
>         at org.junit.Assert.assertEquals(Assert.java:128)
>         at org.junit.Assert.assertEquals(Assert.java:472)
>         at org.junit.Assert.assertEquals(Assert.java:456)
>         at 
> org.musicbrainz.search.analysis.Lucene41CharFilterTest.testAmpersandSearching(Lucene41CharFilterTest.java:89)
>
> As you can see the charfilter does seem to work because the the text 
> 'platinum & gold' is converted to three terms 'platnum, and , gold'. 
> In fact search is working for 'platinum and gold' but not working for 
> the original "platinum & gold" even though both index and search using 
> same analyzer. Maybe the problem is with the query parser, but its 
> certainly related to 4.1 because worked previously.
>
> thanks Paul
>
>
>     package org.musicbrainz.search.analysis;
>
>     import org.apache.lucene.analysis.Analyzer;
>     import org.apache.lucene.analysis.TokenStream;
>     import org.apache.lucene.analysis.Tokenizer;
>     import org.apache.lucene.analysis.charfilter.MappingCharFilter;
>     import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
>     import org.apache.lucene.analysis.core.LowerCaseFilter;
>     import org.apache.lucene.document.Document;
>     import org.apache.lucene.document.Field;
>     import org.apache.lucene.index.*;
>     import org.apache.lucene.queryparser.classic.QueryParser;
>     import org.apache.lucene.search.IndexSearcher;
>     import org.apache.lucene.search.Query;
>     import org.apache.lucene.search.TopDocs;
>     import org.apache.lucene.store.RAMDirectory;
>     import org.apache.lucene.util.BytesRef;
>     import org.apache.lucene.util.Version;
>     import org.junit.Test;
>     import java.io.Reader;
>
>     import static org.junit.Assert.assertEquals;
>
>     public class Lucene41CharFilterTest
>     {
>         class SimpleAnalyzer extends Analyzer {
>
>             protected NormalizeCharMap charConvertMap;
>
>             protected void setCharConvertMap() {
>
>                 NormalizeCharMap.Builder builder = new 
> NormalizeCharMap.Builder();
>                 builder.add("&","and");
>                 charConvertMap = builder.build();
>             }
>
>             public SimpleAnalyzer() {
>                 setCharConvertMap();
>             }
>
>             @Override
>             protected TokenStreamComponents createComponents(String 
> fieldName, Reader reader) {
>                 Tokenizer source = new 
> MusicbrainzTokenizer(Version.LUCENE_41,
>                         new MappingCharFilter(charConvertMap, reader));
>                 TokenStream filter = new 
> LowerCaseFilter(Version.LUCENE_41,source);
>                 return new TokenStreamComponents(source, filter);
>             }
>         }
>
>         @Test
>         public void testAmpersandSearching() throws Exception {
>
>             Analyzer analyzer = new SimpleAnalyzer();
>             RAMDirectory dir = new RAMDirectory();
>             IndexWriterConfig writerConfig = new 
> IndexWriterConfig(Version.LUCENE_41,analyzer);
>             IndexWriter writer = new IndexWriter(dir, writerConfig);
>             {
>                 Document doc = new Document();
>                 doc.add(new Field("name", "platinum & gold", 
> Field.Store.YES, Field.Index.ANALYZED));
>                 writer.addDocument(doc);
>             }
>             writer.close();
>
>             IndexReader ir = DirectoryReader.open(dir);
>             Fields fields = MultiFields.getFields(ir);
>             Terms terms = fields.terms("name");
>             TermsEnum termsEnum = terms.iterator(null);
>             BytesRef text;
>             while((text = termsEnum.next()) != null) {
>                 System.out.println("--term=" + text.utf8ToString()+"--");
>             }
>             ir.close();
>
>             IndexSearcher searcher = new 
> IndexSearcher(IndexReader.open(dir));
>             {
>                 Query q = new QueryParser(Version.LUCENE_41, "name", 
> analyzer).parse("\"platinum and gold\"");
>                 System.out.println(q);
>                 TopDocs td = searcher.search(q, 10);
>                 System.out.println("Size"+td.scoreDocs.length);
>                 assertEquals(1, searcher.search(q, 10).totalHits);
>             }
>
>             searcher = new IndexSearcher(IndexReader.open(dir));
>             {
>                 Query q = new QueryParser(Version.LUCENE_41, "name", 
> analyzer).parse("\"platinum & gold\"");
>                 System.out.println(q);
>                 TopDocs td = searcher.search(q, 10);
>                 System.out.println("Size"+td.scoreDocs.length);
>                 assertEquals(1, searcher.search(q, 10).totalHits);
>             }
>         }
>     }
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
>
FWIW

I found the issue I had to override new initReader() method for it to be 
applied when analyzer used with QueryParser.Not sure if the filter needs 
to be constructed in createComponents() as well but tests working if I 
just  add to initReader.

class SimpleAnalyzer extends Analyzer {

         protected NormalizeCharMap charConvertMap;

         protected void setCharConvertMap() {

             NormalizeCharMap.Builder builder = new 
NormalizeCharMap.Builder();
             builder.add("&","and");
             charConvertMap = builder.build();
         }

         public SimpleAnalyzer() {
             setCharConvertMap();
         }

         @Override
         protected TokenStreamComponents createComponents(String 
fieldName, Reader reader) {
             Tokenizer source = new MusicbrainzTokenizer(Version.LUCENE_35,
                     reader);
             TokenStream filter = new 
LowerCaseFilter(Version.LUCENE_35,source);
             return new TokenStreamComponents(source, filter);
         }

         @Override
         protected Reader initReader(String fieldName,
                                     Reader reader)
         {
             return new MappingCharFilter(charConvertMap, reader);
         }
     }

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message