lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Paul Taylor <paul_t...@fastmail.fm>
Subject Re: Uable to extends TopTermsRewrite in Lucene 4.1
Date Wed, 27 Feb 2013 09:52:39 GMT
On 26/02/2013 18:01, Paul Taylor wrote:
> On 26/02/2013 17:22, Uwe Schindler wrote:
>>> Hi,
>>>
>>> You cannot override rewrite() because you could easily break the logic
>>> behind TopTermsRewrite. If you want another behavior, subclass another
>>> base class and wrap the TopTermsRewrite instead of subclassing it (the
>>> generics also enforce that the rewrite needs to rewrite() to a class 
>>> that’s
>>> specified in the generics parameter).
>>>
>>> addClause() is not final, its abstract. There is one "final" helper 
>>> method used
>>> by the rewrite itself, but the methods you need to override are 
>>> abstract.
>>>
>>> Also your generics seem to be wrong, leading to the above question...
>> In addition, you cast the call to super.rewrite() to DisjMaxQuery, so 
>> it is definitely a DisjMaxQuery (because getTopLevelQuery() always 
>> returns one, see generics). You then pass this DisjMaxQuery to this 
>> "getQueryBoostMethod", which checks for instanceof PrefixQuery. This 
>> can never return true, so the boost is always 1. You can therefore 
>> nuke the whole rewrite method (as it changes nothing) and only 
>> implement getToplevelQuery() and addClause().
>>
>> Uwe
Not making much sense of this, Im trying to use the same rewritemethod for

QueryParser

and

FuzzyQuery
PrefixQuery

I'm confused as to whether I should be applying at both stages, and what 
the generic parameter should be
as the javadoc for QueryParser. setMultiTermRewriteMethod() implies you 
need to change this to use different rewrite for fuzzy and prefix 
queries but you seem to be saying I should be using FuzzyQuery as the 
generic type whihc would prevent this wouldn't it ?

Is there a fuller explanation of rewrite methods anywhere ?

Full class below if it makes things clearer

Paul

package org.musicbrainz.search.servlet;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.musicbrainz.search.LuceneVersion;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

public class DismaxQueryParser {

     public static String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC";
     protected DisjunctionQueryParser dqp;

     protected DismaxQueryParser() {
     }

     public DismaxQueryParser(org.apache.lucene.analysis.Analyzer 
analyzer) {
         dqp = new DisjunctionQueryParser(IMPOSSIBLE_FIELD_NAME, analyzer);
         //TODO FIXME
         //dqp.setMultiTermRewriteMethod(new 
MultiTermUseIdfOfSearchTerm(100));
     }

     /**
      * Create query consists of disjunction queries for each term 
fields combo, and then
      * a phrase search for each field as long as the original query is 
more than one term
      *
      * @param query
      * @return
      *
      */
     public Query parse(String query) throws 
org.apache.lucene.queryparser.classic.ParseException {

         Query term = dqp.parse(DismaxQueryParser.IMPOSSIBLE_FIELD_NAME 
+ ":(" + query + ")");
         Query phrase = 
dqp.parse(DismaxQueryParser.IMPOSSIBLE_FIELD_NAME + ":\"" + query + "\"");
         return buildTopQuery(term, phrase);
     }

     /**
      * If a phrase query was built then we create a boolean query that 
requires something to match in
      * the term query, under normal circumstances if nothing matches 
the term query nothing will match the phrase
      * query
      *
      * @param term
      * @param phrase
      * @return
      */
     protected Query buildTopQuery(Query term, Query phrase) {
         if (phrase instanceof DisjunctionMaxQuery) {
             BooleanQuery bq = new BooleanQuery(true);
             bq.add(term, BooleanClause.Occur.MUST);
             bq.add(phrase, BooleanClause.Occur.SHOULD);
             return bq;
         } else {
             return term;
         }
     }


     public void addAlias(String field, DismaxAlias dismaxAlias) {
         dqp.addAlias(field, dismaxAlias);
     }

     static class DisjunctionQueryParser extends QueryParser {

         //Only make search terms that are this length fuzzy searchable 
and only match to terms that are also this length
         protected static final int MIN_FIELD_LENGTH_TO_MAKE_FUZZY = 4;
         protected static final float FUZZY_SIMILARITY = 0.5f;

         //Reduce boost of wildcard/fuzzy matches compared to exact matches
         protected static final float WILDCARD_BOOST_REDUCER = 0.8f;

         //Reduce phrase query scores otherwise there is too much 
difference between a document that matches on
         //phrase and one that doesn't quite.
         protected static final float PHRASE_BOOST_REDUCER = 0.2f;


         public DisjunctionQueryParser(String defaultField, 
org.apache.lucene.analysis.Analyzer analyzer) {
             super(LuceneVersion.LUCENE_VERSION, defaultField, analyzer);
         }

         protected Map<String, DismaxAlias> aliases = new 
HashMap<String, DismaxAlias>(3);

         //Field to DismaxAlias
         public void addAlias(String field, DismaxAlias dismaxAlias) {
             aliases.put(field, dismaxAlias);
         }

         // TODO FIXME _ Unable to create rewrite using original idf
         // Rewrite Method used by Prefix Search and Fuzzy Search, use 
idf of the original term
         //MultiTermQuery.RewriteMethod fuzzyAndPrefixQueryRewriteMethod
         //        = new MultiTermUseIdfOfSearchTerm(200);

         protected boolean checkQuery(DisjunctionMaxQuery q, Query 
querySub, boolean quoted, DismaxAlias a, String f) {
             if (querySub != null) {
                 //if query was quoted but doesn't generate a phrase 
query we reject it
                 if ((!quoted) || (querySub instanceof PhraseQuery)) {
                     //Reduce phrase because will have matched both 
parts giving far too much score differential
                     if (quoted) {
                         querySub.setBoost(PHRASE_BOOST_REDUCER);
                     } else {
querySub.setBoost(a.getFields().get(f).getBoost());
                     }
                     q.add(querySub);
                     return true;
                 }
             }
             return false;
         }

         @Override
         //TODO FIXME was using a FLOAT similarity value of 0.5 but now 
chnaged to integral
         protected Query getFuzzyQuery(String field, String termStr, 
float minSimilarity) {
             Term t = new Term(field, termStr);
             FuzzyQuery fq = new FuzzyQuery(t,  2, 
MIN_FIELD_LENGTH_TO_MAKE_FUZZY);
             //TODO FIXME
             //fq.setRewriteMethod(fuzzyAndPrefixQueryRewriteMethod);
             return fq;
         }


         protected Query getFieldQuery(String field, String queryText, 
boolean quoted)
                 throws ParseException
         {
             //If field is an alias
             if (aliases.containsKey(field)) {

                 DismaxAlias a = aliases.get(field);
                 DisjunctionMaxQuery q = new 
DisjunctionMaxQuery(a.getTie());
                 boolean ok = false;

                 for (String f : a.getFields().keySet()) {

                     //if query can be created for this field and text
                     Query querySub;
                     Query queryWildcard = null;
                     Query queryFuzzy = null;

                     DismaxAlias.AliasField af = a.getFields().get(f);
                     if (!quoted && queryText.length() >= 
MIN_FIELD_LENGTH_TO_MAKE_FUZZY) {
                         querySub = getFieldQuery(f, queryText, quoted);
                         if (querySub instanceof TermQuery) {

                             if (af.isFuzzy()) {
                                 Term t = ((TermQuery) querySub).getTerm();
                                 queryWildcard = newPrefixQuery(new 
Term(t.field(), t.text()));
                                 queryFuzzy = getFuzzyQuery(t.field(), 
t.text(), FUZZY_SIMILARITY);
                                 queryFuzzy.setBoost(af.getBoost() * 
WILDCARD_BOOST_REDUCER);
                                 q.add(queryFuzzy);
                                 queryWildcard.setBoost(af.getBoost() * 
WILDCARD_BOOST_REDUCER);
                                 q.add(queryWildcard);
                             }
                         }
                     } else {
                         querySub = getFieldQuery(f, queryText, quoted);
                     }

                     if (checkQuery(q, querySub, quoted, a, f) && ok == 
false) {
                         ok = true;
                     }
                 }
                 //Something has been added to disjunction query
                 return ok ? q : null;

             } else {
                 //usual Field
                 try {
                     return super.getFieldQuery(field, queryText, quoted);
                 } catch (Exception e) {
                     return null;
                 }
             }
         }

         /**
          * Builds a new PrefixQuery instance
          * @param prefix Prefix term
          * @return new PrefixQuery instance
          */
         protected Query newPrefixQuery(Term prefix){
             PrefixQuery query = new PrefixQuery(prefix);
             //TODO FIXME
//query.setRewriteMethod(fuzzyAndPrefixQueryRewriteMethod);
             return query;
         }
     }

     /*
     TODO FIXME WAS Overriding methods that are now final
     public static class MultiTermUseIdfOfSearchTerm<Q extends 
DisjunctionMaxQuery> extends TopTermsRewrite<Query> {

     //public static final class MultiTermUseIdfOfSearchTerm extends 
TopTermsRewrite<BooleanQuery> {
         private final TFIDFSimilarity similarity;

         public MultiTermUseIdfOfSearchTerm(int size) {
             super(size);
             this.similarity = new DefaultSimilarity();

         }

         @Override
         protected int getMaxSize() {
             return BooleanQuery.getMaxClauseCount();
         }

         @Override
         protected DisjunctionMaxQuery getTopLevelQuery() {
             return new DisjunctionMaxQuery(0.1f);
         }

         @Override
         protected void addClause(Query topLevel, Term term, float boost) {
             final Query tq = new ConstantScoreQuery(new TermQuery(term));
             tq.setBoost(boost);
             ((DisjunctionMaxQuery)topLevel).add(tq);
         }

         protected float getQueryBoost(final IndexReader reader, final 
MultiTermQuery query)
                 throws IOException {
             float idf = 1f;
             float df;
             if (query instanceof PrefixQuery)
             {
                 PrefixQuery fq = (PrefixQuery) query;
                 df = reader.docFreq(fq.getPrefix());
                 if(df>=1)
                 {
                     //Same as idf value for search term, 0.5 acts as 
length norm
                     idf = (float)Math.pow(similarity.idf((int) df, 
reader.numDocs()),2) * 0.5f;
                 }
             }
             return idf;
         }

         @Override
         public Query rewrite(final IndexReader reader, final 
MultiTermQuery query) throws IOException {
             DisjunctionMaxQuery  bq = 
(DisjunctionMaxQuery)super.rewrite(reader, query);

             float idfBoost = getQueryBoost(reader, query);
             Iterator<Query> iterator = bq.iterator();
             while(iterator.hasNext())
             {
                 Query next = iterator.next();
                 next.setBoost(next.getBoost() * idfBoost);
             }
             return bq;
         }

     }
     */
}

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message