lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Michael Bell" <...@gwava.com>
Subject Reverse wildcarding
Date Mon, 27 Feb 2012 20:33:45 GMT
(This is an expanded version of the post I made before in the hopes someone will comment)

I am trying to port the reverse wildcard support from SOLR to base Lucene.

In broad strokes, I will use a PerFieldAnalyzer map with the INDEXWRITER such that fields
that I want to be indexed both ways
will call my "SuperAnalyzer" (StandardAnalyzer+ReverseWildcardFilter)

When using INDEXSEARCHER, I use a extended version of QueryParser, that for these fields reverses
as necessary. The Analyzer passed here, is JUST standardanalyzer, not superanalyzer

1. Is this the right approach?
2. Please glance at Superanalyzer. am I implementing things right? It's hard to know what
to override from extending Analyzer class.

This SEEMS to work in tests, but obviously I am concerned about missing a subtlety

Superanalyzer
=========
public class SuperAnalyzer extends Analyzer {

	final private Analyzer base;
	final private ReversedWildcardFilterFactory filter;
	public SuperAnalyzer(Analyzer base,ReversedWildcardFilterFactory filter) {
		this.base=base; this.filter=filter;
	}
	@Override
	public int getPositionIncrementGap(String fieldName) {
		return this.base.getPositionIncrementGap(fieldName); // or something else
	}
	@Override
	public TokenStream tokenStream(String fieldName, Reader reader) { // this looks ok
		TokenStream ts=this.base.tokenStream(fieldName, reader);
		return this.filter.create(ts);		
	}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
		throws IOException {
	TokenStream ts=this.base.reusableTokenStream(fieldName, reader);
	return this.filter.create(ts);	
}
// other items to override?
}

ReversedWildcardFilterFactory (greatly simplified from SOLR)
========================================

public class ReversedWildcardFilterFactory {
  
  final private char markerChar = ReverseStringFilter.START_OF_HEADING_MARKER;
  final private boolean withOriginal;
  final private int maxPosAsterisk;
  final private int maxPosQuestion;
  final private int minTrailing;
  final private float maxFractionAsterisk;

  public ReversedWildcardFilterFactory(boolean withOriginal,int maxPosAsterisk,int maxPosQuestion,int
minTrailing,float maxFractionAsterisk) {
	  this.withOriginal=withOriginal; this.maxPosAsterisk=maxPosAsterisk;
	  this.maxPosQuestion=maxPosQuestion; this.minTrailing=minTrailing;
	  this.maxFractionAsterisk=maxFractionAsterisk;

  }
  
  public ReversedWildcardFilterFactory() {
	  this(true, 2, 1, 2, 0.0f);
  }

  public TokenStream create(TokenStream input) {
    return new ReversedWildcardFilter(input, this.withOriginal, this.markerChar);
  }
  
  /**
   * This method encapsulates the logic that determines whether
   * a query token should be reversed in order to use the
   * reversed terms in the index.
   * @param token input token.
   * @return true if input token should be reversed, false otherwise.
   */
  public boolean shouldReverse(String token) {
    int posQ = token.indexOf('?');
    int posA = token.indexOf('*');
    if (posQ == -1 && posA == -1) { // not a wildcard query
      return false;
    }
    int pos;
    int lastPos;
    int len = token.length();
    lastPos = token.lastIndexOf('?');
    pos = token.lastIndexOf('*');
    if (pos > lastPos) lastPos = pos;
    if (posQ != -1) {
      pos = posQ;
      if (posA != -1) {
        pos = Math.min(posQ, posA);
      }
    } else {
      pos = posA;
    }
    if (len - lastPos < this.minTrailing)  { // too few trailing chars
      return false;
    }
    if (posQ != -1 && posQ < this.maxPosQuestion) {  // leading '?'
      return true;
    }
    if (posA != -1 && posA < this.maxPosAsterisk) { // leading '*'
      return true;
    }
    // '*' in the leading part
    if (this.maxFractionAsterisk > 0.0f && pos < token.length() * this.maxFractionAsterisk)
{
      return true;
    }
    return false;
  }
  
  public char getMarkerChar() {
    return this.markerChar;
  }
  
}


SolrQueryParser (greatly simplified from SOLR)
================================
public class SolrQueryParser extends QueryParser {
  
  protected final ReversedWildcardFilterFactory reverseFactory;
  protected final Set<String> fieldsToReverse;

  public SolrQueryParser(Version version, String defaultField, Analyzer analyzer) {
    this(version, defaultField, analyzer, null, null);
  }
  public SolrQueryParser(Version version, String defaultField, Analyzer analyzer,
		  ReversedWildcardFilterFactory reverseFactory, Set<String> fieldsToSupportReverse)
{
    super(version, defaultField, analyzer); 
    this.reverseFactory=reverseFactory;
    this.fieldsToReverse=(fieldsToSupportReverse==null) ? new HashSet<String>() : new
HashSet<String>(fieldsToSupportReverse);
    //setLowercaseExpandedTerms(false);
    //setEnablePositionIncrements(true);
    checkAllowLeadingWildcards();
  }

  protected void checkAllowLeadingWildcards() {
    boolean allow = false;
    if (this.reverseFactory !=null) {
    	if (!this.fieldsToReverse.isEmpty()) allow=true;
    }
    // should be enabled on a per-field basis
    if (allow) {
      setAllowLeadingWildcard(true);
    }
  }

  @Override
  protected Query getWildcardQuery(String field, String termStrp) throws ParseException {
    // *:* -> MatchAllDocsQuery
	 String termStr=termStrp;
    if ("*".equals(field) && "*".equals(termStr)) {
      return newMatchAllDocsQuery();
    }
    
    // can we use reversed wildcards in this field?    
    ReversedWildcardFilterFactory factory = this.reverseFactory;
    if ((factory != null) && (this.fieldsToReverse.contains(field)) && (factory.shouldReverse(termStr)))
{
      int len = termStr.length();
      char[] chars = new char[len+1];
      chars[0] = factory.getMarkerChar();      
      termStr.getChars(0, len, chars, 1);
      ReversedWildcardFilter.reverse(chars, 1, len);
      termStr = new String(chars);
    }
    Query q = super.getWildcardQuery(field, termStr);
    if (q instanceof WildcardQuery) {
      // use a constant score query to avoid overflowing clauses
      WildcardQuery wildcardQuery = new WildcardQuery(((WildcardQuery)q).getTerm());
      return  wildcardQuery; 
    }
    return q;
  }
}

(ReversedWildcardFilter itself is more or less as it is in Lucene-Contrib)



---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message