lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Paul Elschot <paul.elsc...@xs4all.nl>
Subject Re: Regexp Query
Date Wed, 25 Aug 2004 23:11:48 GMT
On Wednesday 25 August 2004 22:06, Naseeruddin Mohammad wrote:
> I want to develop regular expression based search for lucene. Figured
> that I need to write a Query which processes it for regular
> expression. I am planning to use apache regexp api.
> I need suggestions and tips in doing so.
>
> I think it is analogs as WildCharQuery.

Or PrefixQuery.
One possibility is to split the regex into a constant prefix
and the remainder. Then walk the terms (much as in PrefixQuery)
that have the prefix and match their remainders to the remaining regex.
You then have all the terms to pass to a BooleanQuery.
There is a default maximum of 1024 clauses there, so it might
make sense to have a minimum length for the constant prefix.
(The maximum nr. of clauses exists because each search Term
requires a buffer.)

Regards,
Paul Elschot

P.S. This might be useful, just ignore the defined class name and its superclass.
(It's a part of earlier posted code.)

/* Licensed under the Apache License, Version 2.0 */

import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;

import java.io.IOException;

import java.util.regex.Pattern;
import java.util.regex.Matcher;


public class SrndTruncQuery extends SimpleTerm {
  public SrndTruncQuery(String truncated, char unlimited, char mask) {
    super(false); /* not quoted */
    this.truncated = truncated;
    this.unlimited = unlimited;
    this.mask = mask;
    truncatedToPrefixAndPattern();
  }
  
  private final String truncated;
  private final char unlimited;
  private final char mask;
  
  private String prefix;
  private Pattern pattern;
  
  
  public String getTruncated() {return truncated;}
  
  public String toStringUnquoted() {return getTruncated();}

  
  protected boolean matchingChar(char c) {
    return (c != unlimited) && (c != mask);
  }

  protected void appendRegExpForChar(char c, StringBuffer re) {
    if (c == unlimited)
      re.append(".*");
    else if (c == mask)
      re.append(".");
    else
      re.append(c);
  }
  
  protected void truncatedToPrefixAndPattern() {
    int i = 0;
    while ((i < truncated.length()) && matchingChar(truncated.charAt(i))) {
      i++;
    }
    prefix = truncated.substring(0, i);
    
    StringBuffer re = new StringBuffer();
    while (i < truncated.length()) {
      appendRegExpForChar(truncated.charAt(i), re);
      i++;
    }
    pattern = Pattern.compile(re.toString());
  }
  
  public void visitMatchingTerms(
    IndexReader reader,
    String fieldName,
    MatchingTermVisitor mtv) throws IOException
  {
    boolean expanded = false;
    int prefixLength = prefix.length();
    TermEnum enumerator = reader.terms(new Term(fieldName, prefix));
    Matcher matcher = pattern.matcher("");
    try {
      do {
        Term term = enumerator.term();
        if (term != null) {
          String text = term.text();
          if ((! text.startsWith(prefix)) || (! term.field().equals(fieldName))) {
            break;
          } else {
            matcher.reset( text.substring(prefixLength));
            if (matcher.matches()) {
              mtv.visitMatchingTerm(term);
              expanded = true;
            }
          }
        }
      } while (enumerator.next());
    } finally {
      enumerator.close();
      matcher.reset();
    }
    if (! expanded) {
      System.out.println("No terms in " + fieldName + " field for: " + toString());
    }
  }
}


---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message