lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gol...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/search FuzzyQuery.java SloppyPhraseScorer.java PhraseScorer.java FuzzyTermEnum.java PhrasePrefixQuery.java PhraseQuery.java PhrasePositions.java ExactPhraseScorer.java
Date Fri, 01 Oct 2004 09:56:49 GMT
goller      2004/10/01 02:56:49

  Modified:    src/java/org/apache/lucene/search Tag: lucene_1_4_2_dev
                        FuzzyQuery.java SloppyPhraseScorer.java
                        PhraseScorer.java FuzzyTermEnum.java
                        PhrasePrefixQuery.java PhraseQuery.java
                        PhrasePositions.java ExactPhraseScorer.java
  Log:
  Extensions to FuzzyQuery, PhraseQuery, and
  PhrasePrefixQuery. (Backport)
  
  Revision  Changes    Path
  No                   revision
  No                   revision
  1.4.2.1   +70 -4     jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
  
  Index: FuzzyQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v
  retrieving revision 1.4
  retrieving revision 1.4.2.1
  diff -u -r1.4 -r1.4.2.1
  --- FuzzyQuery.java	29 Mar 2004 22:48:03 -0000	1.4
  +++ FuzzyQuery.java	1 Oct 2004 09:56:49 -0000	1.4.2.1
  @@ -20,17 +20,83 @@
   import org.apache.lucene.index.Term;
   import java.io.IOException;
   
  -/** Implements the fuzzy search query */
  +/** Implements the fuzzy search query. The similiarity measurement
  + * is based on the Levenshtein (edit distance) algorithm.
  + */
   public final class FuzzyQuery extends MultiTermQuery {
  -  public FuzzyQuery(Term term) {
  +  
  +  public final static float defaultMinSimilarity = 0.5f;
  +  private float minimumSimilarity;
  +  private int prefixLength;
  +  
  +  /**
  +   * Create a new FuzzyQuery that will match terms with a similarity 
  +   * of at least <code>minimumSimilarity</code> to <code>term</code>.
  +   * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
  +   * of that length is also required.
  +   * 
  +   * @param term the term to search for
  +   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
  +   *  between the query term and the matching terms. For example, for a
  +   *  <code>minimumSimilarity</code> of <code>0.5</code> a term
of the same length
  +   *  as the query term is considered similar to the query term if the edit distance
  +   *  between both terms is less than <code>length(term)*0.5</code>
  +   * @param prefixLength length of common (non-fuzzy) prefix
  +   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
  +   * or if prefixLength &lt; 0 or &gt; <code>term.text().length()</code>.
  +   */
  +  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException
{
       super(term);
  +    
  +    if (minimumSimilarity > 1.0f)
  +      throw new IllegalArgumentException("minimumSimilarity > 1");
  +    else if (minimumSimilarity < 0.0f)
  +      throw new IllegalArgumentException("minimumSimilarity < 0");
  +    this.minimumSimilarity = minimumSimilarity;
  +    
  +    if(prefixLength < 0)
  +        throw new IllegalArgumentException("prefixLength < 0");
  +    else if(prefixLength >= term.text().length())
  +        throw new IllegalArgumentException("prefixLength >= term.text().length()");
  +    this.prefixLength = prefixLength;
  +  }
  +  
  +  /**
  +   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
  +   */
  +  public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException
{
  +      this(term, minimumSimilarity, 0);
  +  }
  +
  +  /**
  +   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
  +   */
  +  public FuzzyQuery(Term term) {
  +    this(term, defaultMinSimilarity, 0);
  +  }
  +  
  +  /**
  +   * Returns the minimum similarity that is required for this query to match.
  +   * @return float value between 0.0 and 1.0
  +   */
  +  public float getMinSimilarity() {
  +    return minimumSimilarity;
     }
       
  +  /**
  +   * Returns the prefix length, i.e. the number of characters at the start
  +   * of a term that must be identical (not fuzzy) to the query term if the query
  +   * is to match that term. 
  +   */
  +  public int getPrefixLength() {
  +    return prefixLength;
  +  }
  +
     protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
  -    return new FuzzyTermEnum(reader, getTerm());
  +    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
     }
       
     public String toString(String field) {
  -    return super.toString(field) + '~';
  +    return super.toString(field) + '~' + Float.toString(minimumSimilarity);
     }
   }
  
  
  
  1.6.2.1   +3 -3      jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
  
  Index: SloppyPhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java,v
  retrieving revision 1.6
  retrieving revision 1.6.2.1
  diff -u -r1.6 -r1.6.2.1
  --- SloppyPhraseScorer.java	29 Mar 2004 22:48:04 -0000	1.6
  +++ SloppyPhraseScorer.java	1 Oct 2004 09:56:49 -0000	1.6.2.1
  @@ -23,9 +23,9 @@
   final class SloppyPhraseScorer extends PhraseScorer {
       private int slop;
   
  -    SloppyPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
  -                       int slop, byte[] norms) throws IOException {
  -        super(weight, tps, similarity, norms);
  +    SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity
similarity,
  +                       int slop, byte[] norms) {
  +        super(weight, tps, positions, similarity, norms);
           this.slop = slop;
       }
   
  
  
  
  1.14.2.1  +4 -3      jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java
  
  Index: PhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v
  retrieving revision 1.14
  retrieving revision 1.14.2.1
  diff -u -r1.14 -r1.14.2.1
  --- PhraseScorer.java	11 May 2004 17:18:04 -0000	1.14
  +++ PhraseScorer.java	1 Oct 2004 09:56:49 -0000	1.14.2.1
  @@ -32,8 +32,9 @@
   
     private float freq;
   
  -  PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
  -               byte[] norms) throws IOException {
  +
  +  PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
  +               byte[] norms) {
       super(similarity);
       this.norms = norms;
       this.weight = weight;
  @@ -41,7 +42,7 @@
   
       // convert tps to a list
       for (int i = 0; i < tps.length; i++) {
  -      PhrasePositions pp = new PhrasePositions(tps[i], i);
  +      PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
         if (last != null) {			  // add next to end of list
           last.next = pp;
         } else
  
  
  
  1.6.2.1   +55 -9     jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java
  
  Index: FuzzyTermEnum.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v
  retrieving revision 1.6
  retrieving revision 1.6.2.1
  diff -u -r1.6 -r1.6.2.1
  --- FuzzyTermEnum.java	11 May 2004 17:23:21 -0000	1.6
  +++ FuzzyTermEnum.java	1 Oct 2004 09:56:49 -0000	1.6.2.1
  @@ -26,21 +26,69 @@
     the enumeration is greater than all that precede it.  */
   public final class FuzzyTermEnum extends FilteredTermEnum {
       double distance;
  -    boolean fieldMatch = false;
       boolean endEnum = false;
   
       Term searchTerm = null;
       String field = "";
       String text = "";
       int textlen;
  +    String prefix = "";
  +    int prefixLength = 0;
  +    float minimumSimilarity;
  +    double scale_factor;
       
  +    
  +    /**
  +     * Empty prefix and minSimilarity of 0.5f are used.
  +     * 
  +     * @param reader
  +     * @param term
  +     * @throws IOException
  +     * @see #FuzzyTermEnum(IndexReader, Term, float, int)
  +     */
       public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
  +      this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
  +    }
  +    
  +    /**
  +     * This is the standard FuzzyTermEnum with an empty prefix.
  +     * 
  +     * @param reader
  +     * @param term
  +     * @param minSimilarity
  +     * @throws IOException
  +     * @see #FuzzyTermEnum(IndexReader, Term, float, int)
  +     */
  +    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException
{
  +      this(reader, term, minSimilarity, 0);
  +    }
  +    
  +    /**
  +     * Constructor for enumeration of all terms from specified <code>reader</code>
which share a prefix of
  +     * length <code>prefixLength</code> with <code>term</code>
and which have a fuzzy similarity &gt;
  +     * <code>minSimilarity</code>. 
  +     * 
  +     * @param reader Delivers terms.
  +     * @param term Pattern term.
  +     * @param minSimilarity Minimum required similarity for terms from the reader. Default
value is 0.5f.
  +     * @param prefixLength Length of required common prefix. Default value is 0.
  +     * @throws IOException
  +     */
  +    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength)
throws IOException {
           super();
  +        minimumSimilarity = minSimilarity;
  +        scale_factor = 1.0f / (1.0f - minimumSimilarity);
           searchTerm = term;
           field = searchTerm.field();
           text = searchTerm.text();
           textlen = text.length();
  -        setEnum(reader.terms(new Term(searchTerm.field(), "")));
  +        if(prefixLength > 0 && prefixLength < textlen){
  +            this.prefixLength = prefixLength;
  +            prefix = text.substring(0, prefixLength);
  +            text = text.substring(prefixLength);
  +            textlen = text.length();
  +        }
  +        setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
       }
       
       /**
  @@ -48,19 +96,20 @@
        calculate the distance between the given term and the comparing term. 
        */
       protected final boolean termCompare(Term term) {
  -        if (field == term.field()) {
  -            String target = term.text();
  +        String termText = term.text();
  +        if (field == term.field() && termText.startsWith(prefix)) {
  +            String target = termText.substring(prefixLength);
               int targetlen = target.length();
               int dist = editDistance(text, target, textlen, targetlen);
               distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
  -            return (distance > FUZZY_THRESHOLD);
  +            return (distance > minimumSimilarity);
           }
           endEnum = true;
           return false;
       }
       
       protected final float difference() {
  -        return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
  +        return (float)((distance - minimumSimilarity) * scale_factor);
       }
       
       public final boolean endEnum() {
  @@ -70,9 +119,6 @@
       /******************************
        * Compute Levenshtein distance
        ******************************/
  -    
  -    public static final double FUZZY_THRESHOLD = 0.5;
  -    public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
       
       /**
        Finds and returns the smallest of three integers 
  
  
  
  1.12.2.1  +42 -12    jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
  
  Index: PhrasePrefixQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v
  retrieving revision 1.12
  retrieving revision 1.12.2.1
  diff -u -r1.12 -r1.12.2.1
  --- PhrasePrefixQuery.java	29 Mar 2004 22:48:03 -0000	1.12
  +++ PhrasePrefixQuery.java	1 Oct 2004 09:56:49 -0000	1.12.2.1
  @@ -19,6 +19,7 @@
   import java.io.IOException;
   import java.util.ArrayList;
   import java.util.Iterator;
  +import java.util.Vector;
   
   import org.apache.lucene.index.IndexReader;
   import org.apache.lucene.index.MultipleTermPositions;
  @@ -40,42 +41,69 @@
   public class PhrasePrefixQuery extends Query {
     private String field;
     private ArrayList termArrays = new ArrayList();
  +  private Vector positions = new Vector();
   
     private int slop = 0;
   
  -  /* Sets the phrase slop for this query.
  +  /** Sets the phrase slop for this query.
      * @see PhraseQuery#setSlop(int)
      */
     public void setSlop(int s) { slop = s; }
   
  -  /* Sets the phrase slop for this query.
  +  /** Sets the phrase slop for this query.
      * @see PhraseQuery#getSlop()
      */
     public int getSlop() { return slop; }
   
  -  /* Add a single term at the next position in the phrase.
  +  /** Add a single term at the next position in the phrase.
      * @see PhraseQuery#add(Term)
      */
     public void add(Term term) { add(new Term[]{term}); }
   
  -  /* Add multiple terms at the next position in the phrase.  Any of the terms
  +  /** Add multiple terms at the next position in the phrase.  Any of the terms
      * may match.
      *
      * @see PhraseQuery#add(Term)
      */
     public void add(Term[] terms) {
  +    int position = 0;
  +    if (positions.size() > 0)
  +      position = ((Integer) positions.lastElement()).intValue() + 1;
  +
  +    add(terms, position);
  +  }
  +  
  +  /**
  +   * Allows to specify the relative position of terms within the phrase.
  +   * 
  +   * @see PhraseQuery#add(Term, int)
  +   * @param terms
  +   * @param position
  +   */
  +  public void add(Term[] terms, int position) {
       if (termArrays.size() == 0)
         field = terms[0].field();
  -    
  -    for (int i=0; i<terms.length; i++) {
  +
  +    for (int i = 0; i < terms.length; i++) {
         if (terms[i].field() != field) {
  -        throw new IllegalArgumentException
  -          ("All phrase terms must be in the same field (" + field + "): "
  -           + terms[i]);
  +        throw new IllegalArgumentException(
  +            "All phrase terms must be in the same field (" + field + "): "
  +                + terms[i]);
         }
       }
   
       termArrays.add(terms);
  +    positions.addElement(new Integer(position));
  +  }
  +  
  +  /**
  +   * Returns the relative positions of terms in this phrase.
  +   */
  +  public int[] getPositions() {
  +    int[] result = new int[positions.size()];
  +    for (int i = 0; i < positions.size(); i++)
  +      result[i] = ((Integer) positions.elementAt(i)).intValue();
  +    return result;
     }
   
     private class PhrasePrefixWeight implements Weight {
  @@ -131,10 +159,10 @@
         }
       
         if (slop == 0)
  -        return new ExactPhraseScorer(this, tps, getSimilarity(searcher),
  +        return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
                                        reader.norms(field));
         else
  -        return new SloppyPhraseScorer(this, tps, getSimilarity(searcher),
  +        return new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
                                         slop, reader.norms(field));
       }
       
  @@ -222,7 +250,9 @@
       Iterator i = termArrays.iterator();
       while (i.hasNext()) {
         Term[] terms = (Term[])i.next();
  -      buffer.append(terms[0].text() + (terms.length > 0 ? "*" : ""));
  +      buffer.append(terms[0].text() + (terms.length > 1 ? "*" : ""));
  +      if (i.hasNext())
  +        buffer.append(" ");
       }
       buffer.append("\"");
   
  
  
  
  1.15.2.1  +45 -12    jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java
  
  Index: PhraseQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java,v
  retrieving revision 1.15
  retrieving revision 1.15.2.1
  diff -u -r1.15 -r1.15.2.1
  --- PhraseQuery.java	29 Mar 2004 22:48:03 -0000	1.15
  +++ PhraseQuery.java	1 Oct 2004 09:56:49 -0000	1.15.2.1
  @@ -29,6 +29,7 @@
   public class PhraseQuery extends Query {
     private String field;
     private Vector terms = new Vector();
  +  private Vector positions = new Vector();
     private int slop = 0;
   
     /** Constructs an empty phrase query. */
  @@ -52,21 +53,51 @@
     /** Returns the slop.  See setSlop(). */
     public int getSlop() { return slop; }
   
  -  /** Adds a term to the end of the query phrase. */
  +  /**
  +   * Adds a term to the end of the query phrase.
  +   * The relative position of the term is the one immediately after the last term added.
  +   */
     public void add(Term term) {
  -    if (terms.size() == 0)
  -      field = term.field();
  -    else if (term.field() != field)
  -      throw new IllegalArgumentException
  -	("All phrase terms must be in the same field: " + term);
  -
  -    terms.addElement(term);
  +    int position = 0;
  +    if(positions.size() > 0)
  +        position = ((Integer) positions.lastElement()).intValue() + 1;
  +    
  +    add(term, position);
  +  }
  +  
  +  /**
  +   * Adds a term to the end of the query phrase.
  +   * The relative position of the term within the phrase is specified explicitly.
  +   * This allows e.g. phrases with more than one term at the same position
  +   * or phrases with gaps (e.g. in connection with stopwords).
  +   * 
  +   * @param term
  +   * @param position
  +   */
  +  public void add(Term term, int position) {
  +      if (terms.size() == 0)
  +          field = term.field();
  +      else if (term.field() != field)
  +          throw new IllegalArgumentException("All phrase terms must be in the same field:
" + term);
  +      
  +      terms.addElement(term);
  +      positions.addElement(new Integer(position));
     }
   
     /** Returns the set of terms in this phrase. */
     public Term[] getTerms() {
       return (Term[])terms.toArray(new Term[0]);
     }
  +  
  +  /**
  +   * Returns the relative positions of terms in this phrase.
  +   */
  +  public int[] getPositions() {
  +      int[] result = new int[positions.size()];
  +      for(int i = 0; i < positions.size(); i++)
  +          result[i] = ((Integer) positions.elementAt(i)).intValue();
  +      return result;
  +  }
   
     private class PhraseWeight implements Weight {
       private Searcher searcher;
  @@ -109,11 +140,11 @@
         }
   
         if (slop == 0)				  // optimize exact case
  -        return new ExactPhraseScorer(this, tps, getSimilarity(searcher),
  +        return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
                                        reader.norms(field));
         else
           return
  -          new SloppyPhraseScorer(this, tps, getSimilarity(searcher), slop,
  +          new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), slop,
                                    reader.norms(field));
         
       }
  @@ -244,14 +275,16 @@
       PhraseQuery other = (PhraseQuery)o;
       return (this.getBoost() == other.getBoost())
         && (this.slop == other.slop)
  -      &&  this.terms.equals(other.terms);
  +      &&  this.terms.equals(other.terms)
  +      && this.positions.equals(other.positions);
     }
   
     /** Returns a hash code value for this object.*/
     public int hashCode() {
       return Float.floatToIntBits(getBoost())
         ^ Float.floatToIntBits(slop)
  -      ^ terms.hashCode();
  +      ^ terms.hashCode()
  +      ^ positions.hashCode();
     }
   
   }
  
  
  
  1.3.2.1   +1 -1      jakarta-lucene/src/java/org/apache/lucene/search/PhrasePositions.java
  
  Index: PhrasePositions.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePositions.java,v
  retrieving revision 1.3
  retrieving revision 1.3.2.1
  diff -u -r1.3 -r1.3.2.1
  --- PhrasePositions.java	29 Mar 2004 22:48:03 -0000	1.3
  +++ PhrasePositions.java	1 Oct 2004 09:56:49 -0000	1.3.2.1
  @@ -27,7 +27,7 @@
     TermPositions tp;				  // stream of positions
     PhrasePositions next;				  // used to make lists
   
  -  PhrasePositions(TermPositions t, int o) throws IOException {
  +  PhrasePositions(TermPositions t, int o) {
       tp = t;
       offset = o;
     }
  
  
  
  1.6.2.1   +2 -2      jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
  
  Index: ExactPhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java,v
  retrieving revision 1.6
  retrieving revision 1.6.2.1
  diff -u -r1.6 -r1.6.2.1
  --- ExactPhraseScorer.java	11 May 2004 17:18:03 -0000	1.6
  +++ ExactPhraseScorer.java	1 Oct 2004 09:56:49 -0000	1.6.2.1
  @@ -21,9 +21,9 @@
   
   final class ExactPhraseScorer extends PhraseScorer {
   
  -  ExactPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
  +  ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
                       byte[] norms) throws IOException {
  -    super(weight, tps, similarity, norms);
  +    super(weight, tps, positions, similarity, norms);
     }
   
     protected final float phraseFreq() throws IOException {
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message