lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dor...@apache.org
Subject svn commit: r531733 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/ src/test/org/apache/lucene/search/
Date Tue, 24 Apr 2007 05:32:48 GMT
Author: doronc
Date: Mon Apr 23 22:32:47 2007
New Revision: 531733

URL: http://svn.apache.org/viewvc?view=rev&rev=531733
Log:
LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
For example query "B C B"~2 matches the doc "A B C D E".
Search time cost of this fix is 4%, for sloppy phrase search. 
This fix also partially brings back the the tests checkSkipTo() in QueryUtils, 
(which was disabled by LUCENE-730), but this is now invoked 
only for non Boolean scorers.


Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java
    lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java
    lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java
    lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java
    lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
    lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Apr 23 22:32:47 2007
@@ -98,6 +98,10 @@
     the instance of IndexWriter (but, not the index itself) by
     referencing already deleted segments.  This bug was only present
     in 2.2 (trunk), ie was never released.  (Mike McCandless)
+    
+13. LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs.
+    For example query "B C B"~2 matches the doc "A B C D E". (Doron Cohen)
+    
 
 New features
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/ExactPhraseScorer.java Mon Apr 23
22:32:47 2007
@@ -22,27 +22,30 @@
 
 final class ExactPhraseScorer extends PhraseScorer {
 
-  ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+  ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
                     byte[] norms) {
-    super(weight, tps, positions, similarity, norms);
+    super(weight, tps, offsets, similarity, norms);
   }
 
   protected final float phraseFreq() throws IOException {
     // sort list with pq
+    pq.clear();
     for (PhrasePositions pp = first; pp != null; pp = pp.next) {
       pp.firstPosition();
       pq.put(pp);				  // build pq from list
     }
     pqToList();					  // rebuild list from pq
 
+    // for counting how many times the exact phrase is found in current document,
+    // just count how many times all PhrasePosition's have exactly the same position.   
     int freq = 0;
     do {					  // find position w/ all terms
       while (first.position < last.position) {	  // scan forward in first
-	do {
-	  if (!first.nextPosition())
-	    return (float)freq;
-	} while (first.position < last.position);
-	firstToLast();
+	    do {
+	      if (!first.nextPosition())
+	        return (float)freq;
+	    } while (first.position < last.position);
+	      firstToLast();
       }
       freq++;					  // all equal: a match
     } while (last.nextPosition());

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhrasePositions.java Mon Apr 23 22:32:47
2007
@@ -20,6 +20,9 @@
 import java.io.IOException;
 import org.apache.lucene.index.*;
 
+/**
+ * Position of a term in a document that takes into account the term offset within the phrase.

+ */
 final class PhrasePositions {
   int doc;					  // current doc
   int position;					  // position in doc
@@ -27,6 +30,7 @@
   int offset;					  // position in phrase
   TermPositions tp;				  // stream of positions
   PhrasePositions next;				  // used to make lists
+  boolean repeats;       // there's other pp for same term (e.g. query="1st word 2nd word"~1)

 
   PhrasePositions(TermPositions t, int o) {
     tp = t;
@@ -61,6 +65,12 @@
     nextPosition();
   }
 
+  /**
+   * Go to next location of this term current document, and set 
+   * <code>position</code> as <code>location - offset</code>, so
that a 
+   * matching exact phrase is easily identified when all PhrasePositions 
+   * have exactly the same <code>position</code>.
+   */
   final boolean nextPosition() throws IOException {
     if (count-- > 0) {				  // read subsequent pos's
       position = tp.nextPosition() - offset;

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQueue.java Mon Apr 23 22:32:47
2007
@@ -28,7 +28,12 @@
     PhrasePositions pp1 = (PhrasePositions)o1;
     PhrasePositions pp2 = (PhrasePositions)o2;
     if (pp1.doc == pp2.doc) 
-      return pp1.position < pp2.position;
+      if (pp1.position == pp2.position)
+        // same doc and pp.position, so decide by actual term positions. 
+        // rely on: pp.position == tp.position - offset. 
+        return pp1.offset < pp2.offset;
+      else
+        return pp1.position < pp2.position;
     else
       return pp1.doc < pp2.doc;
   }

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhraseScorer.java Mon Apr 23 22:32:47
2007
@@ -21,6 +21,16 @@
 
 import org.apache.lucene.index.*;
 
+/** Expert: Scoring functionality for phrase queries.
+ * <br>A document is considered matching if it contains the phrase-query terms  
+ * at "valid" positons. What "valid positions" are
+ * depends on the type of the phrase query: for an exact phrase query terms are required

+ * to appear in adjacent locations, while for a sloppy phrase query some distance between

+ * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
+ * is invoked for each document containing all the phrase query terms, in order to 
+ * compute the frequency of the phrase query in that document. A non zero frequency
+ * means a match. 
+ */
 abstract class PhraseScorer extends Scorer {
   private Weight weight;
   protected byte[] norms;
@@ -31,19 +41,23 @@
   protected PhraseQueue pq;
   protected PhrasePositions first, last;
 
-  private float freq;
+  private float freq; //prhase frequency in current doc as computed by phraseFreq().
 
 
-  PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+  PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
                byte[] norms) {
     super(similarity);
     this.norms = norms;
     this.weight = weight;
     this.value = weight.getValue();
 
-    // convert tps to a list
+    // convert tps to a list of phrase positions.
+    // note: phrase-position differs from term-position in that its position
+    // reflects the phrase offset: pp.pos = tp.pos - offset.
+    // this allows to easily identify a matching (exact) phrase 
+    // when all PhrasePositions have exactly the same position.
     for (int i = 0; i < tps.length; i++) {
-      PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
+      PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
       if (last != null) {			  // add next to end of list
         last.next = pp;
       } else
@@ -94,6 +108,7 @@
   }
 
   public boolean skipTo(int target) throws IOException {
+    firstTime = false;
     for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
       more = pp.skipTo(target);
     }
@@ -102,6 +117,13 @@
     return doNext();
   }
 
+  /**
+   * For a document containing all the phrase query terms, compute the
+   * frequency of the phrase in that document. 
+   * A non zero frequency means a match.
+   * <br>Note, that containing all phrase terms does not guarantee a match - they have
to be found in matching locations.  
+   * @return frequency of the phrase in current doc, 0 if not found. 
+   */
   protected abstract float phraseFreq() throws IOException;
 
   private void init() throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Mon Apr 23
22:32:47 2007
@@ -20,38 +20,58 @@
 import org.apache.lucene.index.TermPositions;
 
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
 
 final class SloppyPhraseScorer extends PhraseScorer {
     private int slop;
+    private PhrasePositions repeats[];
+    private boolean checkedRepeats;
 
-    SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+    SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity,
                        int slop, byte[] norms) {
-        super(weight, tps, positions, similarity, norms);
+        super(weight, tps, offsets, similarity, norms);
         this.slop = slop;
     }
 
+    /**
+     * Score a candidate doc for all slop-valid position-combinations (matches) 
+     * encountered while traversing/hopping the PhrasePositions.
+     * <br> The score contribution of a match depends on the distance: 
+     * <br> - highest score for distance=0 (exact match).
+     * <br> - score gets lower as distance gets higher.
+     * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:

+     * once for "a b" (distance=0), and once for "b a" (distance=2).
+     * <br>Pssibly not all valid combinations are encountered, because for efficiency
 
+     * we always propagate the least PhrasePosition. This allows to base on 
+     * PriorityQueue and move forward faster. 
+     * As result, for example, document "a b c b a"
+     * would score differently for queries "a b c"~4 and "c b a"~4, although 
+     * they really are equivalent. 
+     * Similarly, for doc "a b c b a f g", query "c b"~2 
+     * would get same score as "g f"~2, although "c b"~2 could be matched twice.
+     * We may want to fix this in the future (currently not, for performance reasons).
+     */
     protected final float phraseFreq() throws IOException {
-        pq.clear();
-        int end = 0;
-        for (PhrasePositions pp = first; pp != null; pp = pp.next) {
-            pp.firstPosition();
-            if (pp.position > end)
-                end = pp.position;
-            pq.put(pp);				  // build pq from list
-        }
-
+        int end = initPhrasePositions();
+        
         float freq = 0.0f;
-        boolean done = false;
-        do {
+        boolean done = (end<0);
+        while (!done) {
             PhrasePositions pp = (PhrasePositions) pq.pop();
             int start = pp.position;
             int next = ((PhrasePositions) pq.top()).position;
-            for (int pos = start; pos <= next; pos = pp.position) {
-                start = pos;				  // advance pp to min window
+
+            boolean tpsDiffer = true;
+            for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
+                if (pos<=next && tpsDiffer)
+                    start = pos;				  // advance pp to min window
                 if (!pp.nextPosition()) {
-                    done = true;				  // ran out of a term -- done
+                    done = true;          // ran out of a term -- done
                     break;
                 }
+                tpsDiffer = !pp.repeats || termPositionsDiffer(pp);
             }
 
             int matchLength = end - start;
@@ -61,8 +81,113 @@
             if (pp.position > end)
                 end = pp.position;
             pq.put(pp);				  // restore pq
-        } while (!done);
+        }
 
         return freq;
+    }
+    
+    
+    /**
+     * Init PhrasePositions in place.
+     * There is a one time initializatin for this scorer:
+     * <br>- Put in repeats[] each pp that has another pp with same position in the
doc.
+     * <br>- Also mark each such pp by pp.repeats = true.
+     * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that
check efficient.
+     * In particular, this allows to score queries with no repetiotions with no overhead
due to this computation.
+     * <br>- Example 1 - query with no repetitions: "ho my"~2
+     * <br>- Example 2 - query with repetitions: "ho my my"~2
+     * <br>- Example 3 - query with repetitions: "my ho my"~2
+     * <br>Init per doc w/repeats in query, includes propagating some repeating pp's
to avoid false phrase detection.  
+     * @return end (max position), or -1 if any term ran out (i.e. done) 
+     * @throws IOException 
+     */
+    private int initPhrasePositions() throws IOException {
+        int end = 0;
+        
+        // no repeats at all (most common case is also the simplest one)
+        if (checkedRepeats && repeats==null) {
+            // build queue from list
+            pq.clear();
+            for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+                pp.firstPosition();
+                if (pp.position > end)
+                    end = pp.position;
+                pq.put(pp);         // build pq from list
+            }
+            return end;
+        }
+        
+        // position the pp's
+        for (PhrasePositions pp = first; pp != null; pp = pp.next)
+            pp.firstPosition();
+        
+        // one time initializatin for this scorer
+        if (!checkedRepeats) {
+            checkedRepeats = true;
+            // check for repeats
+            HashMap m = null;
+            for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+                int tpPos = pp.position + pp.offset;
+                for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
+                    int tpPos2 = pp2.position + pp2.offset;
+                    if (tpPos2 == tpPos) { 
+                        if (m == null)
+                            m = new HashMap();
+                        pp.repeats = true;
+                        pp2.repeats = true;
+                        m.put(pp,null);
+                        m.put(pp2,null);
+                    }
+                }
+            }
+            if (m!=null)
+                repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]);
+        }
+        
+        // with repeats must advance some repeating pp's so they all start with differing
tp's       
+        if (repeats!=null) {
+            // must propagate higher offsets first (otherwise might miss matches).
+            Arrays.sort(repeats,  new Comparator() {
+                public int compare(Object x, Object y) {
+                    return ((PhrasePositions) y).offset - ((PhrasePositions) x).offset;
+                }});
+            // now advance them
+            for (int i = 0; i < repeats.length; i++) {
+                PhrasePositions pp = repeats[i];
+                while (!termPositionsDiffer(pp)) {
+                  if (!pp.nextPosition())
+                      return -1;    // ran out of a term -- done  
+                } 
+            }
+        }
+      
+        // build queue from list
+        pq.clear();
+        for (PhrasePositions pp = first; pp != null; pp = pp.next) {
+            if (pp.position > end)
+                end = pp.position;
+            pq.put(pp);         // build pq from list
+        }
+
+        return end;
+    }
+
+    // disalow two pp's to have the same tp position, so that same word twice 
+    // in query would go elswhere in the matched doc
+    private boolean termPositionsDiffer(PhrasePositions pp) {
+        // efficiency note: a more efficient implemention could keep a map between repeating

+        // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats

+        // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b,
pp1c. 
+        // However this would complicate code, for a rather rare case, so choice is to compromise
here.
+        int tpPos = pp.position + pp.offset;
+        for (int i = 0; i < repeats.length; i++) {
+            PhrasePositions pp2 = repeats[i];
+            if (pp2 == pp)
+                continue;
+            int tpPos2 = pp2.position + pp2.offset;
+            if (tpPos2 == tpPos)
+                return false;
+        }
+        return true;
     }
 }

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/QueryUtils.java Mon Apr 23 22:32:47
2007
@@ -68,18 +68,15 @@
 
   /** various query sanity checks on a searcher */
   public static void check(Query q1, Searcher s) {
-// Disabled because this started failing after LUCENE-730 patch was applied
-//     try {
+    try {
       check(q1);
-/* disabled for use of BooleanScorer in BooleanScorer2.
       if (s!=null && s instanceof IndexSearcher) {
         IndexSearcher is = (IndexSearcher)s;
-//         checkSkipTo(q1,is);
+        checkSkipTo(q1,is);
       }
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
- */
   }
 
   /** alternate scorer skipTo(),skipTo(),next(),next(),skipTo(),skipTo(), etc
@@ -87,42 +84,71 @@
    */
   public static void checkSkipTo(final Query q, final IndexSearcher s) throws IOException
{
     //System.out.println("Checking "+q);
-    final Weight w = q.weight(s);
-    final Scorer scorer = w.scorer(s.getIndexReader());
-
-    // FUTURE: ensure scorer.doc()==-1
-    
+   
     if (BooleanQuery.getUseScorer14()) return;  // 1.4 doesn't support skipTo
 
-    final int[] which = new int[1];
-    final int[] sdoc = new int[] {-1};
-    final float maxDiff = 1e-5f;
-    s.search(q,new HitCollector() {
-      public void collect(int doc, float score) {
-        try {
-          boolean more = (which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next();
-          sdoc[0] = scorer.doc();
-          float scorerScore = scorer.score();
-          float scoreDiff = Math.abs(score-scorerScore);
-          scoreDiff=0; // TODO: remove this go get LUCENE-697 failures 
-          if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
-            throw new RuntimeException("ERROR matching docs:"
-                    +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
-                    +" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
-                    +"\n\thitCollector.doc=" + doc + " score="+score
-                    +"\n\t Scorer=" + scorer
-                    +"\n\t Query=" + q
-                    +"\n\t Searcher=" + s
-            );
-          }
-        } catch (IOException e) {
-          throw new RuntimeException(e);
-        }
+    final int skip_op = 0;
+    final int next_op = 1;
+    final int orders [][] = {
+        {skip_op},
+        {next_op},
+        {skip_op, next_op},
+        {next_op, skip_op},
+        {skip_op, skip_op, next_op, next_op},
+        {next_op, next_op, skip_op, skip_op},
+        {skip_op, skip_op, skip_op, next_op, next_op},
+    };
+    for (int k = 0; k < orders.length; k++) {
+      final int order[] = orders[k];
+      //System.out.print("Order:");for (int i = 0; i < order.length; i++) System.out.print(order[i]==skip_op
? " skip()":" next()"); System.out.println();
+      final int opidx[] = {0};
+
+      final Weight w = q.weight(s);
+      final Scorer scorer = w.scorer(s.getIndexReader());
+      
+      if (scorer instanceof BooleanScorer || scorer instanceof BooleanScorer2) {
+        return; // TODO change this if BooleanScorers would once again guarantee docs in
order. 
       }
-    });
 
-    // make sure next call to scorer is false.
-    TestCase.assertFalse((which[0]++&0x02)==0 ? scorer.skipTo(sdoc[0]+1) : scorer.next());
+      // FUTURE: ensure scorer.doc()==-1
+
+      final int[] sdoc = new int[] {-1};
+      final float maxDiff = 1e-5f;
+      s.search(q,new HitCollector() {
+        public void collect(int doc, float score) {
+          try {
+            int op = order[(opidx[0]++)%order.length];
+            //System.out.println(op==skip_op ? "skip("+(sdoc[0]+1)+")":"next()");
+            boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
+            sdoc[0] = scorer.doc();
+            float scorerScore = scorer.score();
+            float scoreDiff = Math.abs(score-scorerScore);
+            if (more==false || doc != sdoc[0] || scoreDiff>maxDiff) {
+              StringBuffer sbord = new StringBuffer();
+              for (int i = 0; i < order.length; i++) 
+                sbord.append(order[i]==skip_op ? " skip()":" next()");
+              throw new RuntimeException("ERROR matching docs:"
+                  +"\n\tscorer.more=" + more + " doc="+sdoc[0] + " scorerScore="+scorerScore
+                  +" scoreDiff="+scoreDiff + " maxDiff="+maxDiff
+                  +"\n\thitCollector.doc=" + doc + " score="+score
+                  +"\n\t Scorer=" + scorer
+                  +"\n\t Query=" + q
+                  +"\n\t Searcher=" + s
+                  +"\n\t Order=" + sbord
+              );
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      });
+      
+      // make sure next call to scorer is false.
+      int op = order[(opidx[0]++)%order.length];
+      //System.out.println(op==skip_op ? "last: skip()":"last: next()");
+      boolean more = op==skip_op ? scorer.skipTo(sdoc[0]+1) : scorer.next();
+      TestCase.assertFalse(more);
+    }
   }
 
 }

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java?view=diff&rev=531733&r1=531732&r2=531733
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestPhraseQuery.java Mon Apr 23 22:32:47
2007
@@ -35,6 +35,10 @@
  * @author Erik Hatcher
  */
 public class TestPhraseQuery extends TestCase {
+
+  /** threshold for comparing floats */
+  public static final float SCORE_COMP_THRESH = 1e-6f;
+  
   private IndexSearcher searcher;
   private PhraseQuery query;
   private RAMDirectory directory;
@@ -57,8 +61,17 @@
     doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES,
Field.Index.TOKENIZED));
     Fieldable repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES,
Field.Index.TOKENIZED);
     doc.add(repeatedField);
+    doc.add(new Field("palindrome", "one two three two one", Field.Store.YES, Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+    
+    doc = new Document();
+    doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
     writer.addDocument(doc);
     
+    doc = new Document();
+    doc.add(new Field("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+
     writer.optimize();
     writer.close();
 
@@ -341,12 +354,188 @@
     query.add(new Term("repeated", "part"));
     query.add(new Term("repeated", "second"));
     query.add(new Term("repeated", "part"));
+    query.setSlop(100);
+
+    Hits hits = searcher.search(query);
+    assertEquals("slop of 100 just right", 1, hits.length());
+    QueryUtils.check(query,searcher);
+
     query.setSlop(99);
 
+    hits = searcher.search(query);
+    assertEquals("slop of 99 not enough", 0, hits.length());
+    QueryUtils.check(query,searcher);
+  }
+
+  // work on two docs like this: "phrase exist notexist exist found"
+  public void testNonExistingPhrase() throws IOException {
+    // phrase without repetitions that exists in 2 docs
+    query.add(new Term("nonexist", "phrase"));
+    query.add(new Term("nonexist", "notexist"));
+    query.add(new Term("nonexist", "found"));
+    query.setSlop(2); // would be found this way
+
     Hits hits = searcher.search(query);
-    assertEquals(0, hits.length());
+    assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length());
+    QueryUtils.check(query,searcher);
+
+    // phrase with repetitions that exists in 2 docs
+    query = new PhraseQuery();
+    query.add(new Term("nonexist", "phrase"));
+    query.add(new Term("nonexist", "exist"));
+    query.add(new Term("nonexist", "exist"));
+    query.setSlop(1); // would be found 
+
+    hits = searcher.search(query);
+    assertEquals("phrase with repetitions exists in two docs", 2, hits.length());
+    QueryUtils.check(query,searcher);
+
+    // phrase I with repetitions that does not exist in any doc
+    query = new PhraseQuery();
+    query.add(new Term("nonexist", "phrase"));
+    query.add(new Term("nonexist", "notexist"));
+    query.add(new Term("nonexist", "phrase"));
+    query.setSlop(1000); // would not be found no matter how high the slop is
+
+    hits = searcher.search(query);
+    assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
+    QueryUtils.check(query,searcher);
+
+    // phrase II with repetitions that does not exist in any doc
+    query = new PhraseQuery();
+    query.add(new Term("nonexist", "phrase"));
+    query.add(new Term("nonexist", "exist"));
+    query.add(new Term("nonexist", "exist"));
+    query.add(new Term("nonexist", "exist"));
+    query.setSlop(1000); // would not be found no matter how high the slop is
+
+    hits = searcher.search(query);
+    assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length());
+    QueryUtils.check(query,searcher);
+
+  }
+
+  /**
+   * Working on a 2 fields like this:
+   *    Field("field", "one two three four five")
+   *    Field("palindrome", "one two three two one")
+   * Phrase of size 2 occuriong twice, once in order and once in reverse, 
+   * because doc is a palyndrome, is counted twice. 
+   * Also, in this case order in query does not matter. 
+   * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
  
+   */
+  public void testPalyndrome2() throws Exception {
+    
+    // search on non palyndrome, find phrase with no slop, using exact phrase scorer
+    query.setSlop(0); // to use exact phrase scorer
+    query.add(new Term("field", "two"));
+    query.add(new Term("field", "three"));
+    Hits hits = searcher.search(query);
+    assertEquals("phrase found with exact phrase scorer", 1, hits.length());
+    float score0 = hits.score(0);
+    //System.out.println("(exact) field: two three: "+score0);
+    QueryUtils.check(query,searcher);
+
+    // search on non palyndrome, find phrase with slop 2, though no slop required here.
+    query.setSlop(2); // to use sloppy scorer 
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score1 = hits.score(0);
+    //System.out.println("(sloppy) field: two three: "+score1);
+    assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0,
score1, SCORE_COMP_THRESH);
     QueryUtils.check(query,searcher);
 
+    // search ordered in palyndrome, find it twice
+    query = new PhraseQuery();
+    query.setSlop(2); // must be at least two for both ordered and reversed to match
+    query.add(new Term("palindrome", "two"));
+    query.add(new Term("palindrome", "three"));
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score2 = hits.score(0);
+    //System.out.println("palindrome: two three: "+score2);
+    QueryUtils.check(query,searcher);
+    
+    //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

+    //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
+
+    // search reveresed in palyndrome, find it twice
+    query = new PhraseQuery();
+    query.setSlop(2); // must be at least two for both ordered and reversed to match
+    query.add(new Term("palindrome", "three"));
+    query.add(new Term("palindrome", "two"));
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score3 = hits.score(0);
+    //System.out.println("palindrome: three two: "+score3);
+    QueryUtils.check(query,searcher);
+
+    //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

+    //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
+    //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
   }
 
+  /**
+   * Working on a 2 fields like this:
+   *    Field("field", "one two three four five")
+   *    Field("palindrome", "one two three two one")
+   * Phrase of size 3 occuriong twice, once in order and once in reverse, 
+   * because doc is a palyndrome, is counted twice. 
+   * Also, in this case order in query does not matter. 
+   * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
  
+   */
+  public void testPalyndrome3() throws Exception {
+    
+    // search on non palyndrome, find phrase with no slop, using exact phrase scorer
+    query.setSlop(0); // to use exact phrase scorer
+    query.add(new Term("field", "one"));
+    query.add(new Term("field", "two"));
+    query.add(new Term("field", "three"));
+    Hits hits = searcher.search(query);
+    assertEquals("phrase found with exact phrase scorer", 1, hits.length());
+    float score0 = hits.score(0);
+    //System.out.println("(exact) field: one two three: "+score0);
+    QueryUtils.check(query,searcher);
+
+    // search on non palyndrome, find phrase with slop 3, though no slop required here.
+    query.setSlop(4); // to use sloppy scorer 
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score1 = hits.score(0);
+    //System.out.println("(sloppy) field: one two three: "+score1);
+    assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0,
score1, SCORE_COMP_THRESH);
+    QueryUtils.check(query,searcher);
+
+    // search ordered in palyndrome, find it twice
+    query = new PhraseQuery();
+    query.setSlop(4); // must be at least four for both ordered and reversed to match
+    query.add(new Term("palindrome", "one"));
+    query.add(new Term("palindrome", "two"));
+    query.add(new Term("palindrome", "three"));
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score2 = hits.score(0);
+    //System.out.println("palindrome: one two three: "+score2);
+    QueryUtils.check(query,searcher);
+    
+    //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

+    //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
+
+    // search reveresed in palyndrome, find it twice
+    query = new PhraseQuery();
+    query.setSlop(4); // must be at least four for both ordered and reversed to match
+    query.add(new Term("palindrome", "three"));
+    query.add(new Term("palindrome", "two"));
+    query.add(new Term("palindrome", "one"));
+    hits = searcher.search(query);
+    assertEquals("just sloppy enough", 1, hits.length());
+    float score3 = hits.score(0);
+    //System.out.println("palindrome: three two one: "+score3);
+    QueryUtils.check(query,searcher);
+
+    //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().

+    //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
+    //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
+  }
+  
 }



Mime
View raw message