lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r833544 - in /lucene/java/trunk/src: java/org/apache/lucene/search/FuzzyQuery.java test/org/apache/lucene/search/TestFuzzyQuery.java
Date Fri, 06 Nov 2009 20:15:23 GMT
Author: uschindler
Date: Fri Nov  6 20:15:23 2009
New Revision: 833544

URL: http://svn.apache.org/viewvc?rev=833544&view=rev
Log:
LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support
BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory.

Modified:
    lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=833544&r1=833543&r2=833544&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java Fri Nov  6 20:15:23
2009
@@ -19,10 +19,10 @@
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.ToStringUtils;
 
 import java.io.IOException;
+import java.util.PriorityQueue;
 
 /** Implements the fuzzy search query. The similarity measurement
  * is based on the Levenshtein (edit distance) algorithm.
@@ -132,40 +132,40 @@
       return new TermQuery(term);
     }
 
+    int maxSize = BooleanQuery.getMaxClauseCount();
+    PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(1024);
     FilteredTermEnum enumerator = getEnum(reader);
-    int maxClauseCount = BooleanQuery.getMaxClauseCount();
-    ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
-    ScoreTerm reusableST = null;
-
     try {
+      ScoreTerm bottomSt = null;
       do {
-        float score = 0.0f;
-        Term t = enumerator.term();
-        if (t != null) {
-          score = enumerator.difference();
-          if (reusableST == null) {
-            reusableST = new ScoreTerm(t, score);
-          } else if (score >= reusableST.score) {
-            // reusableST holds the last "rejected" entry, so, if
-            // this new score is not better than that, there's no
-            // need to try inserting it
-            reusableST.score = score;
-            reusableST.term = t;
-          } else {
-            continue;
+        final Term t = enumerator.term();
+        if (t == null) break;
+        ScoreTerm st = new ScoreTerm(t, enumerator.difference());
+        if (stQueue.size() < maxSize) {
+          // record the current bottom item
+          if (bottomSt == null || st.compareTo(bottomSt) > 0) {
+            bottomSt = st;
+          }
+          // add to PQ, as it is not yet filled up
+          stQueue.offer(st);
+        } else {
+          assert bottomSt != null;
+          // only add to PQ, if the ScoreTerm is greater than the current bottom,
+          // as all entries will be enqueued after the current bottom and will never be visible
+          if (st.compareTo(bottomSt) < 0) {
+            stQueue.offer(st);
           }
-
-          reusableST = stQueue.insertWithOverflow(reusableST);
         }
+        //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
       } while (enumerator.next());
     } finally {
       enumerator.close();
     }
     
     BooleanQuery query = new BooleanQuery(true);
-    int size = stQueue.size();
+    int size = Math.min(stQueue.size(), maxSize);
     for(int i = 0; i < size; i++){
-      ScoreTerm st = stQueue.pop();
+      ScoreTerm st = stQueue.poll();
       TermQuery tq = new TermQuery(st.term);      // found a match
       tq.setBoost(getBoost() * st.score); // set the boost
       query.add(tq, BooleanClause.Occur.SHOULD);          // add to query
@@ -173,10 +173,28 @@
 
     return query;
   }
+  
+  protected static class ScoreTerm implements Comparable<ScoreTerm> {
+    public Term term;
+    public float score;
+    
+    public ScoreTerm(Term term, float score){
+      this.term = term;
+      this.score = score;
+    }
+    
+    public int compareTo(ScoreTerm other) {
+      if (this.score == other.score)
+        return this.term.compareTo(other.term);
+      else
+        // inverse ordering!!!
+        return Float.compare(other.score, this.score);
+    }
+  }
     
   @Override
   public String toString(String field) {
-    StringBuilder buffer = new StringBuilder();
+    final StringBuilder buffer = new StringBuilder();
     if (!term.field().equals(field)) {
         buffer.append(term.field());
         buffer.append(":");
@@ -188,32 +206,6 @@
     return buffer.toString();
   }
   
-  protected static class ScoreTerm {
-    public Term term;
-    public float score;
-    
-    public ScoreTerm(Term term, float score){
-      this.term = term;
-      this.score = score;
-    }
-  }
-  
-  protected static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
-    
-    public ScoreTermQueue(int size){
-      initialize(size);
-    }
-    
-    @Override
-    protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
-      if (termA.score == termB.score)
-        return termA.term.compareTo(termB.term) > 0;
-      else
-        return termA.score < termB.score;
-    }
-    
-  }
-
   @Override
   public int hashCode() {
     final int prime = 31;

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java?rev=833544&r1=833543&r2=833544&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java Fri Nov  6 20:15:23
2009
@@ -17,6 +17,9 @@
  * limitations under the License.
  */
 
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Arrays;
 import java.io.IOException;
 
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -76,6 +79,23 @@
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
  
     hits = searcher.search(query, null, 1000).scoreDocs;
     assertEquals(1, hits.length);
+    
+    // test BooleanQuery.maxClauseCount
+    int savedClauseCount = BooleanQuery.getMaxClauseCount();
+    try {
+      BooleanQuery.setMaxClauseCount(2);
+      // This query would normally return 3 documents, because 3 terms match:
+      query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity,
3);   
+      hits = searcher.search(query, null, 1000).scoreDocs;
+      assertEquals("only 2 documents should match", 2, hits.length);
+      Set<String> possibleTerms = new HashSet<String>(Arrays.asList("aaaaa","aaaab"));
+      for (int i = 0; i < hits.length; i++) {
+        final String term = searcher.doc(hits[i].doc).get("field");
+        assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term));
+      }
+    } finally {
+      BooleanQuery.setMaxClauseCount(savedClauseCount);
+    }
 
     // not similar enough:
     query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
 	



Mime
View raw message