Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 85189 invoked from network); 6 Nov 2009 20:15:48 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 6 Nov 2009 20:15:48 -0000 Received: (qmail 15980 invoked by uid 500); 6 Nov 2009 20:15:48 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 15909 invoked by uid 500); 6 Nov 2009 20:15:47 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 15900 invoked by uid 99); 6 Nov 2009 20:15:47 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Nov 2009 20:15:47 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Nov 2009 20:15:45 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id E3F802388893; Fri, 6 Nov 2009 20:15:23 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r833544 - in /lucene/java/trunk/src: java/org/apache/lucene/search/FuzzyQuery.java test/org/apache/lucene/search/TestFuzzyQuery.java Date: Fri, 06 Nov 2009 20:15:23 -0000 To: java-commits@lucene.apache.org From: uschindler@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091106201523.E3F802388893@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: uschindler Date: Fri Nov 6 20:15:23 2009 New Revision: 833544 URL: http://svn.apache.org/viewvc?rev=833544&view=rev Log: LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory. Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java Modified: lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=833544&r1=833543&r2=833544&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/search/FuzzyQuery.java Fri Nov 6 20:15:23 2009 @@ -19,10 +19,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; +import java.util.PriorityQueue; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. @@ -132,40 +132,40 @@ return new TermQuery(term); } + int maxSize = BooleanQuery.getMaxClauseCount(); + PriorityQueue stQueue = new PriorityQueue(1024); FilteredTermEnum enumerator = getEnum(reader); - int maxClauseCount = BooleanQuery.getMaxClauseCount(); - ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); - ScoreTerm reusableST = null; - try { + ScoreTerm bottomSt = null; do { - float score = 0.0f; - Term t = enumerator.term(); - if (t != null) { - score = enumerator.difference(); - if (reusableST == null) { - reusableST = new ScoreTerm(t, score); - } else if (score >= reusableST.score) { - // reusableST holds the last "rejected" entry, so, if - // this new score is not better than that, there's no - // need to try inserting it - reusableST.score = score; - reusableST.term = t; - } else { - continue; + final Term t = enumerator.term(); + if (t == null) break; + ScoreTerm st = new ScoreTerm(t, enumerator.difference()); + if (stQueue.size() < maxSize) { + // record the current bottom item + if (bottomSt == null || st.compareTo(bottomSt) > 0) { + bottomSt = st; + } + // add to PQ, as it is not yet filled up + stQueue.offer(st); + } else { + assert bottomSt != null; + // only add to PQ, if the ScoreTerm is greater than the current bottom, + // as all entries will be enqueued after the current bottom and will never be visible + if (st.compareTo(bottomSt) < 0) { + stQueue.offer(st); } - - reusableST = stQueue.insertWithOverflow(reusableST); } + //System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")"); } while (enumerator.next()); } finally { enumerator.close(); } BooleanQuery query = new BooleanQuery(true); - int size = stQueue.size(); + int size = Math.min(stQueue.size(), maxSize); for(int i = 0; i < size; i++){ - ScoreTerm st = stQueue.pop(); + ScoreTerm st = stQueue.poll(); TermQuery tq = new TermQuery(st.term); // found a match tq.setBoost(getBoost() * st.score); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query @@ -173,10 +173,28 @@ return query; } + + protected static class ScoreTerm implements Comparable { + public Term term; + public float score; + + public ScoreTerm(Term term, float score){ + this.term = term; + this.score = score; + } + + public int compareTo(ScoreTerm other) { + if (this.score == other.score) + return this.term.compareTo(other.term); + else + // inverse ordering!!! + return Float.compare(other.score, this.score); + } + } @Override public String toString(String field) { - StringBuilder buffer = new StringBuilder(); + final StringBuilder buffer = new StringBuilder(); if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); @@ -188,32 +206,6 @@ return buffer.toString(); } - protected static class ScoreTerm { - public Term term; - public float score; - - public ScoreTerm(Term term, float score){ - this.term = term; - this.score = score; - } - } - - protected static class ScoreTermQueue extends PriorityQueue { - - public ScoreTermQueue(int size){ - initialize(size); - } - - @Override - protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) { - if (termA.score == termB.score) - return termA.term.compareTo(termB.term) > 0; - else - return termA.score < termB.score; - } - - } - @Override public int hashCode() { final int prime = 31; Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java?rev=833544&r1=833543&r2=833544&view=diff ============================================================================== --- lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java (original) +++ lucene/java/trunk/src/test/org/apache/lucene/search/TestFuzzyQuery.java Fri Nov 6 20:15:23 2009 @@ -17,6 +17,9 @@ * limitations under the License. */ +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -76,6 +79,23 @@ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); + + // test BooleanQuery.maxClauseCount + int savedClauseCount = BooleanQuery.getMaxClauseCount(); + try { + BooleanQuery.setMaxClauseCount(2); + // This query would normally return 3 documents, because 3 terms match: + query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("only 2 documents should match", 2, hits.length); + Set possibleTerms = new HashSet(Arrays.asList("aaaaa","aaaab")); + for (int i = 0; i < hits.length; i++) { + final String term = searcher.doc(hits[i].doc).get("field"); + assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term)); + } + } finally { + BooleanQuery.setMaxClauseCount(savedClauseCount); + } // not similar enough: query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);