Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 68853 invoked from network); 12 Jun 2006 19:38:58 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur.apache.org with SMTP; 12 Jun 2006 19:38:58 -0000 Received: (qmail 3488 invoked by uid 500); 12 Jun 2006 19:38:58 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 3411 invoked by uid 500); 12 Jun 2006 19:38:58 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 3393 invoked by uid 99); 12 Jun 2006 19:38:57 -0000 Received: from asf.osuosl.org (HELO asf.osuosl.org) (140.211.166.49) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Jun 2006 12:38:57 -0700 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received-SPF: pass (asf.osuosl.org: local policy) Received: from [140.211.166.113] (HELO eris.apache.org) (140.211.166.113) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Jun 2006 12:38:56 -0700 Received: by eris.apache.org (Postfix, from userid 65534) id B06A91A983A; Mon, 12 Jun 2006 12:38:36 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r413732 - /lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java Date: Mon, 12 Jun 2006 19:38:36 -0000 To: java-commits@lucene.apache.org From: mharwood@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20060612193836.B06A91A983A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org X-Spam-Rating: minotaur.apache.org 1.6.2 0/1000/N Author: mharwood Date: Mon Jun 12 12:38:36 2006 New Revision: 413732 URL: http://svn.apache.org/viewvc?rev=413732&view=rev Log: Added optimization to ignore duplicate terms Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java?rev=413732&r1=413731&r2=413732&view=diff ============================================================================== --- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (original) +++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java Mon Jun 12 12:38:36 2006 @@ -4,6 +4,7 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; @@ -89,57 +90,61 @@ Token token=ts.next(); int corpusNumDocs=reader.numDocs(); Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects - + HashSet processedTerms=new HashSet(); while(token!=null) { - ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term - float minScore=0; - Term startTerm=internSavingTemplateTerm.createTerm(token.termText()); - FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength); - TermEnum origEnum = reader.terms(startTerm); - int df=0; - if(startTerm.equals(origEnum.term())) - { - df=origEnum.docFreq(); //store the df so all variants use same idf - } - int numVariants=0; - int totalVariantDocFreqs=0; - do - { - Term possibleMatch=fe.term(); - if(possibleMatch!=null) + if(!processedTerms.contains(token.termText())) + { + processedTerms.add(token.termText()); + ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term + float minScore=0; + Term startTerm=internSavingTemplateTerm.createTerm(token.termText()); + FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength); + TermEnum origEnum = reader.terms(startTerm); + int df=0; + if(startTerm.equals(origEnum.term())) { - numVariants++; - totalVariantDocFreqs+=fe.docFreq(); - float score=fe.difference(); - if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm); - variantsQ.insert(st); - minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore - } + df=origEnum.docFreq(); //store the df so all variants use same idf } - } - while(fe.next()); - if(numVariants==0) - { - //no variants to rank here - break; - } - int avgDf=totalVariantDocFreqs/numVariants; - if(df==0)//no direct match we can use as df for all variants - { - df=avgDf; //use avg df of all variants - } - - // take the top variants (scored by edit distance) and reset the score - // to include an IDF factor then add to the global queue for ranking overall top query terms - int size = variantsQ.size(); - for(int i = 0; i < size; i++) - { - ScoreTerm st = (ScoreTerm) variantsQ.pop(); - st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs); - q.insert(st); - } + int numVariants=0; + int totalVariantDocFreqs=0; + do + { + Term possibleMatch=fe.term(); + if(possibleMatch!=null) + { + numVariants++; + totalVariantDocFreqs+=fe.docFreq(); + float score=fe.difference(); + if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ + ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm); + variantsQ.insert(st); + minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore + } + } + } + while(fe.next()); + if(numVariants==0) + { + //no variants to rank here + break; + } + int avgDf=totalVariantDocFreqs/numVariants; + if(df==0)//no direct match we can use as df for all variants + { + df=avgDf; //use avg df of all variants + } + + // take the top variants (scored by edit distance) and reset the score + // to include an IDF factor then add to the global queue for ranking overall top query terms + int size = variantsQ.size(); + for(int i = 0; i < size; i++) + { + ScoreTerm st = (ScoreTerm) variantsQ.pop(); + st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs); + q.insert(st); + } + } token=ts.next(); } }