Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 69269 invoked from network); 16 Aug 2006 21:42:40 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur.apache.org with SMTP; 16 Aug 2006 21:42:40 -0000 Received: (qmail 15894 invoked by uid 500); 16 Aug 2006 21:42:40 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 15880 invoked by uid 500); 16 Aug 2006 21:42:40 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 15869 invoked by uid 99); 16 Aug 2006 21:42:39 -0000 Received: from asf.osuosl.org (HELO asf.osuosl.org) (140.211.166.49) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 16 Aug 2006 14:42:39 -0700 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received-SPF: pass (asf.osuosl.org: local policy) Received: from [140.211.166.113] (HELO eris.apache.org) (140.211.166.113) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 16 Aug 2006 14:42:39 -0700 Received: by eris.apache.org (Postfix, from userid 65534) id 014981A981A; Wed, 16 Aug 2006 14:42:19 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r432042 - in /lucene/java/trunk/contrib/highlighter/src: java/org/apache/lucene/search/highlight/Highlighter.java test/org/apache/lucene/search/highlight/HighlighterTest.java Date: Wed, 16 Aug 2006 21:42:18 -0000 To: java-commits@lucene.apache.org From: mharwood@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20060816214219.014981A981A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org X-Spam-Rating: minotaur.apache.org 1.6.2 0/1000/N Author: mharwood Date: Wed Aug 16 14:42:18 2006 New Revision: 432042 URL: http://svn.apache.org/viewvc?rev=432042&view=rev Log: Fix for http://issues.apache.org/jira/browse/LUCENE-645 with added Junit tests for this bug and related problem where last fragment can be huge if highlighting huge documents. Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=432042&r1=432041&r2=432042&view=diff ============================================================================== --- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original) +++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Wed Aug 16 14:42:18 2006 @@ -21,6 +21,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.PriorityQueue; @@ -221,8 +222,8 @@ textFragmenter.start(text); TokenGroup tokenGroup=new TokenGroup(); - - while ((token = tokenStream.next()) != null) + token = tokenStream.next(); + while ((token!= null)&&(token.startOffset()0)&&(tokenGroup.isDistinct(token))) { @@ -251,12 +252,13 @@ } } - tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); + tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); - if(lastEndOffset>maxDocBytesToAnalyze) - { - break; - } +// if(lastEndOffset>maxDocBytesToAnalyze) +// { +// break; +// } + token = tokenStream.next(); } currentFrag.setScore(fragmentScorer.getFragmentScore()); @@ -274,9 +276,18 @@ lastEndOffset=Math.max(lastEndOffset,endOffset); } - // append text after end of last token -// if (lastEndOffset < text.length()) -// newText.append(encoder.encodeText(text.substring(lastEndOffset))); + //Test what remains of the original text beyond the point where we stopped analyzing + if ( +// if there is text beyond the last token considered.. + (lastEndOffset < text.length()) + && +// and that text is not too large... + (text.length()help me [54-65]", match); + } public void testGetBestFragmentsFilteredQuery() throws Exception { RangeFilter rf=new RangeFilter("contents","john","john",true,true); @@ -338,6 +350,40 @@ "us from finding matches for this record: " + numHighlights + " found", numHighlights == 0); } + public void testMaxSizeHighlightTruncates() throws IOException + { + String goodWord="goodtoken"; + String stopWords[]={"stoppedtoken"}; + + TermQuery query= new TermQuery( new Term( "data", goodWord )); + SimpleHTMLFormatter fm=new SimpleHTMLFormatter(); + Highlighter hg = new Highlighter(fm, new QueryScorer( query )); + hg.setTextFragmenter( new NullFragmenter() ); + + String match = null; + StringBuffer sb=new StringBuffer(); + sb.append(goodWord); + for(int i=0;i<10000;i++) + { + sb.append(" "); + sb.append(stopWords[0]); + } + + hg.setMaxDocBytesToAnalyze(100); + match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length()