Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 1419817451 for ; Sat, 3 Jan 2015 23:25:39 +0000 (UTC) Received: (qmail 70782 invoked by uid 500); 3 Jan 2015 23:25:34 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 70485 invoked by uid 99); 3 Jan 2015 23:25:34 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 03 Jan 2015 23:25:34 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 06859AC0535; Sat, 3 Jan 2015 23:25:31 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1649264 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/highlight/ Date: Sat, 03 Jan 2015 23:25:31 -0000 To: commits@lucene.apache.org From: dsmiley@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150103232533.06859AC0535@hades.apache.org> Author: dsmiley Date: Sat Jan 3 23:25:30 2015 New Revision: 1649264 URL: http://svn.apache.org/r1649264 Log: LUCENE-6139: TokenGroup start/end offset getters should have been returning offsets of matching tokens when there are some. Also made the Highlighter use the getters instead of direct field access. Modified: lucene/dev/branches/branch_5x/ (props changed) lucene/dev/branches/branch_5x/lucene/ (props changed) lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed) lucene/dev/branches/branch_5x/lucene/highlighter/ (props changed) lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1649264&r1=1649263&r2=1649264&view=diff ============================================================================== --- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original) +++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Sat Jan 3 23:25:30 2015 @@ -390,6 +390,10 @@ Bug Fixes * LUCENE-6152: Fix double close problems in OutputStreamIndexOutput. (Uwe Schindler) + +* LUCENE-6139: Highlighter: TokenGroup start & end offset getters should have + been returning the offsets of just the matching tokens in the group when + there's a distinction. (David Smiley) Documentation Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=1649264&r1=1649263&r2=1649264&view=diff ============================================================================== --- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original) +++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Sat Jan 3 23:25:30 2015 @@ -225,12 +225,12 @@ public class Highlighter throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() +" exceeds length of provided text sized "+text.length()); } - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) + if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; + startOffset = tokenGroup.getStartOffset(); + endOffset = tokenGroup.getEndOffset(); tokenText = text.substring(startOffset, endOffset); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group @@ -261,11 +261,11 @@ public class Highlighter } currentFrag.setScore(fragmentScorer.getFragmentScore()); - if(tokenGroup.numTokens>0) + if(tokenGroup.getNumTokens() >0) { //flush the accumulated text (same code as in above loop) - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; + startOffset = tokenGroup.getStartOffset(); + endOffset = tokenGroup.getEndOffset(); tokenText = text.substring(startOffset, endOffset); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java?rev=1649264&r1=1649263&r2=1649264&view=diff ============================================================================== --- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (original) +++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java Sat Jan 3 23:25:30 2015 @@ -24,18 +24,20 @@ import org.apache.lucene.analysis.tokena /** * One, or several overlapping tokens, along with the score(s) and the scope of - * the original text + * the original text. */ public class TokenGroup { private static final int MAX_NUM_TOKENS_PER_GROUP = 50; - Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; - float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; - int numTokens = 0; - int startOffset = 0; - int endOffset = 0; - float tot; - int matchStartOffset, matchEndOffset; + + private Token[] tokens = new Token[MAX_NUM_TOKENS_PER_GROUP]; + private float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; + private int numTokens = 0; + private int startOffset = 0; + private int endOffset = 0; + private float tot; + private int matchStartOffset; + private int matchEndOffset; private OffsetAttribute offsetAtt; private CharTermAttribute termAtt; @@ -47,8 +49,8 @@ public class TokenGroup { void addToken(float score) { if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { - int termStartOffset = offsetAtt.startOffset(); - int termEndOffset = offsetAtt.endOffset(); + final int termStartOffset = offsetAtt.startOffset(); + final int termEndOffset = offsetAtt.endOffset(); if (numTokens == 0) { startOffset = matchStartOffset = termStartOffset; endOffset = matchEndOffset = termEndOffset; @@ -58,8 +60,8 @@ public class TokenGroup { endOffset = Math.max(endOffset, termEndOffset); if (score > 0) { if (tot == 0) { - matchStartOffset = offsetAtt.startOffset(); - matchEndOffset = offsetAtt.endOffset(); + matchStartOffset = termStartOffset; + matchEndOffset = termEndOffset; } else { matchStartOffset = Math.min(matchStartOffset, termStartOffset); matchEndOffset = Math.max(matchEndOffset, termEndOffset); @@ -84,15 +86,14 @@ public class TokenGroup { numTokens = 0; tot = 0; } - - /* - * @param index a value between 0 and numTokens -1 - * @return the "n"th token - */ - public Token getToken(int index) - { - return tokens[index]; - } + + /** + * @param index a value between 0 and numTokens -1 + * @return the "n"th token + */ + public Token getToken(int index) { + return tokens[index]; + } /** * @@ -104,24 +105,26 @@ public class TokenGroup { } /** - * @return the end position in the original text + * @return the earliest start offset in the original text of a matching token in this group (score > 0), or + * if there are none then the earliest offset of any token in the group. */ - public int getEndOffset() { - return endOffset; + public int getStartOffset() { + return matchStartOffset; } /** - * @return the number of tokens in this group + * @return the latest end offset in the original text of a matching token in this group (score > 0), or + * if there are none then {@link #getEndOffset()}. */ - public int getNumTokens() { - return numTokens; + public int getEndOffset() { + return matchEndOffset; } /** - * @return the start position in the original text + * @return the number of tokens in this group */ - public int getStartOffset() { - return startOffset; + public int getNumTokens() { + return numTokens; } /** @@ -130,4 +133,5 @@ public class TokenGroup { public float getTotalScore() { return tot; } + }