Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm
Precedence: bulk
Reply-To: dev@lucene.apache.org
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: svn commit: r1649264 - in /lucene/dev/branches/branch_5x: ./ lucene/
 lucene/highlighter/
 lucene/highlighter/src/java/org/apache/lucene/search/highlight/
Date: Sat, 03 Jan 2015 23:25:31 -0000
To: commits@lucene.apache.org
From: dsmiley@apache.org
Message-Id: <20150103232533.06859AC0535@hades.apache.org>

Author: dsmiley
Date: Sat Jan  3 23:25:30 2015
New Revision: 1649264

URL: http://svn.apache.org/r1649264
Log:
LUCENE-6139: TokenGroup start/end offset getters should have been returning offsets of matching tokens when there are some.

Also made the Highlighter use the getters instead of direct field access.

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
    lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1649264&r1=1649263&r2=1649264&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Sat Jan  3 23:25:30 2015
@@ -390,6 +390,10 @@ Bug Fixes
   
 * LUCENE-6152: Fix double close problems in OutputStreamIndexOutput.
   (Uwe Schindler)
+
+* LUCENE-6139: Highlighter: TokenGroup start & end offset getters should have
+  been returning the offsets of just the matching tokens in the group when
+  there's a distinction. (David Smiley)
   
 Documentation
 

Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=1649264&r1=1649263&r2=1649264&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Sat Jan  3 23:25:30 2015
@@ -225,12 +225,12 @@ public class Highlighter
           throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
               +" exceeds length of provided text sized "+text.length());
         }
-        if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
+        if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct()))
         {
           //the current token is distinct from previous tokens -
           // markup the cached token group info
-          startOffset = tokenGroup.matchStartOffset;
-          endOffset = tokenGroup.matchEndOffset;
+          startOffset = tokenGroup.getStartOffset();
+          endOffset = tokenGroup.getEndOffset();
           tokenText = text.substring(startOffset, endOffset);
           String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
           //store any whitespace etc from between this and last group
@@ -261,11 +261,11 @@ public class Highlighter
       }
       currentFrag.setScore(fragmentScorer.getFragmentScore());
 
-      if(tokenGroup.numTokens>0)
+      if(tokenGroup.getNumTokens() >0)
       {
         //flush the accumulated text (same code as in above loop)
-        startOffset = tokenGroup.matchStartOffset;
-        endOffset = tokenGroup.matchEndOffset;
+        startOffset = tokenGroup.getStartOffset();
+        endOffset = tokenGroup.getEndOffset();
         tokenText = text.substring(startOffset, endOffset);
         String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
         //store any whitespace etc from between this and last group

Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java?rev=1649264&r1=1649263&r2=1649264&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java Sat Jan  3 23:25:30 2015
@@ -24,18 +24,20 @@ import org.apache.lucene.analysis.tokena
 
 /**
  * One, or several overlapping tokens, along with the score(s) and the scope of
- * the original text
+ * the original text.
  */
 public class TokenGroup {
 
   private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
-  Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
-  float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
-  int numTokens = 0;
-  int startOffset = 0;
-  int endOffset = 0;
-  float tot;
-  int matchStartOffset, matchEndOffset;
+
+  private Token[] tokens = new Token[MAX_NUM_TOKENS_PER_GROUP];
+  private float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
+  private int numTokens = 0;
+  private int startOffset = 0;
+  private int endOffset = 0;
+  private float tot;
+  private int matchStartOffset;
+  private int matchEndOffset;
 
   private OffsetAttribute offsetAtt;
   private CharTermAttribute termAtt;
@@ -47,8 +49,8 @@ public class TokenGroup {
 
   void addToken(float score) {
     if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
-      int termStartOffset = offsetAtt.startOffset();
-      int termEndOffset = offsetAtt.endOffset();
+      final int termStartOffset = offsetAtt.startOffset();
+      final int termEndOffset = offsetAtt.endOffset();
       if (numTokens == 0) {
         startOffset = matchStartOffset = termStartOffset;
         endOffset = matchEndOffset = termEndOffset;
@@ -58,8 +60,8 @@ public class TokenGroup {
         endOffset = Math.max(endOffset, termEndOffset);
         if (score > 0) {
           if (tot == 0) {
-            matchStartOffset = offsetAtt.startOffset();
-            matchEndOffset = offsetAtt.endOffset();
+            matchStartOffset = termStartOffset;
+            matchEndOffset = termEndOffset;
           } else {
             matchStartOffset = Math.min(matchStartOffset, termStartOffset);
             matchEndOffset = Math.max(matchEndOffset, termEndOffset);
@@ -84,15 +86,14 @@ public class TokenGroup {
     numTokens = 0;
     tot = 0;
   }
-  
-  /* 
-  * @param index a value between 0 and numTokens -1
-  * @return the "n"th token
-  */
- public Token getToken(int index)
- {
-     return tokens[index];
- }
+
+  /**
+   * @param index a value between 0 and numTokens -1
+   * @return the "n"th token
+   */
+  public Token getToken(int index) {
+    return tokens[index];
+  }
 
   /**
    * 
@@ -104,24 +105,26 @@ public class TokenGroup {
   }
 
   /**
-   * @return the end position in the original text
+   * @return the earliest start offset in the original text of a matching token in this group (score &gt; 0), or
+   * if there are none then the earliest offset of any token in the group.
    */
-  public int getEndOffset() {
-    return endOffset;
+  public int getStartOffset() {
+    return matchStartOffset;
   }
 
   /**
-   * @return the number of tokens in this group
+   * @return the latest end offset in the original text of a matching token in this group (score &gt; 0), or
+   * if there are none then {@link #getEndOffset()}.
    */
-  public int getNumTokens() {
-    return numTokens;
+  public int getEndOffset() {
+    return matchEndOffset;
   }
 
   /**
-   * @return the start position in the original text
+   * @return the number of tokens in this group
    */
-  public int getStartOffset() {
-    return startOffset;
+  public int getNumTokens() {
+    return numTokens;
   }
 
   /**
@@ -130,4 +133,5 @@ public class TokenGroup {
   public float getTotalScore() {
     return tot;
   }
+
 }