lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nkn...@apache.org
Subject [30/50] [abbrv] lucene-solr git commit: LUCENE-2229: Fix SimpleSpanFragmenter bug with adjacent stop-words
Date Mon, 08 Feb 2016 22:36:05 GMT
LUCENE-2229: Fix SimpleSpanFragmenter bug with adjacent stop-words


git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_5_4@1724097 13f79535-47bb-0310-9956-ffa450edef68


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ccd7ef23
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ccd7ef23
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ccd7ef23

Branch: refs/heads/branch_5_4
Commit: ccd7ef23ecbce8bde8a80d207096c412b5cc000e
Parents: eef1314
Author: Adrien Grand <jpountz@apache.org>
Authored: Mon Jan 11 17:38:19 2016 +0000
Committer: Adrien Grand <jpountz@apache.org>
Committed: Mon Jan 11 17:38:19 2016 +0000

----------------------------------------------------------------------
 .../search/highlight/SimpleSpanFragmenter.java  | 10 ++++----
 .../search/highlight/HighlighterTest.java       | 27 +++++++++++++++++++-
 2 files changed, 31 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ccd7ef23/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
index 4ca4bb2..0b46a8a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
@@ -51,7 +51,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
 
   /**
    * @param queryScorer QueryScorer that was used to score hits
-   * @param fragmentSize size in bytes of each fragment
+   * @param fragmentSize size in chars of each fragment
    */
   public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize) {
     this.fragmentSize = fragmentSize;
@@ -65,7 +65,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
   public boolean isNewFragment() {
     position += posIncAtt.getPositionIncrement();
 
-    if (waitForPos == position) {
+    if (waitForPos <= position) {
       waitForPos = -1;
     } else if (waitForPos != -1) {
       return false;
@@ -76,9 +76,9 @@ public class SimpleSpanFragmenter implements Fragmenter {
     if (wSpanTerm != null) {
       List<PositionSpan> positionSpans = wSpanTerm.getPositionSpans();
 
-      for (int i = 0; i < positionSpans.size(); i++) {
-        if (positionSpans.get(i).start == position) {
-          waitForPos = positionSpans.get(i).end + 1;
+      for (PositionSpan positionSpan : positionSpans) {
+        if (positionSpan.start == position) {
+          waitForPos = positionSpan.end + 1;
           break;
         }
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ccd7ef23/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
index db239ff..51d43bf 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@@ -130,7 +130,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements
Formatte
       "This piece of text refers to Kennedy at the beginning then has a longer piece of text
that is very long in the middle and finally ends with another reference to Kennedy",
       "JFK has been shot", "John Kennedy has been shot",
       "This text has a typo in referring to Keneddy",
-      "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets
is a the lets is a the lets is a the lets" };
+      "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets
is a the lets is a the lets is a the lets",
+      "Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter
needs to update the appropriate Attribute(s) in incrementToken(). The consumer, commonly the
Lucene indexer, consumes the data in the Attributes and then calls incrementToken() again
until it retuns false, which indicates that the end of the stream was reached. This means
that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data
in the Attribute instances. "
+  };
 
   // Convenience method for succinct tests; doesn't represent "best practice"
   private TokenStream getAnyTokenStream(String fieldName, int docId)
@@ -348,6 +350,29 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements
Formatte
     // throw any exceptions
   }
 
+  // LUCENE-2229
+  public void testSimpleSpanHighlighterWithStopWordsStraddlingFragmentBoundaries() throws
Exception {
+    doSearching(new PhraseQuery(FIELD_NAME, "all", "tokens"));
+
+    int maxNumFragmentsRequired = 1;
+
+    QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+    Highlighter highlighter = new Highlighter(scorer);
+
+    assertEquals("Must have one hit", 1, hits.totalHits);
+    for (int i = 0; i < hits.totalHits; i++) {
+      String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+      TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+
+      highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 36));
+
+      String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
+      if (VERBOSE) System.out.println("\t" + result);
+
+      assertTrue("Fragment must be less than 60 characters long", result.length() < 60);
+    }
+  }
+
   // LUCENE-1752
   public void testRepeatingTermsInMultBooleans() throws Exception {
     String content = "x y z a b c d e f g b c g";


Mime
View raw message