lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From k...@apache.org
Subject svn commit: r1166530 - in /lucene/dev/trunk/lucene/contrib: ./ highlighter/src/java/org/apache/lucene/search/vectorhighlight/ highlighter/src/test/org/apache/lucene/search/vectorhighlight/
Date Thu, 08 Sep 2011 05:35:03 GMT
Author: koji
Date: Thu Sep  8 05:35:02 2011
New Revision: 1166530

URL: http://svn.apache.org/viewvc?rev=1166530&view=rev
Log:
LUCENE-1824: add BoundaryScanner

Added:
    lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java
    lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1166530&r1=1166529&r2=1166530&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Sep  8 05:35:02 2011
@@ -103,6 +103,10 @@ New Features
    added support for simple numeric queries, such as <age:4>, in contrib
    query parser (Vinicius Barros via Uwe Schindler)
 
+ * LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
+   SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
+   can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
+
 Changes in runtime behavior:
 
  * LUCENE-1768: StandardQueryConfigHandler now uses NumericFieldConfigListener

Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java?rev=1166530&r1=1166529&r2=1166530&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
(original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
Thu Sep  8 05:35:02 2011
@@ -48,14 +48,24 @@ public abstract class BaseFragmentsBuild
   };
   public static final String[] COLORED_POST_TAGS = { "</b>" };
   private char multiValuedSeparator = ' ';
+  private final BoundaryScanner boundaryScanner;
   
   protected BaseFragmentsBuilder(){
     this( new String[]{ "<b>" }, new String[]{ "</b>" } );
   }
   
   protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
+    this(preTags, postTags, new SimpleBoundaryScanner());
+  }
+  
+  protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){
+    this( new String[]{ "<b>" }, new String[]{ "</b>" }, boundaryScanner );
+  }
+  
+  protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner
){
     this.preTags = preTags;
     this.postTags = postTags;
+    this.boundaryScanner = boundaryScanner;
   }
   
   static Object checkTagsArgument( Object tags ){
@@ -135,28 +145,35 @@ public abstract class BaseFragmentsBuild
 
   protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo
fragInfo,
       String[] preTags, String[] postTags, Encoder encoder ){
-    final int s = fragInfo.startOffset;
-    return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset
), s,
-        preTags, postTags, encoder );
-  }
-  
-  private String makeFragment( WeightedFragInfo fragInfo, String src, int s,
-      String[] preTags, String[] postTags, Encoder encoder ){
     StringBuilder fragment = new StringBuilder();
+    final int s = fragInfo.getStartOffset();
+    int[] modifiedStartOffset = { s };
+    String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(),
modifiedStartOffset );
     int srcIndex = 0;
-    for( SubInfo subInfo : fragInfo.subInfos ){
-      for( Toffs to : subInfo.termsOffsets ){
+    for( SubInfo subInfo : fragInfo.getSubInfos() ){
+      for( Toffs to : subInfo.getTermsOffsets() ){
         fragment
-          .append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) )
-          .append( getPreTag( preTags, subInfo.seqnum ) )
-          .append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset -
s ) ) )
-          .append( getPostTag( postTags, subInfo.seqnum ) );
-        srcIndex = to.endOffset - s;
+          .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0]
) ) )
+          .append( getPreTag( preTags, subInfo.getSeqnum() ) )
+          .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0],
to.getEndOffset() - modifiedStartOffset[0] ) ) )
+          .append( getPostTag( postTags, subInfo.getSeqnum() ) );
+        srcIndex = to.getEndOffset() - modifiedStartOffset[0];
       }
     }
     fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
     return fragment.toString();
   }
+
+  protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
+      int startOffset, int endOffset, int[] modifiedStartOffset ){
+    while( buffer.length() < endOffset && index[0] < values.length ){
+      buffer.append( values[index[0]++].stringValue() );
+      buffer.append( getMultiValuedSeparator() );
+    }
+    int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset(
buffer, endOffset );
+    modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
+    return buffer.substring( modifiedStartOffset[0], eo );
+  }
   
   protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
       int startOffset, int endOffset ){

Added: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java?rev=1166530&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java
(added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java
Thu Sep  8 05:35:02 2011
@@ -0,0 +1,40 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ */
+public interface BoundaryScanner {
+
+  /**
+   * Scan backward to find end offset.
+   * @param buffer scanned object
+   * @param start start offset to begin
+   * @return the found start offset
+   */
+  public int findStartOffset( StringBuilder buffer, int start );
+
+  /**
+   * Scan forward to find start offset.
+   * @param buffer scanned object
+   * @param start start offset to begin
+   * @return the found end offset
+   */
+  public int findEndOffset( StringBuilder buffer, int start );
+}

Added: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java?rev=1166530&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java
(added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java
Thu Sep  8 05:35:02 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.BreakIterator;
+
+/**
+ * A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find
+ * boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE}
is the default).
+ */
+public class BreakIteratorBoundaryScanner implements BoundaryScanner {
+  
+  final BreakIterator bi;
+
+  public BreakIteratorBoundaryScanner(BreakIterator bi){
+    this.bi = bi;
+  }
+
+  public int findStartOffset(StringBuilder buffer, int start) {
+    // avoid illegal start offset
+    if( start > buffer.length() || start < 1 ) return start;
+    bi.setText(buffer.substring(0, start));
+    bi.last();
+    return bi.previous();
+  }
+
+  public int findEndOffset(StringBuilder buffer, int start) {
+    // avoid illegal start offset
+    if( start > buffer.length() || start < 0 ) return start;
+    bi.setText(buffer.substring(start));
+    return bi.next() + start;
+  }
+}

Added: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java?rev=1166530&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java
(added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java
Thu Sep  8 05:35:02 2011
@@ -0,0 +1,81 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+public class SimpleBoundaryScanner implements BoundaryScanner {
+  
+  public static final int DEFAULT_MAX_SCAN = 20;
+  public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t',
'\n'};
+
+  protected int maxScan;
+  protected Set<Character> boundaryChars;
+  
+  public SimpleBoundaryScanner(){
+    this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS );
+  }
+  
+  public SimpleBoundaryScanner( int maxScan ){
+    this( maxScan, DEFAULT_BOUNDARY_CHARS );
+  }
+  
+  public SimpleBoundaryScanner( Character[] boundaryChars ){
+    this( DEFAULT_MAX_SCAN, boundaryChars );
+  }
+  
+  public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){
+    this.maxScan = maxScan;
+    this.boundaryChars = new HashSet<Character>();
+    this.boundaryChars.addAll(Arrays.asList(boundaryChars));
+  }
+  
+  public SimpleBoundaryScanner( int maxScan, Set<Character> boundaryChars ){
+    this.maxScan = maxScan;
+    this.boundaryChars = boundaryChars;
+  }
+
+  public int findStartOffset(StringBuilder buffer, int start) {
+    // avoid illegal start offset
+    if( start > buffer.length() || start < 1 ) return start;
+    int offset, count = maxScan;
+    for( offset = start; offset > 0 && count > 0; count-- ){
+      // found?
+      if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset;
+      offset--;
+    }
+    // not found
+    return start;
+  }
+
+  public int findEndOffset(StringBuilder buffer, int start) {
+    // avoid illegal start offset
+    if( start > buffer.length() || start < 0 ) return start;
+    int offset, count = maxScan;
+    //for( offset = start; offset <= buffer.length() && count > 0; count--
){
+    for( offset = start; offset < buffer.length() && count > 0; count-- ){
+      // found?
+      if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset;
+      offset++;
+    }
+    // not found
+    return start;
+  }
+}

Added: lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java?rev=1166530&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java
(added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java
Thu Sep  8 05:35:02 2011
@@ -0,0 +1,91 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class BreakIteratorBoundaryScannerTest extends LuceneTestCase {
+  static final String TEXT =
+    "Apache Lucene(TM) is a high-performance, full-featured text search engine library written
entirely in Java." +
+    "\nIt is a technology suitable for nearly any application that requires\n" +
+    "full-text search, especially cross-platform. \nApache Lucene is an open source project
available for free download.";
+
+  public void testOutOfRange() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH);
+    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+    
+    int start = TEXT.length() + 1;
+    assertEquals(start, scanner.findStartOffset(text, start));
+    assertEquals(start, scanner.findEndOffset(text, start));
+    start = 0;
+    assertEquals(start, scanner.findStartOffset(text, start));
+    start = -1;
+    assertEquals(start, scanner.findEndOffset(text, start));
+  }
+
+  public void testWordBoundary() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH);
+    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+    
+    int start = TEXT.indexOf("formance");
+    int expected = TEXT.indexOf("high-performance");
+    testFindStartOffset(text, start, expected, scanner);
+
+    expected = TEXT.indexOf(", full");
+    testFindEndOffset(text, start, expected, scanner);
+  }
+
+  public void testSentenceBoundary() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BreakIterator bi = BreakIterator.getSentenceInstance();
+    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+    
+    int start = TEXT.indexOf("any application");
+    int expected = TEXT.indexOf("It is a");
+    testFindStartOffset(text, start, expected, scanner);
+
+    expected = TEXT.indexOf("Apache Lucene is an open source");
+    testFindEndOffset(text, start, expected, scanner);
+  }
+
+  public void testLineBoundary() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BreakIterator bi = BreakIterator.getLineInstance();
+    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
+    
+    int start = TEXT.indexOf("any application");
+    int expected = TEXT.indexOf("nearly");
+    testFindStartOffset(text, start, expected, scanner);
+
+    expected = TEXT.indexOf("application that requires");
+    testFindEndOffset(text, start, expected, scanner);
+  }
+  
+  private void testFindStartOffset(StringBuilder text, int start, int expected, BoundaryScanner
scanner) throws Exception {
+    assertEquals(expected, scanner.findStartOffset(text, start));
+  }
+  
+  private void testFindEndOffset(StringBuilder text, int start, int expected, BoundaryScanner
scanner) throws Exception {
+    assertEquals(expected, scanner.findEndOffset(text, start));
+  }
+}

Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java?rev=1166530&r1=1166529&r2=1166530&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java
(original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java
Thu Sep  8 05:35:02 2011
@@ -36,8 +36,8 @@ public class ScoreOrderFragmentsBuilderT
     assertEquals( 3, f.length );
     // check score order
     assertEquals( "<b>c</b> <b>a</b> <b>a</b> b b ",
f[0] );
-    assertEquals( "b b <b>a</b> b <b>a</b> b b b b b ", f[1] );
-    assertEquals( "<b>a</b> b b b b b b b b b ", f[2] );
+    assertEquals( "b b <b>a</b> b <b>a</b> b b b b b c", f[1] );
+    assertEquals( "<b>a</b> b b b b b b b b b b", f[2] );
   }
 
   private FieldFragList ffl(Query query, String indexValue ) throws Exception {

Added: lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java?rev=1166530&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java
(added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java
Thu Sep  8 05:35:02 2011
@@ -0,0 +1,55 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class SimpleBoundaryScannerTest extends LuceneTestCase {
+  static final String TEXT =
+    "Apache Lucene(TM) is a high-performance, full-featured\ntext search engine library written
entirely in Java.";
+
+  public void testFindStartOffset() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BoundaryScanner scanner = new SimpleBoundaryScanner();
+    
+    // test out of range
+    int start = TEXT.length() + 1;
+    assertEquals(start, scanner.findStartOffset(text, start));
+    start = 0;
+    assertEquals(start, scanner.findStartOffset(text, start));
+    
+    start = TEXT.indexOf("formance");
+    int expected = TEXT.indexOf("high-performance");
+    assertEquals(expected, scanner.findStartOffset(text, start));
+  }
+
+  public void testFindEndOffset() throws Exception {
+    StringBuilder text = new StringBuilder(TEXT);
+    BoundaryScanner scanner = new SimpleBoundaryScanner();
+    
+    // test out of range
+    int start = TEXT.length() + 1;
+    assertEquals(start, scanner.findEndOffset(text, start));
+    start = -1;
+    assertEquals(start, scanner.findEndOffset(text, start));
+    
+    start = TEXT.indexOf("full-");
+    int expected = TEXT.indexOf("\ntext");
+    assertEquals(expected, scanner.findEndOffset(text, start));
+  }
+}

Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java?rev=1166530&r1=1166529&r2=1166530&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
(original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
Thu Sep  8 05:35:02 2011
@@ -50,7 +50,7 @@ public class SimpleFragmentsBuilderTest 
     String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
     // 3 snippets requested, but should be 2
     assertEquals( 2, f.length );
-    assertEquals( "<b>a</b> b b b b b b b b b ", f[0] );
+    assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
     assertEquals( "b b <b>a</b> b <b>a</b> b ", f[1] );
   }
   
@@ -63,8 +63,8 @@ public class SimpleFragmentsBuilderTest 
     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
     String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
     assertEquals( 3, f.length );
-    assertEquals( "<b>a</b> b b b b b b b b b ", f[0] );
-    assertEquals( "b b <b>a</b> b <b>a</b> b b b b b ", f[1] );
+    assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
+    assertEquals( "b b <b>a</b> b <b>a</b> b b b b b c", f[1] );
     assertEquals( "<b>c</b> <b>a</b> <b>a</b> b b ",
f[2] );
   }
   
@@ -94,7 +94,7 @@ public class SimpleFragmentsBuilderTest 
     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
-    assertEquals( " b c  <b>d</b> e ", sfb.createFragment( reader, 0, F, ffl
) );
+    assertEquals( "a b c  <b>d</b> e ", sfb.createFragment( reader, 0, F, ffl
) );
   }
   
   public void test1PhraseLongMV() throws Exception {
@@ -106,7 +106,7 @@ public class SimpleFragmentsBuilderTest 
     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
-    assertEquals( " most <b>search engines</b> use only one of these methods.
Even the <b>search engines</b> that says they can use t",
+    assertEquals( "The most <b>search engines</b> use only one of these methods.
Even the <b>search engines</b> that says they can use the",
         sfb.createFragment( reader, 0, F, ffl ) );
   }
 
@@ -119,7 +119,7 @@ public class SimpleFragmentsBuilderTest 
     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
-    assertEquals( "ssing <b>speed</b>, the ", sfb.createFragment( reader, 0,
F, ffl ) );
+    assertEquals( "processing <b>speed</b>, the ", sfb.createFragment( reader,
0, F, ffl ) );
   }
   
   public void testUnstoredField() throws Exception {



Mime
View raw message