lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r896624 - in /lucene/java/trunk/contrib: ./ highlighter/src/java/org/apache/lucene/search/highlight/ highlighter/src/test/org/apache/lucene/search/highlight/
Date Wed, 06 Jan 2010 19:08:36 GMT
Author: markrmiller
Date: Wed Jan  6 19:08:36 2010
New Revision: 896624

URL: http://svn.apache.org/viewvc?rev=896624&view=rev
Log:
TokenSources.getTokenStream() does not assign  positionIncrement. 

Added:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=896624&r1=896623&r2=896624&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Wed Jan  6 19:08:36 2010
@@ -28,6 +28,9 @@
    correctly (enumerate all non-deleted docs).  (Karl Wettin via Mike
    McCandless)
    
+ * LUCENE-2035: TokenSources.getTokenStream() does not assign  positionIncrement. 
+   (Christopher Morris via Mark Miller)
+   
 API Changes
 
  * LUCENE-2108: Add SpellChecker.close, to close the underlying

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=896624&r1=896623&r2=896624&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
(original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
Wed Jan  6 19:08:36 2010
@@ -38,234 +38,248 @@
 import org.apache.lucene.index.TermVectorOffsetInfo;
 
 /**
- * Hides implementation issues associated with obtaining a TokenStream for use with
- * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions
or
- * from Analyzer class reparsing the stored content.
+ * Hides implementation issues associated with obtaining a TokenStream for use
+ * with the higlighter - can obtain from TermFreqVectors with offsets and
+ * (optionally) positions or from Analyzer class reparsing the stored content.
  */
-public class TokenSources
-{
+public class TokenSources {
   /**
-   * A convenience method that tries to first get a TermPositionVector for the specified
docId, then, falls back to
-   * using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
 This is useful when
-   * you already have the document, but would prefer to use the vector first.
-   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try and get
the vector from
+   * A convenience method that tries to first get a TermPositionVector for the
+   * specified docId, then, falls back to using the passed in
+   * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
+   * This is useful when you already have the document, but would prefer to use
+   * the vector first.
+   * 
+   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
+   *        and get the vector from
    * @param docId The docId to retrieve.
    * @param field The field to retrieve on the document
    * @param doc The document to fall back on
-   * @param analyzer The analyzer to use for creating the TokenStream if the vector doesn't
exist
-   * @return The {@link org.apache.lucene.analysis.TokenStream} for the {@link org.apache.lucene.document.Fieldable}
on the {@link org.apache.lucene.document.Document}
+   * @param analyzer The analyzer to use for creating the TokenStream if the
+   *        vector doesn't exist
+   * @return The {@link org.apache.lucene.analysis.TokenStream} for the
+   *         {@link org.apache.lucene.document.Fieldable} on the
+   *         {@link org.apache.lucene.document.Document}
    * @throws IOException if there was an error loading
    */
-  public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field,
Document doc, Analyzer analyzer) throws IOException{
-    TokenStream ts=null;
+  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
+      String field, Document doc, Analyzer analyzer) throws IOException {
+    TokenStream ts = null;
+
+    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
+    if (tfv != null) {
+      if (tfv instanceof TermPositionVector) {
+        ts = getTokenStream((TermPositionVector) tfv);
+      }
+    }
+    // No token info stored so fall back to analyzing raw content
+    if (ts == null) {
+      ts = getTokenStream(doc, field, analyzer);
+    }
+    return ts;
+  }
+
+  /**
+   * A convenience method that tries a number of approaches to getting a token
+   * stream. The cost of finding there are no termVectors in the index is
+   * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
+   * approach to coding is probably acceptable
+   * 
+   * @param reader
+   * @param docId
+   * @param field
+   * @param analyzer
+   * @return null if field not stored correctly
+   * @throws IOException
+   */
+  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
+      String field, Analyzer analyzer) throws IOException {
+    TokenStream ts = null;
+
+    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
+    if (tfv != null) {
+      if (tfv instanceof TermPositionVector) {
+        ts = getTokenStream((TermPositionVector) tfv);
+      }
+    }
+    // No token info stored so fall back to analyzing raw content
+    if (ts == null) {
+      ts = getTokenStream(reader, docId, field, analyzer);
+    }
+    return ts;
+  }
+
+  public static TokenStream getTokenStream(TermPositionVector tpv) {
+    // assumes the worst and makes no assumptions about token position
+    // sequences.
+    return getTokenStream(tpv, false);
+  }
+
+  /**
+   * Low level api. Returns a token stream or null if no offset info available
+   * in index. This can be used to feed the highlighter with a pre-parsed token
+   * stream
+   * 
+   * In my tests the speeds to recreate 1000 token streams using this method
+   * are: - with TermVector offset only data stored - 420 milliseconds - with
+   * TermVector offset AND position data stored - 271 milliseconds (nb timings
+   * for TermVector with position data are based on a tokenizer with contiguous
+   * positions - no overlaps or gaps) The cost of not using TermPositionVector
+   * to store pre-parsed content and using an analyzer to re-parse the original
+   * content: - reanalyzing the original content - 980 milliseconds
+   * 
+   * The re-analyze timings will typically vary depending on - 1) The complexity
+   * of the analyzer code (timings above were using a
+   * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
+   * reads ALL fields off the disk when accessing just one document field - can
+   * cost dear!) 3) Use of compression on field storage - could be faster due to
+   * compression (less disk IO) or slower (more CPU burn) depending on the
+   * content.
+   * 
+   * @param tpv
+   * @param tokenPositionsGuaranteedContiguous true if the token position
+   *        numbers have no overlaps or gaps. If looking to eek out the last
+   *        drops of performance, set to true. If in doubt, set to false.
+   */
+  public static TokenStream getTokenStream(TermPositionVector tpv,
+      boolean tokenPositionsGuaranteedContiguous) {
+    if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
+      return new TokenStreamFromTermPositionVector(tpv);
+    }
 
-		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
-		if(tfv!=null)
-		{
-		    if(tfv instanceof TermPositionVector)
-		    {
-		        ts=getTokenStream((TermPositionVector) tfv);
-		    }
-		}
-		//No token info stored so fall back to analyzing raw content
-		if(ts==null)
-		{
-		    ts=getTokenStream(doc,field,analyzer);
-		}
-		return ts;
-  }
-    /**
-     * A convenience method that tries a number of approaches to getting a token stream.
-     * The cost of finding there are no termVectors in the index is minimal (1000 invocations
still 
-     * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
-     * @param reader
-     * @param docId
-     * @param field
-     * @param analyzer
-     * @return null if field not stored correctly 
-     * @throws IOException
-     */
-    public static TokenStream getAnyTokenStream(IndexReader reader,int docId, String field,Analyzer
analyzer) throws IOException
-    {
-		TokenStream ts=null;
-
-		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
-		if(tfv!=null)
-		{
-		    if(tfv instanceof TermPositionVector)
-		    {
-		        ts=getTokenStream((TermPositionVector) tfv);
-		    }
-		}
-		//No token info stored so fall back to analyzing raw content
-		if(ts==null)
-		{
-		    ts=getTokenStream(reader,docId,field,analyzer);
-		}
-		return ts;
-    }
-    
-    
-    public static TokenStream getTokenStream(TermPositionVector tpv)
-    {
-        //assumes the worst and makes no assumptions about token position sequences.
-         return getTokenStream(tpv,false);   
-    }
-    /**
-     * Low level api.
-     * Returns a token stream or null if no offset info available in index.
-     * This can be used to feed the highlighter with a pre-parsed token stream 
-     * 
-     * In my tests the speeds to recreate 1000 token streams using this method are:
-     * - with TermVector offset only data stored - 420  milliseconds 
-     * - with TermVector offset AND position data stored - 271 milliseconds
-     *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
-     *  positions - no overlaps or gaps)
-     * The cost of not using TermPositionVector to store
-     * pre-parsed content and using an analyzer to re-parse the original content: 
-     * - reanalyzing the original content - 980 milliseconds
-     * 
-     * The re-analyze timings will typically vary depending on -
-     * 	1) The complexity of the analyzer code (timings above were using a 
-     * 	   stemmer/lowercaser/stopword combo)
-     *  2) The  number of other fields (Lucene reads ALL fields off the disk 
-     *     when accessing just one document field - can cost dear!)
-     *  3) Use of compression on field storage - could be faster due to compression (less
disk IO)
-     *     or slower (more CPU burn) depending on the content.
-     *
-     * @param tpv
-     * @param tokenPositionsGuaranteedContiguous true if the token position numbers have
no overlaps or gaps. If looking
-     * to eek out the last drops of performance, set to true. If in doubt, set to false.
-     */
-    public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous)
{
-        //an object used to iterate across an array of tokens
-        class StoredTokenStream extends TokenStream {
-          Token tokens[];
-          int currentToken = 0;
-          TermAttribute termAtt;
-          OffsetAttribute offsetAtt;
-    
-          StoredTokenStream(Token tokens[]) {
-            this.tokens = tokens;
-            termAtt = addAttribute(TermAttribute.class);
-            offsetAtt = addAttribute(OffsetAttribute.class);
-          }
-    
-          @Override
-          public boolean incrementToken() throws IOException {
-            if (currentToken >= tokens.length) {
-              return false;
-            }
-            Token token = tokens[currentToken++];
-            termAtt.setTermBuffer(token.term());
-            offsetAtt.setOffset(token.startOffset(), token.endOffset());
-            return true;
-          }
-        }      
-        //code to reconstruct the original sequence of Tokens
-        String[] terms=tpv.getTerms();          
-        int[] freq=tpv.getTermFrequencies();
-        int totalTokens=0;
-
-        for (int t = 0; t < freq.length; t++)
-        {
-            totalTokens+=freq[t];
+    // an object used to iterate across an array of tokens
+    class StoredTokenStream extends TokenStream {
+      Token tokens[];
+
+      int currentToken = 0;
+
+      TermAttribute termAtt;
+
+      OffsetAttribute offsetAtt;
+
+      StoredTokenStream(Token tokens[]) {
+        this.tokens = tokens;
+        termAtt = addAttribute(TermAttribute.class);
+        offsetAtt = addAttribute(OffsetAttribute.class);
+      }
+
+      @Override
+      public boolean incrementToken() throws IOException {
+        if (currentToken >= tokens.length) {
+          return false;
         }
-        Token tokensInOriginalOrder[]=new Token[totalTokens];
-        ArrayList<Token> unsortedTokens = null;
-        for (int t = 0; t < freq.length; t++)
-        {
-            TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
-            if(offsets==null)
-            {
-                return null;
-            }
-            
-            int[] pos=null;
-            if(tokenPositionsGuaranteedContiguous)
-            {
-                //try get the token position info to speed up assembly of tokens into sorted
sequence
-                pos=tpv.getTermPositions(t);
-            }
-            if(pos==null)
-            {	
-                //tokens NOT stored with positions or not guaranteed contiguous - must add
to list and sort later
-                if(unsortedTokens==null)
-                {
-                    unsortedTokens=new ArrayList<Token>();
-                }
-                for (int tp = 0; tp < offsets.length; tp++)
-                {
-                  Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
-                  token.setTermBuffer(terms[t]);
-                  unsortedTokens.add(token);
-                }
-            }
-            else
-            {
-                //We have positions stored and a guarantee that the token position information
is contiguous
-                
-                // This may be fast BUT wont work if Tokenizers used which create >1 token
in same position or
-                // creates jumps in position numbers - this code would fail under those circumstances
-                
-                //tokens stored with positions - can use this to index straight into sorted
array
-                for (int tp = 0; tp < pos.length; tp++)
-                {
-                  Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
-                  tokensInOriginalOrder[pos[tp]] = token;
-                }                
-            }
+        Token token = tokens[currentToken++];
+        termAtt.setTermBuffer(token.term());
+        offsetAtt.setOffset(token.startOffset(), token.endOffset());
+        return true;
+      }
+    }
+    // code to reconstruct the original sequence of Tokens
+    String[] terms = tpv.getTerms();
+    int[] freq = tpv.getTermFrequencies();
+    int totalTokens = 0;
+
+    for (int t = 0; t < freq.length; t++) {
+      totalTokens += freq[t];
+    }
+    Token tokensInOriginalOrder[] = new Token[totalTokens];
+    ArrayList<Token> unsortedTokens = null;
+    for (int t = 0; t < freq.length; t++) {
+      TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
+      if (offsets == null) {
+        throw new IllegalArgumentException("Required TermVector Offset information was not
found");
+      }
+
+      int[] pos = null;
+      if (tokenPositionsGuaranteedContiguous) {
+        // try get the token position info to speed up assembly of tokens into
+        // sorted sequence
+        pos = tpv.getTermPositions(t);
+      }
+      if (pos == null) {
+        // tokens NOT stored with positions or not guaranteed contiguous - must
+        // add to list and sort later
+        if (unsortedTokens == null) {
+          unsortedTokens = new ArrayList<Token>();
+        }
+        for (int tp = 0; tp < offsets.length; tp++) {
+          Token token = new Token(offsets[tp].getStartOffset(), offsets[tp]
+              .getEndOffset());
+          token.setTermBuffer(terms[t]);
+          unsortedTokens.add(token);
+        }
+      } else {
+        // We have positions stored and a guarantee that the token position
+        // information is contiguous
+
+        // This may be fast BUT wont work if Tokenizers used which create >1
+        // token in same position or
+        // creates jumps in position numbers - this code would fail under those
+        // circumstances
+
+        // tokens stored with positions - can use this to index straight into
+        // sorted array
+        for (int tp = 0; tp < pos.length; tp++) {
+          Token token = new Token(terms[t], offsets[tp].getStartOffset(),
+              offsets[tp].getEndOffset());
+          tokensInOriginalOrder[pos[tp]] = token;
         }
-        //If the field has been stored without position data we must perform a sort     
  
-        if(unsortedTokens!=null) {
-            tokensInOriginalOrder= unsortedTokens.toArray(new Token[unsortedTokens.size()]);
-            Arrays.sort(tokensInOriginalOrder, new Comparator<Token>(){
-                public int compare(Token t1, Token t2) {
-                    if(t1.startOffset()>t2.endOffset())
-                        return 1;
-                    if(t1.startOffset()<t2.startOffset())
-                        return -1;
-                    return 0;
-                }});
+      }
+    }
+    // If the field has been stored without position data we must perform a sort
+    if (unsortedTokens != null) {
+      tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
+          .size()]);
+      Arrays.sort(tokensInOriginalOrder, new Comparator<Token>() {
+        public int compare(Token t1, Token t2) {
+          if (t1.startOffset() > t2.endOffset())
+            return 1;
+          if (t1.startOffset() < t2.startOffset())
+            return -1;
+          return 0;
         }
-        return new StoredTokenStream(tokensInOriginalOrder);
+      });
+    }
+    return new StoredTokenStream(tokensInOriginalOrder);
+  }
+
+  public static TokenStream getTokenStream(IndexReader reader, int docId,
+      String field) throws IOException {
+    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
+    if (tfv == null) {
+      throw new IllegalArgumentException(field + " in doc #" + docId
+          + "does not have any term position data stored");
     }
+    if (tfv instanceof TermPositionVector) {
+      TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
+          docId, field);
+      return getTokenStream(tpv);
+    }
+    throw new IllegalArgumentException(field + " in doc #" + docId
+        + "does not have any term position data stored");
+  }
+
+  // convenience method
+  public static TokenStream getTokenStream(IndexReader reader, int docId,
+      String field, Analyzer analyzer) throws IOException {
+    Document doc = reader.document(docId);
+    return getTokenStream(doc, field, analyzer);
+  }
+
+  public static TokenStream getTokenStream(Document doc, String field,
+      Analyzer analyzer) {
+    String contents = doc.get(field);
+    if (contents == null) {
+      throw new IllegalArgumentException("Field " + field
+          + " in document is not stored and cannot be analyzed");
+    }
+    return getTokenStream(field, contents, analyzer);
+  }
 
-    public static TokenStream getTokenStream(IndexReader reader,int docId, String field)
throws IOException
-    {
-		TermFreqVector tfv = reader.getTermFreqVector(docId,field);
-		if(tfv==null)
-		{
-		    throw new IllegalArgumentException(field+" in doc #"+docId
-		            	+"does not have any term position data stored");
-		}
-	    if(tfv instanceof TermPositionVector)
-	    {
-			TermPositionVector tpv=(TermPositionVector) reader.getTermFreqVector(docId,field);
-	        return getTokenStream(tpv);	        
-	    }
-	    throw new IllegalArgumentException(field+" in doc #"+docId
-            	+"does not have any term position data stored");
-    }
-
-    //convenience method
-    public static TokenStream getTokenStream(IndexReader reader,int docId, String field,Analyzer
analyzer) throws IOException
-    {
-		  Document doc=reader.document(docId);
-		  return getTokenStream(doc, field, analyzer);
-    }
-    
-  public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer){
-    String contents=doc.get(field);
-		if(contents==null)
-		{
-		    throw new IllegalArgumentException("Field "+field +" in document is not stored and
cannot be analyzed");
-		}
-        return getTokenStream(field, contents, analyzer);
-  }
-  //convenience method
-  public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
-    return analyzer.tokenStream(field,new StringReader(contents));
+  // convenience method
+  public static TokenStream getTokenStream(String field, String contents,
+      Analyzer analyzer) {
+    return analyzer.tokenStream(field, new StringReader(contents));
   }
 
 }

Added: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java?rev=896624&view=auto
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
(added)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
Wed Jan  6 19:08:36 2010
@@ -0,0 +1,118 @@
+package org.apache.lucene.search.highlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+
+/**
+ * @author CMorris
+ */
+public class TokenStreamFromTermPositionVector extends TokenStream {
+
+  private final List<Token> positionedTokens = new ArrayList<Token>();
+
+  private Iterator<Token> tokensAtCurrentPosition;
+
+  private TermAttribute termAttribute;
+
+  private PositionIncrementAttribute positionIncrementAttribute;
+
+  private OffsetAttribute offsetAttribute;
+
+  /**
+   * Constructor.
+   * 
+   * @param termPositionVector TermPositionVector that contains the data for
+   *        creating the TokenStream. Must have positions and offsets.
+   */
+  public TokenStreamFromTermPositionVector(
+      final TermPositionVector termPositionVector) {
+    termAttribute = addAttribute(TermAttribute.class);
+    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
+    offsetAttribute = addAttribute(OffsetAttribute.class);
+    final String[] terms = termPositionVector.getTerms();
+    for (int i = 0; i < terms.length; i++) {
+      final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
+      final int[] termPositions = termPositionVector.getTermPositions(i);
+      for (int j = 0; j < termPositions.length; j++) {
+        Token token;
+        if (offsets != null) {
+          token = new Token(terms[i].toCharArray(), 0, terms[i].length(),
+              offsets[j].getStartOffset(), offsets[j].getEndOffset());
+        } else {
+          token = new Token();
+          token.setTermBuffer(terms[i]);
+        }
+        // Yes - this is the position, not the increment! This is for
+        // sorting. This value
+        // will be corrected before use.
+        token.setPositionIncrement(termPositions[j]);
+        this.positionedTokens.add(token);
+      }
+    }
+    final Comparator<Token> tokenComparator = new Comparator<Token>() {
+      public int compare(final Token o1, final Token o2) {
+        if (o1.getPositionIncrement() < o2.getPositionIncrement()) {
+          return -1;
+        }
+        if (o1.getPositionIncrement() > o2.getPositionIncrement()) {
+          return 1;
+        }
+        return 0;
+      }
+    };
+    Collections.sort(this.positionedTokens, tokenComparator);
+    int lastPosition = -1;
+    for (final Token token : this.positionedTokens) {
+      int thisPosition = token.getPositionIncrement();
+      token.setPositionIncrement(thisPosition - lastPosition);
+      lastPosition = thisPosition;
+    }
+    this.tokensAtCurrentPosition = this.positionedTokens.iterator();
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (this.tokensAtCurrentPosition.hasNext()) {
+      final Token next = this.tokensAtCurrentPosition.next();
+      termAttribute.setTermBuffer(next.term());
+      positionIncrementAttribute.setPositionIncrement(next
+          .getPositionIncrement());
+      offsetAttribute.setOffset(next.startOffset(), next.endOffset());
+      return true;
+    }
+    return false;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    this.tokensAtCurrentPosition = this.positionedTokens.iterator();
+  }
+}

Added: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java?rev=896624&view=auto
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
(added)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
Wed Jan  6 19:08:36 2010
@@ -0,0 +1,378 @@
+package org.apache.lucene.search.highlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopDocs;
+
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.OpenBitSet;
+
+import junit.framework.TestCase;
+
+public class HighlighterPhraseTest extends TestCase {
+  private static final String FIELD = "text";
+
+  public void testConcurrentPhrase() throws CorruptIndexException,
+      LockObtainFailedException, IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox jumped";
+    final Directory directory = new RAMDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    try {
+      final Document document = new Document();
+      document.add(new Field(FIELD, new TokenStreamConcurrent(),
+          TermVector.WITH_POSITIONS_OFFSETS));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = IndexReader.open(directory, true);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+      try {
+        final PhraseQuery phraseQuery = new PhraseQuery();
+        phraseQuery.add(new Term(FIELD, "fox"));
+        phraseQuery.add(new Term(FIELD, "jumped"));
+        phraseQuery.setSlop(0);
+        TopDocs hits = indexSearcher.search(phraseQuery, 1);
+        assertEquals(1, hits.totalHits);
+        final Highlighter highlighter = new Highlighter(
+            new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+            new QueryScorer(phraseQuery));
+
+        final TokenStream tokenStream = TokenSources
+            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+                0, FIELD), false);
+        assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
+            TEXT), highlighter.getBestFragment(tokenStream, TEXT));
+
+      } finally {
+        indexSearcher.close();
+      }
+    } finally {
+      indexReader.close();
+    }
+  }
+
+  public void testConcurrentSpan() throws CorruptIndexException,
+      LockObtainFailedException, IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox jumped";
+    final Directory directory = new RAMDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    try {
+      final Document document = new Document();
+      document.add(new Field(FIELD, new TokenStreamConcurrent(),
+          TermVector.WITH_POSITIONS_OFFSETS));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = IndexReader.open(directory, true);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+      try {
+        final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
+            new SpanTermQuery(new Term(FIELD, "fox")),
+            new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true);
+        final OpenBitSet bitset = new OpenBitSet();
+        indexSearcher.search(phraseQuery, new Collector() {
+          private int baseDoc;
+
+          public boolean acceptsDocsOutOfOrder() {
+            return true;
+          }
+
+          public void collect(int i) throws IOException {
+            bitset.set(this.baseDoc + i);
+          }
+
+          public void setNextReader(IndexReader indexreader, int i)
+              throws IOException {
+            this.baseDoc = i;
+          }
+
+          public void setScorer(org.apache.lucene.search.Scorer scorer)
+              throws IOException {
+            // Do Nothing
+          }
+        });
+        assertEquals(1, bitset.cardinality());
+        final Highlighter highlighter = new Highlighter(
+            new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+            new QueryScorer(phraseQuery));
+        for (int position = bitset.nextSetBit(0); position >= 0; position = bitset
+            .nextSetBit(position + 1)) {
+          assertEquals(0, position);
+          final TokenStream tokenStream = TokenSources.getTokenStream(
+              (TermPositionVector) indexReader.getTermFreqVector(position,
+                  FIELD), false);
+          assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
+              TEXT), highlighter.getBestFragment(tokenStream, TEXT));
+        }
+      } finally {
+        indexSearcher.close();
+      }
+    } finally {
+      indexReader.close();
+    }
+  }
+
+  public void testSparsePhrase() throws CorruptIndexException,
+      LockObtainFailedException, IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox did not jump";
+    final Directory directory = new RAMDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    try {
+      final Document document = new Document();
+      document.add(new Field(FIELD, new TokenStreamSparse(),
+          TermVector.WITH_POSITIONS_OFFSETS));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = IndexReader.open(directory, true);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+      try {
+        final PhraseQuery phraseQuery = new PhraseQuery();
+        phraseQuery.add(new Term(FIELD, "did"));
+        phraseQuery.add(new Term(FIELD, "jump"));
+        phraseQuery.setSlop(0);
+        TopDocs hits = indexSearcher.search(phraseQuery, 1);
+        assertEquals(0, hits.totalHits);
+        final Highlighter highlighter = new Highlighter(
+            new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+            new QueryScorer(phraseQuery));
+        final TokenStream tokenStream = TokenSources
+            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+                0, FIELD), false);
+        assertEquals(
+            highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
+            highlighter.getBestFragment(tokenStream, TEXT));
+      } finally {
+        indexSearcher.close();
+      }
+    } finally {
+      indexReader.close();
+    }
+  }
+
+  public void testSparsePhraseWithNoPositions() throws CorruptIndexException,
+      LockObtainFailedException, IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox did not jump";
+    final Directory directory = new RAMDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    try {
+      final Document document = new Document();
+      document.add(new Field(FIELD, TEXT, Store.YES, Index.ANALYZED,
+          TermVector.WITH_OFFSETS));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = IndexReader.open(directory, true);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+      try {
+        final PhraseQuery phraseQuery = new PhraseQuery();
+        phraseQuery.add(new Term(FIELD, "did"));
+        phraseQuery.add(new Term(FIELD, "jump"));
+        phraseQuery.setSlop(1);
+        TopDocs hits = indexSearcher.search(phraseQuery, 1);
+        assertEquals(1, hits.totalHits);
+        final Highlighter highlighter = new Highlighter(
+            new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+            new QueryScorer(phraseQuery));
+        final TokenStream tokenStream = TokenSources.getTokenStream(
+            (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), true);
+        assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
+            .getBestFragment(tokenStream, TEXT));
+      } finally {
+        indexSearcher.close();
+      }
+    } finally {
+      indexReader.close();
+    }
+  }
+
+  public void testSparseSpan() throws CorruptIndexException,
+      LockObtainFailedException, IOException, InvalidTokenOffsetsException {
+    final String TEXT = "the fox did not jump";
+    final Directory directory = new RAMDirectory();
+    final IndexWriter indexWriter = new IndexWriter(directory,
+        new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    try {
+      final Document document = new Document();
+      document.add(new Field(FIELD, new TokenStreamSparse(),
+          TermVector.WITH_POSITIONS_OFFSETS));
+      indexWriter.addDocument(document);
+    } finally {
+      indexWriter.close();
+    }
+    final IndexReader indexReader = IndexReader.open(directory, true);
+    try {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+      try {
+        final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
+            new SpanTermQuery(new Term(FIELD, "did")),
+            new SpanTermQuery(new Term(FIELD, "jump")) }, 0, true);
+
+        TopDocs hits = indexSearcher.search(phraseQuery, 1);
+        assertEquals(0, hits.totalHits);
+        final Highlighter highlighter = new Highlighter(
+            new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+            new QueryScorer(phraseQuery));
+        final TokenStream tokenStream = TokenSources
+            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+                0, FIELD), false);
+        assertEquals(
+            highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
+            highlighter.getBestFragment(tokenStream, TEXT));
+      } finally {
+        indexSearcher.close();
+      }
+    } finally {
+      indexReader.close();
+    }
+  }
+
+  private static final class TokenStreamSparse extends TokenStream {
+    private Token[] tokens;
+
+    private int i = -1;
+
+    private TermAttribute termAttribute;
+
+    private OffsetAttribute offsetAttribute;
+
+    private PositionIncrementAttribute positionIncrementAttribute;
+
+    public TokenStreamSparse() {
+      termAttribute = addAttribute(TermAttribute.class);
+      offsetAttribute = addAttribute(OffsetAttribute.class);
+      positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
+      reset();
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      this.i++;
+      if (this.i >= this.tokens.length) {
+        return false;
+      }
+      termAttribute.setTermBuffer(this.tokens[i].term(), 0, this.tokens[i]
+          .term().length());
+      offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
+          .endOffset());
+      positionIncrementAttribute.setPositionIncrement(this.tokens[i]
+          .getPositionIncrement());
+      return true;
+    }
+
+    public void reset() {
+      this.i = -1;
+      this.tokens = new Token[] {
+          new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
+          new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
+          new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
+          new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
+      this.tokens[3].setPositionIncrement(2);
+    }
+  }
+
+  private static final class TokenStreamConcurrent extends TokenStream {
+    private Token[] tokens;
+
+    private int i = -1;
+
+    private TermAttribute termAttribute;
+
+    private OffsetAttribute offsetAttribute;
+
+    private PositionIncrementAttribute positionIncrementAttribute;
+
+    public TokenStreamConcurrent() {
+      termAttribute = addAttribute(TermAttribute.class);
+      offsetAttribute = addAttribute(OffsetAttribute.class);
+      positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
+      reset();
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      this.i++;
+      if (this.i >= this.tokens.length) {
+        return false;
+      }
+      termAttribute.setTermBuffer(this.tokens[i].term(), 0, this.tokens[i]
+          .term().length());
+      offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
+          .endOffset());
+      positionIncrementAttribute.setPositionIncrement(this.tokens[i]
+          .getPositionIncrement());
+      return true;
+    }
+
+    public void reset() {
+      this.i = -1;
+      this.tokens = new Token[] {
+          new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
+          new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
+          new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 8, 14),
+          new Token(new char[] { 'j', 'u', 'm', 'p', 'e', 'd' }, 0, 6, 8, 14) };
+      this.tokens[3].setPositionIncrement(0);
+    }
+  }
+
+}



Mime
View raw message