lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r799455 - in /lucene/java/trunk/contrib: ./ highlighter/src/java/org/apache/lucene/search/highlight/ highlighter/src/test/org/apache/lucene/search/highlight/
Date Thu, 30 Jul 2009 22:00:48 GMT
Author: markrmiller
Date: Thu Jul 30 22:00:47 2009
New Revision: 799455

URL: http://svn.apache.org/viewvc?rev=799455&view=rev
Log:
LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards compatibility with some public classes. If you have implemented custom Fregmenters or Scorers,  you will need to adjust them to work with the new TokenStream API. Rather than getting passed a Token at a time, you will be given a TokenStream to init your impl with - store the Attributes you are interested in locally and access them on each call to the method that used to pass a new Token. Look at the included updated impls for examples.

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Jul 30 22:00:47 2009
@@ -11,7 +11,12 @@
 
 API Changes
 
- (None)
+ 1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards
+    compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, 
+    you will need to adjust them to work with the new TokenStream API. Rather than getting passed a 
+    Token at a time, you will be given a TokenStream to init your impl with - store the Attributes 
+    you are interested in locally and access them on each call to the method that used to pass a new 
+    Token. Look at the included updated impls for examples.  (Mark Miller)
 
 Bug fixes
 
@@ -41,9 +46,6 @@
 
  8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
     (Todd Teak via Otis Gospodnetic)
-    
- 9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or 
-    disjunction queries. (Koji Sekiguchi, Mark Miller)
 
 New features
 

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java Thu Jul 30 22:00:47 2009
@@ -16,24 +16,31 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
 
 /**
- * Implements the policy for breaking text into multiple fragments for consideration
- * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
- * of detecting end of sentences in the text.
+ * Implements the policy for breaking text into multiple fragments for
+ * consideration by the {@link Highlighter} class. A sophisticated
+ * implementation may do this on the basis of detecting end of sentences in the
+ * text.
  */
-public interface Fragmenter
-{
-	/**
-	 * Initializes the Fragmenter
-	 * @param originalText
-	 */
-	public void start(String originalText);
+public interface Fragmenter {
 
-	/**
-	 * Test to see if this token from the stream should be held in a new TextFragment
-	 * @param nextToken
-	 */
-	public boolean isNewFragment(Token nextToken);
+  /**
+   * Initializes the Fragmenter. You can grab references to the Attributes you are
+   * interested in from tokenStream and then access the values in isNewFragment.
+   * 
+   * @param originalText
+   * @param tokenStream
+   */
+  public void start(String originalText, TokenStream tokenStream);
+
+
+  /**
+   * Test to see if this token from the stream should be held in a new
+   * TextFragment. Every time this is called, the TokenStream
+   * passed to start(String, TokenStream) will have been incremented.
+   * 
+   */
+  public boolean isNewFragment();
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Thu Jul 30 22:00:47 2009
@@ -22,8 +22,10 @@
 import java.util.Iterator;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.PriorityQueue;
 
 /**
@@ -214,8 +216,14 @@
 	{
 		ArrayList docFrags = new ArrayList();
 		StringBuffer newText=new StringBuffer();
-
+		
+	    TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
+	    OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);
+	    tokenStream.addAttribute(PositionIncrementAttribute.class);
+	    tokenStream.reset();
+	    
 		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
+		fragmentScorer.init(tokenStream);
 		fragmentScorer.startFragment(currentFrag);
 		docFrags.add(currentFrag);
 
@@ -223,28 +231,27 @@
 
 		try
 		{
-                  final Token reusableToken = new Token();
+
 			String tokenText;
 			int startOffset;
 			int endOffset;
 			int lastEndOffset = 0;
-			textFragmenter.start(text);
+			textFragmenter.start(text, tokenStream);
+
+			TokenGroup tokenGroup=new TokenGroup(tokenStream);
 
-			TokenGroup tokenGroup=new TokenGroup();
-			
-			for (Token nextToken = tokenStream.next(reusableToken);
-			     (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
-			     nextToken = tokenStream.next(reusableToken))
+			for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
+			      next = tokenStream.incrementToken())
 			{
-				if(	(nextToken.endOffset()>text.length())
+				if(	(offsetAtt.endOffset()>text.length())
 					||
-					(nextToken.startOffset()>text.length())
+					(offsetAtt.startOffset()>text.length())
 					)						
 				{
-					throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
+					throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
 							+" exceeds length of provided text sized "+text.length());
 				}
-				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
+				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
 				{
 					//the current token is distinct from previous tokens -
 					// markup the cached token group info
@@ -260,7 +267,7 @@
 					tokenGroup.clear();
 
 					//check if current token marks the start of a new fragment
-					if(textFragmenter.isNewFragment(nextToken))
+					if(textFragmenter.isNewFragment())
 					{
 						currentFrag.setScore(fragmentScorer.getFragmentScore());
 						//record stats for a new fragment
@@ -271,7 +278,7 @@
 					}
 				}
 
-				tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken));
+				tokenGroup.addToken(fragmentScorer.getTokenScore());
 
 //				if(lastEndOffset>maxDocBytesToAnalyze)
 //				{
@@ -332,7 +339,7 @@
 				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
 				//fix to PriorityQueue. The correct method to use here is the new "insert" method
 				// USE ABOVE CODE IF THIS DOES NOT COMPILE!
-				fragQueue.insert(currentFrag);
+				fragQueue.insertWithOverflow(currentFrag);
 			}
 
 			//return the most relevant fragments

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java Thu Jul 30 22:00:47 2009
@@ -16,17 +16,18 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
 
 /**
  * {@link Fragmenter} implementation which does not fragment the text.
  * This is useful for highlighting the entire content of a document or field.
  */
 public class NullFragmenter implements Fragmenter {
-  public void start(String s) {
+  public void start(String s, TokenStream tokenStream) {
   }
 
-  public boolean isNewFragment(Token token) {
+  public boolean isNewFragment() {
     return false;
   }
+
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java Thu Jul 30 22:00:47 2009
@@ -1,4 +1,5 @@
 package org.apache.lucene.search.highlight;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -19,134 +20,142 @@
 import java.util.HashMap;
 import java.util.HashSet;
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Query;
 
 /**
- * {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
- * This class uses the {@link QueryTermExtractor} class to process determine the query terms and 
- * their boosts to be used.
+ * {@link Scorer} implementation which scores text fragments by the number of
+ * unique query terms found. This class uses the {@link QueryTermExtractor}
+ * class to process determine the query terms and their boosts to be used.
  */
-//TODO: provide option to boost score of fragments near beginning of document 
+// TODO: provide option to boost score of fragments near beginning of document
 // based on fragment.getFragNum()
-public class QueryScorer implements Scorer
-{
-	TextFragment currentTextFragment=null;
-	HashSet uniqueTermsInFragment;
-	float totalScore=0;
-	float maxTermWeight=0;
-	private HashMap termsToFind;
-	
-
-	/**
-	 * 
-	 * @param query a Lucene query (ideally rewritten using query.rewrite 
-	 * before being passed to this class and the searcher)
-	 */
-	public QueryScorer(Query query)
-	{
-		this(QueryTermExtractor.getTerms(query));
-	}
-	
-	/**
-	 * 
-	 * @param query a Lucene query (ideally rewritten using query.rewrite 
-	 * before being passed to this class and the searcher)
-	 * @param fieldName the Field name which is used to match Query terms
-	 */
-	public QueryScorer(Query query, String fieldName)
-	{
-		this(QueryTermExtractor.getTerms(query, false,fieldName));
-	}	
-
-	/**
-	 * 
-	 * @param query a Lucene query (ideally rewritten using query.rewrite 
-	 * before being passed to this class and the searcher)
-	 * @param reader used to compute IDF which can be used to a) score selected fragments better 
-	 * b) use graded highlights eg set font color intensity
-	 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
-	 */
-	public QueryScorer(Query query, IndexReader reader, String fieldName)
-	{
-		this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); 
-	}
-
-	public QueryScorer(WeightedTerm []weightedTerms	)
-	{
-		termsToFind = new HashMap();
-		for (int i = 0; i < weightedTerms.length; i++)
-		{
-			WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term);
-			if( (existingTerm==null) ||(existingTerm.weight<weightedTerms[i].weight) )
-			{
-				//if a term is defined more than once, always use the highest scoring weight
-				termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
-				maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight());
-			}
-		}
-	}
-	
-
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
-	 */
-	public void startFragment(TextFragment newFragment)
-	{
-		uniqueTermsInFragment = new HashSet();
-		currentTextFragment=newFragment;
-		totalScore=0;
-		
-	}
-	
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
-	 */
-	public float getTokenScore(Token token)
-	{
-		String termText=token.term();
-		
-		WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
-		if(queryTerm==null)
-		{
-			//not a query term - return
-			return 0;
-		}
-		//found a query term - is it unique in this doc?
-		if(!uniqueTermsInFragment.contains(termText))
-		{
-			totalScore+=queryTerm.getWeight();
-			uniqueTermsInFragment.add(termText);
-		}
-		return queryTerm.getWeight();
-	}
-	
-	
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
-	 */
-	public float getFragmentScore()
-	{
-		return totalScore;		
-	}
-
-
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
-	 */
-	public void allFragmentsProcessed()
-	{
-		//this class has no special operations to perform at end of processing
-	}
-
-	/**
-	 * 
-	 * @return The highest weighted term (useful for passing to GradientFormatter to set
-	 * top end of coloring scale.  
-	 */
-    public float getMaxTermWeight()
-    {
-        return maxTermWeight;
+public class QueryScorer implements Scorer {
+  
+  TextFragment currentTextFragment = null;
+  HashSet uniqueTermsInFragment;
+
+  float totalScore = 0;
+  float maxTermWeight = 0;
+  private HashMap termsToFind;
+
+  private TermAttribute termAtt;
+
+  /**
+   * 
+   * @param query a Lucene query (ideally rewritten using query.rewrite before
+   *        being passed to this class and the searcher)
+   */
+  public QueryScorer(Query query) {
+    this(QueryTermExtractor.getTerms(query));
+  }
+
+  /**
+   * 
+   * @param query a Lucene query (ideally rewritten using query.rewrite before
+   *        being passed to this class and the searcher)
+   * @param fieldName the Field name which is used to match Query terms
+   */
+  public QueryScorer(Query query, String fieldName) {
+    this(QueryTermExtractor.getTerms(query, false, fieldName));
+  }
+
+  /**
+   * 
+   * @param query a Lucene query (ideally rewritten using query.rewrite before
+   *        being passed to this class and the searcher)
+   * @param reader used to compute IDF which can be used to a) score selected
+   *        fragments better b) use graded highlights eg set font color
+   *        intensity
+   * @param fieldName the field on which Inverse Document Frequency (IDF)
+   *        calculations are based
+   */
+  public QueryScorer(Query query, IndexReader reader, String fieldName) {
+    this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
+  }
+
+  public QueryScorer(WeightedTerm[] weightedTerms) {
+    termsToFind = new HashMap();
+    for (int i = 0; i < weightedTerms.length; i++) {
+      WeightedTerm existingTerm = (WeightedTerm) termsToFind
+          .get(weightedTerms[i].term);
+      if ((existingTerm == null)
+          || (existingTerm.weight < weightedTerms[i].weight)) {
+        // if a term is defined more than once, always use the highest scoring
+        // weight
+        termsToFind.put(weightedTerms[i].term, weightedTerms[i]);
+        maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
+      }
+    }
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
+   */
+  public void init(TokenStream tokenStream) {
+    termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see
+   * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
+   * .lucene.search.highlight.TextFragment)
+   */
+  public void startFragment(TextFragment newFragment) {
+    uniqueTermsInFragment = new HashSet();
+    currentTextFragment = newFragment;
+    totalScore = 0;
+
+  }
+
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
+   */
+  public float getTokenScore() {
+    String termText = termAtt.term();
+
+    WeightedTerm queryTerm = (WeightedTerm) termsToFind.get(termText);
+    if (queryTerm == null) {
+      // not a query term - return
+      return 0;
     }
+    // found a query term - is it unique in this doc?
+    if (!uniqueTermsInFragment.contains(termText)) {
+      totalScore += queryTerm.getWeight();
+      uniqueTermsInFragment.add(termText);
+    }
+    return queryTerm.getWeight();
+  }
+
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
+   */
+  public float getFragmentScore() {
+    return totalScore;
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see
+   * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
+   */
+  public void allFragmentsProcessed() {
+    // this class has no special operations to perform at end of processing
+  }
+
+  /**
+   * 
+   * @return The highest weighted term (useful for passing to GradientFormatter
+   *         to set top end of coloring scale.
+   */
+  public float getMaxTermWeight() {
+    return maxTermWeight;
+  }
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java Thu Jul 30 22:00:47 2009
@@ -1,4 +1,5 @@
 package org.apache.lucene.search.highlight;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,34 +17,45 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
 
 /**
  * Adds to the score for a fragment based on its tokens
  */
-public interface Scorer
-{
-	/**
-	 * called when a new fragment is started for consideration
-	 * @param newFragment
-	 */
-	public void startFragment(TextFragment newFragment);
-
-	/**
-	 * Called for each token in the current fragment
-	 * @param token The token to be scored
-	 * @return a score which is passed to the Highlighter class to influence the mark-up of the text
-	 * (this return value is NOT used to score the fragment)
-	 */
-	public float getTokenScore(Token token);
-	
-
-	/**
-	 * Called when the highlighter has no more tokens for the current fragment - the scorer returns
-	 * the weighting it has derived for the most recent fragment, typically based on the tokens
-	 * passed to getTokenScore(). 
-	 *
-	 */	
-	public float getFragmentScore();
+public interface Scorer {
+
+  /**
+   * Called to init the Scorer with a TokenStream. You can grab references to
+   * the attributes you are interested in here and access them from
+   * getTokenScore().
+   * 
+   * @param tokenStream
+   */
+  public void init(TokenStream tokenStream);
+
+  /**
+   * called when a new fragment is started for consideration
+   * 
+   * @param newFragment
+   */
+  public void startFragment(TextFragment newFragment);
+
+  /**
+   * Called for each token in the current fragment. The Highlighter will
+   * increment the TokenStream passed to init on every call.
+   * 
+   * @return a score which is passed to the Highlighter class to influence the
+   *         mark-up of the text (this return value is NOT used to score the
+   *         fragment)
+   */
+  public float getTokenScore();
+
+  /**
+   * Called when the highlighter has no more tokens for the current fragment -
+   * the scorer returns the weighting it has derived for the most recent
+   * fragment, typically based on the tokens passed to getTokenScore().
+   * 
+   */
+  public float getFragmentScore();
 
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java Thu Jul 30 22:00:47 2009
@@ -1,4 +1,5 @@
 package org.apache.lucene.search.highlight;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,69 +17,64 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 
 /**
- * {@link Fragmenter} implementation which breaks text up into same-size 
+ * {@link Fragmenter} implementation which breaks text up into same-size
  * fragments with no concerns over spotting sentence boundaries.
  */
-public class SimpleFragmenter implements Fragmenter
-{
-	private static final int DEFAULT_FRAGMENT_SIZE =100;
-	private int currentNumFrags;
-	private int fragmentSize;
-
-
-	public SimpleFragmenter()
-	{
-		this(DEFAULT_FRAGMENT_SIZE);
-	}
-
-
-	/**
-	 * 
-	 * @param fragmentSize size in number of characters of each fragment
-	 */
-	public SimpleFragmenter(int fragmentSize)
-	{
-		this.fragmentSize=fragmentSize;
-	}
-
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
-	 */
-	public void start(String originalText)
-	{
-		currentNumFrags=1;
-	}
-
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
-	 */
-	public boolean isNewFragment(Token token)
-	{
-		boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
-		if(isNewFrag)
-		{
-			currentNumFrags++;
-		}
-		return isNewFrag;
-	}
-
-	/**
-	 * @return size in number of characters of each fragment
-	 */
-	public int getFragmentSize()
-	{
-		return fragmentSize;
-	}
-
-	/**
-	 * @param size size in characters of each fragment
-	 */
-	public void setFragmentSize(int size)
-	{
-		fragmentSize = size;
-	}
+public class SimpleFragmenter implements Fragmenter {
+  private static final int DEFAULT_FRAGMENT_SIZE = 100;
+  private int currentNumFrags;
+  private int fragmentSize;
+  private OffsetAttribute offsetAtt;
+
+  public SimpleFragmenter() {
+    this(DEFAULT_FRAGMENT_SIZE);
+  }
+
+  /**
+   * 
+   * @param fragmentSize size in number of characters of each fragment
+   */
+  public SimpleFragmenter(int fragmentSize) {
+    this.fragmentSize = fragmentSize;
+  }
+
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
+   */
+  public void start(String originalText, TokenStream stream) {
+    offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
+    currentNumFrags = 1;
+  }
+
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
+   */
+  public boolean isNewFragment() {
+    boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags);
+    if (isNewFrag) {
+      currentNumFrags++;
+    }
+    return isNewFrag;
+  }
+
+  /**
+   * @return size in number of characters of each fragment
+   */
+  public int getFragmentSize() {
+    return fragmentSize;
+  }
+
+  /**
+   * @param size size in characters of each fragment
+   */
+  public void setFragmentSize(int size) {
+    fragmentSize = size;
+  }
 
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java Thu Jul 30 22:00:47 2009
@@ -17,10 +17,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-import org.apache.lucene.analysis.Token;
-
 import java.util.List;
 
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
 
 /**
  * {@link Fragmenter} implementation which breaks text up into same-size
@@ -34,6 +37,9 @@
   private SpanScorer spanScorer;
   private int waitForPos = -1;
   private int textSize;
+  private TermAttribute termAtt;
+  private PositionIncrementAttribute posIncAtt;
+  private OffsetAttribute offsetAtt;
 
   /**
    * @param spanscorer SpanScorer that was used to score hits
@@ -50,12 +56,12 @@
     this.fragmentSize = fragmentSize;
     this.spanScorer = spanscorer;
   }
-
+  
   /* (non-Javadoc)
-   * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token)
+   * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
    */
-  public boolean isNewFragment(Token token) {
-    position += token.getPositionIncrement();
+  public boolean isNewFragment() {
+    position += posIncAtt.getPositionIncrement();
 
     if (waitForPos == position) {
       waitForPos = -1;
@@ -63,7 +69,7 @@
       return false;
     }
 
-    WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
+    WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term());
 
     if (wSpanTerm != null) {
       List positionSpans = wSpanTerm.getPositionSpans();
@@ -76,8 +82,8 @@
       }
     }
 
-    boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags)
-        && (textSize - token.endOffset()) >= (fragmentSize >>> 1);
+    boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags)
+        && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1);
     
     if (isNewFrag) {
       currentNumFrags++;
@@ -86,12 +92,16 @@
     return isNewFrag;
   }
 
+
   /* (non-Javadoc)
-   * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
+   * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
    */
-  public void start(String originalText) {
+  public void start(String originalText, TokenStream tokenStream) {
     position = -1;
     currentNumFrags = 1;
     textSize = originalText.length();
+    termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+    posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
+    offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
   }
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java Thu Jul 30 22:00:47 2009
@@ -7,9 +7,10 @@
 import java.util.Set;
 
 import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.ConstantScoreRangeQuery;
 import org.apache.lucene.search.Query;
 
 
@@ -26,6 +27,8 @@
   private float maxTermWeight;
   private int position = -1;
   private String defaultField;
+  private TermAttribute termAtt;
+  private PositionIncrementAttribute posIncAtt;
   private static boolean highlightCnstScrRngQuery;
 
   /**
@@ -176,9 +179,9 @@
    * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
    *      int)
    */
-  public float getTokenScore(Token token) {
-    position += token.getPositionIncrement();
-    String termText = token.term();
+  public float getTokenScore() {
+    position += posIncAtt.getPositionIncrement();
+    String termText = termAtt.term();
 
     WeightedSpanTerm weightedSpanTerm;
 
@@ -203,6 +206,11 @@
     return score;
   }
 
+  public void init(TokenStream tokenStream) {
+    termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+    posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
+  }
+  
   /**
    * Retrieve the WeightedSpanTerm for the specified token. Useful for passing
    * Span information to a Fragmenter.

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java Thu Jul 30 22:00:47 2009
@@ -1,4 +1,5 @@
 package org.apache.lucene.search.highlight;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -15,118 +16,117 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
- * One, or several overlapping tokens, along with the score(s) and the
- * scope of the original text
+ * One, or several overlapping tokens, along with the score(s) and the scope of
+ * the original text
  */
-public class TokenGroup
-{
-	
-	private static final int MAX_NUM_TOKENS_PER_GROUP=50;
-	Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
-	float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
-	int numTokens=0;
-	int startOffset=0;
-	int endOffset=0;
-	float tot;
+public class TokenGroup {
 
+  private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
+  Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
+  float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
+  int numTokens = 0;
+  int startOffset = 0;
+  int endOffset = 0;
+  float tot;
   int matchStartOffset, matchEndOffset;
 
+  private OffsetAttribute offsetAtt;
+  private TermAttribute termAtt;
 
-  void addToken(Token token, float score)
-	{
-	    if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
-        {	    
-			if(numTokens==0)
-			{
-				startOffset=matchStartOffset=token.startOffset();
-				endOffset=matchEndOffset=token.endOffset();
-				tot += score;
-			}
-			else
-			{
-				startOffset=Math.min(startOffset,token.startOffset());
-				endOffset=Math.max(endOffset,token.endOffset());
-        if (score>0) {
-          if (tot==0) {
-            matchStartOffset=token.startOffset();
-            matchEndOffset=token.endOffset();
+  public TokenGroup(TokenStream tokenStream) {
+    offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+  }
+
+  void addToken(float score) {
+    if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
+      int termStartOffset = offsetAtt.startOffset();
+      int termEndOffset = offsetAtt.endOffset();
+      if (numTokens == 0) {
+        startOffset = matchStartOffset = termStartOffset;
+        endOffset = matchEndOffset = termEndOffset;
+        tot += score;
+      } else {
+        startOffset = Math.min(startOffset, termStartOffset);
+        endOffset = Math.max(endOffset, termEndOffset);
+        if (score > 0) {
+          if (tot == 0) {
+            matchStartOffset = offsetAtt.startOffset();
+            matchEndOffset = offsetAtt.endOffset();
           } else {
-            matchStartOffset=Math.min(matchStartOffset,token.startOffset());
-            matchEndOffset=Math.max(matchEndOffset,token.endOffset());
+            matchStartOffset = Math.min(matchStartOffset, termStartOffset);
+            matchEndOffset = Math.max(matchEndOffset, termEndOffset);
           }
-          tot+=score;
+          tot += score;
         }
       }
-			tokens[numTokens]= (Token) token.clone();
-			scores[numTokens]=score;
-			numTokens++;
-        }
-	}
-
-	boolean isDistinct(Token token)
-	{
-		return token.startOffset()>=endOffset;
-	}
-
-
-	void clear()
-	{
-		numTokens=0;
-		tot=0;
-	}
-	
-	/**
-	 * 
-	 * @param index a value between 0 and numTokens -1
-	 * @return the "n"th token
-	 */
-	public Token getToken(int index)
-	{
-		return tokens[index];
-	}
-
-	/**
-	 * 
-	 * @param index a value between 0 and numTokens -1
-	 * @return the "n"th score
-	 */
-	public float getScore(int index)
-	{
-		return scores[index];
-	}
-
-	/**
-	 * @return the end position in the original text
-	 */
-	public int getEndOffset()
-	{
-		return endOffset;
-	}
-
-	/**
-	 * @return the number of tokens in this group
-	 */
-	public int getNumTokens()
-	{
-		return numTokens;
-	}
-
-	/**
-	 * @return the start position in the original text
-	 */
-	public int getStartOffset()
-	{
-		return startOffset;
-	}
-
-	/**
-	 * @return all tokens' scores summed up
-	 */
-	public float getTotalScore()
-	{
-		return tot;
-	}
+      Token token = new Token(termStartOffset, termEndOffset);
+      token.setTermBuffer(termAtt.term());
+      tokens[numTokens] = token;
+      scores[numTokens] = score;
+      numTokens++;
+    }
+  }
+
+  boolean isDistinct() {
+    return offsetAtt.startOffset() >= endOffset;
+  }
+
+  void clear() {
+    numTokens = 0;
+    tot = 0;
+  }
+  
+  /* 
+  * @param index a value between 0 and numTokens -1
+  * @return the "n"th token
+  */
+ public Token getToken(int index)
+ {
+     return tokens[index];
+ }
+
+  /**
+   * 
+   * @param index a value between 0 and numTokens -1
+   * @return the "n"th score
+   */
+  public float getScore(int index) {
+    return scores[index];
+  }
+
+  /**
+   * @return the end position in the original text
+   */
+  public int getEndOffset() {
+    return endOffset;
+  }
+
+  /**
+   * @return the number of tokens in this group
+   */
+  public int getNumTokens() {
+    return numTokens;
+  }
+
+  /**
+   * @return the start position in the original text
+   */
+  public int getStartOffset() {
+    return startOffset;
+  }
+
+  /**
+   * @return all tokens' scores summed up
+   */
+  public float getTotalScore() {
+    return tot;
+  }
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Thu Jul 30 22:00:47 2009
@@ -29,6 +29,8 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.TermFreqVector;
@@ -135,32 +137,45 @@
      * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
      * to eek out the last drops of performance, set to true. If in doubt, set to false.
      */
-    public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous)
-    {
+    public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) {
         //an object used to iterate across an array of tokens
-        class StoredTokenStream extends TokenStream
-        {
-            Token tokens[];
-            int currentToken=0;
-            StoredTokenStream(Token tokens[])
-            {
-                this.tokens=tokens;
+        class StoredTokenStream extends TokenStream {
+          Token tokens[];
+          int currentToken = 0;
+          TermAttribute termAtt;
+          OffsetAttribute offsetAtt;
+    
+          StoredTokenStream(Token tokens[]) {
+            this.tokens = tokens;
+            termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+            offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+          }
+    
+          public Token next(final Token reusableToken) {
+            System.out.println("next token");
+            assert reusableToken != null;
+            if (currentToken >= tokens.length) {
+              return null;
             }
-            public Token next(final Token reusableToken)
-            {
-                assert reusableToken != null;
-                if(currentToken>=tokens.length)
-                {
-                    return null;
-                }
-                return tokens[currentToken++];
-            }            
-        }        
+            return tokens[currentToken++];
+          }
+    
+          public boolean incrementToken() throws IOException {
+            System.out.println("inc token");
+            if (currentToken >= tokens.length) {
+              return false;
+            }
+            Token token = tokens[currentToken++];
+            termAtt.setTermBuffer(token.term());
+            offsetAtt.setOffset(token.startOffset(), token.endOffset());
+            return true;
+          }
+        }      
         //code to reconstruct the original sequence of Tokens
         String[] terms=tpv.getTerms();          
         int[] freq=tpv.getTermFrequencies();
         int totalTokens=0;
-        Token newToken = new Token();
+
         for (int t = 0; t < freq.length; t++)
         {
             totalTokens+=freq[t];
@@ -190,8 +205,9 @@
                 }
                 for (int tp = 0; tp < offsets.length; tp++)
                 {
-                  newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
-                  unsortedTokens.add(newToken.clone());
+                  Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+                  token.setTermBuffer(terms[t]);
+                  unsortedTokens.add(token);
                 }
             }
             else
@@ -204,8 +220,8 @@
                 //tokens stored with positions - can use this to index straight into sorted array
                 for (int tp = 0; tp < pos.length; tp++)
                 {
-                  newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
-                  tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
+                  Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+                  tokensInOriginalOrder[pos[tp]] = token;
                 }                
             }
         }
@@ -218,7 +234,7 @@
                 {
                     Token t1=(Token) o1;
                     Token t2=(Token) o2;
-                    if(t1.startOffset()>t2.startOffset())
+                    if(t1.startOffset()>t2.endOffset())
                         return 1;
                     if(t1.startOffset()<t2.startOffset())
                         return -1;

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Thu Jul 30 22:00:47 2009
@@ -42,8 +42,8 @@
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanOrQuery;
@@ -98,7 +98,7 @@
   private void extract(Query query, Map terms) throws IOException {
     if (query instanceof BooleanQuery) {
       BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
-  
+
       for (int i = 0; i < queryClauses.length; i++) {
         if (!queryClauses[i].isProhibited()) {
           extract(queryClauses[i].getQuery(), terms);
@@ -441,7 +441,7 @@
    * This class makes sure that if both position sensitive and insensitive
    * versions of the same term are added, the position insensitive one wins.
    */
-  private class PositionCheckingMap extends HashMap {
+  static private class PositionCheckingMap extends HashMap {
 
     public void putAll(Map m) {
       Iterator it = m.keySet().iterator();

Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=799455&r1=799454&r2=799455&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Thu Jul 30 22:00:47 2009
@@ -38,10 +38,14 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Index;
@@ -62,9 +66,8 @@
 import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeFilter;
-import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeFilter;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
@@ -75,6 +78,7 @@
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
 
@@ -87,7 +91,7 @@
   static final String FIELD_NAME = "contents";
   private Query query;
   RAMDirectory ramDir;
-  public Searcher searcher = null;
+  public IndexSearcher searcher = null;
   public Hits hits = null;
   int numHighlights = 0;
   Analyzer analyzer = new StandardAnalyzer();
@@ -108,11 +112,40 @@
     super(arg0);
   }
 
+  public void testHits() throws Exception {
+    Analyzer analyzer = new SimpleAnalyzer();
+    QueryParser qp = new QueryParser(FIELD_NAME, analyzer);
+    query = qp.parse("\"very long\"");
+    searcher = new IndexSearcher(ramDir, false);
+    TopDocs hits = searcher.search(query, 10);
+
+    Highlighter highlighter = new Highlighter(null);
+
+
+    for (int i = 0; i < hits.scoreDocs.length; i++) {
+      Document doc = searcher.doc(hits.scoreDocs[i].doc);
+      String storedField = doc.get(FIELD_NAME);
+
+      TokenStream stream = TokenSources.getAnyTokenStream(searcher
+          .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+      CachingTokenFilter ctf = new CachingTokenFilter(stream);
+      SpanScorer scorer = new SpanScorer(query, FIELD_NAME, ctf);
+     // ctf.reset();
+      Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
+      highlighter.setFragmentScorer(scorer);
+      highlighter.setTextFragmenter(fragmenter);
+
+      String fragment = highlighter.getBestFragment(ctf, storedField);
+
+      System.out.println(fragment);
+    }
+  }
+  
   public void testHighlightingWithDefaultField() throws Exception {
 
     String s1 = "I call our world Flatland, not because we call it so,";
 
-    QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer());
+    QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer(Version.LUCENE_CURRENT));
 
     // Verify that a query against the default field results in text being
     // highlighted
@@ -144,7 +177,7 @@
    */
   private static String highlightField(Query query, String fieldName, String text)
       throws IOException, InvalidTokenOffsetsException {
-    CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
+    CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer(Version.LUCENE_CURRENT).tokenStream(
         fieldName, new StringReader(text)));
     // Assuming "<B>", "</B>" used to highlight
     SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@@ -908,10 +941,12 @@
         Query query = parser.parse(srchkey);
 
         TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
         Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this);
 
         // Get 3 best fragments and seperate with a "..."
         tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
         String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
         String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
         assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult
@@ -1075,10 +1110,11 @@
   }
 
   public void testUnRewrittenQuery() throws Exception {
-    TestHighlightRunner helper = new TestHighlightRunner() {
+    final TestHighlightRunner helper = new TestHighlightRunner() {
 
       public void run() throws Exception {
         numHighlights = 0;
+        SpanScorer.setHighlightCnstScrRngQuery(false);
         // test to show how rewritten query can still be used
         searcher = new IndexSearcher(ramDir);
         Analyzer analyzer = new StandardAnalyzer();
@@ -1154,13 +1190,17 @@
       public void startFragment(TextFragment newFragment) {
       }
 
-      public float getTokenScore(Token token) {
+      public float getTokenScore() {
         return 0;
       }
 
       public float getFragmentScore() {
         return 1;
       }
+
+      public void init(TokenStream tokenStream) {
+        
+      }
     });
     highlighter.setTextFragmenter(new SimpleFragmenter(2000));
     TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent));
@@ -1292,27 +1332,44 @@
     return new TokenStream() {
       Iterator iter;
       List lst;
+      private TermAttribute termAtt;
+      private PositionIncrementAttribute posIncrAtt;
+      private OffsetAttribute offsetAtt;
       {
+        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+        posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+        offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
         lst = new ArrayList();
         Token t;
         t = createToken("hi", 0, 2);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("hispeed", 0, 8);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("speed", 3, 8);
         t.setPositionIncrement(0);
         lst.add(t);
         t = createToken("10", 8, 10);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("foo", 11, 14);
+        t.setPositionIncrement(1);
         lst.add(t);
         iter = lst.iterator();
       }
 
-      public Token next(final Token reusableToken) throws IOException {
-        assert reusableToken != null;
-        return iter.hasNext() ? (Token) iter.next() : null;
+      public boolean incrementToken() throws IOException {
+        if(iter.hasNext()) {
+          Token token = (Token) iter.next();
+          termAtt.setTermBuffer(token.term());
+          posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+          offsetAtt.setOffset(token.startOffset(), token.endOffset());
+          return true;
+        }
+        return false;
       }
+     
     };
   }
 
@@ -1322,26 +1379,42 @@
     return new TokenStream() {
       Iterator iter;
       List lst;
+      private TermAttribute termAtt;
+      private PositionIncrementAttribute posIncrAtt;
+      private OffsetAttribute offsetAtt;
       {
+        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+        posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+        offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
         lst = new ArrayList();
         Token t;
         t = createToken("hispeed", 0, 8);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("hi", 0, 2);
         t.setPositionIncrement(0);
         lst.add(t);
         t = createToken("speed", 3, 8);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("10", 8, 10);
+        t.setPositionIncrement(1);
         lst.add(t);
         t = createToken("foo", 11, 14);
+        t.setPositionIncrement(1);
         lst.add(t);
         iter = lst.iterator();
       }
 
-      public Token next(final Token reusableToken) throws IOException {
-        assert reusableToken != null;
-        return iter.hasNext() ? (Token) iter.next() : null;
+      public boolean incrementToken() throws IOException {
+        if(iter.hasNext()) {
+          Token token = (Token) iter.next();
+          termAtt.setTermBuffer(token.term());
+          posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+          offsetAtt.setOffset(token.startOffset(), token.endOffset());
+          return true;
+        }
+        return false;
       }
     };
   }
@@ -1611,7 +1684,11 @@
    *      java.io.Reader)
    */
   public TokenStream tokenStream(String arg0, Reader arg1) {
-    return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
+    LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1);
+    stream.addAttribute(TermAttribute.class);
+    stream.addAttribute(PositionIncrementAttribute.class);
+    stream.addAttribute(OffsetAttribute.class);
+    return new SynonymTokenizer(stream, synonyms);
   }
 }
 
@@ -1622,47 +1699,70 @@
 class SynonymTokenizer extends TokenStream {
   private TokenStream realStream;
   private Token currentRealToken = null;
+  private org.apache.lucene.analysis.Token cRealToken = null;
   private Map synonyms;
   StringTokenizer st = null;
+  private TermAttribute realTermAtt;
+  private PositionIncrementAttribute realPosIncrAtt;
+  private OffsetAttribute realOffsetAtt;
+  private TermAttribute termAtt;
+  private PositionIncrementAttribute posIncrAtt;
+  private OffsetAttribute offsetAtt;
 
   public SynonymTokenizer(TokenStream realStream, Map synonyms) {
     this.realStream = realStream;
     this.synonyms = synonyms;
+    realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class);
+    realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class);
+    realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class);
+
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
   }
 
-  public Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
+  public boolean incrementToken() throws IOException {
+
     if (currentRealToken == null) {
-      Token nextRealToken = realStream.next(reusableToken);
-      if (nextRealToken == null) {
-        return null;
-      }
-      String expansions = (String) synonyms.get(nextRealToken.term());
+      boolean next = realStream.incrementToken();
+      if (!next) {
+        return false;
+      }
+      //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
+      termAtt.setTermBuffer(realTermAtt.term());
+      offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+      posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
+
+      String expansions = (String) synonyms.get(realTermAtt.term());
       if (expansions == null) {
-        return nextRealToken;
+        return true;
       }
       st = new StringTokenizer(expansions, ",");
       if (st.hasMoreTokens()) {
-        currentRealToken = (Token) nextRealToken.clone();
+        currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+        currentRealToken.setTermBuffer(realTermAtt.term());
       }
-      return currentRealToken;
+      
+      return true;
     } else {
-      reusableToken.reinit(st.nextToken(),
-                           currentRealToken.startOffset(),
-                           currentRealToken.endOffset());
-      reusableToken.setPositionIncrement(0);
+      String tok = st.nextToken();
+      termAtt.setTermBuffer(tok);
+      offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
+      posIncrAtt.setPositionIncrement(0);
       if (!st.hasMoreTokens()) {
         currentRealToken = null;
         st = null;
       }
-      return reusableToken;
+      return true;
     }
+    
   }
 
   static abstract class TestHighlightRunner {
     static final int STANDARD = 0;
     static final int SPAN = 1;
     int mode = STANDARD;
+    Fragmenter frag = new SimpleFragmenter(20);
 
     public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream,
         Formatter formatter) {
@@ -1725,7 +1825,7 @@
         if (mode == SPAN) {
           ((CachingTokenFilter) tokenStream).reset();
         }
-        highlighter.setTextFragmenter(new SimpleFragmenter(20));
+        highlighter.setTextFragmenter(frag);
 
         String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
             fragmentSeparator);



Mime
View raw message