lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r797665 [1/3] - in /lucene/java/trunk: ./ src/java/org/apache/lucene/analysis/ src/java/org/apache/lucene/analysis/standard/ src/java/org/apache/lucene/analysis/tokenattributes/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/q...
Date Fri, 24 Jul 2009 21:45:50 GMT
Author: buschmi
Date: Fri Jul 24 21:45:48 2009
New Revision: 797665

URL: http://svn.apache.org/viewvc?rev=797665&view=rev
Log:
LUCENE-1693: Various improvements to the new TokenStream API.

Added:
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java   (with props)
    lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/util/TestAttributeSource.java   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
    lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java
    lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java
    lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj
    lucene/java/trunk/src/java/org/apache/lucene/search/QueryTermVector.java
    lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
    lucene/java/trunk/src/java/org/apache/lucene/util/AttributeSource.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java
    lucene/java/trunk/src/test/org/apache/lucene/util/LuceneTestCase.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Jul 24 21:45:48 2009
@@ -64,6 +64,22 @@
     process. It is not recommended to implement it, but rather extend
     Searcher.  (Shai Erera via Mike McCandless)
 
+ 4. LUCENE-1422, LUCENE-1693: The new TokenStream API (see below) using
+    Attributes has some backwards breaks in rare cases.
+    We did our best to make the transition as easy as possible. You should
+    not have problems, if your tokenizers still implement next(Token) or
+    next(), the calls are automatically wrapped. The indexer and query parser
+    use the new API using incrementToken() calls. All core TokenStreams
+    are implemented using the new API. You can mix old and new API
+    style TokenFilters/TokenStream. Problems only occur when you have done
+    the following:
+    You have overridden next(Token) or next() in one of the non-abstract core
+    TokenStreams/-Filters. This classes should normally be final, but some
+    of them are not. In this case next(Token)/next() would never be called.
+    To early fail with a hard compile/runtime error, the next(Token)/next()
+    methods in these TokenStreams/-Filters were made final.
+    (Michael Busch, Uwe Schindler)
+
 Changes in runtime behavior
 
  1. LUCENE-1424: QueryParser now by default uses constant score query
@@ -156,14 +172,16 @@
    and deprecate FSDirectory.getDirectory().  FSDirectory instances
    are not required to be singletons per path. (yonik)
 
-4. LUCENE-1422: New TokenStream API that uses a new class called 
+4. LUCENE-1422, LUCENE-1693: New TokenStream API that uses a new class called 
    AttributeSource instead of the now deprecated Token class. All attributes
    that the Token class had have been moved into separate classes:
    TermAttribute, OffsetAttribute, PositionIncrementAttribute, 
    PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
    is much more flexible; it allows to combine the Attributes arbitrarily 
    and also to define custom Attributes. The new API has the same performance
-   as the old next(Token) approach. (Michael Busch)
+   as the old next(Token) approach.
+   For conformance with this new API Tee-/SinkTokenizer was deprecated
+   and replaced by a new TeeSinkTokenFilter. (Michael Busch, Uwe Schindler)
 
 5. LUCENE-1467: Add nextDoc() and next(int) methods to OpenBitSetIterator.
    These methods can be used to avoid additional calls to doc(). 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java Fri Jul 24 21:45:48 2009
@@ -1,5 +1,8 @@
 package org.apache.lucene.analysis;
 
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.ArrayUtil;
 
 /**
@@ -53,24 +56,21 @@
  * accents from Latin1 characters.  For example, 'à' will be replaced by
  * 'a'.
  */
-public class ASCIIFoldingFilter extends TokenFilter {
+public final class ASCIIFoldingFilter extends TokenFilter {
   public ASCIIFoldingFilter(TokenStream input)
   {
     super(input);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
   }
 
   private char[] output = new char[512];
   private int outputPos;
+  private TermAttribute termAtt;
 
-  public Token next(Token result)
-      throws java.io.IOException
-  {
-    result = input.next(result);
-
-    if (result != null)
-    {
-      final char[] buffer = result.termBuffer();
-      final int length = result.termLength();
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final char[] buffer = termAtt.termBuffer();
+      final int length = termAtt.termLength();
 
       // If no characters actually require rewriting then we
       // just return token as-is:
@@ -79,13 +79,13 @@
         if (c >= '\u0080')
         {
           foldToASCII(buffer, length);
-          result.setTermBuffer(output, 0, outputPos);
+          termAtt.setTermBuffer(output, 0, outputPos);
           break;
         }
       }
-      return result;
+      return true;
     } else {
-      return null;
+      return false;
     }
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -25,24 +25,35 @@
 import org.apache.lucene.util.AttributeSource;
 
 /**
- * This class can be used if the Tokens of a TokenStream
+ * This class can be used if the token attributes of a TokenStream
  * are intended to be consumed more than once. It caches
- * all Tokens locally in a List.
+ * all token attribute states locally in a List.
  * 
- * CachingTokenFilter implements the optional method
+ * <P>CachingTokenFilter implements the optional method
  * {@link TokenStream#reset()}, which repositions the
  * stream to the first Token. 
- *
  */
 public class CachingTokenFilter extends TokenFilter {
-  private List cache;
-  private Iterator iterator; 
+  private List cache = null;
+  private Iterator iterator = null; 
   
   public CachingTokenFilter(TokenStream input) {
     super(input);
   }
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next(final Token reusableToken) throws IOException {
+    return super.next(reusableToken);
+  }
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next() throws IOException {
+    return super.next();
+  }
   
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
     if (cache == null) {
       // fill cache lazily
       cache = new LinkedList();
@@ -51,34 +62,14 @@
     }
     
     if (!iterator.hasNext()) {
-      // the cache is exhausted, return null
+      // the cache is exhausted, return false
       return false;
     }
     // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
-    AttributeSource state = (AttributeSource) iterator.next();
-    state.restoreState(this);
+    restoreState((AttributeSource.State) iterator.next());
     return true;
   }
-  
-  /** @deprecated */
-  public Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    if (cache == null) {
-      // fill cache lazily
-      cache = new LinkedList();
-      fillCache(reusableToken);
-      iterator = cache.iterator();
-    }
-    
-    if (!iterator.hasNext()) {
-      // the cache is exhausted, return null
-      return null;
-    }
-    // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
-    Token nextToken = (Token) iterator.next();
-    return (Token) nextToken.clone();
-  }
-  
+
   public void reset() throws IOException {
     if(cache != null) {
       iterator = cache.iterator();
@@ -90,12 +81,5 @@
       cache.add(captureState());
     }
   }
-  
-  /** @deprecated */
-  private void fillCache(final Token reusableToken) throws IOException {
-    for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
-      cache.add(nextToken.clone());
-    }
-  }
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Jul 24 21:45:48 2009
@@ -94,49 +94,16 @@
     return true;
   }
 
-  /** @deprecated */
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
   public final Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    reusableToken.clear();
-    int length = 0;
-    int start = bufferIndex;
-    char[] buffer = reusableToken.termBuffer();
-    while (true) {
-
-      if (bufferIndex >= dataLen) {
-        offset += dataLen;
-        dataLen = input.read(ioBuffer);
-        if (dataLen == -1) {
-          if (length > 0)
-            break;
-          else
-            return null;
-        }
-        bufferIndex = 0;
-      }
-
-      final char c = ioBuffer[bufferIndex++];
-
-      if (isTokenChar(c)) {               // if it's a token char
-
-        if (length == 0)			           // start of token
-          start = offset + bufferIndex - 1;
-        else if (length == buffer.length)
-          buffer = reusableToken.resizeTermBuffer(1+length);
-
-        buffer[length++] = normalize(c); // buffer it, normalized
-
-        if (length == MAX_WORD_LEN)		   // buffer overflow!
-          break;
-
-      } else if (length > 0)             // at non-Letter w/ chars
-        break;                           // return 'em
-    }
+    return super.next(reusableToken);
+  }
 
-    reusableToken.setTermLength(length);
-    reusableToken.setStartOffset(input.correctOffset(start));
-    reusableToken.setEndOffset(input.correctOffset(start+length));
-    return reusableToken;
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next() throws IOException {
+    return super.next();
   }
 
   public void reset(Reader input) throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java Fri Jul 24 21:45:48 2009
@@ -57,27 +57,17 @@
     } else
       return false;
   }
-  
-  /** @deprecated */
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
   public final Token next(final Token reusableToken) throws java.io.IOException {
-    assert reusableToken != null;
-    Token nextToken = input.next(reusableToken);
-    if (nextToken != null) {
-      final char[] buffer = nextToken.termBuffer();
-      final int length = nextToken.termLength();
-      // If no characters actually require rewriting then we
-      // just return token as-is:
-      for(int i=0;i<length;i++) {
-        final char c = buffer[i];
-        if (c >= '\u00c0' && c <= '\uFB06') {
-          removeAccents(buffer, length);
-          nextToken.setTermBuffer(output, 0, outputPos);
-          break;
-        }
-      }
-      return nextToken;
-    } else
-      return null;
+    return super.next(reusableToken);
+  }
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next() throws java.io.IOException {
+    return super.next();
   }
 
   /**

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Fri Jul 24 21:45:48 2009
@@ -45,7 +45,7 @@
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
   }
   
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
     if (!done) {
       done = true;
       int upto = 0;
@@ -65,28 +65,16 @@
     return false;
   }
 
-  /** @deprecated */
-  public Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    if (!done) {
-      done = true;
-      int upto = 0;
-      reusableToken.clear();
-      char[] buffer = reusableToken.termBuffer();
-      while (true) {
-        final int length = input.read(buffer, upto, buffer.length-upto);
-        if (length == -1) break;
-        upto += length;
-        if (upto == buffer.length)
-          buffer = reusableToken.resizeTermBuffer(1+buffer.length);
-      }
-      reusableToken.setTermLength(upto);
-      reusableToken.setStartOffset(input.correctOffset(0));
-      reusableToken.setEndOffset(input.correctOffset(upto));
-      
-      return reusableToken;
-    }
-    return null;
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next(final Token reusableToken) throws IOException {
+    return super.next(reusableToken);
+  }
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next() throws IOException {
+    return super.next();
   }
 
   public void reset(Reader input) throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java Fri Jul 24 21:45:48 2009
@@ -61,24 +61,4 @@
     // reached EOS -- return null
     return false;
   }
-
-  /**
-   * Returns the next input Token whose term() is the right len
-   * @deprecated
-   */
-  public final Token next(final Token reusableToken) throws IOException
-  {
-    assert reusableToken != null;
-    // return the first non-stop word found
-    for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken))
-    {
-      int len = nextToken.termLength();
-      if (len >= min && len <= max) {
-          return nextToken;
-      }
-      // note: else we ignore it but should we index each part of it?
-    }
-    // reached EOS -- return null
-    return null;
-  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java Fri Jul 24 21:45:48 2009
@@ -46,20 +46,4 @@
     } else
       return false;
   }
-  
-  /** @deprecated */
-  public final Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    Token nextToken = input.next(reusableToken);
-    if (nextToken != null) {
-
-      final char[] buffer = nextToken.termBuffer();
-      final int length = nextToken.termLength();
-      for(int i=0;i<length;i++)
-        buffer[i] = Character.toLowerCase(buffer[i]);
-
-      return nextToken;
-    } else
-      return null;
-  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Jul 24 21:45:48 2009
@@ -206,40 +206,6 @@
     shift += precisionStep;
     return true;
   }
-
-  // @Override
-  /** @deprecated Will be removed in Lucene 3.0 */
-  public Token next(final Token reusableToken) {
-    assert reusableToken != null;
-    if (valSize == 0)
-      throw new IllegalStateException("call set???Value() before usage");
-    if (shift >= valSize)
-      return null;
-    
-    reusableToken.clear();
-
-    final char[] buffer;
-    switch (valSize) {
-      case 64:
-        buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
-        reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
-        break;
-      
-      case 32:
-        buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
-        reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
-        break;
-      
-      default:
-        // should not happen
-        throw new IllegalArgumentException("valSize must be 32 or 64");
-    }
-
-    reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
-    reusableToken.setPositionIncrement((shift == 0) ? 1 : 0);
-    shift += precisionStep;
-    return reusableToken;
-  }
   
   // @Override
   public String toString() {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java Fri Jul 24 21:45:48 2009
@@ -57,16 +57,4 @@
       termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
     return true;
   }
-  
-  /** @deprecated */
-  public final Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    Token nextToken = input.next(reusableToken);
-    if (nextToken == null)
-      return null;
-
-    if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength()))
-      nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
-    return nextToken;
-  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java Fri Jul 24 21:45:48 2009
@@ -22,19 +22,21 @@
 import java.util.Iterator;
 import java.util.List;
 
-import org.apache.lucene.util.AttributeSource;
-
 
 /**
  * A SinkTokenizer can be used to cache Tokens for use in an Analyzer
- *
+ * <p/>
+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
+ * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers 
+ * the same functionality.
  * @see TeeTokenFilter
+ * @deprecated Use {@link TeeSinkTokenFilter} instead
  *
  **/
 public class SinkTokenizer extends Tokenizer {
   protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
   protected Iterator/*<Token>*/ iter;
-  
+
   public SinkTokenizer(List/*<Token>*/ input) {
     this.lst = input;
     if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
@@ -64,29 +66,9 @@
   }
 
   /**
-   * Increments this stream to the next token out of the list of cached tokens
-   * @throws IOException
-   */
-  public boolean incrementToken() throws IOException {
-    if (iter == null) iter = lst.iterator();
-    // Since this TokenStream can be reset we have to maintain the tokens as immutable
-    if (iter.hasNext()) {
-      AttributeSource state = (AttributeSource) iter.next();
-      state.restoreState(this);
-      return true;
-    }
-    return false;
-  }
-
-  public void add(AttributeSource source) throws IOException {
-    lst.add(source); 
-  }
-  
-  /**
    * Returns the next token out of the list of cached tokens
    * @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
    * @throws IOException
-   * @deprecated
    */
   public Token next(final Token reusableToken) throws IOException {
     assert reusableToken != null;

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Fri Jul 24 21:45:48 2009
@@ -235,27 +235,6 @@
   }
 
   /**
-   * Returns the next input Token whose term() is not a stop word.
-   * @deprecated
-   */
-  public final Token next(final Token reusableToken) throws IOException {
-    assert reusableToken != null;
-    // return the first non-stop word found
-    int skippedPositions = 0;
-    for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
-      if (!stopWords.contains(nextToken.termBuffer(), 0, nextToken.termLength())) {
-        if (enablePositionIncrements) {
-          nextToken.setPositionIncrement(nextToken.getPositionIncrement() + skippedPositions);
-        }
-        return nextToken;
-      }
-      skippedPositions += nextToken.getPositionIncrement();
-    }
-    // reached EOS -- return null
-    return null;
-  }
-
-  /**
    * @see #setEnablePositionIncrementsDefault(boolean). 
    * @deprecated Please specify this when you create the StopFilter
    */

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,206 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Collections;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * This TokenFilter provides the ability to set aside attribute states
+ * that have already been analyzed.  This is useful in situations where multiple fields share
+ * many common analysis steps and then go their separate ways.
+ * <p/>
+ * It is also useful for doing things like entity extraction or proper noun analysis as
+ * part of the analysis workflow and saving off those tokens for use in another field.
+ *
+ * <pre>
+TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
+TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+
+TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
+source2.addSinkTokenStream(sink1);
+source2.addSinkTokenStream(sink2);
+
+TokenStream final1 = new LowerCaseFilter(source1);
+TokenStream final2 = source2;
+TokenStream final3 = new EntityDetect(sink1);
+TokenStream final4 = new URLDetect(sink2);
+
+d.add(new Field("f1", final1));
+d.add(new Field("f2", final2));
+d.add(new Field("f3", final3));
+d.add(new Field("f4", final4));
+ * </pre>
+ * In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
+ * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
+ * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ * It is important, that tees are consumed before sinks (in the above example, the field names must be
+ * less the sink's field names). If you are not sure, which stream is consumed first, you can simply
+ * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
+ * This TokenFilter is exhausted after this. In the above example, change
+ * the example above to:
+ * <pre>
+...
+TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
+TokenStream final2 = source2.newSinkTokenStream();
+sink1.consumeAllTokens();
+sink2.consumeAllTokens();
+...
+ * </pre>
+ * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
+ * <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
+ */
+public final class TeeSinkTokenFilter extends TokenFilter {
+  private final List sinks = new LinkedList();
+  
+  /**
+   * Instantiates a new TeeSinkTokenFilter.
+   */
+  public TeeSinkTokenFilter(TokenStream input) {
+    super(input);
+  }
+
+  /**
+   * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
+   */
+  public SinkTokenStream newSinkTokenStream() {
+    return newSinkTokenStream(ACCEPT_ALL_FILTER);
+  }
+  
+  /**
+   * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
+   * that pass the supplied filter.
+   * @see SinkFilter
+   */
+  public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
+    SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
+    this.sinks.add(new WeakReference(sink));
+    return sink;
+  }
+  
+  /**
+   * Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
+   * to this one. The supplied stream will also receive all consumed tokens.
+   * This method can be used to pass tokens from two different tees to one sink.
+   */
+  public void addSinkTokenStream(final SinkTokenStream sink) {
+    // check that sink has correct factory
+    if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
+      throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
+    }
+    // add eventually missing attribute impls to the existing sink
+    for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
+      sink.addAttributeImpl((AttributeImpl) it.next());
+    }
+    this.sinks.add(new WeakReference(sink));
+  }
+  
+  /**
+   * <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
+   * when itsself is consumed. To be sure, that all tokens from the input
+   * stream are passed to the sinks, you can call this methods.
+   * This instance is exhausted after this, but all sinks are instant available.
+   */
+  public void consumeAllTokens() throws IOException {
+    while (incrementToken());
+  }
+  
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      // capture state lazily - maybe no SinkFilter accepts this state
+      AttributeSource.State state = null;
+      for (Iterator it = sinks.iterator(); it.hasNext(); ) {
+        final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
+        if (sink != null) {
+          if (sink.accept(this)) {
+            if (state == null) {
+              state = this.captureState();
+            }
+            sink.addState(state);
+          }
+        }
+      }
+      return true;
+    }
+    
+    return false;
+  }
+  
+  /**
+   * TODO: Missing Docs
+   */
+  public static interface SinkFilter {
+    boolean accept(AttributeSource source);
+  }
+  
+  public static final class SinkTokenStream extends TokenStream {
+    private final List cachedStates = new LinkedList();
+    private Iterator it = null;
+    private SinkFilter filter;
+    
+    private SinkTokenStream(AttributeSource source, SinkFilter filter) {
+      super(source);
+      this.filter = filter;
+    }
+    
+    private boolean accept(AttributeSource source) {
+      return filter.accept(source);
+    }
+    
+    private void addState(AttributeSource.State state) {
+      if (it != null) {
+        throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
+      }
+      cachedStates.add(state);
+    }
+    
+    public final boolean incrementToken() throws IOException {
+      // lazy init the iterator
+      if (it == null) {
+        it = cachedStates.iterator();
+      }
+    
+      if (!it.hasNext()) {
+        return false;
+      }
+      
+      AttributeSource.State state = (State) it.next();
+      restoreState(state);
+      return true;
+    }
+    
+    public final void reset() {
+      it = cachedStates.iterator();
+    }
+  }
+  
+  private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
+    public boolean accept(AttributeSource source) {
+      return true;
+    }
+  };
+  
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -18,7 +18,6 @@
 package org.apache.lucene.analysis;
 
 import java.io.IOException;
-import java.util.Iterator;
 
 
 /**
@@ -30,8 +29,8 @@
  * part of the analysis workflow and saving off those tokens for use in another field.
  *
  * <pre>
-SinkTokenizer sink1 = new SinkTokenizer(null);
-SinkTokenizer sink2 = new SinkTokenizer(null);
+SinkTokenizer sink1 = new SinkTokenizer();
+SinkTokenizer sink2 = new SinkTokenizer();
 
 TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
 TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
@@ -46,14 +45,22 @@
 d.add(new Field("f3", final3));
 d.add(new Field("f4", final4));
  * </pre>
- * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
-   and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
- Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
+ * In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
+ * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
+ * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ * It is important, that tees are consumed before sinks (in the above example, the field names must be
+ * less the sink's field names).
+ * Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
  <p/>
  *
- * See http://issues.apache.org/jira/browse/LUCENE-1058
+ * See <a href="http://issues.apache.org/jira/browse/LUCENE-1058">LUCENE-1058</a>.
+ * <p/>
+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
+ * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers 
+ * the same functionality.
+
  * @see SinkTokenizer
- *
+ * @deprecated Use {@link TeeSinkTokenFilter} instead
  **/
 public class TeeTokenFilter extends TokenFilter {
   SinkTokenizer sink;
@@ -61,21 +68,8 @@
   public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
     super(input);
     this.sink = sink;
-    Iterator it = getAttributesIterator();
-    while (it.hasNext()) {
-      sink.addAttribute(it.next().getClass());
-    }
   }
   
-  public boolean incrementToken() throws IOException {
-    if (input.incrementToken()) {
-      sink.add(captureState());
-      return true;
-    }
-    return false;
-  }
-
-  /** @deprecated */
   public Token next(final Token reusableToken) throws IOException {
     assert reusableToken != null;
     Token nextToken = input.next(reusableToken);

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Fri Jul 24 21:45:48 2009
@@ -17,14 +17,19 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.TermPositions;     // for javadoc
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
 
 /** 
-  This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
-  See Javadocs in {@link TokenStream} for further details.
-  <p> 
   A Token is an occurrence of a term from the text of a field.  It consists of
   a term's text, the start and end offset of the term in the text of the field,
   and a type string.
@@ -44,11 +49,13 @@
   {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
   
   <br><br>
-  <p><font color="#FF0000">
-  WARNING: The status of the <b>Payloads</b> feature is experimental. 
-  The APIs introduced here might change in the future and will not be 
-  supported anymore in such a case.</font>
-
+  
+  <p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
+  that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
+  Even though it is not necessary to use Token anymore, with the new TokenStream API it can
+  be used as convenience class that implements all {@link Attribute}s, which is especially useful
+  to easily switch from the old to the new TokenStream API.
+  
   <br><br>
 
   <p><b>NOTE:</b> As of 2.3, Token stores the term text
@@ -118,10 +125,10 @@
   </p>
 
   @see org.apache.lucene.index.Payload
-  @deprecated A new TokenStream API was introduced with Lucene 2.9.
-              See javadocs in {@link TokenStream} for further details.
 */
-public class Token implements Cloneable {
+public class Token extends AttributeImpl 
+                   implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
+                              FlagsAttribute, OffsetAttribute, PayloadAttribute {
 
   public static final String DEFAULT_TYPE = "word";
 
@@ -134,7 +141,7 @@
   /**
    * Characters for the term text.
    * @deprecated This will be made private. Instead, use:
-   * {@link termBuffer()}, 
+   * {@link #termBuffer()}, 
    * {@link #setTermBuffer(char[], int, int)},
    * {@link #setTermBuffer(String)}, or
    * {@link #setTermBuffer(String, int, int)}
@@ -144,28 +151,28 @@
   /**
    * Length of term text in the buffer.
    * @deprecated This will be made private. Instead, use:
-   * {@link termLength()}, or @{link setTermLength(int)}.
+   * {@link #termLength()}, or @{link setTermLength(int)}.
    */
   int termLength;
 
   /**
    * Start in source text.
    * @deprecated This will be made private. Instead, use:
-   * {@link startOffset()}, or @{link setStartOffset(int)}.
+   * {@link #startOffset()}, or @{link setStartOffset(int)}.
    */
   int startOffset;
 
   /**
    * End in source text.
    * @deprecated This will be made private. Instead, use:
-   * {@link endOffset()}, or @{link setEndOffset(int)}.
+   * {@link #endOffset()}, or @{link setEndOffset(int)}.
    */
   int endOffset;
 
   /**
    * The lexical type of the token.
    * @deprecated This will be made private. Instead, use:
-   * {@link type()}, or @{link setType(String)}.
+   * {@link #type()}, or @{link setType(String)}.
    */
   String type = DEFAULT_TYPE;
 
@@ -173,13 +180,13 @@
   
   /**
    * @deprecated This will be made private. Instead, use:
-   * {@link getPayload()}, or @{link setPayload(Payload)}.
+   * {@link #getPayload()}, or @{link setPayload(Payload)}.
    */
   Payload payload;
   
   /**
    * @deprecated This will be made private. Instead, use:
-   * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}.
+   * {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}.
    */
   int positionIncrement = 1;
 
@@ -561,6 +568,13 @@
   public void setEndOffset(int offset) {
     this.endOffset = offset;
   }
+  
+  /** Set the starting and ending offset.
+  @see #startOffset() and #endOffset()*/
+  public void setOffset(int startOffset, int endOffset) {
+    this.startOffset = startOffset;
+    this.endOffset = endOffset;
+  }
 
   /** Returns this Token's lexical type.  Defaults to "word". */
   public final String type() {
@@ -640,19 +654,15 @@
   }
 
   public Object clone() {
-    try {
-      Token t = (Token)super.clone();
-      // Do a deep clone
-      if (termBuffer != null) {
-        t.termBuffer = (char[]) termBuffer.clone();
-      }
-      if (payload != null) {
-        t.setPayload((Payload) payload.clone());
-      }
-      return t;
-    } catch (CloneNotSupportedException e) {
-      throw new RuntimeException(e);  // shouldn't happen
+    Token t = (Token)super.clone();
+    // Do a deep clone
+    if (termBuffer != null) {
+      t.termBuffer = (char[]) termBuffer.clone();
+    }
+    if (payload != null) {
+      t.setPayload((Payload) payload.clone());
     }
+    return t;
   }
 
   /** Makes a clone, but replaces the term buffer &
@@ -862,4 +872,9 @@
     type = prototype.type;
     payload =  prototype.payload;
   }
+
+  public void copyTo(AttributeImpl target) {
+    Token to = (Token) target;
+    to.reinit(this);
+  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java Fri Jul 24 21:45:48 2009
@@ -42,7 +42,7 @@
     super(input);
     this.input = input;
   }
-    
+  
   /** Close the input TokenStream. */
   public void close() throws IOException {
     input.close();
@@ -50,20 +50,6 @@
 
   /** Reset the filter as well as the input TokenStream. */
   public void reset() throws IOException {
-    super.reset();
     input.reset();
   }
-  
-  public boolean useNewAPI() {
-    return input.useNewAPI();
-  }
-
-  /**
-   * Sets whether or not to use the new TokenStream API. Settings this
-   * will apply to this Filter and all TokenStream/Filters upstream.
-   */
-  public void setUseNewAPI(boolean use) {
-    input.setUseNewAPI(use);
-  }
-
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java Fri Jul 24 21:45:48 2009
@@ -18,10 +18,15 @@
  */
 
 import java.io.IOException;
-import java.util.Iterator;
 
-import org.apache.lucene.index.Payload;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeSource;
 
 /** A TokenStream enumerates the sequence of tokens, either from
@@ -36,13 +41,13 @@
   </ul>
   A new TokenStream API is introduced with Lucene 2.9. Since
   2.9 Token is deprecated and the preferred way to store
-  the information of a token is to use {@link Attribute}s.
+  the information of a token is to use {@link AttributeImpl}s.
   <p>
   For that reason TokenStream extends {@link AttributeSource}
-  now. Note that only one instance per {@link Attribute} is
+  now. Note that only one instance per {@link AttributeImpl} is
   created and reused for every token. This approach reduces
   object creations and allows local caching of references to
-  the {@link Attribute}s. See {@link #incrementToken()} for further details.
+  the {@link AttributeImpl}s. See {@link #incrementToken()} for further details.
   <p>
   <b>The workflow of the new TokenStream API is as follows:</b>
   <ol>
@@ -60,19 +65,8 @@
   <p>
   Sometimes it is desirable to capture a current state of a
   TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
-  {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
-  {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.  
-  <p>
-  <b>NOTE:</b> In order to enable the new API the method
-  {@link #useNewAPI()} has to be called with useNewAPI=true.
-  Otherwise the deprecated method {@link #next(Token)} will 
-  be used by Lucene consumers (indexer and queryparser) to
-  consume the tokens. {@link #next(Token)} will be removed
-  in Lucene 3.0.
-  <p>
-  NOTE: To use the old API subclasses must override {@link #next(Token)}.
-  It's also OK to instead override {@link #next()} but that
-  method is slower compared to {@link #next(Token)}.
+  {@link TeeSinkTokenFilter}). For this usecase
+  {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used.  
  * <p><font color="#FF0000">
  * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. 
  * The APIs introduced in these classes with Lucene 2.9 might change in the future. 
@@ -80,110 +74,203 @@
   */
 
 public abstract class TokenStream extends AttributeSource {
-  private static boolean useNewAPIDefault = false;
-  private boolean useNewAPI = useNewAPIDefault;
+
+  /** @deprecated Remove this when old API is removed! */
+  private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
+    = new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static final Class[] METHOD_NO_PARAMS = new Class[0];
+
+  /** @deprecated Remove this when old API is removed! */
+  private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class};
   
+  /** @deprecated Remove this when old API is removed! */
+  private final TokenWrapper tokenWrapper;
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static boolean onlyUseNewAPI = false;
+
+  /** @deprecated Remove this when old API is removed! */
+  private final boolean
+    hasIncrementToken = isMethodOverridden("incrementToken", METHOD_NO_PARAMS),
+    hasReusableNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_TOKEN_PARAM),
+    hasNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_NO_PARAMS);
+  
+  /** @deprecated Remove this when old API is removed! */
+  private boolean isMethodOverridden(String name, Class[] params) {
+    try {
+      return this.getClass().getMethod(name, params).getDeclaringClass() != TokenStream.class;
+    } catch (NoSuchMethodException e) {
+      // should not happen
+      throw new RuntimeException(e);
+    }
+  }
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static final class TokenWrapperAttributeFactory extends AttributeFactory {
+    private final AttributeFactory delegate;
+  
+    private TokenWrapperAttributeFactory(AttributeFactory delegate) {
+      this.delegate = delegate;
+    }
+  
+    public AttributeImpl createAttributeInstance(Class attClass) {
+      return attClass.isAssignableFrom(TokenWrapper.class)
+        ? new TokenWrapper()
+        : delegate.createAttributeInstance(attClass);
+    }
+    
+    // this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource,
+    // so two TokenStreams using old API have the same AttributeFactory wrapped by this one.
+    public boolean equals(Object other) {
+      if (this == other) return true;
+      if (other instanceof TokenWrapperAttributeFactory) {
+        final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other;
+        return this.delegate.equals(af.delegate);
+      }
+      return false;
+    }
+  }
+
+  /**
+   * A TokenStream using the default attribute factory.
+   */
   protected TokenStream() {
-    super();
+    super(onlyUseNewAPI
+      ? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
+      : TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
+    );
+    tokenWrapper = initTokenWrapper(null);
+    check();
   }
   
+  /**
+   * A TokenStream that uses the same attributes as the supplied one.
+   */
   protected TokenStream(AttributeSource input) {
     super(input);
+    tokenWrapper = initTokenWrapper(input);
+    check();
   }
-
+  
   /**
-   * Returns whether or not the new TokenStream APIs are used
-   * by default. 
-   * (see {@link #incrementToken()}, {@link AttributeSource}).
+   * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
    */
-  public static boolean useNewAPIDefault() {
-    return useNewAPIDefault;
+  protected TokenStream(AttributeFactory factory) {
+    super(onlyUseNewAPI
+      ? factory
+      : new TokenWrapperAttributeFactory(factory)
+    );
+    tokenWrapper = initTokenWrapper(null);
+    check();
   }
 
+  /** @deprecated Remove this when old API is removed! */
+  private TokenWrapper initTokenWrapper(AttributeSource input) {
+    if (onlyUseNewAPI) {
+      // no wrapper needed
+      return null;
+    } else {
+      // if possible get the wrapper from the filter's input stream
+      if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) {
+        return ((TokenStream) input).tokenWrapper;
+      }
+      // check that all attributes are implemented by the same TokenWrapper instance
+      final AttributeImpl att = addAttribute(TermAttribute.class);
+      if (att instanceof TokenWrapper &&
+        addAttribute(TypeAttribute.class) == att &&
+        addAttribute(PositionIncrementAttribute.class) == att &&
+        addAttribute(FlagsAttribute.class) == att &&
+        addAttribute(OffsetAttribute.class) == att &&
+        addAttribute(PayloadAttribute.class) == att
+      ) {
+        return (TokenWrapper) att;
+      } else {
+        throw new UnsupportedOperationException(
+          "If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+
+          "TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+
+          "instantiated with this flag disabled and do not add any custom instances for the basic Attributes!"
+        );
+      }
+    }
+  }
+
+  /** @deprecated Remove this when old API is removed! */
+  private void check() {
+    if (onlyUseNewAPI && !hasIncrementToken) {
+      throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.");
+    }
+
+    // a TokenStream subclass must at least implement one of the methods!
+    if (!(hasIncrementToken || hasNext || hasReusableNext)) {
+      throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next().");
+    }
+  }
+  
   /**
-   * Use this API to enable or disable the new TokenStream API.
-   * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}. 
-   * (see {@link #incrementToken()}, {@link AttributeSource}).
-   * <p>
-   * If set to true, the indexer will call {@link #incrementToken()} 
-   * to consume Tokens from this stream.
-   * <p>
-   * If set to false, the indexer will call {@link #next(Token)}
-   * instead. 
+   * For extra performance you can globally enable the new {@link #incrementToken}
+   * API using {@link Attribute}s. There will be a small, but in most cases neglectible performance 
+   * increase by enabling this, but it only works if <b>all</b> TokenStreams and -Filters
+   * use the new API and implement {@link #incrementToken}. This setting can only be enabled
+   * globally.
+   * <P>This setting only affects TokenStreams instantiated after this call. All TokenStreams
+   * already created use the other setting.
+   * <P>All core analyzers are compatible with this setting, if you have own
+   * TokenStreams/-Filters, that are also compatible, enable this.
+   * <P>When enabled, tokenization may throw {@link UnsupportedOperationException}s,
+   * if the whole tokenizer chain is not compatible.
+   * <P>The default is <code>false</code>, so there is the fallback to the old API available.
+   * @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
+   * when {@link #incrementToken} is abstract and must be always implemented.
    */
-  public static void setUseNewAPIDefault(boolean use) {
-    useNewAPIDefault = use;
+  public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) {
+    TokenStream.onlyUseNewAPI = onlyUseNewAPI;
   }
   
-  /**
-   * Returns whether or not the new TokenStream APIs are used 
-   * for this stream.
-   * (see {@link #incrementToken()}, {@link AttributeSource}).
+  /** Returns if only the new API is used.
+   * @see #setOnlyUseNewAPI
+   * @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
+   * when {@link #incrementToken} is abstract and must be always implemented.
    */
-  public boolean useNewAPI() {
-    return useNewAPI;
+  public static boolean getOnlyUseNewAPI() {
+    return onlyUseNewAPI;
   }
-
+  
   /**
-   * Use this API to enable or disable the new TokenStream API
-   * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
-   * (see {@link #incrementToken()}, {@link AttributeSource}).
+   * Consumers (e. g. the indexer) use this method to advance the stream 
+   * to the next token. Implementing classes must implement this method 
+   * and update the appropriate {@link AttributeImpl}s with content of the 
+   * next token.
    * <p>
-   * If set to true, the indexer will call {@link #incrementToken()} 
-   * to consume Tokens from this stream.
+   * This method is called for every token of a document, so an efficient
+   * implementation is crucial for good performance. To avoid calls to 
+   * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
+   * downcasts, references to all {@link AttributeImpl}s that this stream uses 
+   * should be retrieved during instantiation.   
    * <p>
-   * If set to false, the indexer will call {@link #next(Token)}
-   * instead. 
-   * <p>
-   * <b>NOTE: All streams and filters in one chain must use the
-   * same API. </b>
-   */
-  public void setUseNewAPI(boolean use) {
-    useNewAPI = use;
-  }
-    	
-	/**
-	 * Consumers (e. g. the indexer) use this method to advance the stream 
-	 * to the next token. Implementing classes must implement this method 
-	 * and update the appropriate {@link Attribute}s with content of the 
-	 * next token.
-	 * <p>
-	 * This method is called for every token of a document, so an efficient
-	 * implementation is crucial for good performance. To avoid calls to 
-	 * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
-	 * downcasts, references to all {@link Attribute}s that this stream uses 
-	 * should be retrieved during instantiation.   
-	 * <p>
-	 * To make sure that filters and consumers know which attributes are available
+   * To make sure that filters and consumers know which attributes are available
    * the attributes must be added during instantiation. Filters and 
    * consumers are not required to check for availability of attributes in {@link #incrementToken()}.
-	 * 
-	 * @return false for end of stream; true otherwise
-	 *
-	 * <p>
-	 * <b>Note that this method will be defined abstract in Lucene 3.0.</b>
-	 */
-	public boolean incrementToken() throws IOException {
-	  // subclasses must implement this method; will be made abstract in Lucene 3.0
-	  return false;
-	}
-	
-  /** Returns the next token in the stream, or null at EOS.
-   *  @deprecated The returned Token is a "full private copy" (not
-   *  re-used across calls to next()) but will be slower
-   *  than calling {@link #next(Token)} instead.. */
-  public Token next() throws IOException {
-    final Token reusableToken = new Token();
-    Token nextToken = next(reusableToken);
-
-    if (nextToken != null) {
-      Payload p = nextToken.getPayload();
-      if (p != null) {
-        nextToken.setPayload((Payload) p.clone());
-      }
+   * 
+   * @return false for end of stream; true otherwise
+   *
+   * <p>
+   * <b>Note that this method will be defined abstract in Lucene 3.0.</b>
+   */
+  public boolean incrementToken() throws IOException {
+    assert !onlyUseNewAPI && tokenWrapper != null;
+    
+    final Token token;
+    if (hasReusableNext) {
+      token = next(tokenWrapper.delegate);
+    } else {
+      assert hasNext;
+      token = next();
     }
-
-    return nextToken;
+    if (token == null) return false;
+    tokenWrapper.delegate = token;
+    return true;
   }
 
   /** Returns the next token in the stream, or null at EOS.
@@ -215,12 +302,46 @@
    *  good idea to assert that it is not null.)
    *  @return next token in the stream or null if end-of-stream was hit
    *  @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
-   *  APIs should be used instead. See also {@link #useNewAPI()}.
+   *  APIs should be used instead.
    */
   public Token next(final Token reusableToken) throws IOException {
-    // We don't actually use inputToken, but still add this assert
     assert reusableToken != null;
-    return next();
+    
+    if (onlyUseNewAPI)
+      throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
+    
+    if (hasIncrementToken) {
+      tokenWrapper.delegate = reusableToken;
+      return incrementToken() ? tokenWrapper.delegate : null;
+    } else {
+      assert hasNext;
+      final Token token = next();
+      if (token == null) return null;
+      tokenWrapper.delegate = token;
+      return token;
+    }
+  }
+
+  /** Returns the next token in the stream, or null at EOS.
+   * @deprecated The returned Token is a "full private copy" (not
+   * re-used across calls to next()) but will be slower
+   * than calling {@link #next(Token)} or using the new
+   * {@link #incrementToken()} method with the new
+   * {@link AttributeSource} API.
+   */
+  public Token next() throws IOException {
+    if (onlyUseNewAPI)
+      throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
+    
+    if (hasIncrementToken) {
+      return incrementToken() ? ((Token) tokenWrapper.delegate.clone()) : null;
+    } else {
+      assert hasReusableNext;
+      final Token token = next(tokenWrapper.delegate);
+      if (token == null) return null;
+      tokenWrapper.delegate = token;
+      return (Token) token.clone();
+    }
   }
 
   /** Resets this stream to the beginning. This is an
@@ -240,24 +361,4 @@
   /** Releases resources associated with this stream. */
   public void close() throws IOException {}
   
-  public String toString() {
-    StringBuffer sb = new StringBuffer();
-    sb.append('(');
-    
-    if (hasAttributes()) {
-      // TODO Java 1.5
-      //Iterator<Attribute> it = attributes.values().iterator();
-      Iterator it = getAttributesIterator();
-      if (it.hasNext()) {
-        sb.append(it.next().toString());
-      }
-      while (it.hasNext()) {
-        sb.append(',');
-        sb.append(it.next().toString());
-      }
-    }
-    sb.append(')');
-    return sb.toString();
-  }
-
 }

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,163 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.AttributeImpl;
+
+/** 
+ * This class wraps a Token and supplies a single attribute instance
+ * where the delegate token can be replaced.
+ * @deprecated Will be removed, when old TokenStream API is removed.
+ */
+final class TokenWrapper extends AttributeImpl 
+                   implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
+                              FlagsAttribute, OffsetAttribute, PayloadAttribute {
+
+  Token delegate;
+
+  TokenWrapper() {
+    this(new Token());
+  }
+  
+  TokenWrapper(Token delegate) {
+    this.delegate = delegate;
+  }
+  
+  // TermAttribute:
+
+  public String term() {
+    return delegate.term();
+  }
+
+  public void setTermBuffer(char[] buffer, int offset, int length) {
+    delegate.setTermBuffer(buffer, offset, length);
+  }
+
+  public void setTermBuffer(String buffer) {
+    delegate.setTermBuffer(buffer);
+  }
+
+  public void setTermBuffer(String buffer, int offset, int length) {
+    delegate.setTermBuffer(buffer, offset, length);
+  }
+  
+  public char[] termBuffer() {
+    return delegate.termBuffer();
+  }
+
+  public char[] resizeTermBuffer(int newSize) {
+    return delegate.resizeTermBuffer(newSize);
+  }
+
+  public int termLength() {
+    return delegate.termLength();
+  }
+  
+  public void setTermLength(int length) {
+    delegate.setTermLength(length);
+  }
+  
+  // TypeAttribute:
+  
+  public String type() {
+    return delegate.type();
+  }
+
+  public void setType(String type) {
+    delegate.setType(type);
+  }
+
+  public void setPositionIncrement(int positionIncrement) {
+    delegate.setPositionIncrement(positionIncrement);
+  }
+
+  public int getPositionIncrement() {
+    return delegate.getPositionIncrement();
+  }
+  
+  // FlagsAttribute
+  
+  public int getFlags() {
+    return delegate.getFlags();
+  }
+
+  public void setFlags(int flags) {
+    delegate.setFlags(flags);
+  }
+  
+  // OffsetAttribute
+  
+  public int startOffset() {
+    return delegate.startOffset();
+  }
+
+  public void setOffset(int startOffset, int endOffset) {
+    delegate.setOffset(startOffset, endOffset);
+  }
+  
+  public int endOffset() {
+    return delegate.endOffset();
+  }
+  
+  // PayloadAttribute
+  public Payload getPayload() {
+    return delegate.getPayload();
+  }
+
+  public void setPayload(Payload payload) {
+    delegate.setPayload(payload);
+  }
+  
+  // TokenAttribute
+  
+  public void clear() {
+    delegate.clear();
+  }
+
+  // AttributeImpl
+
+  public String toString() {
+    return delegate.toString();
+  }
+  
+  public int hashCode() {
+    return delegate.hashCode();
+  }
+  
+  public boolean equals(Object other) {
+    if (other instanceof TokenWrapper) {
+      return ((TokenWrapper) other).delegate.equals(this.delegate);
+    }
+    return false;
+  }
+  
+  public Object clone() {
+    return new TokenWrapper((Token) delegate.clone());
+  }
+
+  public void copyTo(AttributeImpl target) {
+    ((TokenWrapper) target).delegate = (Token) this.delegate.clone();
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Jul 24 21:45:48 2009
@@ -24,17 +24,10 @@
   <p>
   This is an abstract class.
   <p>
-  <b>NOTE:</b> In order to enable the new API the method
-  {@link #useNewAPI()} has to be called with useNewAPI=true.
-  Otherwise the deprecated method {@link #next(Token)} will 
-  be used by Lucene consumers (indexer and queryparser) to
-  consume the tokens. {@link #next(Token)} will be removed
-  in Lucene 3.0.
-  <p>
   NOTE: To use the old API subclasses must override {@link #next(Token)}.
   It's also OK to instead override {@link #next()} but that
   method is slower compared to {@link #next(Token)}.
- <p>
+  <p>
   NOTE: subclasses overriding {@link #next(Token)} must  
   call {@link Token#clear()}.
  * <p><font color="#FF0000">

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html Fri Jul 24 21:45:48 2009
@@ -442,57 +442,73 @@
 in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped. 
 Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
 is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
+
 <h4>Adding a custom Attribute</h4>
 Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently 
-<code>PartOfSpeechAttribute</code>:
+<code>PartOfSpeechAttribute</code>. First we need to define the interface of the new Attribute:
 <pre>
-  public static enum PartOfSpeech {
-    Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
+  public interface PartOfSpeechAttribute extends Attribute {
+    public static enum PartOfSpeech {
+      Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
+    }
+  
+    public void setPartOfSpeech(PartOfSpeech pos);
+  
+    public PartOfSpeech getPartOfSpeech();
   }
+</pre>
+
+Now we also need to write the implementing class. The name of that class is important here: By default, Lucene
+checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would
+consequently call the implementing class <code>PartOfSpeechAttributeImpl</code>. <br/>
+This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions:
+{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument
+and returns an actual instance. You can implement your own factory if you need to change the default behavior. <br/><br/>
+
+Now here is the actual class that implements our new Attribute. Notice that the class has to extend
+{@link org.apache.lucene.util.AttributeSource.AttributeImpl}:
+
+<pre>
+public final class PartOfSpeechAttributeImpl extends AttributeImpl 
+                            implements PartOfSpeechAttribute{
   
-  public static final class PartOfSpeechAttribute extends Attribute {
-    
-    private PartOfSpeech pos = PartOfSpeech.Unknown;
-    
-    public void setPartOfSpeech(PartOfSpeech pos) {
-      this.pos = pos;
-    }
-    
-    public PartOfSpeech getPartOfSpeech() {
-      return pos;
-    }
+  private PartOfSpeech pos = PartOfSpeech.Unknown;
+  
+  public void setPartOfSpeech(PartOfSpeech pos) {
+    this.pos = pos;
+  }
+  
+  public PartOfSpeech getPartOfSpeech() {
+    return pos;
+  }
 
-    public void clear() {
-      pos = PartOfSpeech.Unknown;
-    }
+  public void clear() {
+    pos = PartOfSpeech.Unknown;
+  }
 
-    public void copyTo(Attribute target) {
-      ((PartOfSpeechAttribute) target).pos = pos;
-    }
+  public void copyTo(AttributeImpl target) {
+    ((PartOfSpeechAttributeImpl) target).pos = pos;
+  }
 
-    public boolean equals(Object other) {
-      if (other == this) {
-        return true;
-      }
-      
-      if (other instanceof PartOfSpeechAttribute) {
-        return pos == ((PartOfSpeechAttribute) other).pos;
-      }
-   
-      return false;
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
     }
-
-    public int hashCode() {
-      return pos.ordinal();
+    
+    if (other instanceof PartOfSpeechAttributeImpl) {
+      return pos == ((PartOfSpeechAttributeImpl) other).pos;
     }
+ 
+    return false;
+  }
 
-    public String toString() {
-      return "PartOfSpeech=" + pos;
-    }
+  public int hashCode() {
+    return pos.ordinal();
   }
+}
 </pre>
-This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
-new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
+This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the
+new <code>AttributeImpl</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode()</code>.
 Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
 that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
 <pre>
@@ -523,7 +539,9 @@
   }
 </pre>
 Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
-stores references in instance variables. Now we need to add the filter to the chain:
+stores references in instance variables. Notice how you only need to pass in the interface of the new
+Attribute and instantiating the correct class is automatically been taken care of.
+Now we need to add the filter to the chain:
 <pre>
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream stream = new WhitespaceTokenizer(reader);
@@ -582,7 +600,8 @@
 as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). 
 As a small hint, this is how the new Attribute class could begin:
 <pre>
-  public class FirstTokenOfSentenceAttribute extends Attribute {
+  public class FirstTokenOfSentenceAttributeImpl extends Attribute
+                   implements FirstTokenOfSentenceAttribute {
     
     private boolean firstToken;
     

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java Fri Jul 24 21:45:48 2009
@@ -73,39 +73,4 @@
 
     return true;
   }
-  
-  /** Returns the next token in the stream, or null at EOS.
-   * <p>Removes <tt>'s</tt> from the end of words.
-   * <p>Removes dots from acronyms.
-   * @deprecated
-   */
-  public final Token next(final Token reusableToken) throws java.io.IOException {
-    assert reusableToken != null;
-    Token nextToken = input.next(reusableToken);
-
-    if (nextToken == null)
-      return null;
-
-    char[] buffer = nextToken.termBuffer();
-    final int bufferLength = nextToken.termLength();
-    final String type = nextToken.type();
-
-    if (type == APOSTROPHE_TYPE &&		  // remove 's
-	bufferLength >= 2 &&
-        buffer[bufferLength-2] == '\'' &&
-        (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
-      // Strip last 2 characters off
-      nextToken.setTermLength(bufferLength - 2);
-    } else if (type == ACRONYM_TYPE) {		  // remove dots
-      int upto = 0;
-      for(int i=0;i<bufferLength;i++) {
-        char c = buffer[i];
-        if (c != '.')
-          buffer[upto++] = c;
-      }
-      nextToken.setTermLength(upto);
-    }
-
-    return nextToken;
-  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Jul 24 21:45:48 2009
@@ -147,7 +147,7 @@
    *
    * @see org.apache.lucene.analysis.TokenStream#next()
    */
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
     int posIncr = 1;
 
     while(true) {
@@ -183,66 +183,33 @@
         posIncr++;
     }
   }
-  
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next(final Token reusableToken) throws IOException {
+    return super.next(reusableToken);
+  }
+
+  /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+   * not be overridden. Delegates to the backwards compatibility layer. */
+  public final Token next() throws IOException {
+    return super.next();
+  }
+
   /*
    * (non-Javadoc)
    *
-   * @see org.apache.lucene.analysis.TokenStream#next()
+   * @see org.apache.lucene.analysis.TokenStream#reset()
    */
-  /** @deprecated */
-  public Token next(final Token reusableToken) throws IOException {
-      assert reusableToken != null;
-      int posIncr = 1;
-
-      while(true) {
-        int tokenType = scanner.getNextToken();
-
-        if (tokenType == StandardTokenizerImpl.YYEOF) {
-          return null;
-        }
-
-        if (scanner.yylength() <= maxTokenLength) {
-          reusableToken.clear();
-          reusableToken.setPositionIncrement(posIncr);
-          scanner.getText(reusableToken);
-          final int start = scanner.yychar();
-          reusableToken.setStartOffset(input.correctOffset(start));
-          reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
-          // This 'if' should be removed in the next release. For now, it converts
-          // invalid acronyms to HOST. When removed, only the 'else' part should
-          // remain.
-          if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
-            if (replaceInvalidAcronym) {
-              reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
-              reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
-            } else {
-              reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
-            }
-          } else {
-            reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
-          }
-          return reusableToken;
-        } else
-          // When we skip a too-long term, we still increment the
-          // position increment
-          posIncr++;
-      }
-    }
-
-    /*
-     * (non-Javadoc)
-     *
-     * @see org.apache.lucene.analysis.TokenStream#reset()
-     */
-    public void reset() throws IOException {
-      super.reset();
-      scanner.yyreset(input);
-    }
+  public void reset() throws IOException {
+    super.reset();
+    scanner.yyreset(input);
+  }
 
-    public void reset(Reader reader) throws IOException {
-      setInput(reader);
-      reset();
-    }
+  public void reset(Reader reader) throws IOException {
+    setInput(reader);
+    reset();
+  }
 
   /**
    * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java Fri Jul 24 21:45:48 2009
@@ -17,8 +17,6 @@
  * limitations under the License.
  */
 
-import java.io.Serializable;
-
 import org.apache.lucene.util.Attribute;
 
 /**
@@ -31,9 +29,7 @@
  * We will make our best efforts to keep the APIs backwards-compatible.</font>
 
  */
-public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
-  private int flags = 0;
-  
+public interface FlagsAttribute extends Attribute {
   /**
    * EXPERIMENTAL:  While we think this is here to stay, we may want to change it to be a long.
    * <p/>
@@ -44,43 +40,10 @@
    *
    * @return The bits
    */
-  public int getFlags() {
-    return flags;
-  }
+  public int getFlags();
 
   /**
    * @see #getFlags()
    */
-  public void setFlags(int flags) {
-    this.flags = flags;
-  }
-  
-  public void clear() {
-    flags = 0;
-  }
-
-  public String toString() {
-    return "flags=" + flags;
-  }
-
-  public boolean equals(Object other) {
-    if (this == other) {
-      return true;
-    }
-    
-    if (other instanceof FlagsAttribute) {
-      return ((FlagsAttribute) other).flags == flags;
-    }
-    
-    return false;
-  }
-
-  public int hashCode() {
-    return flags;
-  }
-  
-  public void copyTo(Attribute target) {
-    FlagsAttribute t = (FlagsAttribute) target;
-    t.setFlags(flags);
-  }
+  public void setFlags(int flags);  
 }

Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * This attribute can be used to pass different flags down the tokenizer chain,
+ * e. g. from one TokenFilter to another one. 
+ * 
+ * <p><font color="#FF0000">
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. 
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future. 
+ * We will make our best efforts to keep the APIs backwards-compatible.</font>
+
+ */
+public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
+  private int flags = 0;
+  
+  /**
+   * EXPERIMENTAL:  While we think this is here to stay, we may want to change it to be a long.
+   * <p/>
+   *
+   * Get the bitset for any bits that have been set.  This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
+   * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
+   *
+   *
+   * @return The bits
+   */
+  public int getFlags() {
+    return flags;
+  }
+
+  /**
+   * @see #getFlags()
+   */
+  public void setFlags(int flags) {
+    this.flags = flags;
+  }
+  
+  public void clear() {
+    flags = 0;
+  }
+
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+    
+    if (other instanceof FlagsAttributeImpl) {
+      return ((FlagsAttributeImpl) other).flags == flags;
+    }
+    
+    return false;
+  }
+
+  public int hashCode() {
+    return flags;
+  }
+  
+  public void copyTo(AttributeImpl target) {
+    FlagsAttribute t = (FlagsAttribute) target;
+    t.setFlags(flags);
+  }
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message