lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From busc...@apache.org
Subject svn commit: r806942 - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ contrib/...
Date Sun, 23 Aug 2009 08:34:24 GMT
Author: buschmi
Date: Sun Aug 23 08:34:22 2009
New Revision: 806942

URL: http://svn.apache.org/viewvc?rev=806942&view=rev
Log:
LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer
implementations.

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Aug 23 08:34:22 2009
@@ -402,6 +402,10 @@
 36. LUCENE-1808: Query.createWeight has been changed from protected to
     public. (Tim Smith, Shai Erera via Mark Miller)
 
+37. LUCENE-1826: Add constructors that take AttributeSource and
+    AttributeFactory to all Tokenizer implementations.
+    (Michael Busch)
+
 Bug fixes
 
 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -19,6 +19,7 @@
 import java.io.Reader;
 
 import org.apache.lucene.analysis.LetterTokenizer;
+import org.apache.lucene.util.AttributeSource;
 
 /**
  * The problem with the standard Letter tokenizer is that it fails on diacritics.
@@ -32,6 +33,14 @@
     super(in);
   }
 
+  public ArabicLetterTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  public ArabicLetterTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
   /** 
    * Allows for Letter category or NonspacingMark category
    * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -24,6 +24,8 @@
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
 
 
 /**
@@ -111,11 +113,25 @@
      */
     public CJKTokenizer(Reader in) {
       super(in);
+      init();
+    }
+
+    public CJKTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+      init();
+    }
+
+    public CJKTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+      init();
+    }
+    
+    private void init() {
       termAtt = (TermAttribute) addAttribute(TermAttribute.class);
       offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
       typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
     }
-
+    
     //~ Methods ----------------------------------------------------------------
 
     /**

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -24,6 +24,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 
 /**
@@ -59,10 +60,24 @@
 
     public ChineseTokenizer(Reader in) {
       super(in);
+      init();
+    }
+
+    public ChineseTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+      init();
+    }
+
+    public ChineseTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+      init();
+    }
+    
+    private void init() {
       termAtt = (TermAttribute) addAttribute(TermAttribute.class);
       offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
     }
-
+    
     private int offset = 0, bufferIndex=0, dataLen=0;
     private final static int MAX_WORD_LEN = 255;
     private final static int IO_BUFFER_SIZE = 1024;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 import java.io.IOException;
 import java.io.Reader;
@@ -88,7 +89,76 @@
    */
   public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
     super(input);
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram,
int maxGram) {
+    super(source, input);
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   * 
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram,
int maxGram) {
+    super(factory, input);
+    init(side, minGram, maxGram);
+  }
+  
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
+    this(input, Side.getSide(sideLabel), minGram, maxGram);
+  }
 
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram,
int maxGram) {
+    this(source, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   * 
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int
minGram, int maxGram) {
+    this(factory, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+  
+  private void init(Side side, int minGram, int maxGram) {
     if (side == null) {
       throw new IllegalArgumentException("sideLabel must be either front or back");
     }
@@ -107,17 +177,7 @@
     
     this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-  }
-  /**
-   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
-   *
-   * @param input {@link Reader} holding the input to be tokenized
-   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
-   * @param minGram the smallest n-gram to generate
-   * @param maxGram the largest n-gram to generate
-   */
-  public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
-    this(input, Side.getSide(sideLabel), minGram, maxGram);
+
   }
 
   /** Returns the next token in the stream, or null at EOS. */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 import java.io.IOException;
 import java.io.Reader;
@@ -50,6 +51,42 @@
    */
   public NGramTokenizer(Reader input, int minGram, int maxGram) {
     super(input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param source {@link AttributeSource} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
+    super(source, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram)
{
+    super(factory, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  }
+  
+  private void init(int minGram, int maxGram) {
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -60,14 +97,7 @@
     this.maxGram = maxGram;
     
     this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
-    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-  }
-  /**
-   * Creates NGramTokenizer with default min and max n-grams.
-   * @param input {@link Reader} holding the input to be tokenized
-   */
-  public NGramTokenizer(Reader input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);    
   }
 
   /** Returns the next token in the stream, or null at EOS. */

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.CharTokenizer;
 import org.apache.lucene.analysis.Tokenizer; // for javadocs
 import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
+import org.apache.lucene.util.AttributeSource;
 
 /**
  * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
@@ -57,6 +58,18 @@
     	this(in, RussianCharsets.UnicodeRussian);
     }
 
+    public RussianLetterTokenizer(AttributeSource source, Reader in, char[] charset)
+    {
+        super(source, in);
+        this.charset = charset;
+    }
+
+    public RussianLetterTokenizer(AttributeFactory factory, Reader in, char[] charset)
+    {
+        super(factory, in);
+        this.charset = charset;
+    }
+    
     /**
      * Collects only characters which satisfy
      * {@link Character#isLetter(char)}.

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -24,6 +24,7 @@
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 /**
  * Tokenizes input text into sentences.
@@ -48,11 +49,25 @@
 
   public SentenceTokenizer(Reader reader) {
     super(reader);
+    init();
+  }
+
+  public SentenceTokenizer(AttributeSource source, Reader reader) {
+    super(source, reader);
+    init();
+  }
+
+  public SentenceTokenizer(AttributeFactory factory, Reader reader) {
+    super(factory, reader);
+    init();
+  }
+  
+  private void init() {
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+    typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);    
   }
-
+  
   public boolean incrementToken() throws IOException {
     clearAttributes();
     buffer.setLength(0);

Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
(original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -151,14 +151,46 @@
    */
   public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
     super(input);
-    this.tokenOutput = tokenOutput;
     this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+
+  /**
+   * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}.
 Attaches the
+   * <conde>input</code> to a the newly created JFlex scanner. Uses the given
{@link AttributeFactory}.
+   *
+   * @param input The input
+   * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
+   * @param untokenizedTypes
+   */
+  public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set
untokenizedTypes) {
+    super(factory, input);
+    this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+
+  /**
+   * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}.
 Attaches the
+   * <conde>input</code> to a the newly created JFlex scanner. Uses the given
{@link AttributeSource}.
+   *
+   * @param input The input
+   * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
+   * @param untokenizedTypes
+   */
+  public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes)
{
+    super(source, input);
+    this.scanner = new WikipediaTokenizerImpl(input);
+    init(tokenOutput, untokenizedTypes);
+  }
+  
+  private void init(int tokenOutput, Set untokenizedTypes) {
+    this.tokenOutput = tokenOutput;
     this.untokenizedTypes = untokenizedTypes;
     this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
     this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
     this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
     this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
-    this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+    this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);    
   }
 
   /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Sun Aug 23 08:34:22
2009
@@ -22,6 +22,8 @@
 
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
 
 /** An abstract base class for simple, character-oriented tokenizers.*/
 public abstract class CharTokenizer extends Tokenizer {
@@ -31,6 +33,18 @@
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
   }
 
+  public CharTokenizer(AttributeSource source, Reader input) {
+    super(source, input);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+
+  public CharTokenizer(AttributeFactory factory, Reader input) {
+    super(factory, input);
+    offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+  }
+  
   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 4096;

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Sun Aug 23
08:34:22 2009
@@ -22,6 +22,7 @@
 
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 /**
  * Emits the entire input as a single token.
@@ -41,10 +42,24 @@
 
   public KeywordTokenizer(Reader input, int bufferSize) {
     super(input);
+    init(bufferSize);
+  }
+
+  public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
+    super(source, input);
+    init(bufferSize);
+  }
+
+  public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
+    super(factory, input);
+    init(bufferSize);
+  }
+  
+  private void init(int bufferSize) {
     this.done = false;
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
-    termAtt.resizeTermBuffer(bufferSize);
+    termAtt.resizeTermBuffer(bufferSize);    
   }
   
   public final boolean incrementToken() throws IOException {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java Sun Aug 23
08:34:22 2009
@@ -19,6 +19,9 @@
 
 import java.io.Reader;
 
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
   to say, it defines tokens as maximal strings of adjacent letters, as defined
   by java.lang.Character.isLetter() predicate.
@@ -31,6 +34,16 @@
   public LetterTokenizer(Reader in) {
     super(in);
   }
+  
+  /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */
+  public LetterTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+  
+  /** Construct a new LetterTokenizer using a given {@link AttributeFactory}. */
+  public LetterTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
 
   /** Collects only characters which satisfy
    * {@link Character#isLetter(char)}.*/

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java Sun Aug
23 08:34:22 2009
@@ -19,6 +19,9 @@
 
 import java.io.Reader;
 
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /**
  * LowerCaseTokenizer performs the function of LetterTokenizer
  * and LowerCaseFilter together.  It divides text at non-letters and converts
@@ -35,6 +38,16 @@
     super(in);
   }
 
+  /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */
+  public LowerCaseTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  /** Construct a new LowerCaseTokenizer using a given {@link AttributeFactory}. */
+  public LowerCaseTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
   /** Converts char to lower case
    * {@link Character#toLowerCase(char)}.*/
   protected char normalize(char c) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Sun Aug 23 08:34:22
2009
@@ -77,6 +77,18 @@
     super(source);
   }
 
+  /** Construct a token stream processing the given input using the given AttributeSource.
*/
+  protected Tokenizer(AttributeSource source, Reader input) {
+    super(source);
+    this.input = CharReader.get(input);
+  }
+  
+  /** Construct a token stream processing the given input using the given AttributeSource.
*/
+  protected Tokenizer(AttributeSource source, CharStream input) {
+    super(source);
+    this.input = input;
+  }
+  
   /** By default, closes the input Reader. */
   public void close() throws IOException {
     input.close();

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java Sun Aug
23 08:34:22 2009
@@ -19,15 +19,28 @@
 
 import java.io.Reader;
 
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+
 /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
  * Adjacent sequences of non-Whitespace characters form tokens. */
 
 public class WhitespaceTokenizer extends CharTokenizer {
-  /** Construct a new WhitespaceTokenizer. */
+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
   public WhitespaceTokenizer(Reader in) {
     super(in);
   }
 
+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */
+  public WhitespaceTokenizer(AttributeSource source, Reader in) {
+    super(source, in);
+  }
+
+  /** Construct a new WhitespaceTokenizer using a given {@link AttributeFactory}. */
+  public WhitespaceTokenizer(AttributeFactory factory, Reader in) {
+    super(factory, in);
+  }
+  
   /** Collects only characters which do not satisfy
    * {@link Character#isWhitespace(char)}.*/
   protected boolean isTokenChar(char c) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
Sun Aug 23 08:34:22 2009
@@ -27,6 +27,7 @@
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 /** A grammar-based tokenizer constructed with JFlex
  *
@@ -126,15 +127,38 @@
    * See http://issues.apache.org/jira/browse/LUCENE-1068
    */
   public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
-    this.replaceInvalidAcronym = replaceInvalidAcronym;
-    setInput(input);
+    super();
+    this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  /**
+   * Creates a new StandardTokenizer with a given {@link AttributeSource}. 
+   */
+  public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym)
{
+    super(source);
+    this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  /**
+   * Creates a new StandardTokenizer with a given {@link AttributeFactory}. 
+   */
+  public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym)
{
+    super(factory);
     this.scanner = new StandardTokenizerImpl(input);
+    init(input, replaceInvalidAcronym);
+  }
+
+  private void init(Reader input, boolean replaceInvalidAcronym) {
+    this.replaceInvalidAcronym = replaceInvalidAcronym;
+    setInput(input);    
     termAtt = (TermAttribute) addAttribute(TermAttribute.class);
     offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
     posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
     typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
   }
-
+  
   // this tokenizer generates three attributes:
   // offset, positionIncrement and type
   private TermAttribute termAtt;



Mime
View raw message