Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 60793 invoked from network); 23 Aug 2009 08:34:26 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 23 Aug 2009 08:34:26 -0000 Received: (qmail 46070 invoked by uid 500); 23 Aug 2009 08:34:48 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 45982 invoked by uid 500); 23 Aug 2009 08:34:47 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 45973 invoked by uid 99); 23 Aug 2009 08:34:47 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 23 Aug 2009 08:34:47 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 23 Aug 2009 08:34:45 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id E77F4238888E; Sun, 23 Aug 2009 08:34:24 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r806942 - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ contrib/... Date: Sun, 23 Aug 2009 08:34:24 -0000 To: java-commits@lucene.apache.org From: buschmi@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090823083424.E77F4238888E@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: buschmi Date: Sun Aug 23 08:34:22 2009 New Revision: 806942 URL: http://svn.apache.org/viewvc?rev=806942&view=rev Log: LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations. Modified: lucene/java/trunk/CHANGES.txt lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Modified: lucene/java/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/CHANGES.txt (original) +++ lucene/java/trunk/CHANGES.txt Sun Aug 23 08:34:22 2009 @@ -402,6 +402,10 @@ 36. LUCENE-1808: Query.createWeight has been changed from protected to public. (Tim Smith, Shai Erera via Mark Miller) +37. LUCENE-1826: Add constructors that take AttributeSource and + AttributeFactory to all Tokenizer implementations. + (Michael Busch) + Bug fixes 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java Sun Aug 23 08:34:22 2009 @@ -19,6 +19,7 @@ import java.io.Reader; import org.apache.lucene.analysis.LetterTokenizer; +import org.apache.lucene.util.AttributeSource; /** * The problem with the standard Letter tokenizer is that it fails on diacritics. @@ -32,6 +33,14 @@ super(in); } + public ArabicLetterTokenizer(AttributeSource source, Reader in) { + super(source, in); + } + + public ArabicLetterTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + } + /** * Allows for Letter category or NonspacingMark category * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char) Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Sun Aug 23 08:34:22 2009 @@ -24,6 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; /** @@ -111,11 +113,25 @@ */ public CJKTokenizer(Reader in) { super(in); + init(); + } + + public CJKTokenizer(AttributeSource source, Reader in) { + super(source, in); + init(); + } + + public CJKTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + init(); + } + + private void init() { termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - + //~ Methods ---------------------------------------------------------------- /** Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java Sun Aug 23 08:34:22 2009 @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; /** @@ -59,10 +60,24 @@ public ChineseTokenizer(Reader in) { super(in); + init(); + } + + public ChineseTokenizer(AttributeSource source, Reader in) { + super(source, in); + init(); + } + + public ChineseTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + init(); + } + + private void init() { termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - + private int offset = 0, bufferIndex=0, dataLen=0; private final static int MAX_WORD_LEN = 255; private final static int IO_BUFFER_SIZE = 1024; Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Sun Aug 23 08:34:22 2009 @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; import java.io.IOException; import java.io.Reader; @@ -88,7 +89,76 @@ */ public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) { super(input); + init(side, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param source {@link AttributeSource} to use + * @param input {@link Reader} holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(AttributeSource source, Reader input, Side side, int minGram, int maxGram) { + super(source, input); + init(side, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param factory {@link AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param side the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) { + super(factory, input); + init(side, minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param input {@link Reader} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) { + this(input, Side.getSide(sideLabel), minGram, maxGram); + } + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param source {@link AttributeSource} to use + * @param input {@link Reader} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(AttributeSource source, Reader input, String sideLabel, int minGram, int maxGram) { + this(source, input, Side.getSide(sideLabel), minGram, maxGram); + } + + /** + * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range + * + * @param factory {@link AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param sideLabel the name of the {@link Side} from which to chop off an n-gram + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public EdgeNGramTokenizer(AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) { + this(factory, input, Side.getSide(sideLabel), minGram, maxGram); + } + + private void init(Side side, int minGram, int maxGram) { if (side == null) { throw new IllegalArgumentException("sideLabel must be either front or back"); } @@ -107,17 +177,7 @@ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - } - /** - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * @param input {@link Reader} holding the input to be tokenized - * @param sideLabel the name of the {@link Side} from which to chop off an n-gram - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) { - this(input, Side.getSide(sideLabel), minGram, maxGram); + } /** Returns the next token in the stream, or null at EOS. */ Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Sun Aug 23 08:34:22 2009 @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; import java.io.IOException; import java.io.Reader; @@ -50,6 +51,42 @@ */ public NGramTokenizer(Reader input, int minGram, int maxGram) { super(input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param source {@link AttributeSource} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) { + super(source, input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with given min and max n-grams. + * @param factory {@link AttributeFactory} to use + * @param input {@link Reader} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { + super(factory, input); + init(minGram, maxGram); + } + + /** + * Creates NGramTokenizer with default min and max n-grams. + * @param input {@link Reader} holding the input to be tokenized + */ + public NGramTokenizer(Reader input) { + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + } + + private void init(int minGram, int maxGram) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -60,14 +97,7 @@ this.maxGram = maxGram; this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); - this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - } - /** - * Creates NGramTokenizer with default min and max n-grams. - * @param input {@link Reader} holding the input to be tokenized - */ - public NGramTokenizer(Reader input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** Returns the next token in the stream, or null at EOS. */ Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java Sun Aug 23 08:34:22 2009 @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; // for javadocs import org.apache.lucene.analysis.LetterTokenizer; // for javadocs +import org.apache.lucene.util.AttributeSource; /** * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer} @@ -57,6 +58,18 @@ this(in, RussianCharsets.UnicodeRussian); } + public RussianLetterTokenizer(AttributeSource source, Reader in, char[] charset) + { + super(source, in); + this.charset = charset; + } + + public RussianLetterTokenizer(AttributeFactory factory, Reader in, char[] charset) + { + super(factory, in); + this.charset = charset; + } + /** * Collects only characters which satisfy * {@link Character#isLetter(char)}. Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Sun Aug 23 08:34:22 2009 @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; /** * Tokenizes input text into sentences. @@ -48,11 +49,25 @@ public SentenceTokenizer(Reader reader) { super(reader); + init(); + } + + public SentenceTokenizer(AttributeSource source, Reader reader) { + super(source, reader); + init(); + } + + public SentenceTokenizer(AttributeFactory factory, Reader reader) { + super(factory, reader); + init(); + } + + private void init() { termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - + public boolean incrementToken() throws IOException { clearAttributes(); buffer.setLength(0); Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (original) +++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Sun Aug 23 08:34:22 2009 @@ -151,14 +151,46 @@ */ public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) { super(input); - this.tokenOutput = tokenOutput; this.scanner = new WikipediaTokenizerImpl(input); + init(tokenOutput, untokenizedTypes); + } + + /** + * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the + * input to a the newly created JFlex scanner. Uses the given {@link AttributeFactory}. + * + * @param input The input + * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH} + * @param untokenizedTypes + */ + public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set untokenizedTypes) { + super(factory, input); + this.scanner = new WikipediaTokenizerImpl(input); + init(tokenOutput, untokenizedTypes); + } + + /** + * Createa a new instance of the {@link org.apache.lucene.wikipedia.analysis.WikipediaTokenizer}. Attaches the + * input to a the newly created JFlex scanner. Uses the given {@link AttributeSource}. + * + * @param input The input + * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH} + * @param untokenizedTypes + */ + public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes) { + super(source, input); + this.scanner = new WikipediaTokenizerImpl(input); + init(tokenOutput, untokenizedTypes); + } + + private void init(int tokenOutput, Set untokenizedTypes) { + this.tokenOutput = tokenOutput; this.untokenizedTypes = untokenizedTypes; this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); - this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Sun Aug 23 08:34:22 2009 @@ -22,6 +22,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; /** An abstract base class for simple, character-oriented tokenizers.*/ public abstract class CharTokenizer extends Tokenizer { @@ -31,6 +33,18 @@ termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + public CharTokenizer(AttributeSource source, Reader input) { + super(source, input); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + public CharTokenizer(AttributeFactory factory, Reader input) { + super(factory, input); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Sun Aug 23 08:34:22 2009 @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; /** * Emits the entire input as a single token. @@ -41,10 +42,24 @@ public KeywordTokenizer(Reader input, int bufferSize) { super(input); + init(bufferSize); + } + + public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) { + super(source, input); + init(bufferSize); + } + + public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { + super(factory, input); + init(bufferSize); + } + + private void init(int bufferSize) { this.done = false; termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); - termAtt.resizeTermBuffer(bufferSize); + termAtt.resizeTermBuffer(bufferSize); } public final boolean incrementToken() throws IOException { Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LetterTokenizer.java Sun Aug 23 08:34:22 2009 @@ -19,6 +19,9 @@ import java.io.Reader; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; + /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate. @@ -31,6 +34,16 @@ public LetterTokenizer(Reader in) { super(in); } + + /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */ + public LetterTokenizer(AttributeSource source, Reader in) { + super(source, in); + } + + /** Construct a new LetterTokenizer using a given {@link AttributeFactory}. */ + public LetterTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + } /** Collects only characters which satisfy * {@link Character#isLetter(char)}.*/ Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java Sun Aug 23 08:34:22 2009 @@ -19,6 +19,9 @@ import java.io.Reader; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; + /** * LowerCaseTokenizer performs the function of LetterTokenizer * and LowerCaseFilter together. It divides text at non-letters and converts @@ -35,6 +38,16 @@ super(in); } + /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */ + public LowerCaseTokenizer(AttributeSource source, Reader in) { + super(source, in); + } + + /** Construct a new LowerCaseTokenizer using a given {@link AttributeFactory}. */ + public LowerCaseTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + } + /** Converts char to lower case * {@link Character#toLowerCase(char)}.*/ protected char normalize(char c) { Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Sun Aug 23 08:34:22 2009 @@ -77,6 +77,18 @@ super(source); } + /** Construct a token stream processing the given input using the given AttributeSource. */ + protected Tokenizer(AttributeSource source, Reader input) { + super(source); + this.input = CharReader.get(input); + } + + /** Construct a token stream processing the given input using the given AttributeSource. */ + protected Tokenizer(AttributeSource source, CharStream input) { + super(source); + this.input = input; + } + /** By default, closes the input Reader. */ public void close() throws IOException { input.close(); Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java Sun Aug 23 08:34:22 2009 @@ -19,15 +19,28 @@ import java.io.Reader; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.AttributeSource.AttributeFactory; + /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. * Adjacent sequences of non-Whitespace characters form tokens. */ public class WhitespaceTokenizer extends CharTokenizer { - /** Construct a new WhitespaceTokenizer. */ + /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */ public WhitespaceTokenizer(Reader in) { super(in); } + /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */ + public WhitespaceTokenizer(AttributeSource source, Reader in) { + super(source, in); + } + + /** Construct a new WhitespaceTokenizer using a given {@link AttributeFactory}. */ + public WhitespaceTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + } + /** Collects only characters which do not satisfy * {@link Character#isWhitespace(char)}.*/ protected boolean isTokenChar(char c) { Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=806942&r1=806941&r2=806942&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Sun Aug 23 08:34:22 2009 @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; /** A grammar-based tokenizer constructed with JFlex * @@ -126,15 +127,38 @@ * See http://issues.apache.org/jira/browse/LUCENE-1068 */ public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) { - this.replaceInvalidAcronym = replaceInvalidAcronym; - setInput(input); + super(); + this.scanner = new StandardTokenizerImpl(input); + init(input, replaceInvalidAcronym); + } + + /** + * Creates a new StandardTokenizer with a given {@link AttributeSource}. + */ + public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym) { + super(source); + this.scanner = new StandardTokenizerImpl(input); + init(input, replaceInvalidAcronym); + } + + /** + * Creates a new StandardTokenizer with a given {@link AttributeFactory}. + */ + public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym) { + super(factory); this.scanner = new StandardTokenizerImpl(input); + init(input, replaceInvalidAcronym); + } + + private void init(Reader input, boolean replaceInvalidAcronym) { + this.replaceInvalidAcronym = replaceInvalidAcronym; + setInput(input); termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - + // this tokenizer generates three attributes: // offset, positionIncrement and type private TermAttribute termAtt;