Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 24BCD9C92 for ; Tue, 25 Oct 2011 11:12:45 +0000 (UTC) Received: (qmail 86487 invoked by uid 500); 25 Oct 2011 11:12:45 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 86480 invoked by uid 99); 25 Oct 2011 11:12:45 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 25 Oct 2011 11:12:44 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 25 Oct 2011 11:12:38 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id D08E82388900; Tue, 25 Oct 2011 11:12:14 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r1188604 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/src/test/ lucene/contrib/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/ lucene/contrib/analyzers/comm... Date: Tue, 25 Oct 2011 11:12:14 -0000 To: commits@lucene.apache.org From: uschindler@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20111025111214.D08E82388900@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: uschindler Date: Tue Oct 25 11:12:14 2011 New Revision: 1188604 URL: http://svn.apache.org/viewvc?rev=1188604&view=rev Log: LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be used with custom attributes. All those attributes are preserved and set on all added decompounded tokens Modified: lucene/dev/branches/branch_3x/ (props changed) lucene/dev/branches/branch_3x/lucene/ (props changed) lucene/dev/branches/branch_3x/lucene/backwards/src/test/ (props changed) lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/ (props changed) lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java lucene/dev/branches/branch_3x/solr/ (props changed) Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1188604&r1=1188603&r2=1188604&view=diff ============================================================================== --- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original) +++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Tue Oct 25 11:12:14 2011 @@ -10,6 +10,9 @@ Changes in backwards compatibility polic * LUCENE-3446: Removed BooleanFilter.finalResult() due to change to FixedBitSet. (Uwe Schindler) + * LUCENE-3508: Changed some method signatures in decompounding TokenFilters + to make them no longer use the Token class. (Uwe Schindler) + New Features * LUCENE-1824: Add BoundaryScanner interface and its implementation classes, @@ -79,6 +82,10 @@ Bug Fixes Replaced with RandomSampler. For previous behavior use RepeatableSampler. (Gilad Barkai, Shai Erera, Doron Cohen) + * LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be + used with custom attributes. All those attributes are preserved and set on all + added decompounded tokens. (Spyros Kapnissis, Uwe Schindler) + API Changes * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1188604&r1=1188603&r2=1188604&view=diff ============================================================================== --- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original) +++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Tue Oct 25 11:12:14 2011 @@ -25,19 +25,16 @@ import java.util.Locale; import java.util.Set; import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** - * Base class for decomposition token filters. + * Base class for decomposition token filters. *

* You must specify the required {@link Version} compatibility when creating * CompoundWordTokenFilterBase: @@ -46,6 +43,13 @@ import org.apache.lucene.util.Version; * supplementary characters in strings and char arrays provided as compound word * dictionaries. * + *

If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public abstract class CompoundWordTokenFilterBase extends TokenFilter { /** @@ -64,26 +68,24 @@ public abstract class CompoundWordTokenF public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; protected final CharArraySet dictionary; - protected final LinkedList tokens; + protected final LinkedList tokens; protected final int minWordSize; protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class); + protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final Token wrapper = new Token(); + private AttributeSource.State current; + /** * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead */ @Deprecated protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); } /** @@ -91,7 +93,7 @@ public abstract class CompoundWordTokenF */ @Deprecated protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) { - this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); } /** @@ -107,7 +109,7 @@ public abstract class CompoundWordTokenF */ @Deprecated protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) { - this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } /** @@ -127,11 +129,11 @@ public abstract class CompoundWordTokenF } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) { @@ -139,7 +141,7 @@ public abstract class CompoundWordTokenF } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { - this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) { @@ -149,7 +151,7 @@ public abstract class CompoundWordTokenF protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); - this.tokens=new LinkedList(); + this.tokens=new LinkedList(); this.minWordSize=minWordSize; this.minSubwordSize=minSubwordSize; this.maxSubwordSize=maxSubwordSize; @@ -158,113 +160,77 @@ public abstract class CompoundWordTokenF if (dictionary==null || dictionary instanceof CharArraySet) { this.dictionary = (CharArraySet) dictionary; } else { - this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false); - addAllLowerCase(this.dictionary, dictionary); + this.dictionary = new CharArraySet(matchVersion, dictionary, true); } } - - /** - * Create a set of words from an array - * The resulting Set does case insensitive matching - * TODO We should look for a faster dictionary lookup approach. - * @param dictionary - * @return {@link Set} of lowercased terms - */ - public static final Set makeDictionary(final String[] dictionary) { - return makeDictionary(Version.LUCENE_30, dictionary); - } - public static final Set makeDictionary(final Version matchVersion, final String[] dictionary) { + /** @deprecated Only available for backwards compatibility. */ + @Deprecated + public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) { if (dictionary == null) { return null; } - // is the below really case insensitive? - CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false); - addAllLowerCase(dict, Arrays.asList(dictionary)); - return dict; - } - - private final void setToken(final Token token) throws IOException { - clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - flagsAtt.setFlags(token.getFlags()); - typeAtt.setType(token.type()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - posIncAtt.setPositionIncrement(token.getPositionIncrement()); - payloadAtt.setPayload(token.getPayload()); + return new CharArraySet(matchVersion, Arrays.asList(dictionary), true); } @Override public final boolean incrementToken() throws IOException { - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + if (!tokens.isEmpty()) { + assert current != null; + CompoundToken token = tokens.removeFirst(); + restoreState(current); // keep all other attributes untouched + termAtt.setEmpty().append(token.txt); + offsetAtt.setOffset(token.startOffset, token.endOffset); + posIncAtt.setPositionIncrement(0); return true; } - if (!input.incrementToken()) - return false; - - wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - wrapper.setStartOffset(offsetAtt.startOffset()); - wrapper.setEndOffset(offsetAtt.endOffset()); - wrapper.setFlags(flagsAtt.getFlags()); - wrapper.setType(typeAtt.type()); - wrapper.setPositionIncrement(posIncAtt.getPositionIncrement()); - wrapper.setPayload(payloadAtt.getPayload()); - - decompose(wrapper); - - if (tokens.size() > 0) { - setToken(tokens.removeFirst()); + current = null; // not really needed, but for safety + if (input.incrementToken()) { + // Only words longer than minWordSize get processed + if (termAtt.length() >= this.minWordSize) { + decompose(); + // only capture the state if we really need it for producing new tokens + if (!tokens.isEmpty()) { + current = captureState(); + } + } + // return original token: return true; } else { return false; } } - - protected static final void addAllLowerCase(CharArraySet target, Collection col) { - for (Object obj : col) { - String string = (String) obj; - target.add(string.toLowerCase(Locale.ENGLISH)); - } - } - - protected static char[] makeLowerCaseCopy(final char[] buffer) { - char[] result=new char[buffer.length]; - System.arraycopy(buffer, 0, result, 0, buffer.length); - - for (int i=0;i + *

+ * You must specify the required {@link Version} compatibility when creating + * CompoundWordTokenFilterBase: + *

    + *
  • As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0 + * supplementary characters in strings and char arrays provided as compound word + * dictionaries. + *
+ *

If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { /** - * Creates a new {@link DictionaryCompoundWordTokenFilter} - * + * Creates a new {@link DictionaryCompoundWordTokenFilter}. * @param input the {@link TokenStream} to process * @param dictionary the word dictionary to match against * @param minWordSize only words longer than this get processed @@ -115,7 +127,9 @@ public class DictionaryCompoundWordToken * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); @@ -134,7 +148,9 @@ public class DictionaryCompoundWordToken * the {@link TokenStream} to process * @param dictionary * the word dictionary to match against + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) { super(matchVersion, input, dictionary); } @@ -150,12 +166,9 @@ public class DictionaryCompoundWordToken * @param input * the {@link TokenStream} to process * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. */ - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) { + public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) { super(matchVersion, input, dictionary); } @@ -170,10 +183,7 @@ public class DictionaryCompoundWordToken * @param input * the {@link TokenStream} to process * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize @@ -183,37 +193,31 @@ public class DictionaryCompoundWordToken * @param onlyLongestMatch * Add only the longest matching subword to the stream */ - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary, + public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } @Override - protected void decomposeInternal(final Token token) { - // Only words longer than minWordSize get processed - if (token.length() < this.minWordSize) { - return; - } - - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); - - for (int i=0;i<=token.length()-this.minSubwordSize;++i) { - Token longestMatchToken=null; + protected void decompose() { + final int len = termAtt.length(); + for (int i=0;i<=len-this.minSubwordSize;++i) { + CompoundToken longestMatchToken=null; for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) { - if(i+j>token.length()) { + if(i+j>len) { break; } - if(dictionary.contains(lowerCaseTermBuffer, i, j)) { + if(dictionary.contains(termAtt.buffer(), i, j)) { if (this.onlyLongestMatch) { if (longestMatchToken!=null) { - if (longestMatchToken.length() + *

+ * You must specify the required {@link Version} compatibility when creating + * CompoundWordTokenFilterBase: + *

    + *
  • As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0 + * supplementary characters in strings and char arrays provided as compound word + * dictionaries. + *
+ *

If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! */ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase { @@ -63,11 +76,13 @@ public class HyphenationCompoundWordToke * only subwords shorter than this get to the output stream * @param onlyLongestMatch * Add only the longest matching subword to the stream + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(input, hyphenator, makeDictionary(dictionary), minWordSize, + this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } @@ -85,10 +100,12 @@ public class HyphenationCompoundWordToke * the hyphenation pattern tree to use for hyphenation * @param dictionary * the word dictionary to match against + * @deprecated Use the constructors taking {@link Set} */ + @Deprecated public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, String[] dictionary) { - this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, + this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); } @@ -105,10 +122,7 @@ public class HyphenationCompoundWordToke * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, Set dictionary) { @@ -129,10 +143,7 @@ public class HyphenationCompoundWordToke * @param hyphenator * the hyphenation pattern tree to use for hyphenation * @param dictionary - * the word dictionary to match against. If this is a - * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it - * must have set ignoreCase=false and only contain lower case - * strings. + * the word dictionary to match against. * @param minWordSize * only words longer than this get processed * @param minSubwordSize @@ -196,7 +207,7 @@ public class HyphenationCompoundWordToke public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { - this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize, + this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } @@ -211,7 +222,7 @@ public class HyphenationCompoundWordToke @Deprecated public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary) { - this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, + this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); } @@ -316,22 +327,20 @@ public class HyphenationCompoundWordToke } @Override - protected void decomposeInternal(final Token token) { + protected void decompose() { // get the hyphenation points - Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token - .length(), 1, 1); + Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1); // No hyphen points found -> exit if (hyphens == null) { return; } final int[] hyp = hyphens.getHyphenationPoints(); - char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer()); for (int i = 0; i < hyp.length; ++i) { int remaining = hyp.length - i; int start = hyp[i]; - Token longestMatchToken = null; + CompoundToken longestMatchToken = null; for (int j = 1; j < remaining; j++) { int partLength = hyp[i + j] - start; @@ -348,34 +357,33 @@ public class HyphenationCompoundWordToke } // check the dictionary - if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) { if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength) { - longestMatchToken = createToken(start, partLength, token); + if (longestMatchToken.txt.length() < partLength) { + longestMatchToken = new CompoundToken(start, partLength); } } else { - longestMatchToken = createToken(start, partLength, token); + longestMatchToken = new CompoundToken(start, partLength); } } else { - tokens.add(createToken(start, partLength, token)); + tokens.add(new CompoundToken(start, partLength)); } - } else if (dictionary.contains(lowerCaseTermBuffer, start, - partLength - 1)) { + } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) { // check the dictionary again with a word that is one character // shorter // to avoid problems with genitive 's characters and other binding // characters if (this.onlyLongestMatch) { if (longestMatchToken != null) { - if (longestMatchToken.length() < partLength - 1) { - longestMatchToken = createToken(start, partLength - 1, token); + if (longestMatchToken.txt.length() < partLength - 1) { + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - longestMatchToken = createToken(start, partLength - 1, token); + longestMatchToken = new CompoundToken(start, partLength - 1); } } else { - tokens.add(createToken(start, partLength - 1, token)); + tokens.add(new CompoundToken(start, partLength - 1)); } } } Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff ============================================================================== --- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original) +++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011 @@ -17,15 +17,20 @@ package org.apache.lucene.analysis.compo * limitations under the License. */ +import java.io.IOException; import java.io.StringReader; -import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.xml.sax.InputSource; public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDA() throws Exception { @@ -166,45 +171,45 @@ public class TestCompoundWordTokenFilter String[] dict = {"ab", "cd", "ef"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdef") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdef") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); assertTokenStreamContents(tf, - new String[] { "abcdef", "ab", "cd", "ef" }, - new int[] { 0, 0, 2, 4}, - new int[] { 6, 2, 4, 6}, - new int[] { 1, 0, 0, 0} - ); + new String[] { "abcdef", "ab", "cd", "ef" }, + new int[] { 0, 0, 2, 4}, + new int[] { 6, 2, 4, 6}, + new int[] { 1, 0, 0, 0} + ); } public void testWordComponentWithLessThanMinimumLength() throws Exception { String[] dict = {"abc", "d", "efg"}; DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, - new WhitespaceTokenizer(TEST_VERSION_CURRENT, - new StringReader( - "abcdefg") - ), - dict, - CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, - CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader( + "abcdefg") + ), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); - // since "d" is shorter than the minimum subword size, it should not be added to the token stream + // since "d" is shorter than the minimum subword size, it should not be added to the token stream assertTokenStreamContents(tf, - new String[] { "abcdefg", "abc", "efg" }, - new int[] { 0, 0, 4}, - new int[] { 7, 3, 7}, - new int[] { 1, 0, 0} - ); + new String[] { "abcdefg", "abc", "efg" }, + new int[] { 0, 0, 4}, + new int[] { 7, 3, 7}, + new int[] { 1, 0, 0} + ); } - + public void testReset() throws Exception { String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung" }; @@ -228,4 +233,64 @@ public class TestCompoundWordTokenFilter assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } + public void testRetainMockAttribute() throws Exception { + String[] dict = { "abc", "d", "efg" }; + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("abcdefg")); + TokenStream stream = new MockRetainAttributeFilter(tokenizer); + stream = new DictionaryCompoundWordTokenFilter( + TEST_VERSION_CURRENT, stream, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); + while (stream.incrementToken()) { + assertTrue("Custom attribute value was lost", retAtt.getRetain()); + } + + } + + public static interface MockRetainAttribute extends Attribute { + void setRetain(boolean attr); + boolean getRetain(); + } + + public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute { + private boolean retain = false; + @Override + public void clear() { + retain = false; + } + public boolean getRetain() { + return retain; + } + public void setRetain(boolean retain) { + this.retain = retain; + } + @Override + public void copyTo(AttributeImpl target) { + MockRetainAttribute t = (MockRetainAttribute) target; + t.setRetain(retain); + } + } + + private static class MockRetainAttributeFilter extends TokenFilter { + + MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); + + MockRetainAttributeFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()){ + retainAtt.setRetain(true); + return true; + } else { + return false; + } + } + } + }