Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 5801 invoked from network); 30 Nov 2009 21:49:59 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 30 Nov 2009 21:49:59 -0000 Received: (qmail 49809 invoked by uid 500); 30 Nov 2009 21:49:59 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 49753 invoked by uid 500); 30 Nov 2009 21:49:59 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 49744 invoked by uid 99); 30 Nov 2009 21:49:59 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 30 Nov 2009 21:49:59 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 30 Nov 2009 21:49:46 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id BC2E023889D1; Mon, 30 Nov 2009 21:49:23 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r885592 [2/2] - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/ co... Date: Mon, 30 Nov 2009 21:49:22 -0000 To: java-commits@lucene.apache.org From: uschindler@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091130214923.BC2E023889D1@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java Mon Nov 30 21:49:21 2009 @@ -6,6 +6,9 @@ import java.util.Iterator; import java.util.Set; +import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.util.Version; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -32,45 +35,113 @@ * etc. It is designed to be quick to test if a char[] * is in the set without the necessity of converting it * to a String first. + *

You must specify the required {@link Version} + * compatibility when creating {@link CharArraySet}: + *

    + *
  • As of 3.1, supplementary characters are + * properly lowercased.
  • + *
+ * Before 3.1 supplementary characters could not be + * lowercased correctly due to the lack of Unicode 4 + * support in JDK 1.4. To use instances of + * {@link CharArraySet} with the behavior before Lucene + * 3.1 pass a {@link Version} < 3.1 to the constructors. *

* Please note: This class implements {@link java.util.Set Set} but * does not behave like it should in all cases. The generic type is * {@code Set}, because you can add any object to it, * that has a string representation. The add methods will use * {@link Object#toString} and store the result using a {@code char[]} - * buffer. The same behaviour have the {@code contains()} methods. + * buffer. The same behavior have the {@code contains()} methods. * The {@link #iterator()} returns an {@code Iterator}. * For type safety also {@link #stringIterator()} is provided. */ - public class CharArraySet extends AbstractSet { private final static int INIT_SIZE = 8; private char[][] entries; private int count; private final boolean ignoreCase; - public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false)); + public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, 0, false)); + + private final CharacterUtils charUtils; + private final Version matchVersion; - /** Create set with enough capacity to hold startSize - * terms */ - public CharArraySet(int startSize, boolean ignoreCase) { + /** + * Create set with enough capacity to hold startSize terms + * + * @param matchVersion + * compatibility match version see Version + * note above for details. + * @param startSize + * the initial capacity + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) { this.ignoreCase = ignoreCase; int size = INIT_SIZE; while(startSize + (startSize>>2) > size) size <<= 1; entries = new char[size][]; + this.charUtils = CharacterUtils.getInstance(matchVersion); + this.matchVersion = matchVersion; + } + + /** + * Creates a set from a Collection of objects. + * + * @param matchVersion + * compatibility match version see Version + * note above for details. + * @param c + * a collection whose elements to be placed into the set + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + public CharArraySet(Version matchVersion, Collection c, boolean ignoreCase) { + this(matchVersion, c.size(), ignoreCase); + addAll(c); } - /** Create set from a Collection of char[] or String */ + /** + * Creates a set with enough capacity to hold startSize terms + * + * @param startSize + * the initial capacity + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + * @deprecated use {@link #CharArraySet(Version, int, boolean)} instead + */ + public CharArraySet(int startSize, boolean ignoreCase) { + this(Version.LUCENE_30, startSize, ignoreCase); + } + + /** + * Creates a set from a Collection of objects. + * + * @param c + * a collection whose elements to be placed into the set + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + * @deprecated use {@link #CharArraySet(Version, Collection, boolean)} instead + */ public CharArraySet(Collection c, boolean ignoreCase) { - this(c.size(), ignoreCase); + this(Version.LUCENE_30, c.size(), ignoreCase); addAll(c); } /** Create set from entries */ - private CharArraySet(char[][] entries, boolean ignoreCase, int count){ + private CharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count){ this.entries = entries; this.ignoreCase = ignoreCase; this.count = count; + this.charUtils = CharacterUtils.getInstance(matchVersion); + this.matchVersion = matchVersion; } /** true if the len chars of text starting at off @@ -131,8 +202,11 @@ */ public boolean add(char[] text) { if (ignoreCase) - for(int i=0;i set) { + return copy(Version.LUCENE_30, set); + } + + /** + * Returns a copy of the given set as a {@link CharArraySet}. If the given set + * is a {@link CharArraySet} the ignoreCase property will be preserved. + * + * @param matchVersion + * compatibility match version see Version + * note above for details. + * @param set + * a set to copy + * @return a copy of the given set as a {@link CharArraySet}. If the given set + * is a {@link CharArraySet} the ignoreCase property will be + * preserved. + */ + public static CharArraySet copy(Version matchVersion, Set set) { if (set == null) throw new NullPointerException("Given set is null"); if(set == EMPTY_SET) return EMPTY_SET; final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase : false; - return new CharArraySet(set, ignoreCase); + return new CharArraySet(matchVersion, set, ignoreCase); } @@ -356,9 +459,9 @@ */ private static final class UnmodifiableCharArraySet extends CharArraySet { - private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase, + private UnmodifiableCharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count) { - super(entries, ignoreCase, count); + super(matchVersion, entries, ignoreCase, count); } @Override Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java Mon Nov 30 21:49:21 2009 @@ -32,13 +32,15 @@ *

You must specify the required {@link Version} * compatibility when creating StopAnalyzer: *

    + *
  • As of 3.1, StopFilter correctly handles Unicode 4.0 + * supplementary characters in stopwords *
  • As of 2.9, position increments are preserved *
*/ public final class StopAnalyzer extends Analyzer { private final Set stopWords; - private final boolean enablePositionIncrements; + private final Version matchVersion; /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ @@ -52,7 +54,8 @@ "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" ); - final CharArraySet stopSet = new CharArraySet(stopWords.size(), false); + final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, + stopWords.size(), false); stopSet.addAll(stopWords); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } @@ -63,7 +66,7 @@ */ public StopAnalyzer(Version matchVersion) { stopWords = ENGLISH_STOP_WORDS_SET; - enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); + this.matchVersion = matchVersion; } /** Builds an analyzer with the stop words from the given set. @@ -71,7 +74,7 @@ * @param stopWords Set of stop words */ public StopAnalyzer(Version matchVersion, Set stopWords) { this.stopWords = stopWords; - enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); + this.matchVersion = matchVersion; } /** Builds an analyzer with the stop words from the given file. @@ -80,7 +83,7 @@ * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { stopWords = WordlistLoader.getWordSet(stopwordsFile); - this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); + this.matchVersion = matchVersion; } /** Builds an analyzer with the stop words from the given reader. @@ -89,13 +92,14 @@ * @param stopwords Reader to load stop words from */ public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException { stopWords = WordlistLoader.getWordSet(stopwords); - this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); + this.matchVersion = matchVersion; } /** Filters LowerCaseTokenizer with StopFilter. */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords); + return new StopFilter(matchVersion, + new LowerCaseTokenizer(reader), stopWords); } /** Filters LowerCaseTokenizer with StopFilter. */ @@ -109,7 +113,8 @@ if (streams == null) { streams = new SavedStreams(); streams.source = new LowerCaseTokenizer(reader); - streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords); + streams.result = new StopFilter(matchVersion, + streams.source, stopWords); setPreviousTokenStream(streams); } else streams.source.reset(reader); Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Mon Nov 30 21:49:21 2009 @@ -29,8 +29,16 @@ /** * Removes stop words from a token stream. + * + * + *

You must specify the required {@link Version} + * compatibility when creating StopFilter: + *

    + *
  • As of 3.1, StopFilter correctly handles Unicode 4.0 + * supplementary characters in stopwords and position + * increments are preserved + *
*/ - public final class StopFilter extends TokenFilter { private final CharArraySet stopWords; @@ -54,16 +62,46 @@ * @param input Input TokenStream * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords * @param ignoreCase if true, all words are lower cased first + * @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead */ public StopFilter(boolean enablePositionIncrements, TokenStream input, Set stopWords, boolean ignoreCase) { + this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase); + } + + /** + * Construct a token stream filtering the given input. If + * stopWords is an instance of {@link CharArraySet} (true if + * makeStopSet() was used to construct the set) it will be + * directly used and ignoreCase will be ignored since + * CharArraySet directly controls case sensitivity. + *

+ * If stopWords is not an instance of {@link CharArraySet}, a new + * CharArraySet will be constructed and ignoreCase will be used + * to specify the case sensitivity of that set. + * + * @param matchVersion + * Lucene version to enable correct Unicode 4.0 behavior in the stop + * set if Version > 3.0. See above for details. + * @param input + * Input TokenStream + * @param stopWords + * A Set of Strings or char[] or any other toString()-able set + * representing the stopwords + * @param ignoreCase + * if true, all words are lower cased first + */ + public StopFilter(Version matchVersion, TokenStream input, Set stopWords, boolean ignoreCase) + { + this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase); + } + + /* + * convenience ctor to enable deprecated ctors to set posInc explicitly + */ + private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set stopWords, boolean ignoreCase){ super(input); - if (stopWords instanceof CharArraySet) { - this.stopWords = (CharArraySet)stopWords; - } else { - this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); - this.stopWords.addAll(stopWords); - } + this.stopWords = CharArraySet.unmodifiableSet(new CharArraySet(matchVersion, stopWords, ignoreCase)); this.enablePositionIncrements = enablePositionIncrements; termAtt = addAttribute(TermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); @@ -76,10 +114,29 @@ * @param enablePositionIncrements true if token positions should record the removed stop words * @param in Input stream * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords - * @see #makeStopSet(java.lang.String[]) + * @see #makeStopSet(Version, java.lang.String[]) + * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead */ public StopFilter(boolean enablePositionIncrements, TokenStream in, Set stopWords) { - this(enablePositionIncrements, in, stopWords, false); + this(Version.LUCENE_CURRENT, enablePositionIncrements, in, stopWords, false); + } + + /** + * Constructs a filter which removes words from the input TokenStream that are + * named in the Set. + * + * @param matchVersion + * Lucene version to enable correct Unicode 4.0 behavior in the stop + * set if Version > 3.0. See above for details. + * @param in + * Input stream + * @param stopWords + * A Set of Strings or char[] or any other toString()-able set + * representing the stopwords + * @see #makeStopSet(Version, java.lang.String[]) + */ + public StopFilter(Version matchVersion, TokenStream in, Set stopWords) { + this(matchVersion, in, stopWords, false); } /** @@ -88,10 +145,11 @@ * This permits this stopWords construction to be cached once when * an Analyzer is constructed. * - * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase + * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase + * @deprecated use {@link #makeStopSet(Version, String...)} instead */ public static final Set makeStopSet(String... stopWords) { - return makeStopSet(stopWords, false); + return makeStopSet(Version.LUCENE_30, stopWords, false); } /** @@ -99,34 +157,88 @@ * appropriate for passing into the StopFilter constructor. * This permits this stopWords construction to be cached once when * an Analyzer is constructed. + * + * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 + * @param stopWords An array of stopwords + * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase + */ + public static final Set makeStopSet(Version matchVersion, String... stopWords) { + return makeStopSet(matchVersion, stopWords, false); + } + + /** + * Builds a Set from an array of stop words, + * appropriate for passing into the StopFilter constructor. + * This permits this stopWords construction to be cached once when + * an Analyzer is constructed. * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @return A Set ({@link CharArraySet}) containing the words - * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase + * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase + * @deprecated use {@link #makeStopSet(Version, List)} instead */ public static final Set makeStopSet(List stopWords) { - return makeStopSet(stopWords, false); + return makeStopSet(Version.LUCENE_30, stopWords, false); + } + + /** + * Builds a Set from an array of stop words, + * appropriate for passing into the StopFilter constructor. + * This permits this stopWords construction to be cached once when + * an Analyzer is constructed. + * + * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 + * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords + * @return A Set ({@link CharArraySet}) containing the words + * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase + */ + public static final Set makeStopSet(Version matchVersion, List stopWords) { + return makeStopSet(matchVersion, stopWords, false); } /** + * Creates a stopword set from the given stopword array. + * @param stopWords An array of stopwords + * @param ignoreCase If true, all words are lower cased first. + * @return a Set containing the words + * @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead; + */ + public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { + return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase); + } + /** + * Creates a stopword set from the given stopword array. * + * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ - public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { - CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); + public static final Set makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) { + CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; } - + /** - * + * Creates a stopword set from the given stopword list. * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param ignoreCase if true, all words are lower cased first * @return A Set ({@link CharArraySet}) containing the words + * @deprecated use {@link #makeStopSet(Version, List, boolean)} instead */ public static final Set makeStopSet(List stopWords, boolean ignoreCase){ - CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); + return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase); + } + + /** + * Creates a stopword set from the given stopword list. + * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 + * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords + * @param ignoreCase if true, all words are lower cased first + * @return A Set ({@link CharArraySet}) containing the words + */ + public static final Set makeStopSet(Version matchVersion, List stopWords, boolean ignoreCase){ + CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; } @@ -157,13 +269,14 @@ * StopFilter use this method when creating the * StopFilter. Prior to 2.9, this returns false. On 2.9 * or later, it returns true. + * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead */ public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) { return matchVersion.onOrAfter(Version.LUCENE_29); } /** - * @see #setEnablePositionIncrements(boolean). + * @see #setEnablePositionIncrements(boolean) */ public boolean getEnablePositionIncrements() { return enablePositionIncrements; Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Mon Nov 30 21:49:21 2009 @@ -34,6 +34,8 @@ *

You must specify the required {@link Version} * compatibility when creating StandardAnalyzer: *

    + *
  • As of 3.1, StopFilter correctly handles Unicode 4.0 + * supplementary characters in stopwords *
  • As of 2.9, StopFilter preserves position * increments *
  • As of 2.4, Tokens incorrectly identified as acronyms @@ -47,7 +49,7 @@ * Specifies whether deprecated acronyms should be replaced with HOST type. * See {@linkplain https://issues.apache.org/jira/browse/LUCENE-1068} */ - private final boolean replaceInvalidAcronym,enableStopPositionIncrements; + private final boolean replaceInvalidAcronym; /** An unmodifiable set containing some common English words that are usually not useful for searching. */ @@ -70,7 +72,6 @@ public StandardAnalyzer(Version matchVersion, Set stopWords) { stopSet = stopWords; setOverridesTokenStreamMethod(StandardAnalyzer.class); - enableStopPositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24); this.matchVersion = matchVersion; } @@ -101,7 +102,7 @@ tokenStream.setMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(matchVersion, result); - result = new StopFilter(enableStopPositionIncrements, result, stopSet); + result = new StopFilter(matchVersion, result, stopSet); return result; } @@ -148,8 +149,7 @@ streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(matchVersion, streams.filteredTokenStream); - streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, - streams.filteredTokenStream, stopSet); + streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopSet); } else { streams.tokenStream.reset(reader); } Modified: lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java (original) +++ lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java Mon Nov 30 21:49:21 2009 @@ -35,7 +35,7 @@ * @return a {@link CharacterUtils} implementation according to the given * {@link Version} instance. */ - public static CharacterUtils getInstance(Version matchVersion) { + public static CharacterUtils getInstance(final Version matchVersion) { return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4; } @@ -58,7 +58,7 @@ * - if the value offset is negative or not less than the length of * the char array. */ - public abstract int codePointAt(char[] chars, int offset); + public abstract int codePointAt(final char[] chars, final int offset); /** * Returns the code point at the given index of the {@link CharSequence}. @@ -79,21 +79,52 @@ * - if the value offset is negative or not less than the length of * the character sequence. */ - public abstract int codePointAt(CharSequence seq, int offset); + public abstract int codePointAt(final CharSequence seq, final int offset); + + /** + * Returns the code point at the given index of the char array where only elements + * with index less than the limit are used. + * Depending on the {@link Version} passed to + * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior + * of {@link Character#codePointAt(char[], int)} as it would have been + * available on a Java 1.4 JVM or on a later virtual machine version. + * + * @param chars + * a character array + * @param offset + * the offset to the char values in the chars array to be converted + * @param limit the index afer the last element that should be used to calculate + * codepoint. + * + * @return the Unicode code point at the given index + * @throws NullPointerException + * - if the array is null. + * @throws IndexOutOfBoundsException + * - if the value offset is negative or not less than the length of + * the char array. + */ + public abstract int codePointAt(final char[] chars, final int offset, final int limit); private static final class Java5CharacterUtils extends CharacterUtils { Java5CharacterUtils() { }; @Override - public final int codePointAt(char[] chars, int offset) { + public final int codePointAt(final char[] chars, final int offset) { return Character.codePointAt(chars, offset); } @Override - public int codePointAt(CharSequence seq, int offset) { + public int codePointAt(final CharSequence seq, final int offset) { return Character.codePointAt(seq, offset); } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + return Character.codePointAt(chars, offset, limit); + } + + } private static final class Java4CharacterUtils extends CharacterUtils { @@ -101,14 +132,22 @@ }; @Override - public final int codePointAt(char[] chars, int offset) { + public final int codePointAt(final char[] chars, final int offset) { return chars[offset]; } @Override - public int codePointAt(CharSequence seq, int offset) { + public int codePointAt(final CharSequence seq, final int offset) { return seq.charAt(offset); } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + if(offset >= limit) + throw new IndexOutOfBoundsException("offset must be less than limit"); + return chars[offset]; + } + } } Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java?rev=885592&r1=885591&r2=885592&view=diff ============================================================================== --- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java (original) +++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCharArraySet.java Mon Nov 30 21:49:21 2009 @@ -20,6 +20,7 @@ import java.util.Arrays; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; public class TestCharArraySet extends LuceneTestCase { @@ -33,7 +34,7 @@ public void testRehash() throws Exception { - CharArraySet cas = new CharArraySet(0, true); + CharArraySet cas = new CharArraySet(Version.LUCENE_CURRENT, 0, true); for(int i=0;i stopWords = new HashSet(Arrays.asList("is", "the", "Time")); - TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, false); + TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, false); final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); assertTrue(stream.incrementToken()); assertEquals("Now", termAtt.term()); @@ -49,7 +50,7 @@ public void testIgnoreCase() throws IOException { StringReader reader = new StringReader("Now is The Time"); Set stopWords = new HashSet(Arrays.asList( "is", "the", "Time" )); - TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true); + TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, true); final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); assertTrue(stream.incrementToken()); assertEquals("Now", termAtt.term()); @@ -59,8 +60,8 @@ public void testStopFilt() throws IOException { StringReader reader = new StringReader("Now is The Time"); String[] stopWords = new String[] { "is", "the", "Time" }; - Set stopSet = StopFilter.makeStopSet(stopWords); - TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet); + Set stopSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords); + TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet); final TermAttribute termAtt = stream.getAttribute(TermAttribute.class); assertTrue(stream.incrementToken()); assertEquals("Now", termAtt.term()); @@ -83,14 +84,14 @@ log(sb.toString()); String stopWords[] = (String[]) a.toArray(new String[0]); for (int i=0; i payloads = pspans.getPayload(); sawZero |= pspans.start() == 0; - for (Iterator it = payloads.iterator(); it.hasNext();) { + for (@SuppressWarnings("unused") byte[] bytes : payloads) { count++; - it.next(); - //System.out.println(new String((byte[]) it.next())); + //System.out.println(new String(bytes)); + } } assertEquals(5, count); @@ -302,10 +302,10 @@ sawZero = false; PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader()); - Collection pls = psu.getPayloadsForQuery(snq); + Collection pls = psu.getPayloadsForQuery(snq); count = pls.size(); - for (Iterator it = pls.iterator(); it.hasNext();) { - String s = new String((byte[]) it.next()); + for (byte[] bytes : pls) { + String s = new String(bytes); //System.out.println(s); sawZero |= s.equals("pos: 0"); }