From commits-return-101374-archive-asf-public=cust-asf.ponee.io@lucene.apache.org Tue Jun 5 03:24:36 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id AA88D180636 for ; Tue, 5 Jun 2018 03:24:34 +0200 (CEST) Received: (qmail 80867 invoked by uid 500); 5 Jun 2018 01:24:33 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 80858 invoked by uid 99); 5 Jun 2018 01:24:33 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 05 Jun 2018 01:24:33 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 569A1DFCBA; Tue, 5 Jun 2018 01:24:33 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: rmuir@apache.org To: commits@lucene.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:master: LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters Date: Tue, 5 Jun 2018 01:24:33 +0000 (UTC) Repository: lucene-solr Updated Branches: refs/heads/master 59087d148 -> 2c1ab31b4 LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2c1ab31b Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2c1ab31b Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2c1ab31b Branch: refs/heads/master Commit: 2c1ab31b4e5595595cf0f1549eb61b33c8555000 Parents: 59087d1 Author: Robert Muir Authored: Mon Jun 4 21:24:20 2018 -0400 Committer: Robert Muir Committed: Mon Jun 4 21:24:20 2018 -0400 ---------------------------------------------------------------------- lucene/CHANGES.txt | 3 + .../analysis/ngram/EdgeNGramFilterFactory.java | 6 +- .../analysis/ngram/EdgeNGramTokenFilter.java | 112 ++++++++++++---- .../analysis/ngram/NGramFilterFactory.java | 6 +- .../lucene/analysis/ngram/NGramTokenFilter.java | 130 +++++++++++++++---- .../analysis/core/TestBugInSomething.java | 2 +- .../ngram/EdgeNGramTokenFilterTest.java | 122 +++++++++++------ .../analysis/ngram/NGramTokenFilterTest.java | 118 ++++++++++++----- .../lucene/analysis/ngram/TestNGramFilters.java | 18 ++- .../classification/BM25NBClassifierTest.java | 2 +- .../CachingNaiveBayesClassifierTest.java | 2 +- .../SimpleNaiveBayesClassifierTest.java | 2 +- .../analyzing/AnalyzingInfixSuggester.java | 2 +- 13 files changed, 391 insertions(+), 134 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/CHANGES.txt ---------------------------------------------------------------------- diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6644453..3466d77 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -202,6 +202,9 @@ New Features IndexFileDeleter already accounts for that for existing files which we can now use to also take pending deletes into account which ensures that all file generations per segment always go forward. (Simon Willnauer) + +* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters. + (Ingomar Wesp, Shawn Heisey via Robert Muir) * LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked as such once it's introduced and can't be changed after the fact. http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java index 020b85b..bd7ca1f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/> + * <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class EdgeNGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize); + return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 56efd89..154f075 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * supplementary characters. */ public final class EdgeNGramTokenFilter extends TokenFilter { + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MAX_GRAM_SIZE = 1; + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MIN_GRAM_SIZE = 1; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; private final int minGram; private final int maxGram; + private final boolean preserveOriginal; + private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; - private int savePosIncr; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * + * Creates an EdgeNGramTokenFilter that, for a given input term, produces all + * edge n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is outside the min/max size range. */ - public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + public EdgeNGramTokenFilter( + TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { super(input); if (minGram < 1) { @@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter { this.minGram = minGram; this.maxGram = maxGram; + this.preserveOriginal = preserveOriginal; + } + + /** + * Creates an EdgeNGramTokenFilter that produces edge n-grams of the given + * size. + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the n-gram size to generate. + */ + public EdgeNGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); + } + + /** + * Creates an EdgeNGramTokenFilter that, for a given input term, produces all + * edge n-grams with lengths >= minGram and <= maxGram. + * + *

+ * Behaves the same as + * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, minGram, maxGram, false)} + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * + * @deprecated since 7.4. Use + * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead. + */ + @Deprecated + public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); } @Override @@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - state = captureState(); - savePosIncr += posIncrAtt.getPositionIncrement(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength); + curPosIncr += posIncrAtt.getPositionIncrement(); + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit - if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams - // grab gramSize chars from front or back + + if (curGramSize <= curTermCodePointCount) { + if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram restoreState(state); // first ngram gets increment, others don't - if (curGramSize == minGram) { - posIncrAtt.setPositionIncrement(savePosIncr); - savePosIncr = 0; - } else { - posIncrAtt.setPositionIncrement(0); - } + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } + else if (preserveOriginal) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } } + // Done with this input token, get next token on the next iteration. curTermBuffer = null; } } @@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; - savePosIncr = 0; + curPosIncr = 0; } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 2064716..0a7e77d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> - * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/> + * <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/> * </analyzer> * </fieldType> */ public class NGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean preserveOriginal; /** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory { @Override public TokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); + return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index a2e0aa7..8e1a7e4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization. */ public final class NGramTokenFilter extends TokenFilter { + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MIN_NGRAM_SIZE = 1; + + /** + * @deprecated since 7.4 - this value will be required. + */ + @Deprecated public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; - private final int minGram, maxGram; + private final int minGram; + private final int maxGram; + private final boolean preserveOriginal; private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; private int curPos; - private int curPosInc; + private int curPosIncr; private State state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt; + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /** - * Creates NGramTokenFilter with given min and max n-grams. + * Creates an NGramTokenFilter that, for a given input term, produces all + * contained n-grams with lengths >= minGram and <= maxGram. Will + * optionally preserve the original term when its length is outside of the + * defined range. + * + * Note: Care must be taken when choosing minGram and maxGram; depending + * on the input token size, this filter potentially produces a huge number + * of terms. + * * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * @param preserveOriginal Whether or not to keep the original term when it + * is shorter than minGram or longer than maxGram */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); + public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) { + super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -72,51 +93,107 @@ public final class NGramTokenFilter extends TokenFilter { } this.minGram = minGram; this.maxGram = maxGram; + this.preserveOriginal = preserveOriginal; + } + + /** + * Creates an NGramTokenFilter that produces n-grams of the indicated size. + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param gramSize the size of n-grams to generate. + */ + public NGramTokenFilter(TokenStream input, int gramSize) { + this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL); + } - posIncAtt = addAttribute(PositionIncrementAttribute.class); + /** + * Creates an NGramTokenFilter that, for a given input term, produces all + * contained n-grams with lengths >= minGram and <= maxGram. + * + *

+ * Behaves the same as + * {@link #NGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, minGram, maxGram, false)} + * + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the minimum length of the generated n-grams + * @param maxGram the maximum length of the generated n-grams + * + * @deprecated since 7.4. Use + * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. + */ + @Deprecated + public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL); } /** * Creates NGramTokenFilter with default min and max n-grams. + * + *

+ * Behaves the same as + * {@link #NGramTokenFilter(TokenStream, int, int, boolean) + * NGramTokenFilter(input, 1, 2, false)} + * * @param input {@link TokenStream} holding the input to be tokenized + * @deprecated since 7.4. Use + * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead. */ + @Deprecated public NGramTokenFilter(TokenStream input) { - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL); } - /** Returns the next token in the stream, or null at EOS. */ @Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - curPos = 0; - curPosInc = posIncAtt.getPositionIncrement(); - state = captureState(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); + curPosIncr += posIncrAtt.getPositionIncrement(); + curPos = 0; + + if (preserveOriginal && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { + if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) { ++curPos; curGramSize = minGram; } - if ((curPos + curGramSize) <= curCodePointCount) { + if ((curPos + curGramSize) <= curTermCodePointCount) { restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); - posIncAtt.setPositionIncrement(curPosInc); - curPosInc = 0; + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; curGramSize++; return true; } - curTermBuffer = null; + else if (preserveOriginal && curTermCodePointCount > maxGram) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } + + // Done with this input token, get next token on next iteration. + curTermBuffer = null; } } @@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter { public void reset() throws IOException { super.reset(); curTermBuffer = null; + curPosIncr = 0; } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java index 1d17237..6cdff4b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java @@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase { //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); - stream = new NGramTokenFilter(stream, 55, 83); + stream = new NGramTokenFilter(stream, 55, 83, false); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index d7536e7..fd1949a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 0, 0); + new EdgeNGramTokenFilter(input, 0, 0, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, 2, 1); + new EdgeNGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput3() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new EdgeNGramTokenFilter(input, -1, 2); + new EdgeNGramTokenFilter(input, -1, 2, false); }); } public void testFrontUnigram() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5}); } public void testOversizedNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + + public void testPreserveOriginal() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "ef", "efg", "jk" }, + new int[] { 2, 2, 6, 6, 12 }, + new int[] { 5, 5, 11, 11, 14 }, + new int[] { 2, 0, 1, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" }, + new int[] { 0, 2, 2, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 1, 0, 0, 1 }); + } + } + public void testFrontRangeOfNgrams() throws Exception { - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); } public void testFilterPositions() throws Exception { TokenStream ts = whitespaceMockTokenizer("abcde vwxyz"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false); assertTokenStreamContents(tokenizer, - new String[]{"a","ab","abc","v","vw","vwx"}, - new int[]{0,0,0,6,6,6}, - new int[]{5,5,5,11,11,11}, - null, - new int[]{1,0,0,1,0,0}, - null, - null, - false); + new String[] {"a","ab","abc","v","vw","vwx"}, + new int[] {0, 0, 0, 6, 6, 6}, + new int[] {5, 5, 5, 11, 11, 11}); } private static class PositionFilter extends TokenFilter { @@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testFirstTokenPositionIncrement() throws Exception { TokenStream ts = whitespaceMockTokenizer("a abc"); ts = new PositionFilter(ts); // All but first token will get 0 position increment - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false); // The first token "a" will not be output, since it's smaller than the mingram size of 2. // The second token on input to EdgeNGramTokenFilter will have position increment of 0, // which should be increased to 1, since this is the first output token in the stream. @@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3); + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false); assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}); } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); @@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, min, max)); + new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 100*RANDOM_MULTIPLIER); @@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, 2, 15)); + new EdgeNGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { TokenStream tk = new LetterTokenizer(); ((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q")); tk = new ShingleFilter(tk); - tk = new EdgeNGramTokenFilter(tk, 7, 10); + tk = new EdgeNGramTokenFilter(tk, 7, 10, false); assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6,11,11,14 }, @@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { - assertTrue(tk.incrementToken()); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(s.length(), offsetAtt.endOffset()); - final int end = Character.offsetByCodePoints(s, 0, i); - assertEquals(s.substring(0, end), termAtt.toString()); + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int end = Character.offsetByCodePoints(s, 0, j); + assertEquals(s.substring(0, end), termAtt.toString()); + } + + if (codePointCount > maxGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index d8591a9..2a47396 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { public void testInvalidInput() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 2, 1); + new NGramTokenFilter(input, 2, 1, false); }); } public void testInvalidInput2() throws Exception { expectThrows(IllegalArgumentException.class, () -> { - new NGramTokenFilter(input, 0, 1); + new NGramTokenFilter(input, 0, 1, false); }); } public void testUnigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } public void testBigrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2); + NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false); assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0}); } public void testNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testNgramsNoIncrement() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3); + NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"}, new int[]{0,0,0,0,0,0,0,0,0,0,0,0}, @@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testOversizedNgrams() throws Exception { - NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7); + NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false); assertTokenStreamContents(filter, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsPreserveOriginal() throws Exception { + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false); + assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + } + + public void testSmallTokenInStreamPreserveOriginal() throws Exception { + input = whitespaceMockTokenizer("abc de fgh"); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true); + assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1}); + } public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); - NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1); + NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } + public void testKeepShortTermKeepLongTerm() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // preserveOriginal = false + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" }, + new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // preserveOriginal = true + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" }, + new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }); + } + } + // LUCENE-3642 // EdgeNgram blindly adds term length to offset, but this can take things out of bounds // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) @@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); - filters = new NGramTokenFilter(filters, 2, 2); + filters = new NGramTokenFilter(filters, 2, 2, false); return new TokenStreamComponents(tokenizer, filters); } }; @@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, min, max)); + new NGramTokenFilter(tokenizer, min, max, preserveOriginal)); } }; checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20); @@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, 2, 15)); + new NGramTokenFilter(tokenizer, 2, 15, false)); } }; checkAnalysisConsistency(random, a, random.nextBoolean(), ""); @@ -167,27 +205,47 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new NGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int start = 0; start < codePointCount; ++start) { - for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && preserveOriginal) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + for (int start = 0; start < codePointCount; ++start) { + for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int startIndex = Character.offsetByCodePoints(s, 0, start); + final int endIndex = Character.offsetByCodePoints(s, 0, end); + assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + } + } + + if (codePointCount > maxGram && preserveOriginal) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); - final int startIndex = Character.offsetByCodePoints(s, 0, start); - final int endIndex = Character.offsetByCodePoints(s, 0, end); - assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + assertEquals(s, termAtt.toString()); } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } - } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java index 5de532f..aa98f40 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java @@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test the NGramFilterFactory + * Test the NGramFilterFactory with old defaults */ public void testNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("NGram").create(stream); + stream = tokenFilterFactory("NGram", + "minGramSize", "1", + "maxGramSize", "2").create(stream); assertTokenStreamContents(stream, new String[] { "t", "te", "e", "es", "s", "st", "t" }); } @@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { } /** - * Test EdgeNGramFilterFactory + * Test EdgeNGramFilterFactory with old defaults */ public void testEdgeNGramFilter() throws Exception { Reader reader = new StringReader("test"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("EdgeNGram").create(stream); + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", + "maxGramSize", "1").create(stream); assertTokenStreamContents(stream, new String[] { "t" }); } @@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { - IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + IllegalArgumentException expected = null; + expected = expectThrows(IllegalArgumentException.class, () -> { tokenizerFactory("NGram", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); @@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase { assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("NGram", "bogusArg", "bogusValue"); + tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); expected = expectThrows(IllegalArgumentException.class, () -> { - tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue"); + tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue"); }); assertTrue(expected.getMessage().contains("Unknown parameters")); } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java ---------------------------------------------------------------------- diff --git a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java index 237c53f..050073c 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java @@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20))); + return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false))); } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2c1ab31b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java ---------------------------------------------------------------------- diff --git a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java index 00fa4fe..8669df4 100644 --- a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java +++ b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java @@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase