Return-Path: X-Original-To: apmail-lucene-commits-archive@www.apache.org Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 08376F15D for ; Mon, 22 Apr 2013 13:26:26 +0000 (UTC) Received: (qmail 7482 invoked by uid 500); 22 Apr 2013 13:26:25 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 7463 invoked by uid 99); 22 Apr 2013 13:26:25 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 22 Apr 2013 13:26:25 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 22 Apr 2013 13:26:24 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id F1FA123888E7; Mon, 22 Apr 2013 13:26:03 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1470496 - in /lucene/dev/trunk/lucene/analysis: common/src/java/org/apache/lucene/analysis/ngram/ common/src/test/org/apache/lucene/analysis/ngram/ morfologik/src/test/org/apache/lucene/analysis/morfologik/ Date: Mon, 22 Apr 2013 13:26:03 -0000 To: commits@lucene.apache.org From: sarowe@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130422132603.F1FA123888E7@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: sarowe Date: Mon Apr 22 13:26:03 2013 New Revision: 1470496 URL: http://svn.apache.org/r1470496 Log: LUCENE-4810: first output token from EdgeNGramTokenFilter must be > 0 Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1470496&r1=1470495&r2=1470496&view=diff ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original) +++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Mon Apr 22 13:26:03 2013 @@ -75,6 +75,7 @@ public final class EdgeNGramTokenFilter private int tokEnd; // only used if the length changed before this filter private boolean hasIllegalOffsets; // only if the length changed before this filter private int savePosIncr; + private boolean isFirstToken = true; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -138,9 +139,8 @@ public final class EdgeNGramTokenFilter savePosIncr = posIncrAtt.getPositionIncrement(); } } - if (curGramSize <= maxGram) { - if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams - || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit + if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit + if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; @@ -152,12 +152,16 @@ public final class EdgeNGramTokenFilter } // first ngram gets increment, others don't if (curGramSize == minGram) { - posIncrAtt.setPositionIncrement(savePosIncr); + // Leave the first token position increment at the cleared-attribute value of 1 + if ( ! isFirstToken) { + posIncrAtt.setPositionIncrement(savePosIncr); + } } else { posIncrAtt.setPositionIncrement(0); } termAtt.copyBuffer(curTermBuffer, start, curGramSize); curGramSize++; + isFirstToken = false; return true; } } @@ -169,5 +173,6 @@ public final class EdgeNGramTokenFilter public void reset() throws IOException { super.reset(); curTermBuffer = null; + isFirstToken = true; } } Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1470496&r1=1470495&r2=1470496&view=diff ============================================================================== --- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original) +++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Mon Apr 22 13:26:03 2013 @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.Tokeni import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.position.PositionFilter; import java.io.Reader; import java.io.StringReader; @@ -120,6 +121,21 @@ public class EdgeNGramTokenFilterTest ex false); } + public void testFirstTokenPositionIncrement() throws Exception { + TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false); + ts = new PositionFilter(ts, 0); // All but first token will get 0 position increment + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3); + // The first token "a" will not be output, since it's smaller than the mingram size of 2. + // The second token on input to EdgeNGramTokenFilter will have position increment of 0, + // which should be increased to 1, since this is the first output token in the stream. + assertTokenStreamContents(filter, + new String[] { "ab", "abc" }, + new int[] { 2, 2 }, + new int[] { 4, 5 }, + new int[] { 1, 0 } + ); + } + public void testTokenizerPositions() throws Exception { EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(new StringReader("abcde"), EdgeNGramTokenizer.Side.FRONT, 1, 3); assertTokenStreamContents(tokenizer, Modified: lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1470496&r1=1470495&r2=1470496&view=diff ============================================================================== --- lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original) +++ lucene/dev/trunk/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Mon Apr 22 13:26:03 2013 @@ -52,6 +52,14 @@ public class TestMorfologikAnalyzer exte new int[] { 0, 0, 0, 0, 7, 7, 7, 7 }, new int[] { 6, 6, 6, 6, 13, 13, 13, 13 }, new int[] { 1, 0, 0, 0, 1, 0, 0, 0 }); + + assertAnalyzesToReuse( + a, + "T. Gl\u00FCcksberg", + new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" }, + new int[] { 0, 0, 0, 3 }, + new int[] { 1, 1, 1, 13 }, + new int[] { 1, 0, 0, 1 }); } /** Test reuse of MorfologikFilter with leftover stems. */