Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 6F5F0200C22 for ; Tue, 21 Feb 2017 16:51:52 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 6E02B160B68; Tue, 21 Feb 2017 15:51:52 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 94733160B3E for ; Tue, 21 Feb 2017 16:51:51 +0100 (CET) Received: (qmail 83112 invoked by uid 500); 21 Feb 2017 15:51:50 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 83103 invoked by uid 99); 21 Feb 2017 15:51:50 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 21 Feb 2017 15:51:50 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5B64DDFF36; Tue, 21 Feb 2017 15:51:50 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: mikemccand@apache.org To: commits@lucene.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:master: LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input Date: Tue, 21 Feb 2017 15:51:50 +0000 (UTC) archived-at: Tue, 21 Feb 2017 15:51:52 -0000 Repository: lucene-solr Updated Branches: refs/heads/master ac38872a7 -> 2d03aa21a LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2d03aa21 Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2d03aa21 Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2d03aa21 Branch: refs/heads/master Commit: 2d03aa21a2b674d36e201f6309e646f37771b73b Parents: ac38872 Author: Mike McCandless Authored: Tue Feb 21 10:51:38 2017 -0500 Committer: Mike McCandless Committed: Tue Feb 21 10:51:38 2017 -0500 ---------------------------------------------------------------------- .../analysis/pattern/SimplePatternSplitTokenizer.java | 9 ++++----- .../lucene/analysis/pattern/SimplePatternTokenizer.java | 2 +- .../analysis/pattern/TestSimplePatternSplitTokenizer.java | 10 ++++++++++ .../analysis/pattern/TestSimplePatternTokenizer.java | 10 ++++++++++ 4 files changed, 25 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java index d2b10c1..a8a40b2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java @@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer { } while (state != -1); if (lastAcceptLength != -1) { - // strip the trailing separater we just matched from the token: - tokenUpto -= lastAcceptLength; - // we found a token separator + // we found a token separator; strip the trailing separator we just matched from the token: int extra = sepUpto - lastAcceptLength; if (extra != 0) { pushBack(extra); } + tokenUpto -= lastAcceptLength; if (tokenUpto > 0) { fillToken(offsetStart); return true; @@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer { tokenUpto -= count; assert tokenUpto >= 0; if (pendingLimit == 0) { - if (bufferNextRead >= count) { + if (bufferLimit != -1 && bufferNextRead >= count) { // optimize common case when the chars we are pushing back are still in the buffer bufferNextRead -= count; } else { if (count > pendingChars.length) { pendingChars = ArrayUtil.grow(pendingChars, count); } - System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count); + System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count); pendingLimit = count; } } else { http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java index 867b10a..ff882ef 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java @@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer { private void pushBack(int count) { if (pendingLimit == 0) { - if (bufferNextRead >= count) { + if (bufferLimit != -1 && bufferNextRead >= count) { // optimize common case when the chars we are pushing back are still in the buffer bufferNextRead -= count; } else { http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java index 5642c2b..b497a9a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java @@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase { checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); b.close(); } + + public void testEndLookahead() throws Exception { + Tokenizer t = new SimplePatternSplitTokenizer("(ab)+"); + t.setReader(new StringReader("aba")); + assertTokenStreamContents(t, + new String[] { "a" }, + new int[] { 2 }, + new int[] { 3 }, + 3); + } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java index b566713..51e8c43 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java @@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase { checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); b.close(); } + + public void testEndLookahead() throws Exception { + Tokenizer t = new SimplePatternTokenizer("(ab)+"); + t.setReader(new StringReader("aba")); + assertTokenStreamContents(t, + new String[] { "ab" }, + new int[] { 0 }, + new int[] { 2 }, + 3); + } }