Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id F0EE8200BF9 for ; Sun, 8 Jan 2017 12:27:14 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id EF5E4160B36; Sun, 8 Jan 2017 11:27:14 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 40F64160B2C for ; Sun, 8 Jan 2017 12:27:14 +0100 (CET) Received: (qmail 79316 invoked by uid 500); 8 Jan 2017 11:27:08 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 79307 invoked by uid 99); 8 Jan 2017 11:27:08 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 08 Jan 2017 11:27:08 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 3584FDFB86; Sun, 8 Jan 2017 11:27:08 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: mikemccand@apache.org To: commits@lucene.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:branch_6x: TokenStreamToAutomaton failed to handle certain holes correctly Date: Sun, 8 Jan 2017 11:27:08 +0000 (UTC) archived-at: Sun, 08 Jan 2017 11:27:15 -0000 Repository: lucene-solr Updated Branches: refs/heads/branch_6x 373826a69 -> 2336152fb TokenStreamToAutomaton failed to handle certain holes correctly Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2336152f Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2336152f Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2336152f Branch: refs/heads/branch_6x Commit: 2336152fb4acf20bfc4936ad5e2cddde8efebaf1 Parents: 373826a Author: Mike McCandless Authored: Sun Jan 8 06:26:08 2017 -0500 Committer: Mike McCandless Committed: Sun Jan 8 06:26:27 2017 -0500 ---------------------------------------------------------------------- .../apache/lucene/analysis/TokenStreamToAutomaton.java | 11 ++++++++++- .../org/apache/lucene/analysis/TestGraphTokenizers.java | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2336152f/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java ---------------------------------------------------------------------- diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java index 071fa4a..64bac66 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java @@ -113,6 +113,7 @@ public class TokenStreamToAutomaton { final RollingBuffer positions = new Positions(); int pos = -1; + int freedPos = 0; Position posData = null; int maxOffset = 0; while (in.incrementToken()) { @@ -150,7 +151,15 @@ public class TokenStreamToAutomaton { addHoles(builder, positions, pos); } } - positions.freeBefore(pos); + while (freedPos <= pos) { + Position freePosData = positions.get(freedPos); + // don't free this position yet if we may still need to fill holes over it: + if (freePosData.arriving == -1 || freePosData.leaving == -1) { + break; + } + positions.freeBefore(freedPos); + freedPos++; + } } final int endPos = pos + posLengthAtt.getPositionLength(); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2336152f/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java ---------------------------------------------------------------------- diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java index 78fb127..8899dd1 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java +++ b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java @@ -585,4 +585,16 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase { Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES), Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES))); } + + public void testTokenStreamGraphWithHoles() throws Exception { + final TokenStream ts = new CannedTokenStream( + new Token[] { + token("abc", 1, 1), + token("xyz", 1, 8), + token("def", 1, 1), + token("ghi", 1, 1), + }); + assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")), + join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts); + } }