Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 1C0C3200C39 for ; Wed, 1 Mar 2017 10:27:17 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 1ACFC160B83; Wed, 1 Mar 2017 09:27:17 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 298D3160B5E for ; Wed, 1 Mar 2017 10:27:16 +0100 (CET) Received: (qmail 3229 invoked by uid 500); 1 Mar 2017 09:27:10 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 3111 invoked by uid 99); 1 Mar 2017 09:27:10 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 01 Mar 2017 09:27:10 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id D34DDDFF71; Wed, 1 Mar 2017 09:27:10 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: ab@apache.org To: commits@lucene.apache.org Date: Wed, 01 Mar 2017 09:27:31 -0000 Message-Id: <0558c394a046498ea93272d86930d1b2@git.apache.org> In-Reply-To: <12f66a0defca4d0cb9a65a45a99317ca@git.apache.org> References: <12f66a0defca4d0cb9a65a45a99317ca@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [23/50] [abbrv] lucene-solr:jira/solr-9858: LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false archived-at: Wed, 01 Mar 2017 09:27:17 -0000 LUCENE-7708: Fix position length attribute set by the ShingleFilter when outputUnigrams=false Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/57a42e4e Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/57a42e4e Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/57a42e4e Branch: refs/heads/jira/solr-9858 Commit: 57a42e4ec54aebac40c1ef7dc93d933cd00dbe1e Parents: cab3aae Author: Jim Ferenczi Authored: Fri Feb 24 23:37:37 2017 +0100 Committer: Jim Ferenczi Committed: Fri Feb 24 23:37:37 2017 +0100 ---------------------------------------------------------------------- lucene/CHANGES.txt | 4 + .../lucene/analysis/shingle/ShingleFilter.java | 7 +- .../analysis/shingle/ShingleFilterTest.java | 94 +++++++++++++++++++- 3 files changed, 102 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/CHANGES.txt ---------------------------------------------------------------------- diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1d45ab8..c119eaa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -155,6 +155,10 @@ Bug Fixes token graph, messing up phrase queries when it was used during query parsing (Ere Maijala via Mike McCandless) +* LUCENE-7708: ShingleFilter without unigram was producing a disconnected + token graph, messing up queries when it was used during query + parsing (Jim Ferenczi) + Improvements * LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 5d99291..e3fa803 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -343,7 +343,12 @@ public final class ShingleFilter extends TokenFilter { noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); - posLenAtt.setPositionLength(builtGramSize); + if (outputUnigrams) { + posLenAtt.setPositionLength(builtGramSize); + } else { + // position length for this token is the number of position created by shingles of smaller size. + posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1)); + } isOutputHere = true; gramSize.advance(); tokenAvailable = true; http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/57a42e4e/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java ---------------------------------------------------------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java index 192de38..5645900 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java @@ -30,7 +30,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class ShingleFilterTest extends BaseTokenStreamTestCase { @@ -1239,7 +1239,6 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3); filter.setFillerToken(null); filter.setTokenSeparator(null); - assertTokenStreamContents(filter, new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"}, new int[] {0, 0, 0, 7, 7, 7}, @@ -1247,4 +1246,95 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase { new int[] {1, 0, 0, 1, 0, 0}, 20); } + + public void testPositionLength() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or not", "be or not to", "or not to be"}, + new int[] {0, 3, 6}, + new int[] {12, 15, 18}, + null, + new int[] {1, 1, 1}, + new int[] {1, 1, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to", + "or not to be", "not to", "not to be", "to be"}, + new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13}, + new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18}, + null, + new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1}, + new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to", + "or not to be", "not to be"}, + new int[] {0, 0, 3, 3, 6, 6, 9}, + new int[] {8, 12, 12, 15, 15, 18, 18}, + null, + new int[] {1, 0, 1, 0, 1, 0, 1, 0}, + new int[] {1, 2, 1, 2, 1, 2, 1, 2}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5); + filter.setOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + } + }; + assertTokenStreamContents(a.tokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to", + "be or not to be", "or not to", "or not to be", "not to be"}, + new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9}, + new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18}, + null, + new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0}, + new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1}, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + } }