lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dsmi...@apache.org
Subject lucene-solr:master: LUCENE-8344: TokenStreamToAutomaton doesn't ignore trailing posInc when preservePositionIncrements=false
Date Thu, 14 Jun 2018 03:36:35 GMT
Repository: lucene-solr
Updated Branches:
  refs/heads/master eea4197a3 -> 228a84fd6


LUCENE-8344: TokenStreamToAutomaton doesn't ignore trailing posInc when preservePositionIncrements=false


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/228a84fd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/228a84fd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/228a84fd

Branch: refs/heads/master
Commit: 228a84fd6db3ef5fc1624d69e1c82a1f02c51352
Parents: eea4197
Author: David Smiley <dsmiley@apache.org>
Authored: Wed Jun 13 23:35:44 2018 -0400
Committer: David Smiley <dsmiley@apache.org>
Committed: Wed Jun 13 23:35:44 2018 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  11 ++
 .../lucene/analysis/TokenStreamToAutomaton.java |   8 +-
 .../analyzing/AnalyzingSuggesterTest.java       |  31 +++--
 .../document/TestPrefixCompletionQuery.java     | 132 +++++++++++++------
 4 files changed, 131 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/228a84fd/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 51c461f..ca8fa58 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -120,6 +120,13 @@ Bug Fixes:
 
 ======================= Lucene 7.4.0 =======================
 
+Upgrading
+
+* LUCENE-8344: If you are using the AnalyzingSuggester or FuzzySuggester subclass, and if
you
+  explicitly use the preservePositionIncrements=false setting (not the default), then you
ought
+  to rebuild your suggester index. If you don't, queries or indexed data with trailing position
+  gaps (e.g. stop words) may not work correctly. (David Smiley, Jim Ferenczi)
+
 API Changes
 
 * LUCENE-8242: IndexSearcher.createNormalizedWeight() has been deprecated.
@@ -280,6 +287,10 @@ Bug Fixes
 * LUCENE-8355: Prevent IW from opening an already dropped segment while DV updates
   are written. (Nhat Nguyen via Simon Willnauer)
 
+* LUCENE-8344: TokenStreamToAutomaton (used by some suggesters) was not ignoring a trailing
+  position increment when the preservePositionIncrement setting is false.
+  (David Smiley, Jim Ferenczi)
+
 Other
 
 * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/228a84fd/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
index 0675abe..0891930 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@@ -208,14 +208,14 @@ public class TokenStreamToAutomaton {
 
     in.end();
 
-    int endState = -1;
-
     int endPosInc = posIncAtt.getPositionIncrement();
-
     if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() >
maxOffset) {
       endPosInc = 1;
+    } else if (endPosInc > 0 && preservePositionIncrements==false) {
+      endPosInc = 0;
     }
-    
+
+    int endState;
     if (endPosInc > 0) {
       // there were hole(s) after the last token
       endState = builder.createState();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/228a84fd/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
index 7302d9e..7f3b8b4 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
@@ -220,34 +220,49 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
    * basic "standardanalyzer" test with stopword removal
    */
   public void testStandard() throws Exception {
+    final String input = "the ghost of christmas past the"; // trailing stopword there just
to perturb possible bugs
     Input keys[] = new Input[] {
-        new Input("the ghost of christmas past", 50),
+        new Input(input, 50),
     };
-    
+
     Directory tempDir = getDirectory();
     Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
     AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", standard, standard,

         AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false);
 
     suggester.build(new InputArrayIterator(keys));
-    
-    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the
ghost of chris", random()), false, 1);
+    List<LookupResult> results;
+
+    // round-trip
+    results = suggester.lookup(TestUtil.stringToCharSequence(input, random()), false, 1);
+    assertEquals(1, results.size());
+    assertEquals(input, results.get(0).key.toString());
+    assertEquals(50, results.get(0).value, 0.01F);
+
+    // prefix of input stopping part way through christmas
+    results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()),
false, 1);
     assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
     assertEquals(50, results.get(0).value, 0.01F);
 
     // omit the 'the' since it's a stopword, it's suggested anyway
     results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()),
false, 1);
     assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
     assertEquals(50, results.get(0).value, 0.01F);
 
     // omit the 'the' and 'of' since they are stopwords, it's suggested anyway
     results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false,
1);
     assertEquals(1, results.size());
-    assertEquals("the ghost of christmas past", results.get(0).key.toString());
+    assertEquals(input, results.get(0).key.toString());
     assertEquals(50, results.get(0).value, 0.01F);
-    
+
+    // trailing stopword "the"
+    results = suggester.lookup(TestUtil.stringToCharSequence("ghost christmas past the",
random()), false, 1);
+    assertEquals(1, results.size());
+    assertEquals(input, results.get(0).key.toString());
+    assertEquals(50, results.get(0).value, 0.01F);
+
     IOUtils.close(standard, tempDir);
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/228a84fd/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
index 5e941dd..40de8f4 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
@@ -253,71 +253,125 @@ public class TestPrefixCompletionQuery extends LuceneTestCase {
     iw.close();
   }
 
-  public void testAnalyzerWithoutPreservePosAndSep() throws Exception {
+  public void testAnalyzerDefaults() throws Exception {
     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
"suggest_field_no_p_sep_or_pos_inc"));
+    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
field));
     Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
     iw.addDocument(document);
 
     DirectoryReader reader = iw.getReader();
     SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc",
"fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new
Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep_or_pos_inc",
"foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // not the fo
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new
Entry("foobar", 7));
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field,
"fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with "fo*"
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("foo bar", 8), new
Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); //
becomes "_ fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar"));
// becomes "foo _ bar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11));
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the"));
// becomes "_ baz _"
+    suggest = indexSearcher.suggest(query, 4, false);
+    assertSuggestions(suggest);
     reader.close();
     iw.close();
   }
 
-  public void testAnalyzerWithSepAndNoPreservePos() throws Exception {
+  public void testAnalyzerWithoutSeparator() throws Exception {
     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
"suggest_field_no_p_pos_inc"));
+    //note: when we don't preserve separators, the choice of preservePosInc is irrelevant
+    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, random().nextBoolean());
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
field));
     Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
     iw.addDocument(document);
 
+    // note we use the completionAnalyzer with the queries (instead of input analyzer) because
of non-default settings
     DirectoryReader reader = iw.getReader();
     SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc",
"fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); //matches all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new
Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_pos_inc", "foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // only foobar
-    assertSuggestions(suggest, new Entry("foobar", 7));
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field,
"fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); //
becomes "fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar"));
// becomes "foobar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("foo bar", 8), new Entry("foobar", 7));
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 9, false); // no separators, thus match several
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("foo bar", 8), new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the"));
// becomes "baz*"
+    suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so
we match
+    assertSuggestions(suggest, new Entry("baz the", 12));
     reader.close();
     iw.close();
   }
 
-  public void testAnalyzerWithPreservePosAndNoSep() throws Exception {
+  public void testAnalyzerNoPreservePosInc() throws Exception {
     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
-    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
"suggest_field_no_p_sep"));
+    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
+    final String field = getTestName();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(completionAnalyzer,
field));
     Document document = new Document();
-    document.add(new SuggestField("suggest_field_no_p_sep", "foobar", 7));
-    document.add(new SuggestField("suggest_field_no_p_sep", "foo bar", 8));
-    document.add(new SuggestField("suggest_field_no_p_sep", "the fo", 9));
-    document.add(new SuggestField("suggest_field_no_p_sep", "the foo bar", 10));
+    document.add(new SuggestField(field, "foobar", 7));
+    document.add(new SuggestField(field, "foo bar", 8));
+    document.add(new SuggestField(field, "the fo", 9));
+    document.add(new SuggestField(field, "the foo bar", 10));
+    document.add(new SuggestField(field, "foo the bar", 11)); // middle stopword
+    document.add(new SuggestField(field, "baz the", 12)); // trailing stopword
+
     iw.addDocument(document);
 
+    // note we use the completionAnalyzer with the queries (instead of input analyzer) because
of non-default settings
     DirectoryReader reader = iw.getReader();
     SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
-    CompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep",
"fo"));
-    TopSuggestDocs suggest = indexSearcher.suggest(query, 4, false); // matches all 4
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new
Entry("foo bar", 8), new Entry("foobar", 7));
-    query = new PrefixCompletionQuery(analyzer, new Term("suggest_field_no_p_sep", "foob"));
-    suggest = indexSearcher.suggest(query, 4, false); // except the fo
-    assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new
Entry("foobar", 7));
+    CompletionQuery query = new PrefixCompletionQuery(completionAnalyzer, new Term(field,
"fo"));
+    TopSuggestDocs suggest = indexSearcher.suggest(query, 9, false); //matches all with fo
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with leading stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the fo")); //
becomes "fo*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
+    // with middle stopword
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foo the bar"));
// becomes "foo bar*"
+    suggest = indexSearcher.suggest(query, 9, false);
+    assertSuggestions(suggest, new Entry("foo the bar", 11), new Entry("the foo bar", 10),
new Entry("foo bar", 8)); // no foobar
+    // no space
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "foob"));
+    suggest = indexSearcher.suggest(query, 4, false); // separators, thus only match "foobar"
+    assertSuggestions(suggest, new Entry("foobar", 7));
+    // surrounding stopwords
+    query = new PrefixCompletionQuery(completionAnalyzer, new Term(field, "the baz the"));
// becomes "baz*"
+    suggest = indexSearcher.suggest(query, 4, false);// stopwords in query get removed so
we match
+    assertSuggestions(suggest, new Entry("baz the", 12));
     reader.close();
     iw.close();
   }


Mime
View raw message