lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From broust...@apache.org
Subject [lucene-solr] branch master updated: SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
Date Tue, 24 Dec 2019 14:07:56 GMT
This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 2784056  SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
2784056 is described below

commit 27840562a6f4419a1e8953789ada0f0d19551aa7
Author: Bruno Roustant <broustant@salesforce.com>
AuthorDate: Tue Dec 24 11:46:00 2019 +0100

    SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
    
    Closes #1113
---
 solr/CHANGES.txt                                   |  2 +
 .../solr/spelling/DirectSolrSpellChecker.java      | 10 ++++
 .../solr/spelling/DirectSolrSpellCheckerTest.java  | 57 +++++++++++++++++++---
 solr/solr-ref-guide/src/spell-checking.adoc        |  3 +-
 4 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 9d17c52..2e1679d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -170,6 +170,8 @@ Improvements
   SOLR_IP_WHITELIST and SOLR_IP_BLACKLIST. These variables can restrict access to
   Solr based on IP addresses/networks. (rmuir)
 
+* SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker. (Andy Webb via Bruno Roustant)
+
 Optimizations
 ---------------------
 (No changes)
diff --git a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
index a29d80d..527a3da 100644
--- a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
@@ -54,6 +54,7 @@ import org.slf4j.LoggerFactory;
  *       can be specified as "freq".
  *   <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
  *   <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
+ *   <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
  *   <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
  * </ul>
  * @see DirectSpellChecker
@@ -86,6 +87,9 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
   public static final String MINQUERYLENGTH = "minQueryLength";
   public static final int DEFAULT_MINQUERYLENGTH = 4;
   
+  public static final String MAXQUERYLENGTH = "maxQueryLength";
+  public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;
+
   public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
   public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;
   
@@ -144,6 +148,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
     Integer queryLength = params.getInt(MINQUERYLENGTH);
     if (queryLength != null)
       minQueryLength = queryLength;
+
+    int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
+    Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
+    if (overriddenMaxQueryLength != null)
+      maxQueryLength = overriddenMaxQueryLength;
     
     float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
     Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
@@ -158,6 +167,7 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
     checker.setThresholdFrequency(minThreshold);
     checker.setMaxInspections(maxInspections);
     checker.setMinQueryLength(minQueryLength);
+    checker.setMaxQueryLength(maxQueryLength);
     checker.setMaxQueryFrequency(maxQueryFrequency);
     checker.setLowerCaseTerms(false);
     
diff --git a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
index 0e4cc9a..6106fb4 100644
--- a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
+++ b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
@@ -62,20 +62,25 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
     checker.init(spellchecker, core);
 
     h.getCore().withSearcher(searcher -> {
+
+      // check that 'fob' is corrected to 'foo'
       Collection<Token> tokens = queryConverter.convert("fob");
       SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
       SpellingResult result = checker.getSuggestions(spellOpts);
-      assertTrue("result is null and it shouldn't be", result != null);
+      assertNotNull("result shouldn't be null", result);
       Map<String, Integer> suggestions = result.get(tokens.iterator().next());
+      assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
       Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
-      assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo")
== true);
+      assertEquals("foo", entry.getKey());
       assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue()
== SpellingResult.NO_FREQUENCY_INFO);
 
+      // check that 'super' is *not* corrected
       spellOpts.tokens = queryConverter.convert("super");
       result = checker.getSuggestions(spellOpts);
-      assertTrue("result is null and it shouldn't be", result != null);
-      suggestions = result.get(tokens.iterator().next());
-      assertTrue("suggestions is not null and it should be", suggestions == null);
+      assertNotNull("result shouldn't be null", result);
+      suggestions = result.get(spellOpts.tokens.iterator().next());
+      assertNotNull("suggestions shouldn't be null", suggestions);
+      assertTrue("suggestions should be empty", suggestions.isEmpty());
       return null;
     });
   }
@@ -88,6 +93,46 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
         "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='fox']/arr[@name='suggestion']/lst/int[@name='freq']=2",
         "//lst[@name='spellcheck']/bool[@name='correctlySpelled']='true'"
     );
-  }  
+  }
+
+  @Test
+  public void testMaxQueryLength() throws Exception {
+    testMaxQueryLength(true);
+    testMaxQueryLength(false);
+  }
+
+  private void testMaxQueryLength(Boolean limitQueryLength) throws Exception {
+
+    DirectSolrSpellChecker checker = new DirectSolrSpellChecker();
+    NamedList<Object> spellchecker = new NamedList<>();
+    spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
+    spellchecker.add(SolrSpellChecker.FIELD, "teststop");
+    spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2);
+
+    // demonstrate that "anothar" is not corrected when maxQueryLength is set to a small
number
+    if (limitQueryLength) spellchecker.add(DirectSolrSpellChecker.MAXQUERYLENGTH, 4);
+
+    SolrCore core = h.getCore();
+    checker.init(spellchecker, core);
+
+    h.getCore().withSearcher(searcher -> {
+      Collection<Token> tokens = queryConverter.convert("anothar");
+      SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
+      SpellingResult result = checker.getSuggestions(spellOpts);
+      assertNotNull("result shouldn't be null", result);
+      Map<String, Integer> suggestions = result.get(tokens.iterator().next());
+      assertNotNull("suggestions shouldn't be null", suggestions);
+
+      if (limitQueryLength) {
+        assertTrue("suggestions should be empty", suggestions.isEmpty());
+      } else {
+        assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
+        Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
+        assertEquals("another", entry.getKey());
+      }
+
+      return null;
+    });
+  }
   
 }
diff --git a/solr/solr-ref-guide/src/spell-checking.adoc b/solr/solr-ref-guide/src/spell-checking.adoc
index c883ed9..c480b34 100644
--- a/solr/solr-ref-guide/src/spell-checking.adoc
+++ b/solr/solr-ref-guide/src/spell-checking.adoc
@@ -69,6 +69,7 @@ The `DirectSolrSpellChecker` uses terms from the Solr index without building
a p
     <int name="minPrefix">1</int>
     <int name="maxInspections">5</int>
     <int name="minQueryLength">4</int>
+    <int name="maxQueryLength">40</int>
     <float name="maxQueryFrequency">0.01</float>
     <float name="thresholdTokenFrequency">.01</float>
   </lst>
@@ -81,7 +82,7 @@ Many of the parameters relate to how this spell checker should query the
index f
 
 Because this spell checker is querying the main index, you may want to limit how often it
queries the index to be sure to avoid any performance conflicts with user queries. The `accuracy`
setting defines the threshold for a valid suggestion, while `maxEdits` defines the number
of changes to the term to allow. Since most spelling mistakes are only 1 letter off, setting
this to 1 will reduce the number of possible suggestions (the default, however, is 2); the
value can only be 1 or 2. `min [...]
 
-The `maxInspections` parameter defines the maximum number of possible matches to review before
returning results; the default is 5. `minQueryLength` defines how many characters must be
in the query before suggestions are provided; the default is 4.
+The `maxInspections` parameter defines the maximum number of possible matches to review before
returning results; the default is 5. `minQueryLength` defines how many characters must be
in the query before suggestions are provided; the default is 4. `maxQueryLength` enables the
spell checker to skip over very long query terms, which can avoid expensive operations or
exceptions. There is no limit to term length by default.
 
 At first, spellchecker analyses incoming query words by looking up them in the index. Only
query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered
as misspelled and used for finding suggestions. Words which are frequent than `maxQueryFrequency`
bypass spellchecker unchanged. After suggestions for every misspelled word are found they
are filtered for enough frequency with `thresholdTokenFrequency` as boundary value. These
parameters (`maxQueryFre [...]
 


Mime
View raw message