lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From broust...@apache.org
Subject [lucene-solr] branch branch_8x updated: SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
Date Wed, 25 Dec 2019 21:02:53 GMT
This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 8f5f180  SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
8f5f180 is described below

commit 8f5f18041a9210dec7b9fad28487a05731dc9020
Author: Bruno Roustant <broustant@salesforce.com>
AuthorDate: Wed Dec 25 21:45:33 2019 +0100

    SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.
---
 solr/CHANGES.txt                                   |  2 +
 .../solr/spelling/DirectSolrSpellChecker.java      | 10 ++++
 .../solr/spelling/DirectSolrSpellCheckerTest.java  | 59 +++++++++++++++++++---
 solr/solr-ref-guide/src/spell-checking.adoc        |  3 +-
 4 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 98f17aa..686076d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -96,6 +96,8 @@ Improvements
 * SOLR-13984: Java's SecurityManager sandbox can be enabled via environment variable,
   SOLR_SECURITY_MANAGER_ENABLED=true. (rmuir)
 
+* SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker. (Andy Webb via Bruno Roustant)
+
 Optimizations
 ---------------------
 (No changes)
diff --git a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
index a29d80d..527a3da 100644
--- a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java
@@ -54,6 +54,7 @@ import org.slf4j.LoggerFactory;
  *       can be specified as "freq".
  *   <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
  *   <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
+ *   <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
  *   <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
  * </ul>
  * @see DirectSpellChecker
@@ -86,6 +87,9 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
   public static final String MINQUERYLENGTH = "minQueryLength";
   public static final int DEFAULT_MINQUERYLENGTH = 4;
   
+  public static final String MAXQUERYLENGTH = "maxQueryLength";
+  public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;
+
   public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
   public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;
   
@@ -144,6 +148,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
     Integer queryLength = params.getInt(MINQUERYLENGTH);
     if (queryLength != null)
       minQueryLength = queryLength;
+
+    int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
+    Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
+    if (overriddenMaxQueryLength != null)
+      maxQueryLength = overriddenMaxQueryLength;
     
     float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
     Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
@@ -158,6 +167,7 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
     checker.setThresholdFrequency(minThreshold);
     checker.setMaxInspections(maxInspections);
     checker.setMinQueryLength(minQueryLength);
+    checker.setMaxQueryLength(maxQueryLength);
     checker.setMaxQueryFrequency(maxQueryFrequency);
     checker.setLowerCaseTerms(false);
     
diff --git a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
index 0e4cc9a..631032f 100644
--- a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
+++ b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java
@@ -62,24 +62,29 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
     checker.init(spellchecker, core);
 
     h.getCore().withSearcher(searcher -> {
+
+      // check that 'fob' is corrected to 'foo'
       Collection<Token> tokens = queryConverter.convert("fob");
       SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
       SpellingResult result = checker.getSuggestions(spellOpts);
-      assertTrue("result is null and it shouldn't be", result != null);
+      assertNotNull("result shouldn't be null", result);
       Map<String, Integer> suggestions = result.get(tokens.iterator().next());
+      assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
       Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
-      assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo")
== true);
+      assertEquals("foo", entry.getKey());
       assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue()
== SpellingResult.NO_FREQUENCY_INFO);
 
+      // check that 'super' is *not* corrected
       spellOpts.tokens = queryConverter.convert("super");
       result = checker.getSuggestions(spellOpts);
-      assertTrue("result is null and it shouldn't be", result != null);
-      suggestions = result.get(tokens.iterator().next());
-      assertTrue("suggestions is not null and it should be", suggestions == null);
+      assertNotNull("result shouldn't be null", result);
+      suggestions = result.get(spellOpts.tokens.iterator().next());
+      assertNotNull("suggestions shouldn't be null", suggestions);
+      assertTrue("suggestions should be empty", suggestions.isEmpty());
       return null;
     });
   }
-  
+
   @Test
   public void testOnlyMorePopularWithExtendedResults() throws Exception {
     assertQ(req("q", "teststop:fox", "qt", "/spellCheckCompRH", SpellCheckComponent.COMPONENT_NAME,
"true", SpellingParams.SPELLCHECK_DICT, "direct", SpellingParams.SPELLCHECK_EXTENDED_RESULTS,
"true", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
@@ -88,6 +93,46 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
         "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='fox']/arr[@name='suggestion']/lst/int[@name='freq']=2",
         "//lst[@name='spellcheck']/bool[@name='correctlySpelled']='true'"
     );
-  }  
+  }
+
+  @Test
+  public void testMaxQueryLength() throws Exception {
+    testMaxQueryLength(true);
+    testMaxQueryLength(false);
+  }
+
+  private void testMaxQueryLength(Boolean limitQueryLength) throws Exception {
+
+    DirectSolrSpellChecker checker = new DirectSolrSpellChecker();
+    NamedList<Object> spellchecker = new NamedList<>();
+    spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
+    spellchecker.add(SolrSpellChecker.FIELD, "teststop");
+    spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2);
+
+    // demonstrate that "anothar" is not corrected when maxQueryLength is set to a small
number
+    if (limitQueryLength) spellchecker.add(DirectSolrSpellChecker.MAXQUERYLENGTH, 4);
+
+    SolrCore core = h.getCore();
+    checker.init(spellchecker, core);
+
+    h.getCore().withSearcher(searcher -> {
+      Collection<Token> tokens = queryConverter.convert("anothar");
+      SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
+      SpellingResult result = checker.getSuggestions(spellOpts);
+      assertNotNull("result shouldn't be null", result);
+      Map<String, Integer> suggestions = result.get(tokens.iterator().next());
+      assertNotNull("suggestions shouldn't be null", suggestions);
+
+      if (limitQueryLength) {
+        assertTrue("suggestions should be empty", suggestions.isEmpty());
+      } else {
+        assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
+        Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
+        assertEquals("another", entry.getKey());
+      }
+
+      return null;
+    });
+  }
   
 }
diff --git a/solr/solr-ref-guide/src/spell-checking.adoc b/solr/solr-ref-guide/src/spell-checking.adoc
index c883ed9..c480b34 100644
--- a/solr/solr-ref-guide/src/spell-checking.adoc
+++ b/solr/solr-ref-guide/src/spell-checking.adoc
@@ -69,6 +69,7 @@ The `DirectSolrSpellChecker` uses terms from the Solr index without building
a p
     <int name="minPrefix">1</int>
     <int name="maxInspections">5</int>
     <int name="minQueryLength">4</int>
+    <int name="maxQueryLength">40</int>
     <float name="maxQueryFrequency">0.01</float>
     <float name="thresholdTokenFrequency">.01</float>
   </lst>
@@ -81,7 +82,7 @@ Many of the parameters relate to how this spell checker should query the
index f
 
 Because this spell checker is querying the main index, you may want to limit how often it
queries the index to be sure to avoid any performance conflicts with user queries. The `accuracy`
setting defines the threshold for a valid suggestion, while `maxEdits` defines the number
of changes to the term to allow. Since most spelling mistakes are only 1 letter off, setting
this to 1 will reduce the number of possible suggestions (the default, however, is 2); the
value can only be 1 or 2. `min [...]
 
-The `maxInspections` parameter defines the maximum number of possible matches to review before
returning results; the default is 5. `minQueryLength` defines how many characters must be
in the query before suggestions are provided; the default is 4.
+The `maxInspections` parameter defines the maximum number of possible matches to review before
returning results; the default is 5. `minQueryLength` defines how many characters must be
in the query before suggestions are provided; the default is 4. `maxQueryLength` enables the
spell checker to skip over very long query terms, which can avoid expensive operations or
exceptions. There is no limit to term length by default.
 
 At first, spellchecker analyses incoming query words by looking up them in the index. Only
query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered
as misspelled and used for finding suggestions. Words which are frequent than `maxQueryFrequency`
bypass spellchecker unchanged. After suggestions for every misspelled word are found they
are filtered for enough frequency with `thresholdTokenFrequency` as boundary value. These
parameters (`maxQueryFre [...]
 


Mime
View raw message