lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r995972 - in /lucene/dev/branches/branch_3x/solr: ./ src/java/org/apache/solr/handler/component/ src/java/org/apache/solr/spelling/ src/test/org/apache/solr/handler/component/
Date Fri, 10 Sep 2010 20:31:53 GMT
Author: gsingers
Date: Fri Sep 10 20:31:52 2010
New Revision: 995972

URL: http://svn.apache.org/viewvc?rev=995972&view=rev
Log:
SOLR-2083: fix issue with misrreporting suggestions in distributed spell checking

Modified:
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
    lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=995972&r1=995971&r2=995972&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Fri Sep 10 20:31:52 2010
@@ -388,7 +388,9 @@ Bug Fixes
   to be removed before it was finished being copied.  This did not affect
   normal master/slave replication.  (Peter Sturge via yonik)
 
-* SOLR-2114: Fixed parsing error in hsin function.  The function signature has changed slightly.
(gsingers)  
+* SOLR-2114: Fixed parsing error in hsin function.  The function signature has changed slightly.
(gsingers)
+
+* SOLR-2083: SpellCheckComponent misreports suggestions when distributed (James Dyer via
gsingers)  
 
 
 Other Changes

Modified: lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java?rev=995972&r1=995971&r2=995972&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
(original)
+++ lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
Fri Sep 10 20:31:52 2010
@@ -46,6 +46,7 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.index.IndexReader;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.SpellingParams;
 import org.apache.solr.common.util.NamedList;
@@ -123,6 +124,7 @@ public class SpellCheckComponent extends
     if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
       return;
     }
+    boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
     String q = params.get(SPELLCHECK_Q);
     SolrSpellChecker spellChecker = getSpellChecker(params);
     Collection<Token> tokens = null;
@@ -147,13 +149,12 @@ public class SpellCheckComponent extends
         IndexReader reader = rb.req.getSearcher().getReader();
         boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
         float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
-        SolrParams customParams = getCustomParams(getDictionaryName(params), params);
+        SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
         SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular,
extendedResults,
                 accuracy, customParams);
-
         SpellingResult spellingResult = spellChecker.getSuggestions(options);
         if (spellingResult != null) {
-          response.add("suggestions", toNamedList(spellingResult, q,
+          response.add("suggestions", toNamedList(shardRequest, spellingResult, q,
               extendedResults, collate));
           rb.rsp.add("spellcheck", response);
         }
@@ -171,7 +172,7 @@ public class SpellCheckComponent extends
    * @param params The original SolrParams
    * @return The new Params
    */
-  protected SolrParams getCustomParams(String dictionary, SolrParams params) {
+  protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest)
{
     ModifiableSolrParams result = new ModifiableSolrParams();
     Iterator<String> iter = params.getParameterNamesIterator();
     String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
@@ -181,6 +182,10 @@ public class SpellCheckComponent extends
         result.add(nxt.substring(prefix.length()), params.getParams(nxt));
       }
     }
+    if(shardRequest)
+    {
+    	result.add(ShardParams.IS_SHARD, "true");
+    }
     return result;
   }
 
@@ -243,17 +248,21 @@ public class SpellCheckComponent extends
     Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String,
SpellCheckResponse.Suggestion>();
     // original token string -> summed up frequency
     Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
+    // original token string -> # of shards reporting it as misspelled
+    Map<String, Integer> origVsShards = new HashMap<String, Integer>();
     // original token string -> set of alternatives
     // must preserve order because collation algorithm can only work in-order
     Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String,
HashSet<String>>();
     // alternative string -> corresponding SuggestWord object
     Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
-
+    
+    int totalNumberShardResponses = 0;
     for (ShardRequest sreq : rb.finished) {
       for (ShardResponse srsp : sreq.responses) {
         NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
         LOG.info(srsp.getShard() + " " + nl);
         if (nl != null) {
+        	totalNumberShardResponses++;
           SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
           for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions())
{
             origVsSuggestion.put(suggestion.getToken(), suggestion);
@@ -269,6 +278,14 @@ public class SpellCheckComponent extends
             if (o != null)  origFreq += o;
             origFreq += suggestion.getOriginalFrequency();
             origVsFreq.put(suggestion.getToken(), origFreq);
+            
+            //# shards reporting
+            Integer origShards = origVsShards.get(suggestion.getToken());
+            if(origShards==null) {
+            	origVsShards.put(suggestion.getToken(), 1);
+            } else {
+            	origVsShards.put(suggestion.getToken(), ++origShards);
+            }            
 
             // find best suggestions
             for (int i = 0; i < suggestion.getNumFound(); i++) {
@@ -296,6 +313,13 @@ public class SpellCheckComponent extends
     SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it
need tokens beforehand?
     for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet())
{
       String original = entry.getKey();
+      
+      //Only use this suggestion if all shards reported it as misspelled.
+      Integer numShards = origVsShards.get(original);
+      if(numShards<totalNumberShardResponses) {
+      	continue;
+      }
+      
       HashSet<String> suggested = entry.getValue();
       SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
       for (String suggestion : suggested) {
@@ -335,7 +359,7 @@ public class SpellCheckComponent extends
     }
     
     NamedList response = new SimpleOrderedMap();
-    response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
+    response.add("suggestions", toNamedList(false, result, origQuery, extendedResults, collate));
     rb.rsp.add("spellcheck", response);
   }
 
@@ -383,7 +407,7 @@ public class SpellCheckComponent extends
     return spellCheckers.get(name);
   }
 
-  protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean
extendedResults, boolean collate) {
+  protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String
origQuery, boolean extendedResults, boolean collate) {
     NamedList result = new NamedList();
     Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
     boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
@@ -393,15 +417,23 @@ public class SpellCheckComponent extends
       best = new LinkedHashMap<Token, String>(suggestions.size());
     }
     
+    int numSuggestions = 0;
+    for(LinkedHashMap<String, Integer> theSuggestion : suggestions.values())
+    {
+    	if(theSuggestion.size()>0)
+    	{
+    		numSuggestions++;
+    	}
+    }    
     // will be flipped to false if any of the suggestions are not in the index and hasFreqInfo
is true
-    if(suggestions.size() > 0) {
+    if(numSuggestions > 0) {
       isCorrectlySpelled = true;
     }
     
     for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet())
{
       Token inputToken = entry.getKey();
       Map<String, Integer> theSuggestions = entry.getValue();
-      if (theSuggestions != null && theSuggestions.size() > 0) {
+      if (theSuggestions != null && (theSuggestions.size()>0 || shardRequest))
{
         SimpleOrderedMap suggestionList = new SimpleOrderedMap();
         suggestionList.add("numFound", theSuggestions.size());
         suggestionList.add("startOffset", inputToken.startOffset());
@@ -430,7 +462,7 @@ public class SpellCheckComponent extends
           suggestionList.add("suggestion", theSuggestions.keySet());
         }
 
-        if (collate == true ){//set aside the best suggestion for this token
+        if (collate == true && theSuggestions.size()>0){//set aside the best suggestion
for this token
           best.put(inputToken, theSuggestions.keySet().iterator().next());
         }
         if (hasFreqInfo) {

Modified: lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java?rev=995972&r1=995971&r2=995972&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
(original)
+++ lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
Fri Sep 10 20:31:52 2010
@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 
@@ -41,6 +42,8 @@ import org.apache.lucene.search.spell.Sp
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.RAMDirectory;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
@@ -169,6 +172,12 @@ public abstract class AbstractLuceneSpel
 
   @Override
   public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
+  	boolean shardRequest = false;
+  	SolrParams params = options.customParams;
+  	if(params!=null)
+  	{
+  		shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
+  	}
     SpellingResult result = new SpellingResult(options.tokens);
     IndexReader reader = determineReader(options.reader);
     Term term = field != null ? new Term(field, "") : null;
@@ -183,7 +192,7 @@ public abstract class AbstractLuceneSpel
             field,
             options.onlyMorePopular, theAccuracy);
       if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
-        //These are spelled the same, continue on
+      	//These are spelled the same, continue on
         continue;
       }
 
@@ -191,9 +200,15 @@ public abstract class AbstractLuceneSpel
         term = term.createTerm(tokenText);
         result.add(token, reader.docFreq(term));
         int countLimit = Math.min(options.count, suggestions.length);
-        for (int i = 0; i < countLimit; i++) {
-          term = term.createTerm(suggestions[i]);
-          result.add(token, suggestions[i], reader.docFreq(term));
+        if(countLimit>0)
+        {
+	        for (int i = 0; i < countLimit; i++) {
+	          term = term.createTerm(suggestions[i]);
+	          result.add(token, suggestions[i], reader.docFreq(term));
+	        }
+        } else if(shardRequest) {
+        	List<String> suggList = Collections.emptyList();
+        	result.add(token, suggList);
         }
       } else {
         if (suggestions.length > 0) {
@@ -202,6 +217,9 @@ public abstract class AbstractLuceneSpel
             suggList = suggList.subList(0, options.count);
           }
           result.add(token, suggList);
+        } else if(shardRequest) {
+        	List<String> suggList = Collections.emptyList();
+        	result.add(token, suggList);
         }
       }
     }

Modified: lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java?rev=995972&r1=995971&r2=995972&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
(original)
+++ lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
Fri Sep 10 20:31:52 2010
@@ -1,8 +1,11 @@
 package org.apache.solr.handler.component;
 
+import java.io.File;
+
 import org.apache.solr.BaseDistributedSearchTestCase;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.util.AbstractSolrTestCase;
 
 /**
  * Test for SpellCheckComponent's distributed querying
@@ -13,6 +16,12 @@ import org.apache.solr.common.params.Mod
  */
 public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
   
+	public DistributedSpellCheckComponentTest()
+	{
+		//fixShardCount=true;
+		//shardCount=2;
+	}
+	
   private String saveProp;
   @Override
   public void setUp() throws Exception {
@@ -49,6 +58,7 @@ public class DistributedSpellCheckCompon
   
   @Override
   public void doTest() throws Exception {
+  	del("*:*");
     index(id, "1", "lowerfilt", "toyota");
     index(id, "2", "lowerfilt", "chevrolet");
     index(id, "3", "lowerfilt", "suzuki");
@@ -60,6 +70,18 @@ public class DistributedSpellCheckCompon
     index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs.");
     index(id, "10", "lowerfilt", "blue");
     index(id, "12", "lowerfilt", "glue");
+    index(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "15", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "16", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "17", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "19", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "21", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "24", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
     commit();
 
     handle.clear();



Mime
View raw message