Return-Path: Delivered-To: apmail-lucene-solr-commits-archive@minotaur.apache.org Received: (qmail 97849 invoked from network); 9 Dec 2009 13:23:22 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 9 Dec 2009 13:23:22 -0000 Received: (qmail 61711 invoked by uid 500); 9 Dec 2009 13:23:22 -0000 Delivered-To: apmail-lucene-solr-commits-archive@lucene.apache.org Received: (qmail 61643 invoked by uid 500); 9 Dec 2009 13:23:22 -0000 Mailing-List: contact solr-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: solr-dev@lucene.apache.org Delivered-To: mailing list solr-commits@lucene.apache.org Received: (qmail 61627 invoked by uid 99); 9 Dec 2009 13:23:22 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 09 Dec 2009 13:23:22 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 09 Dec 2009 13:23:14 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 05F7E23888E9; Wed, 9 Dec 2009 13:22:53 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r888796 - in /lucene/solr/trunk: ./ src/java/org/apache/solr/handler/component/ src/java/org/apache/solr/spelling/ src/test/org/apache/solr/handler/component/ Date: Wed, 09 Dec 2009 13:22:52 -0000 To: solr-commits@lucene.apache.org From: shalin@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20091209132253.05F7E23888E9@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: shalin Date: Wed Dec 9 13:22:52 2009 New Revision: 888796 URL: http://svn.apache.org/viewvc?rev=888796&view=rev Log: SOLR-785 -- Distributed Search support for SpellCheckComponent Added: lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java (with props) Modified: lucene/solr/trunk/CHANGES.txt lucene/solr/trunk/src/java/org/apache/solr/handler/component/SpellCheckComponent.java lucene/solr/trunk/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java Modified: lucene/solr/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=888796&r1=888795&r2=888796&view=diff ============================================================================== --- lucene/solr/trunk/CHANGES.txt (original) +++ lucene/solr/trunk/CHANGES.txt Wed Dec 9 13:22:52 2009 @@ -53,6 +53,9 @@ * SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter (Robert Muir via shalin) +* SOLR-785: Distributed Search support for SpellCheckComponent + (Matthew Woytowitz, shalin) + Optimizations ---------------------- Modified: lucene/solr/trunk/src/java/org/apache/solr/handler/component/SpellCheckComponent.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/component/SpellCheckComponent.java?rev=888796&r1=888795&r2=888796&view=diff ============================================================================== --- lucene/solr/trunk/src/java/org/apache/solr/handler/component/SpellCheckComponent.java (original) +++ lucene/solr/trunk/src/java/org/apache/solr/handler/component/SpellCheckComponent.java Wed Dec 9 13:22:52 2009 @@ -19,14 +19,13 @@ import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Collections; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.search.spell.LevensteinDistance; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.util.PriorityQueue; +import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -152,6 +151,217 @@ } } + static class SuggestWordQueue extends PriorityQueue { + SuggestWordQueue(int size) { + initialize(size); + } + + @Override + protected boolean lessThan(Object a, Object b) { + SuggestWord wa = (SuggestWord) a; + SuggestWord wb = (SuggestWord) b; + int val = wa.compareTo(wb); + return val < 0; + } + } + + /** + * Borrowed from Lucene SpellChecker + */ + static class SuggestWord { + /** + * the score of the word + */ + public float score; + + /** + * The freq of the word + */ + public int freq; + + /** + * the suggested word + */ + public String string; + + public final int compareTo(SuggestWord a) { + // first criteria: the edit distance + if (score > a.score) { + return 1; + } + if (score < a.score) { + return -1; + } + + // second criteria (if first criteria is equal): the popularity + if (freq > a.freq) { + return 1; + } + + if (freq < a.freq) { + return -1; + } + return 0; + } + } + + @Override + public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { + SolrParams params = rb.req.getParams(); + // Turn on spellcheck only only when retrieving fields + if (!params.getBool(COMPONENT_NAME, false)) return; + if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) { + // fetch at least 5 suggestions from each shard + int count = sreq.params.getInt(SPELLCHECK_COUNT, 1); + if (count < 5) count = 5; + sreq.params.set(SPELLCHECK_COUNT, count); + sreq.params.set("spellcheck", "true"); + } else { + sreq.params.set("spellcheck", "false"); + } + } + + @Override + @SuppressWarnings({"unchecked", "deprecation"}) + public void finishStage(ResponseBuilder rb) { + SolrParams params = rb.req.getParams(); + if (!params.getBool(COMPONENT_NAME, false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) + return; + + boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false); + boolean collate = params.getBool(SPELLCHECK_COLLATE, false); + + String origQuery = params.get(SPELLCHECK_Q); + if (origQuery == null) { + origQuery = rb.getQueryString(); + if (origQuery == null) { + origQuery = params.get(CommonParams.Q); + } + } + + int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1); + float min = 0.5f; + StringDistance sd = null; + int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); + SolrSpellChecker checker = getSpellChecker(rb.req.getParams()); + if (checker instanceof AbstractLuceneSpellChecker) { + AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker; + min = spellChecker.getAccuracy(); + sd = spellChecker.getStringDistance(); + } + if (sd == null) + sd = new LevensteinDistance(); + + Collection tokens = null; + try { + tokens = getTokens(origQuery, checker.getQueryAnalyzer()); + } catch (IOException e) { + LOG.error("Could not get tokens (this should never happen)", e); + } + + // original token -> corresponding Suggestion object (keep track of start,end) + Map origVsSuggestion = new HashMap(); + // original token string -> summed up frequency + Map origVsFreq = new HashMap(); + // original token string -> set of alternatives + // must preserve order because collation algorithm can only work in-order + Map> origVsSuggested = new LinkedHashMap>(); + // alternative string -> corresponding SuggestWord object + Map suggestedVsWord = new HashMap(); + + for (ShardRequest sreq : rb.finished) { + for (ShardResponse srsp : sreq.responses) { + NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck"); + LOG.info(srsp.getShard() + " " + nl); + if (nl != null) { + SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl); + for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) { + origVsSuggestion.put(suggestion.getToken(), suggestion); + HashSet suggested = origVsSuggested.get(suggestion.getToken()); + if (suggested == null) { + suggested = new HashSet(); + origVsSuggested.put(suggestion.getToken(), suggested); + } + + // sum up original frequency + int origFreq = 0; + Integer o = origVsFreq.get(suggestion.getToken()); + if (o != null) origFreq += o; + origFreq += suggestion.getOriginalFrequency(); + origVsFreq.put(suggestion.getToken(), origFreq); + + // find best suggestions + for (int i = 0; i < suggestion.getNumFound(); i++) { + String alternative = suggestion.getAlternatives().get(i); + suggested.add(alternative); + SuggestWord sug = suggestedVsWord.get(alternative); + if (sug == null) { + sug = new SuggestWord(); + suggestedVsWord.put(alternative, sug); + } + sug.string = alternative; + // alternative frequency is present only for extendedResults=true + if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) { + Integer freq = suggestion.getAlternativeFrequencies().get(i); + if (freq != null) sug.freq += freq; + } + } + } + } + } + } + + // all shard responses have been collected + // create token and get top suggestions + SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand? + for (Map.Entry> entry : origVsSuggested.entrySet()) { + String original = entry.getKey(); + HashSet suggested = entry.getValue(); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + for (String suggestion : suggested) { + SuggestWord sug = suggestedVsWord.get(suggestion); + sug.score = sd.getDistance(original, sug.string); + if (sug.score < min) continue; + sugQueue.insertWithOverflow(sug); + if (sugQueue.size() == numSug) { + // if queue full, maintain the minScore score + min = ((SuggestWord) sugQueue.top()).score; + } + } + + // create token + SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original); + Token token = new Token(); + token.setTermText(original); + token.setStartOffset(suggestion.getStartOffset()); + token.setEndOffset(suggestion.getEndOffset()); + + // get top 'count' suggestions out of 'sugQueue.size()' candidates + SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())]; + // skip the first sugQueue.size() - count elements + for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop(); + // now collect the top 'count' responses + for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) { + suggestions[k] = ((SuggestWord) sugQueue.pop()); + } + + if (extendedResults) { + Integer o = origVsFreq.get(original); + if (o != null) result.add(token, o); + for (SuggestWord word : suggestions) + result.add(token, word.string, word.freq); + } else { + List words = new ArrayList(sugQueue.size()); + for (SuggestWord word : suggestions) words.add(word.string); + result.add(token, words); + } + } + + NamedList response = new SimpleOrderedMap(); + response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate)); + rb.rsp.add("spellcheck", response); + } + private Collection getTokens(String q, Analyzer analyzer) throws IOException { Collection result = new ArrayList(); Token token = null; Modified: lucene/solr/trunk/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java?rev=888796&r1=888795&r2=888796&view=diff ============================================================================== --- lucene/solr/trunk/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java (original) +++ lucene/solr/trunk/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java Wed Dec 9 13:22:52 2009 @@ -77,6 +77,8 @@ protected float accuracy = 0.5f; public static final String FIELD = "field"; + protected StringDistance sd; + public String init(NamedList config, SolrCore core) { super.init(config, core); indexDir = (String) config.get(INDEX_DIR); @@ -90,7 +92,6 @@ sourceLocation = (String) config.get(LOCATION); field = (String) config.get(FIELD); String strDistanceName = (String)config.get(STRING_DISTANCE); - StringDistance sd = null; if (strDistanceName != null) { sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName); //TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils... @@ -226,4 +227,8 @@ public String getSourceLocation() { return sourceLocation; } + + public StringDistance getStringDistance() { + return sd; + } } Added: lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java?rev=888796&view=auto ============================================================================== --- lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java (added) +++ lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java Wed Dec 9 13:22:52 2009 @@ -0,0 +1,39 @@ +package org.apache.solr.handler.component; + +import org.apache.solr.BaseDistributedSearchTestCase; + +/** + * Test for SpellCheckComponent's distributed querying + * + * @since solr 1.5 + * @version $Id$ + * @see org.apache.solr.handler.component.SpellCheckComponent + */ +public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase { + + @Override + public void doTest() throws Exception { + index(id, "1", "lowerfilt", "toyota"); + index(id, "2", "lowerfilt", "chevrolet"); + index(id, "3", "lowerfilt", "suzuki"); + index(id, "4", "lowerfilt", "ford"); + index(id, "5", "lowerfilt", "ferrari"); + index(id, "6", "lowerfilt", "jaguar"); + index(id, "7", "lowerfilt", "mclaren"); + index(id, "8", "lowerfilt", "sonata"); + index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs."); + index(id, "10", "lowerfilt", "blue"); + index(id, "12", "lowerfilt", "glue"); + commit(); + + handle.clear(); + handle.put("QTime", SKIPVAL); + handle.put("timestamp", SKIPVAL); + handle.put("maxScore", SKIPVAL); + + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4"); + query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true"); + } +} Propchange: lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java ------------------------------------------------------------------------------ svn:keywords = Date Author Id Revision HeadURL