Return-Path: Delivered-To: apmail-lucene-commits-archive@www.apache.org Received: (qmail 50267 invoked from network); 5 Jan 2011 03:42:40 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 5 Jan 2011 03:42:40 -0000 Received: (qmail 34789 invoked by uid 500); 5 Jan 2011 03:42:40 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 34782 invoked by uid 99); 5 Jan 2011 03:42:39 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 05 Jan 2011 03:42:39 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 05 Jan 2011 03:42:37 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id EC9C623889BF; Wed, 5 Jan 2011 03:42:15 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1055289 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/CHANGES.txt lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java solr/ Date: Wed, 05 Jan 2011 03:42:15 -0000 To: commits@lucene.apache.org From: rmuir@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20110105034215.EC9C623889BF@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: rmuir Date: Wed Jan 5 03:42:15 2011 New Revision: 1055289 URL: http://svn.apache.org/viewvc?rev=1055289&view=rev Log: LUCENE-2391: improve Spellchecker indexing speed Modified: lucene/dev/branches/branch_3x/ (props changed) lucene/dev/branches/branch_3x/lucene/ (props changed) lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java lucene/dev/branches/branch_3x/solr/ (props changed) Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1055289&r1=1055288&r2=1055289&view=diff ============================================================================== --- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original) +++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Wed Jan 5 03:42:15 2011 @@ -26,6 +26,14 @@ Changes in backwards compatibility polic * LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods are used to set pre/post tags and Encoder. (Koji Sekiguchi) + + * LUCENE-2391: Improved spellchecker (re)build time/ram usage by omitting + frequencies/positions/norms for single-valued fields, modifying the default + ramBufferMBSize to match IndexWriterConfig (16MB), making index optimization + an optional boolean parameter, and modifying the incremental update logic + to work well with unoptimized spellcheck indexes. The indexDictionary() methods + were made final to ensure a hard backwards break in case you were subclassing + Spellchecker. In general, subclassing Spellchecker is not recommended. (Robert Muir) Changes in runtime behavior Modified: lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java?rev=1055289&r1=1055288&r2=1055289&view=diff ============================================================================== --- lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java (original) +++ lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java Wed Jan 5 03:42:15 2011 @@ -18,8 +18,10 @@ package org.apache.lucene.search.spell; */ import java.io.IOException; +import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; +import java.util.List; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; @@ -38,7 +40,9 @@ import org.apache.lucene.search.ScoreDoc import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.Version; +import org.apache.lucene.util.VirtualMethod; /** *

@@ -492,35 +496,56 @@ public class SpellChecker implements jav * @param dict Dictionary to index * @param mergeFactor mergeFactor to use when indexing * @param ramMB the max amount or memory in MB to use + * @param optimize whether or not the spellcheck index should be optimized * @throws AlreadyClosedException if the Spellchecker is already closed * @throws IOException */ - public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException { + public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException { synchronized (modifyCurrentIndexLock) { ensureOpen(); final Directory dir = this.spellIndex; final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB)); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor); + IndexSearcher indexSearcher = obtainSearcher(); + final List readers = new ArrayList(); + + if (searcher.maxDoc() > 0) { + ReaderUtil.gatherSubReaders(readers, searcher.getIndexReader()); + } + + boolean isEmpty = readers.isEmpty(); + + try { + Iterator iter = dict.getWordsIterator(); + + terms: while (iter.hasNext()) { + String word = iter.next(); + + int len = word.length(); + if (len < 3) { + continue; // too short we bail but "too long" is fine... + } - Iterator iter = dict.getWordsIterator(); - while (iter.hasNext()) { - String word = iter.next(); - - int len = word.length(); - if (len < 3) { - continue; // too short we bail but "too long" is fine... - } + if (!isEmpty) { + // we have a non-empty index, check if the term exists + Term term = F_WORD_TERM.createTerm(word); + for (IndexReader ir : readers) { + if (ir.docFreq(term) > 0) { + continue terms; + } + } + } - if (this.exist(word)) { // if the word already exist in the gramindex - continue; + // ok index the word + Document doc = createDocument(word, getMin(len), getMax(len)); + writer.addDocument(doc); } - - // ok index the word - Document doc = createDocument(word, getMin(len), getMax(len)); - writer.addDocument(doc); + } finally { + releaseSearcher(indexSearcher); } // close writer - writer.optimize(); + if (optimize) + writer.optimize(); writer.close(); // also re-open the spell index to see our own changes when the next suggestion // is fetched: @@ -531,10 +556,21 @@ public class SpellChecker implements jav /** * Indexes the data from the given {@link Dictionary}. * @param dict the dictionary to index + * @param mergeFactor mergeFactor to use when indexing + * @param ramMB the max amount or memory in MB to use + * @throws IOException + */ + public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException { + indexDictionary(dict, mergeFactor, ramMB, true); + } + + /** + * Indexes the data from the given {@link Dictionary}. + * @param dict the dictionary to index * @throws IOException */ - public void indexDictionary(Dictionary dict) throws IOException { - indexDictionary(dict, 300, 10); + public final void indexDictionary(Dictionary dict) throws IOException { + indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); } private static int getMin(int l) { @@ -559,7 +595,12 @@ public class SpellChecker implements jav private static Document createDocument(String text, int ng1, int ng2) { Document doc = new Document(); - doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term + // the word field is never queried on... its indexed so it can be quickly + // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos + Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED); + f.setOmitTermFreqAndPositions(true); + f.setOmitNorms(true); + doc.add(f); // orig term addGram(text, doc, ng1, ng2); return doc; } @@ -573,12 +614,20 @@ public class SpellChecker implements jav String gram = text.substring(i, i + ng); doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); if (i == 0) { - doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); + // only one term possible in the startXXField, TF/pos and norms aren't needed. + Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED); + startField.setOmitTermFreqAndPositions(true); + startField.setOmitNorms(true); + doc.add(startField); } end = gram; } if (end != null) { // may not be present if len==ng1 - doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED)); + // only one term possible in the endXXField, TF/pos and norms aren't needed. + Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED); + endField.setOmitTermFreqAndPositions(true); + endField.setOmitNorms(true); + doc.add(endField); } } }