From commits-return-112733-archive-asf-public=cust-asf.ponee.io@lucene.apache.org Wed Jan 15 15:47:36 2020 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id A91C518065E for ; Wed, 15 Jan 2020 16:47:35 +0100 (CET) Received: (qmail 20686 invoked by uid 500); 15 Jan 2020 15:47:35 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 20672 invoked by uid 99); 15 Jan 2020 15:47:35 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 15 Jan 2020 15:47:35 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id DCFFF8194F; Wed, 15 Jan 2020 15:47:34 +0000 (UTC) Date: Wed, 15 Jan 2020 15:47:34 +0000 To: "commits@lucene.apache.org" Subject: [lucene-solr] branch branch_8x updated: LUCENE-9068: Build FuzzyQuery automata up-front (#1042) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <157910325449.4275.11672866318328880986@gitbox.apache.org> From: romseygeek@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: lucene-solr X-Git-Refname: refs/heads/branch_8x X-Git-Reftype: branch X-Git-Oldrev: f56f51fc3ebc58bc69d8c381af9d6ef6671619b5 X-Git-Newrev: 32af73511f9c9ddc591fd3471da66e854f4bd7f2 X-Git-Rev: 32af73511f9c9ddc591fd3471da66e854f4bd7f2 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. romseygeek pushed a commit to branch branch_8x in repository https://gitbox.apache.org/repos/asf/lucene-solr.git The following commit(s) were added to refs/heads/branch_8x by this push: new 32af735 LUCENE-9068: Build FuzzyQuery automata up-front (#1042) 32af735 is described below commit 32af73511f9c9ddc591fd3471da66e854f4bd7f2 Author: Alan Woodward AuthorDate: Wed Jan 15 14:58:11 2020 +0000 LUCENE-9068: Build FuzzyQuery automata up-front (#1042) FuzzyTermsEnum can now either take an array of compiled automata, and an AttributeSource, to be used across multiple segments (eg during FuzzyQuery rewrite); or it can take a term, edit distance, prefix and transition boolean and build the automata itself if only being used once (eg for fuzzy nearest neighbour calculations). Rather than interact via attribute sources and specialized attributes, users of FuzzyTermsEnum can get the boost and set minimum competitive boosts directly on the enum. --- lucene/CHANGES.txt | 2 + .../classification/utils/NearestFuzzyQuery.java | 14 +- .../java/org/apache/lucene/search/FuzzyQuery.java | 50 +++-- .../org/apache/lucene/search/FuzzyTermsEnum.java | 203 ++++++++------------- .../apache/lucene/util/TestRamUsageEstimator.java | 22 ++- .../lucene/sandbox/queries/FuzzyLikeThisQuery.java | 9 +- .../lucene/search/spell/DirectSpellChecker.java | 32 ++-- 7 files changed, 154 insertions(+), 178 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c2fda6a..16b9f90 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -59,6 +59,8 @@ Other * LUCENE-9096: Simplification of CompressingTermVectorsWriter#flushOffsets. (kkewwei via Adrien Grand) +* LUCENE-9068: FuzzyQuery builds its Automaton up-front (Alan Woodward, Mike Drob) + ======================= Lucene 8.4.1 ======================= Bug Fixes diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java index 24c8227..088af3b 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java @@ -34,14 +34,11 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostAttribute; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.FuzzyTermsEnum; -import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -158,27 +155,22 @@ public class NearestFuzzyQuery extends Query { ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); - AttributeSource atts = new AttributeSource(); - MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); - FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true); + FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants = 0; int totalVariantDocFreqs = 0; BytesRef possibleMatch; - BoostAttribute boostAtt = - fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { numVariants++; totalVariantDocFreqs += fe.docFreq(); - float score = boostAtt.getBoost(); + float score = fe.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } - maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); + fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if (numVariants > 0) { diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index 01eac22..f19aa3b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -23,11 +23,11 @@ import org.apache.lucene.index.SingleTermsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; -import org.apache.lucene.util.automaton.Operations; /** Implements the fuzzy search query. The similarity measurement * is based on the Damerau-Levenshtein (optimal string alignment) algorithm, @@ -53,7 +53,9 @@ import org.apache.lucene.util.automaton.Operations; * not match an indexed term "ab", and FuzzyQuery on term "a" with maxEdits=2 will not * match an indexed term "abc". */ -public class FuzzyQuery extends MultiTermQuery { +public class FuzzyQuery extends MultiTermQuery implements Accountable { + + private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(AutomatonQuery.class); public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int defaultPrefixLength = 0; @@ -65,6 +67,10 @@ public class FuzzyQuery extends MultiTermQuery { private final boolean transpositions; private final int prefixLength; private final Term term; + private final int termLength; + private final CompiledAutomaton[] automata; + + private final long ramBytesUsed; /** * Create a new FuzzyQuery that will match terms with an edit distance @@ -100,7 +106,22 @@ public class FuzzyQuery extends MultiTermQuery { this.prefixLength = prefixLength; this.transpositions = transpositions; this.maxExpansions = maxExpansions; + int[] codePoints = FuzzyTermsEnum.stringToUTF32(term.text()); + this.termLength = codePoints.length; + this.automata = FuzzyTermsEnum.buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits); setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions)); + this.ramBytesUsed = calculateRamBytesUsed(term, this.automata); + } + + private static long calculateRamBytesUsed(Term term, CompiledAutomaton[] automata) { + long bytes = BASE_RAM_BYTES + term.ramBytesUsed(); + for (CompiledAutomaton a : automata) { + bytes += a.ramBytesUsed(); + } + bytes += 4 * Integer.BYTES; + bytes += Long.BYTES; + bytes += 1; + return bytes; } /** @@ -150,10 +171,10 @@ public class FuzzyQuery extends MultiTermQuery { } /** - * Expert: Constructs an equivalent Automaton accepting terms matched by this query + * Returns the compiled automata used to match terms */ - public Automaton toAutomaton() { - return FuzzyTermsEnum.buildAutomaton(term.text(), prefixLength, transpositions, maxEdits); + public CompiledAutomaton[] getAutomata() { + return automata; } @Override @@ -162,9 +183,7 @@ public class FuzzyQuery extends MultiTermQuery { if (maxEdits == 0 || prefixLength >= term.text().length()) { visitor.consumeTerms(this, term); } else { - // Note: we're rebuilding the automaton here, so this can be expensive - visitor.consumeTermsMatching(this, field, - new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES)); + automata[automata.length - 1].visit(visitor, this, field); } } } @@ -174,7 +193,7 @@ public class FuzzyQuery extends MultiTermQuery { if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } - return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); + return new FuzzyTermsEnum(terms, atts, getTerm(), termLength, maxEdits, automata); } /** @@ -193,7 +212,7 @@ public class FuzzyQuery extends MultiTermQuery { } buffer.append(term.text()); buffer.append('~'); - buffer.append(Integer.toString(maxEdits)); + buffer.append(maxEdits); return buffer.toString(); } @@ -218,6 +237,8 @@ public class FuzzyQuery extends MultiTermQuery { if (getClass() != obj.getClass()) return false; FuzzyQuery other = (FuzzyQuery) obj; + // Note that we don't need to compare termLength or automata because they + // are entirely determined by the other fields if (maxEdits != other.maxEdits) return false; if (prefixLength != other.prefixLength) @@ -260,4 +281,9 @@ public class FuzzyQuery extends MultiTermQuery { LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); } } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 4759220..91a44d5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -18,18 +18,13 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.Arrays; -import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.Attribute; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -37,6 +32,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; /** Subclass of TermsEnum for enumerating all terms that are similar * to the specified filter term. @@ -45,10 +41,12 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata; * {@link BytesRef#compareTo}. Each term in the enumeration is * greater than all that precede it.

*/ -public final class FuzzyTermsEnum extends BaseTermsEnum { +public final class FuzzyTermsEnum extends TermsEnum { // NOTE: we can't subclass FilteredTermsEnum here because we need to sometimes change actualEnum: private TermsEnum actualEnum; + + private final AttributeSource atts; // We use this to communicate the score (boost) of the current matched term we are on back to // MultiTermQuery.TopTermsBlendedFreqScoringRewrite that is collecting the best (default 50) matched terms: @@ -58,30 +56,46 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { // which we use to know when we can reduce the automaton from ed=2 to ed=1, or ed=0 if only single top term is collected: private final MaxNonCompetitiveBoostAttribute maxBoostAtt; - // We use this to share the pre-built (once for the query) Levenshtein automata across segments: - private final LevenshteinAutomataAttribute dfaAtt; + private final CompiledAutomaton[] automata; private float bottom; private BytesRef bottomTerm; - private final CompiledAutomaton automata[]; private BytesRef queuedBottom; - final int termLength; + private final int termLength; // Maximum number of edits we will accept. This is either 2 or 1 (or, degenerately, 0) passed by the user originally, // but as we collect terms, we can lower this (e.g. from 2 to 1) if we detect that the term queue is full, and all // collected terms are ed=1: private int maxEdits; - final Terms terms; - final Term term; - final int termText[]; - final int realPrefixLength; + private final Terms terms; + private final Term term; + + /** + * Constructor for enumeration of all terms from specified reader which share a prefix of + * length prefixLength with term and which have at most {@code maxEdits} edits. + *

+ * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * + * @param terms Delivers terms. + * @param term Pattern term. + * @param maxEdits Maximum edit distance. + * @param prefixLength the length of the required common prefix + * @param transpositions whether transpositions should count as a single edit + * @throws IOException if there is a low-level IO error + */ + public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException { + this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions); + } + + private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException { + this(terms, new AttributeSource(), term, codePoints.length, maxEdits, + buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits)); + } - // True (the default, in FuzzyQuery) if a transposition should count as a single edit: - final boolean transpositions; - /** * Constructor for enumeration of all terms from specified reader which share a prefix of * length prefixLength with term and which have at most {@code maxEdits} edits. @@ -91,72 +105,62 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { * * @param terms Delivers terms. * @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery} - * thats contains information about competitive boosts during rewrite. It is also used - * to cache DFAs between segment transitions. + * that contains information about competitive boosts during rewrite * @param term Pattern term. * @param maxEdits Maximum edit distance. - * @param prefixLength Length of required common prefix. Default value is 0. + * @param automata An array of levenshtein automata to match against terms, + * see {@link #buildAutomata(String, int[], int, boolean, int)} * @throws IOException if there is a low-level IO error */ - public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, - final int maxEdits, final int prefixLength, boolean transpositions) throws IOException { - if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { - throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits); - } - if (prefixLength < 0) { - throw new IllegalArgumentException("prefixLength cannot be less than 0"); - } + public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength, + final int maxEdits, CompiledAutomaton[] automata) throws IOException { + this.maxEdits = maxEdits; this.terms = terms; this.term = term; - - // convert the string into a utf32 int[] representation for fast comparisons - this.termText = stringToUTF32(term.text()); - this.termLength = termText.length; + this.atts = atts; + this.termLength = termLength; - this.dfaAtt = atts.addAttribute(LevenshteinAutomataAttribute.class); this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); + this.boostAtt = atts.addAttribute(BoostAttribute.class); - // NOTE: boostAtt must pulled from attributes() not from atts! This is because TopTermsRewrite looks for boostAtt from this TermsEnum's - // private attributes() and not the global atts passed to us from MultiTermQuery: - this.boostAtt = attributes().addAttribute(BoostAttribute.class); - - //The prefix could be longer than the word. - //It's kind of silly though. It means we must match the entire word. - this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength; - this.transpositions = transpositions; - - CompiledAutomaton[] prevAutomata = dfaAtt.automata(); - if (prevAutomata == null) { - prevAutomata = new CompiledAutomaton[maxEdits+1]; - Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits); - for (int i = 0; i <= maxEdits; i++) { - prevAutomata[i] = new CompiledAutomaton(automata[i], true, false); - } - // first segment computes the automata, and we share with subsequent segments via this Attribute: - dfaAtt.setAutomata(prevAutomata); - } + this.automata = automata; - this.automata = prevAutomata; bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); bottomTerm = maxBoostAtt.getCompetitiveTerm(); bottomChanged(null); } /** - * Builds a binary Automaton to match a fuzzy term - * @param text the term to match - * @param prefixLength length of a required common prefix - * @param transpositions {@code true} if transpositions should count as a single edit - * @param maxEdits the maximum edit distance of matching terms + * Sets the maximum non-competitive boost, which may allow switching to a + * lower max-edit automaton at run time */ - public static Automaton buildAutomaton(String text, int prefixLength, boolean transpositions, int maxEdits) { - int[] termText = stringToUTF32(text); + public void setMaxNonCompetitiveBoost(float boost) { + this.maxBoostAtt.setMaxNonCompetitiveBoost(boost); + } + + /** + * Gets the boost of the current term + */ + public float getBoost() { + return boostAtt.getBoost(); + } + + static CompiledAutomaton[] buildAutomata(String text, int[] termText, int prefixLength, boolean transpositions, int maxEdits) { + CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1]; Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits); - return automata[automata.length - 1]; + for (int i = 0; i <= maxEdits; i++) { + try { + compiled[i] = new CompiledAutomaton(automata[i], true, false); + } + catch (TooComplexToDeterminizeException e) { + throw new FuzzyTermsException(text, e); + } + } + return compiled; } - private static int[] stringToUTF32(String text) { + static int[] stringToUTF32(String text) { int[] termText = new int[text.codePointCount(0, text.length())]; for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) { termText[j++] = cp = text.codePointAt(i); @@ -323,7 +327,12 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { public long ord() throws IOException { return actualEnum.ord(); } - + + @Override + public AttributeSource attributes() { + return atts; + } + @Override public boolean seekExact(BytesRef text) throws IOException { return actualEnum.seekExact(text); @@ -345,66 +354,14 @@ public final class FuzzyTermsEnum extends BaseTermsEnum { } /** - * reuses compiled automata across different segments, - * because they are independent of the index - * @lucene.internal */ - public static interface LevenshteinAutomataAttribute extends Attribute { - public CompiledAutomaton[] automata(); - public void setAutomata(CompiledAutomaton[] automata); - } - - /** - * Stores compiled automata as a list (indexed by edit distance) - * @lucene.internal */ - public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute { - private CompiledAutomaton[] automata; - - @Override - public CompiledAutomaton[] automata() { - return automata; - } - - @Override - public void setAutomata(CompiledAutomaton[] automata) { - this.automata = automata; - } - - @Override - public void clear() { - automata = null; - } - - @Override - public int hashCode() { - if (automata == null) { - return 0; - } else { - return automata.hashCode(); - } - } - - @Override - public boolean equals(Object other) { - if (this == other) - return true; - if (!(other instanceof LevenshteinAutomataAttributeImpl)) - return false; - return Arrays.equals(automata, ((LevenshteinAutomataAttributeImpl) other).automata); - } - - @Override - public void copyTo(AttributeImpl _target) { - LevenshteinAutomataAttribute target = (LevenshteinAutomataAttribute) _target; - if (automata == null) { - target.setAutomata(null); - } else { - target.setAutomata(automata); - } - } - - @Override - public void reflectWith(AttributeReflector reflector) { - reflector.reflect(LevenshteinAutomataAttribute.class, "automata", automata); + * Thrown to indicate that there was an issue creating a fuzzy query for a given term. + * Typically occurs with terms longer than 220 UTF-8 characters, + * but also possible with shorter terms consisting of UTF-32 code points. + */ + public static class FuzzyTermsException extends RuntimeException { + FuzzyTermsException(String term, Throwable cause) { + super("Term too complex: " + term, cause); } } + } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java b/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java index 8610afc..de993b9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java @@ -17,9 +17,6 @@ package org.apache.lucene.util; -import static org.apache.lucene.util.RamUsageEstimator.*; -import static org.apache.lucene.util.RamUsageTester.sizeOf; - import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -31,9 +28,24 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; -import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; +import static org.apache.lucene.util.RamUsageEstimator.COMPRESSED_REFS_ENABLED; +import static org.apache.lucene.util.RamUsageEstimator.HOTSPOT_BEAN_CLASS; +import static org.apache.lucene.util.RamUsageEstimator.JVM_IS_HOTSPOT_64BIT; +import static org.apache.lucene.util.RamUsageEstimator.LONG_CACHE_MAX_VALUE; +import static org.apache.lucene.util.RamUsageEstimator.LONG_CACHE_MIN_VALUE; +import static org.apache.lucene.util.RamUsageEstimator.LONG_SIZE; +import static org.apache.lucene.util.RamUsageEstimator.MANAGEMENT_FACTORY_CLASS; +import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; +import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_ALIGNMENT; +import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_HEADER; +import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOf; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; +import static org.apache.lucene.util.RamUsageTester.sizeOf; + public class TestRamUsageEstimator extends LuceneTestCase { static final String[] strings = new String[] { @@ -161,7 +173,7 @@ public class TestRamUsageEstimator extends LuceneTestCase { Arrays.asList(new TermQuery(new Term("foo1", "bar1")), new TermQuery(new Term("baz1", "bam1"))), 1.0f); BooleanQuery bq = new BooleanQuery.Builder() .add(new TermQuery(new Term("foo2", "bar2")), BooleanClause.Occur.SHOULD) - .add(new FuzzyQuery(new Term("foo3", "baz3")), BooleanClause.Occur.MUST_NOT) + .add(new PhraseQuery.Builder().add(new Term("foo3", "baz3")).build(), BooleanClause.Occur.MUST_NOT) .add(dismax, BooleanClause.Occur.MUST) .build(); long actual = sizeOf(bq); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index a0e1b5d..b73bea5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -39,13 +39,11 @@ import org.apache.lucene.search.BoostAttribute; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.FuzzyTermsEnum; -import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.automaton.LevenshteinAutomata; @@ -206,10 +204,7 @@ public class FuzzyLikeThisQuery extends Query ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = new Term(f.fieldName, term); - AttributeSource atts = new AttributeSource(); - MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); - FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true); + FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants = 0; @@ -226,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } - maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); + fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if (numVariants > 0) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index f21d7a1..3800596 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -16,27 +16,24 @@ */ package org.apache.lucene.search.spell; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Locale; +import java.util.PriorityQueue; + import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; -import org.apache.lucene.search.BoostAttribute; import org.apache.lucene.search.FuzzyTermsEnum; -import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.automaton.LevenshteinAutomata; -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.Locale; -import java.util.PriorityQueue; - /** * Simple automaton-based spellchecker. *

@@ -420,25 +417,20 @@ public class DirectSpellChecker { */ protected Collection suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException { - - AttributeSource atts = new AttributeSource(); - MaxNonCompetitiveBoostAttribute maxBoostAtt = - atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); + Terms terms = MultiTerms.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } - FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); + FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true); final PriorityQueue stQueue = new PriorityQueue<>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); - BoostAttribute boostAtt = - e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { // For FuzzyQuery, boost is the score: - float score = boostAtt.getBoost(); + float score = e.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && score <= stQueue.peek().boost) { continue; @@ -479,7 +471,7 @@ public class DirectSpellChecker { stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); - maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue;