From commits-return-112733-archive-asf-public=cust-asf.ponee.io@lucene.apache.org Wed Jan 15 15:47:36 2020
Return-Path:
reader
which share a prefix of
+ * length prefixLength
with term
and which have at most {@code maxEdits} edits.
+ *
+ * After calling the constructor the enumeration is already pointing to the first
+ * valid term if such a term exists.
+ *
+ * @param terms Delivers terms.
+ * @param term Pattern term.
+ * @param maxEdits Maximum edit distance.
+ * @param prefixLength the length of the required common prefix
+ * @param transpositions whether transpositions should count as a single edit
+ * @throws IOException if there is a low-level IO error
+ */
+ public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
+ this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions);
+ }
+
+ private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
+ this(terms, new AttributeSource(), term, codePoints.length, maxEdits,
+ buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits));
+ }
- // True (the default, in FuzzyQuery) if a transposition should count as a single edit:
- final boolean transpositions;
-
/**
* Constructor for enumeration of all terms from specified reader
which share a prefix of
* length prefixLength
with term
and which have at most {@code maxEdits} edits.
@@ -91,72 +105,62 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
*
* @param terms Delivers terms.
* @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery}
- * thats contains information about competitive boosts during rewrite. It is also used
- * to cache DFAs between segment transitions.
+ * that contains information about competitive boosts during rewrite
* @param term Pattern term.
* @param maxEdits Maximum edit distance.
- * @param prefixLength Length of required common prefix. Default value is 0.
+ * @param automata An array of levenshtein automata to match against terms,
+ * see {@link #buildAutomata(String, int[], int, boolean, int)}
* @throws IOException if there is a low-level IO error
*/
- public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
- final int maxEdits, final int prefixLength, boolean transpositions) throws IOException {
- if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
- throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
- }
- if (prefixLength < 0) {
- throw new IllegalArgumentException("prefixLength cannot be less than 0");
- }
+ public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength,
+ final int maxEdits, CompiledAutomaton[] automata) throws IOException {
+
this.maxEdits = maxEdits;
this.terms = terms;
this.term = term;
-
- // convert the string into a utf32 int[] representation for fast comparisons
- this.termText = stringToUTF32(term.text());
- this.termLength = termText.length;
+ this.atts = atts;
+ this.termLength = termLength;
- this.dfaAtt = atts.addAttribute(LevenshteinAutomataAttribute.class);
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
+ this.boostAtt = atts.addAttribute(BoostAttribute.class);
- // NOTE: boostAtt must pulled from attributes() not from atts! This is because TopTermsRewrite looks for boostAtt from this TermsEnum's
- // private attributes() and not the global atts passed to us from MultiTermQuery:
- this.boostAtt = attributes().addAttribute(BoostAttribute.class);
-
- //The prefix could be longer than the word.
- //It's kind of silly though. It means we must match the entire word.
- this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength;
- this.transpositions = transpositions;
-
- CompiledAutomaton[] prevAutomata = dfaAtt.automata();
- if (prevAutomata == null) {
- prevAutomata = new CompiledAutomaton[maxEdits+1];
- Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
- for (int i = 0; i <= maxEdits; i++) {
- prevAutomata[i] = new CompiledAutomaton(automata[i], true, false);
- }
- // first segment computes the automata, and we share with subsequent segments via this Attribute:
- dfaAtt.setAutomata(prevAutomata);
- }
+ this.automata = automata;
- this.automata = prevAutomata;
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
bottomTerm = maxBoostAtt.getCompetitiveTerm();
bottomChanged(null);
}
/**
- * Builds a binary Automaton to match a fuzzy term
- * @param text the term to match
- * @param prefixLength length of a required common prefix
- * @param transpositions {@code true} if transpositions should count as a single edit
- * @param maxEdits the maximum edit distance of matching terms
+ * Sets the maximum non-competitive boost, which may allow switching to a
+ * lower max-edit automaton at run time
*/
- public static Automaton buildAutomaton(String text, int prefixLength, boolean transpositions, int maxEdits) {
- int[] termText = stringToUTF32(text);
+ public void setMaxNonCompetitiveBoost(float boost) {
+ this.maxBoostAtt.setMaxNonCompetitiveBoost(boost);
+ }
+
+ /**
+ * Gets the boost of the current term
+ */
+ public float getBoost() {
+ return boostAtt.getBoost();
+ }
+
+ static CompiledAutomaton[] buildAutomata(String text, int[] termText, int prefixLength, boolean transpositions, int maxEdits) {
+ CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
- return automata[automata.length - 1];
+ for (int i = 0; i <= maxEdits; i++) {
+ try {
+ compiled[i] = new CompiledAutomaton(automata[i], true, false);
+ }
+ catch (TooComplexToDeterminizeException e) {
+ throw new FuzzyTermsException(text, e);
+ }
+ }
+ return compiled;
}
- private static int[] stringToUTF32(String text) {
+ static int[] stringToUTF32(String text) {
int[] termText = new int[text.codePointCount(0, text.length())];
for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
termText[j++] = cp = text.codePointAt(i);
@@ -323,7 +327,12 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
public long ord() throws IOException {
return actualEnum.ord();
}
-
+
+ @Override
+ public AttributeSource attributes() {
+ return atts;
+ }
+
@Override
public boolean seekExact(BytesRef text) throws IOException {
return actualEnum.seekExact(text);
@@ -345,66 +354,14 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
}
/**
- * reuses compiled automata across different segments,
- * because they are independent of the index
- * @lucene.internal */
- public static interface LevenshteinAutomataAttribute extends Attribute {
- public CompiledAutomaton[] automata();
- public void setAutomata(CompiledAutomaton[] automata);
- }
-
- /**
- * Stores compiled automata as a list (indexed by edit distance)
- * @lucene.internal */
- public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute {
- private CompiledAutomaton[] automata;
-
- @Override
- public CompiledAutomaton[] automata() {
- return automata;
- }
-
- @Override
- public void setAutomata(CompiledAutomaton[] automata) {
- this.automata = automata;
- }
-
- @Override
- public void clear() {
- automata = null;
- }
-
- @Override
- public int hashCode() {
- if (automata == null) {
- return 0;
- } else {
- return automata.hashCode();
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other)
- return true;
- if (!(other instanceof LevenshteinAutomataAttributeImpl))
- return false;
- return Arrays.equals(automata, ((LevenshteinAutomataAttributeImpl) other).automata);
- }
-
- @Override
- public void copyTo(AttributeImpl _target) {
- LevenshteinAutomataAttribute target = (LevenshteinAutomataAttribute) _target;
- if (automata == null) {
- target.setAutomata(null);
- } else {
- target.setAutomata(automata);
- }
- }
-
- @Override
- public void reflectWith(AttributeReflector reflector) {
- reflector.reflect(LevenshteinAutomataAttribute.class, "automata", automata);
+ * Thrown to indicate that there was an issue creating a fuzzy query for a given term.
+ * Typically occurs with terms longer than 220 UTF-8 characters,
+ * but also possible with shorter terms consisting of UTF-32 code points.
+ */
+ public static class FuzzyTermsException extends RuntimeException {
+ FuzzyTermsException(String term, Throwable cause) {
+ super("Term too complex: " + term, cause);
}
}
+
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java b/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java
index 8610afc..de993b9 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestRamUsageEstimator.java
@@ -17,9 +17,6 @@
package org.apache.lucene.util;
-import static org.apache.lucene.util.RamUsageEstimator.*;
-import static org.apache.lucene.util.RamUsageTester.sizeOf;
-
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -31,9 +28,24 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
-import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
+import static org.apache.lucene.util.RamUsageEstimator.COMPRESSED_REFS_ENABLED;
+import static org.apache.lucene.util.RamUsageEstimator.HOTSPOT_BEAN_CLASS;
+import static org.apache.lucene.util.RamUsageEstimator.JVM_IS_HOTSPOT_64BIT;
+import static org.apache.lucene.util.RamUsageEstimator.LONG_CACHE_MAX_VALUE;
+import static org.apache.lucene.util.RamUsageEstimator.LONG_CACHE_MIN_VALUE;
+import static org.apache.lucene.util.RamUsageEstimator.LONG_SIZE;
+import static org.apache.lucene.util.RamUsageEstimator.MANAGEMENT_FACTORY_CLASS;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_ALIGNMENT;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_HEADER;
+import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOf;
+import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
+import static org.apache.lucene.util.RamUsageTester.sizeOf;
+
public class TestRamUsageEstimator extends LuceneTestCase {
static final String[] strings = new String[] {
@@ -161,7 +173,7 @@ public class TestRamUsageEstimator extends LuceneTestCase {
Arrays.asList(new TermQuery(new Term("foo1", "bar1")), new TermQuery(new Term("baz1", "bam1"))), 1.0f);
BooleanQuery bq = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo2", "bar2")), BooleanClause.Occur.SHOULD)
- .add(new FuzzyQuery(new Term("foo3", "baz3")), BooleanClause.Occur.MUST_NOT)
+ .add(new PhraseQuery.Builder().add(new Term("foo3", "baz3")).build(), BooleanClause.Occur.MUST_NOT)
.add(dismax, BooleanClause.Occur.MUST)
.build();
long actual = sizeOf(bq);
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
index a0e1b5d..b73bea5 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
@@ -39,13 +39,11 @@ import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
-import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
-import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@@ -206,10 +204,7 @@ public class FuzzyLikeThisQuery extends Query
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
- AttributeSource atts = new AttributeSource();
- MaxNonCompetitiveBoostAttribute maxBoostAtt =
- atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
- FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
+ FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
@@ -226,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
- maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
+ fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java
index f21d7a1..3800596 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java
@@ -16,27 +16,24 @@
*/
package org.apache.lucene.search.spell;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.PriorityQueue;
+
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
-import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.FuzzyTermsEnum;
-import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.PriorityQueue;
-
/**
* Simple automaton-based spellchecker.
*
@@ -420,25 +417,20 @@ public class DirectSpellChecker {
*/
protected Collection