lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From broust...@apache.org
Subject [lucene-solr] branch master updated: LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase.
Date Wed, 27 Nov 2019 09:57:22 GMT
This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 8485b5a  LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase.
8485b5a is described below

commit 8485b5a939c5ffc4982dd338d59cdf090c5e1e58
Author: Bruno Roustant <bruno.roustant@gmail.com>
AuthorDate: Wed Sep 18 17:43:53 2019 +0200

    LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase.
---
 lucene/CHANGES.txt                                 |    2 +
 .../org/apache/lucene/search/MultiTermQuery.java   |   11 +-
 .../java/org/apache/lucene/search/PhraseQuery.java |   16 +
 .../apache/lucene/search/PhraseWildcardQuery.java  | 1045 ++++++++++++++++++++
 .../lucene/search/TestPhraseWildcardQuery.java     |  570 +++++++++++
 5 files changed, 1640 insertions(+), 4 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f23952c..08d5d34 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -143,6 +143,8 @@ Other
 
 * LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim)
 
+* LUCENE-8983: Add sandbox PhraseWildcardQuery to control multi-terms expansions in a phrase. (Bruno Roustant)
+
 Build
 
 * Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17.  (Uwe Schindler)
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
index 8c96020..327227a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
@@ -292,11 +292,14 @@ public abstract class MultiTermQuery extends Query {
    */
   protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
 
-  /** Convenience method, if no attributes are needed:
-   * This simply passes empty attributes and is equal to:
-   * <code>getTermsEnum(terms, new AttributeSource())</code>
+  /**
+   * Constructs an enumeration that expands the pattern term.
+   * This method should only be called if the field exists (ie,
+   * implementations can assume the field does exist).
+   * This method never returns null.
+   * The returned TermsEnum is positioned to the first matching term.
    */
-  protected final TermsEnum getTermsEnum(Terms terms) throws IOException {
+  public final TermsEnum getTermsEnum(Terms terms) throws IOException {
     return getTermsEnum(terms, new AttributeSource());
   }
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index 7df01ca..f07a24a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -330,6 +330,22 @@ public class PhraseQuery extends Query {
       }
     }
 
+    public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List<Term> terms) {
+      this.postings = postings;
+      this.impacts = impacts;
+      this.position = position;
+      nTerms = terms == null ? 0 : terms.size();
+      if (nTerms > 0) {
+        Term[] terms2 = terms.toArray(new Term[0]);
+        if (nTerms > 1) {
+          Arrays.sort(terms2);
+        }
+        this.terms = terms2;
+      } else {
+        this.terms = null;
+      }
+    }
+
     @Override
     public int compareTo(PostingsAndFreq other) {
       if (position != other.position) {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java
new file mode 100644
index 0000000..16c601a
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java
@@ -0,0 +1,1045 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SlowImpactsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermStates;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.mutable.MutableValueBool;
+
+/**
+ * A generalized version of {@link PhraseQuery}, built with one or more {@link MultiTermQuery}
+ * that provides term expansions for multi-terms (one of the expanded terms must match).
+ * <p>
+ * Its main advantage is to control the total number of expansions across all {@link MultiTermQuery}
+ * and across all segments.
+ * <p>
+ * Use the {@link Builder} to build a {@link PhraseWildcardQuery}.
+ * <p>
+ * This query is similar to {@link MultiPhraseQuery}, but it handles, controls and optimizes the
+ * multi-term expansions.
+ * <p>
+ * This query is equivalent to building an ordered {@link org.apache.lucene.search.spans.SpanNearQuery}
+ * with a list of {@link org.apache.lucene.search.spans.SpanTermQuery} and
+ * {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper}.
+ * But it optimizes the multi-term expansions and the segment accesses.
+ * It first resolves the single-terms to early stop if some does not match. Then
+ * it expands each multi-term sequentially, stopping immediately if one does not
+ * match. It detects the segments that do not match to skip them for the next
+ * expansions. This often avoid expanding the other multi-terms on some or
+ * even all segments. And finally it controls the total number of expansions.
+ * <p>
+ * Immutable.
+ * @lucene.experimental
+ */
+public class PhraseWildcardQuery extends Query {
+
+  protected static final Query NO_MATCH_QUERY = new MatchNoDocsQuery("Empty " + PhraseWildcardQuery.class.getSimpleName());
+
+  protected final String field;
+  protected final List<PhraseTerm> phraseTerms;
+  protected final int slop;
+  protected final int maxMultiTermExpansions;
+  protected final boolean segmentOptimizationEnabled;
+
+  protected PhraseWildcardQuery(
+      String field,
+      List<PhraseTerm> phraseTerms,
+      int slop,
+      int maxMultiTermExpansions,
+      boolean segmentOptimizationEnabled) {
+    this.field = field;
+    this.phraseTerms = phraseTerms;
+    this.slop = slop;
+    this.maxMultiTermExpansions = maxMultiTermExpansions;
+    this.segmentOptimizationEnabled = segmentOptimizationEnabled;
+  }
+
+  public String getField() {
+    return field;
+  }
+
+  @Override
+  public Query rewrite(IndexReader reader) throws IOException {
+    if (phraseTerms.isEmpty()) {
+      return NO_MATCH_QUERY;
+    }
+    if (phraseTerms.size() == 1) {
+      return phraseTerms.get(0).getQuery();
+    }
+    return super.rewrite(reader);
+  }
+
+  @Override
+  public void visit(QueryVisitor visitor) {
+    if (!visitor.acceptField(field)) {
+      return;
+    }
+    QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this);
+    for (PhraseTerm phraseTerm : phraseTerms) {
+      phraseTerm.getQuery().visit(v);
+    }
+  }
+
+  @Override
+  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
+    IndexReader reader = searcher.getIndexReader();
+
+    // Build a list of segments ordered by terms size (number of terms).
+    // The first segments to be searched are the smaller ones, which are by
+    // design containing the most recent documents. Any segment in this list
+    // may also be removed in the PhraseTerm.collectTermData() calls below
+    // if one of the phrase term does not match in the segment. This allows
+    // to early stop expanding multi-terms on removed segments.
+    // Additionally there is a global multi-term expansion limit across all multi-terms
+    // and all segments. So this is important to first start with the smallest
+    // segments to give back non-used expansion credits to the next multi-terms,
+    // as this is more probable with the small segments.
+    List<LeafReaderContext> sizeSortedSegments =
+        new SegmentTermsSizeComparator().createTermsSizeSortedCopyOf(reader.leaves());
+
+    // TermsData will contain the collected TermState and TermStatistics for all the terms
+    // of the phrase. It is filled during PhraseTerm.collectTermData() calls below.
+    TermsData termsData = createTermsData(sizeSortedSegments.size());
+
+    // Iterate the phrase terms, and collect the TermState for single-terms.
+    // - Early stop if a single term does not match.
+    int numMultiTerms = 0;
+    for (PhraseTerm phraseTerm : phraseTerms) {
+      if (phraseTerm.hasExpansions()) {
+        numMultiTerms++;
+      } else {
+        assert TestCounters.get().incSingleTermAnalysisCount();
+        int numMatches = phraseTerm.collectTermData(this, searcher, sizeSortedSegments, termsData);
+        if (numMatches == 0) {
+          // Early stop here because the single term does not match in any segment.
+          // So the whole phrase query cannot match.
+          return earlyStopWeight();
+        }
+      }
+    }
+
+    // Iterate the phrase terms and collect the TermState for multi-terms.
+    // - Early stop if a multi-term does not match.
+    // - Expand the multi-terms only when required.
+    int remainingExpansions = maxMultiTermExpansions;
+    int remainingMultiTerms = numMultiTerms;
+    for (PhraseTerm phraseTerm : phraseTerms) {
+      if (phraseTerm.hasExpansions()) {
+        assert TestCounters.get().incMultiTermAnalysisCount();
+        assert remainingExpansions >= 0 && remainingExpansions <= maxMultiTermExpansions;
+        assert remainingMultiTerms > 0;
+        // Consider the remaining expansions allowed for all remaining multi-terms.
+        // Divide it evenly to get the expansion limit for the current multi-term.
+        int maxExpansionsForTerm = remainingExpansions / remainingMultiTerms;
+        int numExpansions = phraseTerm.collectTermData(this, searcher, sizeSortedSegments, remainingMultiTerms, maxExpansionsForTerm, termsData);
+        assert numExpansions >= 0 && numExpansions <= maxExpansionsForTerm;
+        if (numExpansions == 0) {
+          // Early stop here because the multi-term does not match in any segment.
+          // So the whole phrase query cannot match.
+          return earlyStopWeight();
+        }
+        // Deduct the effectively used expansions. This may give more expansion
+        // credits to the next multi-terms.
+        remainingExpansions -= numExpansions;
+        remainingMultiTerms--;
+      }
+    }
+    assert remainingMultiTerms == 0;
+    assert remainingExpansions >= 0;
+
+//    TestCounters.get().printTestCounters(termsData);
+
+    return termsData.areAllTermsMatching() ?
+        createPhraseWeight(searcher, scoreMode, boost, termsData)
+        : noMatchWeight();
+  }
+
+  /**
+   * Creates new {@link TermsData}.
+   */
+  protected TermsData createTermsData(int numSegments) {
+    return new TermsData(phraseTerms.size(), numSegments);
+  }
+
+  protected Weight earlyStopWeight() {
+    assert TestCounters.get().incQueryEarlyStopCount();
+    return noMatchWeight();
+  }
+
+  protected Weight noMatchWeight() {
+    return new ConstantScoreWeight(this, 0) {
+      @Override
+      public Scorer scorer(LeafReaderContext leafReaderContext) {
+        return null;
+      }
+
+      @Override
+      public boolean isCacheable(LeafReaderContext ctx) {
+        return true;
+      }
+    };
+  }
+
+  PhraseWeight createPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode,
+                                            float boost, TermsData termsData) throws IOException {
+    return new PhraseWeight(this, field, searcher, scoreMode) {
+
+      @Override
+      protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
+        if (termsData.termStatsList.isEmpty()) {
+          return null;
+        }
+        return searcher.getSimilarity().scorer(
+            boost,
+            searcher.collectionStatistics(field),
+            termsData.termStatsList.toArray(new TermStatistics[0]));
+      }
+
+      @Override
+      protected PhraseMatcher getPhraseMatcher(LeafReaderContext leafReaderContext, Similarity.SimScorer scorer, boolean exposeOffsets) throws IOException {
+        Terms fieldTerms = leafReaderContext.reader().terms(field);
+        if (fieldTerms == null) {
+          return null;
+        }
+        TermsEnum termsEnum = fieldTerms.iterator();
+        float totalMatchCost = 0;
+
+        PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[phraseTerms.size()];
+        for (int termPosition = 0; termPosition < postingsFreqs.length; termPosition++) {
+          TermData termData = termsData.getTermData(termPosition);
+          assert termData != null;
+          List<TermBytesTermState> termStates = termData.getTermStatesForSegment(leafReaderContext);
+          if (termStates == null) {
+            // If the current phrase term does not match in the segment, then the phrase cannot match on the segment.
+            // So early stop by returning a null scorer.
+            return null;
+          }
+          assert !termStates.isEmpty();
+
+          List<PostingsEnum> postingsEnums = new ArrayList<>(termStates.size());
+          for (TermBytesTermState termBytesTermState : termStates) {
+            termsEnum.seekExact(termBytesTermState.termBytes, termBytesTermState.termState);
+            postingsEnums.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
+            totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
+          }
+          PostingsEnum unionPostingsEnum;
+          if (postingsEnums.size() == 1) {
+            unionPostingsEnum = postingsEnums.get(0);
+          } else {
+            unionPostingsEnum = exposeOffsets ? new MultiPhraseQuery.UnionFullPostingsEnum(postingsEnums) : new MultiPhraseQuery.UnionPostingsEnum(postingsEnums);
+          }
+          postingsFreqs[termPosition] = new PhraseQuery.PostingsAndFreq(unionPostingsEnum, new SlowImpactsEnum(unionPostingsEnum), termPosition, termData.terms);
+        }
+
+        if (slop == 0) {
+          // Sort by increasing docFreq order.
+          ArrayUtil.timSort(postingsFreqs);
+          return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
+        } else {
+          return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
+        }
+      }
+    };
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof PhraseWildcardQuery)) {
+      return false;
+    }
+    PhraseWildcardQuery pwq = (PhraseWildcardQuery) o;
+    return slop == pwq.slop && phraseTerms.equals(pwq.phraseTerms);
+  }
+
+  @Override
+  public int hashCode() {
+    return classHash() ^ slop ^ phraseTerms.hashCode();
+  }
+
+  @Override
+  public final String toString(String omittedField) {
+    StringBuilder builder = new StringBuilder();
+    builder.append("phraseWildcard(");
+
+    if (field == null || !field.equals(omittedField)) {
+      builder.append(field).append(':');
+    }
+
+    builder.append('\"');
+    for (int i = 0; i < phraseTerms.size(); i++) {
+      if (i != 0) {
+        builder.append(' ');
+      }
+      phraseTerms.get(i).toString(builder);
+    }
+    builder.append('\"');
+
+    if (slop != 0) {
+      builder.append('~');
+      builder.append(slop);
+    }
+
+    builder.append(")");
+    return builder.toString();
+  }
+
+  /**
+   * Collects the {@link TermState} and {@link TermStatistics} for a single-term
+   * without expansion.
+   *
+   * @param termsData receives the collected data.
+   */
+  protected int collectSingleTermData(
+      SingleTerm singleTerm,
+      IndexSearcher searcher,
+      List<LeafReaderContext> segments,
+      TermsData termsData) throws IOException {
+    TermData termData = termsData.getOrCreateTermData(singleTerm.termPosition);
+    Term term = singleTerm.term;
+    termData.terms.add(term);
+    TermStates termStates = TermStates.build(searcher.getIndexReader().getContext(), term, true);
+
+    // Collect TermState per segment.
+    int numMatches = 0;
+    Iterator<LeafReaderContext> segmentIterator = segments.iterator();
+    while (segmentIterator.hasNext()) {
+      LeafReaderContext leafReaderContext = segmentIterator.next();
+      assert TestCounters.get().incSegmentUseCount();
+      boolean termMatchesInSegment = false;
+      Terms terms = leafReaderContext.reader().terms(term.field());
+      if (terms != null) {
+        checkTermsHavePositions(terms);
+        TermState termState = termStates.get(leafReaderContext);
+        if (termState != null) {
+          termMatchesInSegment = true;
+          numMatches++;
+          termData.setTermStatesForSegment(leafReaderContext, Collections.singletonList(new TermBytesTermState(term.bytes(), termState)));
+        }
+      }
+      if (!termMatchesInSegment && shouldOptimizeSegments()) {
+        // Remove this segment from the list because the phrase cannot match on it.
+        segmentIterator.remove();
+        assert TestCounters.get().incSegmentSkipCount();
+      }
+    }
+    // Collect the term stats across all segments.
+    if (termStates.docFreq() > 0) {
+      termsData.termStatsList.add(searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq()));
+    }
+    return numMatches;
+  }
+
+  /**
+   * Collects the {@link TermState} and {@link TermStatistics} for a multi-term
+   * with expansion.
+   *
+   * @param remainingMultiTerms the number of remaining multi-terms to process,
+   *                            including the current one, excluding the multi-terms already processed.
+   * @param termsData           receives the collected data.
+   */
+  protected int collectMultiTermData(
+      MultiTerm multiTerm,
+      IndexSearcher searcher,
+      List<LeafReaderContext> segments,
+      int remainingMultiTerms, // Unused here but leveraged by extending classes.
+      int maxExpansionsForTerm,
+      TermsData termsData) throws IOException {
+    TermData termData = termsData.getOrCreateTermData(multiTerm.termPosition);
+    Map<BytesRef, TermStats> termStatsMap = createTermStatsMap(multiTerm);
+    int numExpansions = 0;
+    Iterator<LeafReaderContext> segmentIterator = segments.iterator();
+    MutableValueBool shouldStopSegmentIteration = new MutableValueBool();
+
+    while (segmentIterator.hasNext() && !shouldStopSegmentIteration.value) {
+      LeafReaderContext leafReaderContext = segmentIterator.next();
+      int remainingExpansions = maxExpansionsForTerm - numExpansions;
+      assert remainingExpansions >= 0;
+      List<TermBytesTermState> termStates = collectMultiTermDataForSegment(
+          multiTerm, leafReaderContext, remainingExpansions, shouldStopSegmentIteration, termStatsMap);
+
+      if (!termStates.isEmpty()) {
+        assert termStates.size() <= remainingExpansions;
+        numExpansions += termStates.size();
+        assert numExpansions <= maxExpansionsForTerm;
+        termData.setTermStatesForSegment(leafReaderContext, termStates);
+
+      } else if (shouldOptimizeSegments()) {
+        // Remove this segment from the list because the phrase cannot match on it.
+        segmentIterator.remove();
+        assert TestCounters.get().incSegmentSkipCount();
+      }
+    }
+
+    // Collect the term stats across all segments.
+    collectMultiTermStats(searcher, termStatsMap, termsData, termData);
+    return numExpansions;
+  }
+
+  protected boolean shouldOptimizeSegments() {
+    return segmentOptimizationEnabled;
+  }
+
+  /**
+   * Creates a {@link TermStats} map for a {@link MultiTerm}.
+   */
+  protected Map<BytesRef, TermStats> createTermStatsMap(MultiTerm multiTerm) { // multiTerm param can be used by sub-classes.
+    return new HashMap<>();
+  }
+
+  /**
+   * Collects the {@link TermState} list and {@link TermStatistics} for a multi-term
+   * on a specific index segment.
+   *
+   * @param remainingExpansions        the number of remaining expansions allowed
+   *                                   for the segment.
+   * @param shouldStopSegmentIteration to be set to true to stop the segment
+   *                                   iteration calling this method repeatedly.
+   * @param termStatsMap               receives the collected {@link TermStats} across all segments.
+   */
+  protected List<TermBytesTermState> collectMultiTermDataForSegment(
+      MultiTerm multiTerm,
+      LeafReaderContext leafReaderContext,
+      int remainingExpansions,
+      MutableValueBool shouldStopSegmentIteration,
+      Map<BytesRef, TermStats> termStatsMap) throws IOException {
+    TermsEnum termsEnum = createTermsEnum(multiTerm, leafReaderContext);
+    if (termsEnum == null) {
+      return Collections.emptyList();
+    }
+    assert TestCounters.get().incSegmentUseCount();
+    List<TermBytesTermState> termStates = new ArrayList<>();
+    while (termsEnum.next() != null && remainingExpansions > 0) {
+      // Collect term stats for the segment.
+      TermStats termStats = termStatsMap.get(termsEnum.term());
+      if (termStats == null) {
+        BytesRef termBytes = BytesRef.deepCopyOf(termsEnum.term());
+        termStats = new TermStats(termBytes);
+        termStatsMap.put(termBytes, termStats);
+      }
+      // Accumulate stats the same way TermStates.accumulateStatistics() does.
+      // Sum the stats per term for all segments the same way TermStates.build() does.
+      termStats.addStats(termsEnum.docFreq(), termsEnum.totalTermFreq());
+
+      // Collect TermState per segment.
+      termStates.add(new TermBytesTermState(termStats.termBytes, termsEnum.termState()));
+      remainingExpansions--;
+      assert TestCounters.get().incExpansionCount();
+    }
+    assert remainingExpansions >= 0;
+    shouldStopSegmentIteration.value = remainingExpansions == 0;
+    return termStates;
+  }
+
+  /**
+   * Creates the {@link TermsEnum} for the given {@link MultiTerm} and segment.
+   *
+   * @return null if there is no term for this query field in the segment.
+   */
+  protected TermsEnum createTermsEnum(MultiTerm multiTerm, LeafReaderContext leafReaderContext) throws IOException {
+    Terms terms = leafReaderContext.reader().terms(field);
+    if (terms == null) {
+      return null;
+    }
+    checkTermsHavePositions(terms);
+    TermsEnum termsEnum = multiTerm.query.getTermsEnum(terms);
+    assert termsEnum != null;
+    return termsEnum;
+  }
+
+  /**
+   * Collect the term stats across all segments.
+   *
+   * @param termStatsMap input map of already collected {@link TermStats}.
+   * @param termsData    receives the {@link TermStatistics} computed for all {@link TermStats}.
+   * @param termData     receives all the collected {@link Term}.
+   */
+  protected void collectMultiTermStats(
+      IndexSearcher searcher,
+      Map<BytesRef, TermStats> termStatsMap,
+      TermsData termsData,
+      TermData termData) throws IOException {
+    // Collect term stats across all segments.
+    // Collect stats the same way MultiPhraseQuery.MultiPhraseWeight constructor does, for all terms and all segments.
+    for (Map.Entry<BytesRef, TermStats> termStatsEntry : termStatsMap.entrySet()) {
+      Term term = new Term(field, termStatsEntry.getKey());
+      termData.terms.add(term);
+      TermStats termStats = termStatsEntry.getValue();
+      if (termStats.docFreq > 0) {
+        termsData.termStatsList.add(searcher.termStatistics(term, termStats.docFreq, termStats.totalTermFreq));
+      }
+    }
+  }
+
+  protected void checkTermsHavePositions(Terms terms) {
+    if (!terms.hasPositions()) {
+      throw new IllegalStateException("field \"" + field + "\" was indexed without position data;" +
+          " cannot run " + PhraseWildcardQuery.class.getSimpleName());
+    }
+  }
+
+  /**
+   * Builds a {@link PhraseWildcardQuery}.
+   */
+  public static class Builder {
+
+    protected final String field;
+    protected final List<PhraseTerm> phraseTerms;
+    protected int slop;
+    protected final int maxMultiTermExpansions;
+    protected final boolean segmentOptimizationEnabled;
+
+    /**
+     * @param field                  The query field.
+     * @param maxMultiTermExpansions The maximum number of expansions across all multi-terms and across all segments.
+     *                               It counts expansions for each segments individually, that allows optimizations per
+     *                               segment and unused expansions are credited to next segments. This is different from
+     *                               {@link MultiPhraseQuery} and {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper}
+     *                               which have an expansion limit per multi-term.
+     */
+    public Builder(String field, int maxMultiTermExpansions) {
+      this(field, maxMultiTermExpansions, true);
+    }
+
+    /**
+     * @param field                      The query field.
+     * @param maxMultiTermExpansions     The maximum number of expansions across all multi-terms and across all segments.
+     *                                   It counts expansions for each segments individually, that allows optimizations per
+     *                                   segment and unused expansions are credited to next segments. This is different from
+     *                                   {@link MultiPhraseQuery} and {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper}
+     *                                   which have an expansion limit per multi-term.
+     * @param segmentOptimizationEnabled Whether to enable the segment optimization which consists in ignoring a segment
+     *                                   for further analysis as soon as a term is not present inside it. This optimizes
+     *                                   the query execution performance but changes the scoring. The result ranking is
+     *                                   preserved.
+     */
+    public Builder(String field, int maxMultiTermExpansions, boolean segmentOptimizationEnabled) {
+      this.field = field;
+      this.maxMultiTermExpansions = maxMultiTermExpansions;
+      this.segmentOptimizationEnabled = segmentOptimizationEnabled;
+      phraseTerms = new ArrayList<>();
+    }
+
+    /**
+     * Adds a single term at the next position in the phrase.
+     */
+    public Builder addTerm(BytesRef termBytes) {
+      return addTerm(new Term(field, termBytes));
+    }
+
+    /**
+     * Adds a single term at the next position in the phrase.
+     */
+    public Builder addTerm(Term term) {
+      if (!term.field().equals(field)) {
+        throw new IllegalArgumentException(term.getClass().getSimpleName()
+            + " field \"" + term.field() + "\" cannot be different from the "
+            + PhraseWildcardQuery.class.getSimpleName() + " field \"" + field + "\"");
+      }
+      phraseTerms.add(new SingleTerm(term, phraseTerms.size()));
+      return this;
+    }
+
+    /**
+     * Adds a multi-term at the next position in the phrase.
+     * Any of the terms returned by the provided {@link MultiTermQuery} enumeration
+     * may match (expansion as a disjunction).
+     */
+    public Builder addMultiTerm(MultiTermQuery multiTermQuery) {
+      if (!multiTermQuery.getField().equals(field)) {
+        throw new IllegalArgumentException(multiTermQuery.getClass().getSimpleName()
+            + " field \"" + multiTermQuery.getField() + "\" cannot be different from the "
+            + PhraseWildcardQuery.class.getSimpleName() + " field \"" + field + "\"");
+      }
+      phraseTerms.add(new MultiTerm(multiTermQuery, phraseTerms.size()));
+      return this;
+    }
+
+    /**
+     * Sets the phrase slop.
+     */
+    public Builder setSlop(int slop) {
+      if (slop < 0) {
+        throw new IllegalArgumentException("slop value cannot be negative");
+      }
+      this.slop = slop;
+      return this;
+    }
+
+    /**
+     * Builds a {@link PhraseWildcardQuery}.
+     */
+    public PhraseWildcardQuery build() {
+      return new PhraseWildcardQuery(field, phraseTerms, slop, maxMultiTermExpansions, segmentOptimizationEnabled);
+    }
+  }
+
+  /**
+   * All {@link PhraseTerm} are light and immutable. They do not hold query
+   * processing data such as {@link TermsData}. That way, the {@link PhraseWildcardQuery}
+   * is immutable and light itself and can be used safely as a key of the query cache.
+   */
+  protected abstract static class PhraseTerm {
+
+    protected final int termPosition;
+
+    protected PhraseTerm(int termPosition) {
+      this.termPosition = termPosition;
+    }
+
+    protected abstract boolean hasExpansions();
+
+    protected abstract Query getQuery();
+
+    /**
+     * Collects {@link TermState} and {@link TermStatistics} for the term without expansion.
+     * It must be called only if {@link #hasExpansions()} returns false.
+     * Simplified version of {@code #collectTermData(PhraseWildcardQuery, IndexSearcher, List, int, int, TermsData)}
+     * with less arguments. This method throws {@link UnsupportedOperationException} if not overridden.
+     */
+    protected int collectTermData(
+        PhraseWildcardQuery query,
+        IndexSearcher searcher,
+        List<LeafReaderContext> segments,
+        TermsData termsData) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Collects {@link TermState} and {@link TermStatistics} for the term (potentially expanded).
+     *
+     * @param termsData {@link TermsData} to update with the collected terms and stats.
+     * @return The number of expansions or matches in all segments; or 0 if this term
+     * does not match in any segment, in this case the phrase query can immediately stop.
+     */
+    protected abstract int collectTermData(
+        PhraseWildcardQuery query,
+        IndexSearcher searcher,
+        List<LeafReaderContext> segments,
+        int remainingMultiTerms,
+        int maxExpansionsForTerm,
+        TermsData termsData) throws IOException;
+
+    protected abstract void toString(StringBuilder builder);
+
+    @Override
+    public abstract boolean equals(Object o);
+
+    @Override
+    public abstract int hashCode();
+  }
+
+  /**
+   * Phrase term with no expansion.
+   */
+  protected static class SingleTerm extends PhraseTerm {
+
+    protected final Term term;
+
+    protected SingleTerm(Term term, int termPosition) {
+      super(termPosition);
+      this.term = term;
+    }
+
+    @Override
+    protected boolean hasExpansions() {
+      return false;
+    }
+
+    @Override
+    protected Query getQuery() {
+      return new TermQuery(term);
+    }
+
+    @Override
+    protected int collectTermData(
+        PhraseWildcardQuery query,
+        IndexSearcher searcher,
+        List<LeafReaderContext> segments,
+        TermsData termsData) throws IOException {
+      return collectTermData(query, searcher, segments, 0, 0, termsData);
+    }
+
+    @Override
+    protected int collectTermData(
+        PhraseWildcardQuery query,
+        IndexSearcher searcher,
+        List<LeafReaderContext> segments,
+        int remainingMultiTerms,
+        int maxExpansionsForTerm,
+        TermsData termsData) throws IOException {
+      return query.collectSingleTermData(this, searcher, segments, termsData);
+    }
+
+    @Override
+    protected void toString(StringBuilder builder) {
+      builder.append(term.text());
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (!(o instanceof SingleTerm)) {
+        return false;
+      }
+      SingleTerm singleTerm = (SingleTerm) o;
+      return term.equals(singleTerm.term);
+    }
+
+    @Override
+    public int hashCode() {
+      return term.hashCode();
+    }
+  }
+
+  /**
+   * Phrase term with expansions.
+   */
+  protected static class MultiTerm extends PhraseTerm {
+
+    protected final MultiTermQuery query;
+
+    protected MultiTerm(MultiTermQuery query, int termPosition) {
+      super(termPosition);
+      this.query = query;
+    }
+
+    @Override
+    protected boolean hasExpansions() {
+      return true;
+    }
+
+    @Override
+    protected Query getQuery() {
+      return query;
+    }
+
+    @Override
+    protected int collectTermData(
+        PhraseWildcardQuery query,
+        IndexSearcher searcher,
+        List<LeafReaderContext> segments,
+        int remainingMultiTerms,
+        int maxExpansionsForTerm,
+        TermsData termsData) throws IOException {
+      return query.collectMultiTermData(this, searcher, segments, remainingMultiTerms, maxExpansionsForTerm, termsData);
+    }
+
+    @Override
+    protected void toString(StringBuilder builder) {
+      builder.append(query.toString(query.field));
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (!(o instanceof MultiTerm)) {
+        return false;
+      }
+      MultiTerm multiTerm = (MultiTerm) o;
+      return query.equals(multiTerm.query);
+    }
+
+    @Override
+    public int hashCode() {
+      return query.hashCode();
+    }
+  }
+
+  /**
+   * Holds the {@link TermState} and {@link TermStatistics} for all the matched
+   * and collected {@link Term}, for all phrase terms, for all segments.
+   */
+  protected static class TermsData {
+
+    protected final int numTerms;
+    protected final int numSegments;
+    protected final List<TermStatistics> termStatsList;
+    protected final TermData[] termDataPerPosition;
+    protected int numTermsMatching;
+
+    protected TermsData(int numTerms, int numSegments) {
+      this.numTerms = numTerms;
+      this.numSegments = numSegments;
+      termStatsList = new ArrayList<>();
+      termDataPerPosition = new TermData[numTerms];
+    }
+
+    protected TermData getOrCreateTermData(int termPosition) {
+      TermData termData = termDataPerPosition[termPosition];
+      if (termData == null) {
+        termData = new TermData(numSegments, this);
+        termDataPerPosition[termPosition] = termData;
+      }
+      return termData;
+    }
+
+    protected TermData getTermData(int termPosition) {
+      return termDataPerPosition[termPosition];
+    }
+
+    protected boolean areAllTermsMatching() {
+      assert numTermsMatching <= numTerms;
+      return numTermsMatching == numTerms;
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append("TermsData(");
+      builder.append("numSegments=").append(numSegments);
+      builder.append(", termDataPerPosition=").append(Arrays.asList(termDataPerPosition));
+      builder.append(", termsStatsList=[");
+      for (TermStatistics termStatistics : termStatsList) {
+        builder.append("{")
+            .append(termStatistics.term().utf8ToString())
+            .append(", ").append(termStatistics.docFreq())
+            .append(", ").append(termStatistics.totalTermFreq())
+            .append("}");
+      }
+      builder.append("]");
+      builder.append(")");
+      return builder.toString();
+    }
+  }
+
+  /**
+   * Holds the {@link TermState} for all the collected {@link Term},
+   * for a specific phrase term, for all segments.
+   */
+  protected static class TermData {
+
+    protected final int numSegments;
+    protected final TermsData termsData;
+    protected List<TermBytesTermState>[] termStatesPerSegment;
+    protected final List<Term> terms;
+
+    protected TermData(int numSegments, TermsData termsData) {
+      this.numSegments = numSegments;
+      this.termsData = termsData;
+      terms = new ArrayList<>();
+    }
+
+    /**
+     * Sets the collected list of {@link TermBytesTermState} for the given segment.
+     */
+    @SuppressWarnings("unchecked")
+    protected void setTermStatesForSegment(LeafReaderContext leafReaderContext, List<TermBytesTermState> termStates) {
+      if (termStatesPerSegment == null) {
+        termStatesPerSegment = (List<TermBytesTermState>[]) new List[numSegments];
+        termsData.numTermsMatching++;
+      }
+      termStatesPerSegment[leafReaderContext.ord] = termStates;
+    }
+
+    /**
+     * @return The collected list of {@link TermBytesTermState} for the given segment;
+     * or null if this phrase term does not match in the given segment.
+     */
+    protected List<TermBytesTermState> getTermStatesForSegment(LeafReaderContext leafReaderContext) {
+      assert termStatesPerSegment != null : "No TermState for any segment; the query should have been stopped before";
+      return termStatesPerSegment[leafReaderContext.ord];
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append("TermData(");
+      builder.append("termStates=");
+      if (termStatesPerSegment == null) {
+        builder.append("null");
+      } else {
+        builder.append(Arrays.asList(termStatesPerSegment));
+      }
+      builder.append(", terms=").append(terms);
+      builder.append(")");
+      return builder.toString();
+    }
+  }
+
+  /**
+   * Holds a pair of term bytes - term state.
+   */
+  public static class TermBytesTermState {
+
+    protected final BytesRef termBytes;
+    protected final TermState termState;
+
+    public TermBytesTermState(BytesRef termBytes, TermState termState) {
+      this.termBytes = termBytes;
+      this.termState = termState;
+    }
+
+    @Override
+    public String toString() {
+      return "\"" + termBytes.utf8ToString() + "\"->" + termState;
+    }
+  }
+
+  /**
+   * Accumulates the doc freq and total term freq.
+   */
+  public static class TermStats {
+
+    protected final BytesRef termBytes;
+    protected int docFreq;
+    protected long totalTermFreq;
+
+    protected TermStats(BytesRef termBytes) {
+      this.termBytes = termBytes;
+    }
+
+    public BytesRef getTermBytes() {
+      return termBytes;
+    }
+
+    protected void addStats(int docFreq, long totalTermFreq) {
+      this.docFreq += docFreq;
+      if (this.totalTermFreq >= 0 && totalTermFreq >= 0) {
+        this.totalTermFreq += totalTermFreq;
+      } else {
+        this.totalTermFreq = -1;
+      }
+    }
+  }
+
+  /**
+   * Compares segments based of the number of terms they contain.
+   * <p>
+   * This is used to sort segments incrementally by number of terms. This
+   * way the first segment to search is the smallest, so a term has the lowest
+   * probability to match in this segment. And if the term does not match,
+   * we credit unused expansions when searching the other next segments.
+   */
+  protected class SegmentTermsSizeComparator implements Comparator<LeafReaderContext> {
+
+    private static final String COMPARISON_ERROR_MESSAGE = "Segment comparison error";
+
+    @Override
+    public int compare(LeafReaderContext leafReaderContext1, LeafReaderContext leafReaderContext2) {
+      try {
+        return Long.compare(getTermsSize(leafReaderContext1), getTermsSize(leafReaderContext2));
+      } catch (IOException e) {
+        throw new RuntimeException(COMPARISON_ERROR_MESSAGE, e);
+      }
+    }
+
+    protected List<LeafReaderContext> createTermsSizeSortedCopyOf(List<LeafReaderContext> segments) throws IOException {
+      List<LeafReaderContext> copy = new ArrayList<>(segments);
+      try {
+        copy.sort(this);
+      } catch (RuntimeException e) {
+        if (COMPARISON_ERROR_MESSAGE.equals(e.getMessage())) {
+          throw (IOException) e.getCause();
+        }
+        throw e;
+      }
+      return copy;
+    }
+
+    private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException {
+      Terms terms = leafReaderContext.reader().terms(field);
+      return terms == null ? 0 : terms.size();
+    }
+  }
+
+  /**
+   * Test counters incremented when assertions are enabled. Used only when testing.
+   */
+  protected static class TestCounters {
+
+    private static final TestCounters SINGLETON = new TestCounters();
+
+    protected long singleTermAnalysisCount;
+    protected long multiTermAnalysisCount;
+    protected long expansionCount;
+    protected long segmentUseCount;
+    protected long segmentSkipCount;
+    protected long queryEarlyStopCount;
+
+    protected static TestCounters get() {
+      return SINGLETON;
+    }
+
+    protected boolean incSingleTermAnalysisCount() {
+      singleTermAnalysisCount++;
+      return true;
+    }
+
+    protected boolean incMultiTermAnalysisCount() {
+      multiTermAnalysisCount++;
+      return true;
+    }
+
+    protected boolean incExpansionCount() {
+      expansionCount++;
+      return true;
+    }
+
+    protected boolean incSegmentUseCount() {
+      segmentUseCount++;
+      return true;
+    }
+
+    protected boolean incSegmentSkipCount() {
+      segmentSkipCount++;
+      return true;
+    }
+
+    protected boolean incQueryEarlyStopCount() {
+      queryEarlyStopCount++;
+      return true;
+    }
+
+    protected void clear() {
+      singleTermAnalysisCount = 0;
+      multiTermAnalysisCount = 0;
+      expansionCount = 0;
+      segmentUseCount = 0;
+      segmentSkipCount = 0;
+      queryEarlyStopCount = 0;
+    }
+
+//    protected void printTestCounters(TermsData termsData) {
+//      System.out.println("singleTermAnalysisCount=" + singleTermAnalysisCount);
+//      System.out.println("multiTermAnalysisCount=" + multiTermAnalysisCount);
+//      System.out.println("expansionCount=" + expansionCount);
+//      System.out.println("segmentUseCount=" + segmentUseCount);
+//      System.out.println("segmentSkipCount=" + segmentSkipCount);
+//      System.out.println("queryEarlyStopCount=" + queryEarlyStopCount);
+//      System.out.println(termsData);
+//    }
+  }
+}
\ No newline at end of file
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java
new file mode 100644
index 0000000..6d641ac
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java
@@ -0,0 +1,570 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+import static org.apache.lucene.search.PhraseWildcardQuery.TestCounters;
+
+/**
+ * Tests {@link PhraseWildcardQuery}.
+ * <p>
+ * The main goal of this class is to verify that {@link PhraseWildcardQuery}
+ * has the same ranking and same scoring than both {@link MultiPhraseQuery}
+ * and {@link SpanNearQuery}.
+ * <p>
+ * Note that the ranking and scoring are equal if the segment optimization
+ * is disabled, otherwise it may change the score, but the ranking is most
+ * often the same.
+ */
+public class TestPhraseWildcardQuery extends LuceneTestCase {
+
+  protected static final int MAX_DOCS = 1000;
+  protected static final String[] FIELDS = {"title", "author", "category", "other"};
+
+  protected Directory directory;
+  protected IndexReader reader;
+  protected IndexSearcher searcher;
+  protected boolean differentScoreExpectedForSpanNearQuery;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    directory = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
+    iw.setDoRandomForceMerge(false); // Keep the segments separated.
+    addSegments(iw);
+    reader = iw.getReader();
+    iw.close();
+    searcher = newSearcher(reader);
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    reader.close();
+    directory.close();
+    super.tearDown();
+  }
+
+  public void testOneMultiTerm() throws Exception {
+    searchAndCheckResults(field(1), 100, "eric", "br*");
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(1, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(0, TestCounters.get().segmentSkipCount);
+  }
+
+  public void testTwoMultiTerms() throws Exception {
+    searchAndCheckResults(field(1), 100, "e*", "b*");
+    assertEquals(0, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(2, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(0, TestCounters.get().segmentSkipCount);
+
+    expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
+      searchAndCheckResults(field(2), 100, "tim*", "t*");
+      assertEquals(0, TestCounters.get().singleTermAnalysisCount);
+      assertEquals(2, TestCounters.get().multiTermAnalysisCount);
+      assertEquals(2, TestCounters.get().segmentUseCount);
+      assertEquals(1, TestCounters.get().segmentSkipCount);
+    });
+  }
+
+  public void testThreeMultiTerms() throws Exception {
+    searchAndCheckResults(field(0), 100, "t*", "ut?pi?", "e*");
+    assertEquals(0, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(3, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(1, TestCounters.get().segmentSkipCount);
+
+    searchAndCheckResults(field(0), 100, "t?e", "u*", "e*");
+    assertEquals(0, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(3, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(1, TestCounters.get().segmentSkipCount);
+
+    expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
+      searchAndCheckResults(field(0), 100, "t?e", "b*", "b*");
+      assertEquals(0, TestCounters.get().singleTermAnalysisCount);
+      assertEquals(3, TestCounters.get().multiTermAnalysisCount);
+      assertEquals(4, TestCounters.get().segmentUseCount);
+      assertEquals(1, TestCounters.get().segmentSkipCount);
+    });
+  }
+
+  public void testOneSingleTermTwoMultiTerms() throws Exception {
+    searchAndCheckResults(field(0), 100, "t*", "utopia", "e*");
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(2, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(1, TestCounters.get().segmentSkipCount);
+
+    searchAndCheckResults(field(0), 100, "t?e", "utopia", "e*");
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(2, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(1, TestCounters.get().segmentSkipCount);
+
+    searchAndCheckResults(field(0), 100, "t?a", "utopia", "e*");
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(1, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(3, TestCounters.get().segmentUseCount);
+    assertEquals(2, TestCounters.get().segmentSkipCount);
+  }
+
+  public void testTermDoesNotMatch() throws Exception {
+    searchAndCheckResults(field(0), 100, "nomatch", "e*");
+    // We expect that createWeight() is not called because the first term does
+    // not match so the query is early stopped without multi-term expansion.
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(0, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(2, TestCounters.get().segmentUseCount);
+    assertEquals(2, TestCounters.get().segmentSkipCount);
+
+    searchAndCheckResults(field(0), 100, "t*", "nomatch", "e*");
+    assertEquals(1, TestCounters.get().singleTermAnalysisCount);
+    assertEquals(0, TestCounters.get().multiTermAnalysisCount);
+    assertEquals(2, TestCounters.get().segmentUseCount);
+    assertEquals(2, TestCounters.get().segmentSkipCount);
+  }
+
+  public void testNoMultiTerm() throws Exception {
+    searchAndCheckResults(field(0), 100, "the", "utopia");
+    searchAndCheckResults(field(0), 100, "utopia", "the");
+    searchAndCheckResults(field(0), 100, "the", "experiment");
+  }
+
+  public void testMaxExpansions() throws Exception {
+    // The limit on the number of expansions is different with PhraseWildcardQuery
+    // because it applies to each segments individually, and not globally unlike
+    // MultiPhraseQuery and SpanMultiTermQueryWrapper.
+    // Here we verify the total number of expansions directly from test stats
+    // inside PhraseWildcardQuery.
+
+    clearTestCounters();
+    searcher.search(phraseWildcardQuery(field(1), 3, 0, true, "e*", "b*"), MAX_DOCS);
+    // We expect 3 expansions even if both multi-terms have potentially more expansions.
+    assertEquals(3, TestCounters.get().expansionCount);
+
+    clearTestCounters();
+    searcher.search(phraseWildcardQuery(field(0), 4, 0, true, "t?e", "utopia", "e*"), MAX_DOCS);
+    // We expect 2 expansions since the "utopia" term matches only in the
+    // first segment, so there is no expansion for the second segment.
+    assertEquals(2, TestCounters.get().expansionCount);
+  }
+
+  public void testSegmentOptimizationSingleField() throws Exception {
+    searchAndCheckResults(field(0), 100, 0, true, "b*", "e*");
+    // Both multi-terms are present in both segments.
+    // So expecting 4 segment accesses.
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(0, TestCounters.get().segmentSkipCount);
+    assertEquals(0, TestCounters.get().queryEarlyStopCount);
+
+    searchAndCheckResults(field(0), 100, 0, true, "t?e", "b*", "e*");
+    // "t?e" matches only in the first segment. This term adds 2 segment accesses and 1 segment skip.
+    // The other multi-terms match in the first segment. Each one adds 1 segment access.
+    // So expecting 3 segment accesses and 1 segment skips.
+    assertEquals(4, TestCounters.get().segmentUseCount);
+    assertEquals(1, TestCounters.get().segmentSkipCount);
+    assertEquals(0, TestCounters.get().queryEarlyStopCount);
+
+    searchAndCheckResults(field(0), 100, 0, true, "t?e", "blind", "e*");
+    assertEquals(3, TestCounters.get().segmentUseCount);
+    assertEquals(2, TestCounters.get().segmentSkipCount);
+    assertEquals(1, TestCounters.get().queryEarlyStopCount);
+
+    expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
+      searchAndCheckResults(field(2), 100, 0, true, "tim*", "t*");
+      assertEquals(2, TestCounters.get().segmentUseCount);
+      assertEquals(1, TestCounters.get().segmentSkipCount);
+      assertEquals(0, TestCounters.get().queryEarlyStopCount);
+    });
+  }
+
+  public void testMultiplePhraseWildcards() throws Exception {
+    searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+        new String[]{"e*", "b*"},
+        new String[]{"t?e", "utopia"}
+    });
+    searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+        new String[]{"e*", "b*"},
+        new String[]{"d*", "b*"}
+    });
+    searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+        new String[]{"e*", "b*"},
+        new String[]{"t?e", "utopia"},
+        new String[]{"d*", "b*"}
+    });
+    expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
+        searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+            new String[]{"e*", "b*"},
+            new String[]{"b*", "b*"}
+        }));
+    expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
+        searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+            new String[]{"e*", "b*"},
+            new String[]{"b*", "b*"},
+            new String[]{"t?e", "utopia"}
+        }));
+    searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+        new String[]{"e*", "b*"},
+        new String[]{"e*", "b*"}
+    });
+    searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
+        new String[]{"e*", "b*"},
+        new String[]{"t?e", "utopia"},
+        new String[]{"e*", "b*"}
+    });
+  }
+
+  public void testToString() {
+    Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "e*");
+    assertEquals("phraseWildcard(title:\"t?e b* e*\")", testQuery.toString());
+
+    testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "utopia", "e*");
+    assertEquals("phraseWildcard(\"t?e utopia e*\"~1)", testQuery.toString(field(0)));
+
+    testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "b*", "b*");
+    assertEquals("phraseWildcard(\"t?e b* b*\"~1)", testQuery.toString(field(0)));
+  }
+
+  public void testExplain() throws IOException {
+    Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "b*");
+
+    // Verify the standard way to get the query explanation.
+    for (ScoreDoc scoreDoc : searcher.search(testQuery, MAX_DOCS).scoreDocs) {
+      Explanation explanation = searcher.explain(testQuery, scoreDoc.doc);
+      assertTrue(explanation.getValue().doubleValue() > 0);
+      assertEquals("weight(phraseWildcard(title:\"t?e b* b*\") in 1) [AssertingSimilarity], result of:", explanation.getDescription());
+    }
+
+    // Verify that if we call PhraseWildcardQuery.PhraseWildcardWeight.scorer() twice,
+    // the scoring is correct (even if it is not the standard path expected by the scorer() method).
+    int resultCount = 0;
+    Weight weight = testQuery.createWeight(searcher, ScoreMode.TOP_SCORES, 1);
+    for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
+      Scorer scorer = weight.scorer(leafReaderContext);
+      if (scorer != null) {
+        DocIdSetIterator iterator = scorer.iterator();
+        while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          resultCount++;
+        }
+      }
+    }
+    assertEquals(1, resultCount);
+
+    int explanationWithNonNullScoreCount = 0;
+    for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
+      Explanation explanation = weight.explain(leafReaderContext, 1);
+      if (explanation.getValue().doubleValue() > 0) {
+        explanationWithNonNullScoreCount++;
+      }
+    }
+    assertEquals(1, explanationWithNonNullScoreCount);
+  }
+
+  /**
+   * With two similar multi-terms which expansions are subsets (e.g. "tim*" and "t*"),
+   * we expect {@link PhraseWildcardQuery} and {@link MultiPhraseQuery} to
+   * have the same scores, but {@link SpanNearQuery} scores are different.
+   */
+  protected void expectDifferentScoreForSpanNearQueryWithMultiTermSubset(RunnableWithIOException runnable) throws IOException {
+    try {
+      differentScoreExpectedForSpanNearQuery = true;
+      runnable.run();
+    } finally {
+      differentScoreExpectedForSpanNearQuery = false;
+    }
+  }
+
+  /**
+   * Compares {@link PhraseWildcardQuery} to both {@link MultiPhraseQuery}
+   * and {@link SpanNearQuery}.
+   */
+  protected void searchAndCheckResults(String field, int maxExpansions, String... terms) throws IOException {
+    for (int slop = 0; slop <= 1; slop++) {
+      searchAndCheckResults(field, maxExpansions, slop, false, terms);
+      searchAndCheckResults(field, maxExpansions, slop, true, terms);
+    }
+  }
+
+  protected void searchAndCheckResults(String field, int maxExpansions, int slop,
+                                       boolean segmentOptimizationEnabled, String... terms) throws IOException {
+    searchAndCheckSameResults(
+        phraseWildcardQuery(field, maxExpansions, slop, segmentOptimizationEnabled, terms),
+        multiPhraseQuery(field, maxExpansions, slop, terms),
+        spanNearQuery(field, slop, terms),
+        segmentOptimizationEnabled);
+  }
+
+  protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions,
+                                                              int slop, String[][] multiPhraseTerms) throws IOException {
+    searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, false, multiPhraseTerms);
+    searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, true, multiPhraseTerms);
+  }
+
+  protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions, int slop,
+                                                              boolean segmentOptimizationEnabled, String[][] multiPhraseTerms) throws IOException {
+    BooleanQuery.Builder phraseWildcardQueryBuilder = new BooleanQuery.Builder();
+    BooleanQuery.Builder multiPhraseQueryBuilder = new BooleanQuery.Builder();
+    BooleanQuery.Builder spanNearQueryBuilder = new BooleanQuery.Builder();
+    for (String[] terms : multiPhraseTerms) {
+      BooleanClause.Occur occur = random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
+      phraseWildcardQueryBuilder.add(disMaxQuery(phraseWildcardQueries(fields, maxExpansions, slop, segmentOptimizationEnabled, terms)), occur);
+      multiPhraseQueryBuilder.add(disMaxQuery(multiPhraseQueries(fields, maxExpansions, slop, terms)), occur);
+      spanNearQueryBuilder.add(disMaxQuery(spanNearQueries(fields, slop, terms)), occur);
+    }
+    searchAndCheckSameResults(
+        phraseWildcardQueryBuilder.build(),
+        multiPhraseQueryBuilder.build(),
+        spanNearQueryBuilder.build(),
+        segmentOptimizationEnabled
+    );
+  }
+
+  protected Query disMaxQuery(Query... disjuncts) {
+    return new DisjunctionMaxQuery(Arrays.asList(disjuncts), 0.1f);
+  }
+
+  protected Query[] phraseWildcardQueries(String[] fields, int maxExpansions, int slop, boolean segmentOptimizationEnabled, String... terms) {
+    Query[] queries = new Query[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      queries[i] = phraseWildcardQuery(fields[i], maxExpansions, slop, segmentOptimizationEnabled, terms);
+    }
+    return queries;
+  }
+
+  protected Query[] multiPhraseQueries(String[] fields, int maxExpansions, int slop, String... terms) throws IOException {
+    Query[] queries = new Query[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      queries[i] = multiPhraseQuery(fields[i], maxExpansions, slop, terms);
+    }
+    return queries;
+  }
+
+  protected Query[] spanNearQueries(String[] fields, int slop, String... terms) {
+    Query[] queries = new Query[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      queries[i] = spanNearQuery(fields[i], slop, terms);
+    }
+    return queries;
+  }
+
+  protected void searchAndCheckSameResults(Query testQuery, Query multiPhraseQuery, Query spanNearQuery, boolean segmentOptimizationEnabled) throws IOException {
+    // Search and compare results with MultiPhraseQuery.
+    // Do not compare the scores if the segment optimization is enabled because
+    // it changes the score (but not the result ranking).
+    boolean sameScoreExpected = !segmentOptimizationEnabled;
+    searchAndCheckSameResults(testQuery, multiPhraseQuery, sameScoreExpected);
+
+    // Clear the test stats to verify them only with the last test query execution.
+    clearTestCounters();
+    // Search and compare results with SpanNearQuery.
+    sameScoreExpected = !segmentOptimizationEnabled && !differentScoreExpectedForSpanNearQuery;
+    searchAndCheckSameResults(testQuery, spanNearQuery, sameScoreExpected);
+  }
+
+  protected void clearTestCounters() {
+    TestCounters.get().clear();
+  }
+
+  protected void searchAndCheckSameResults(Query testQuery, Query referenceQuery,
+                                           boolean compareScores) throws IOException {
+    ScoreDoc[] testResults = searcher.search(testQuery, MAX_DOCS).scoreDocs;
+    ScoreDoc[] referenceResults = searcher.search(referenceQuery, MAX_DOCS).scoreDocs;
+    assertEquals("Number of results differ when comparing to " + referenceQuery.getClass().getSimpleName(),
+        referenceResults.length, testResults.length);
+    if (compareScores) {
+      for (int i = 0; i < testResults.length; i++) {
+        ScoreDoc testResult = testResults[i];
+        ScoreDoc referenceResult = referenceResults[i];
+        assertTrue("Result " + i + " differ when comparing to " + referenceQuery.getClass().getSimpleName()
+                + "\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
+            equals(testResult, referenceResult));
+      }
+    } else {
+      Set<Integer> testResultDocIds = Arrays.stream(testResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
+      Set<Integer> referenceResultDocIds = Arrays.stream(referenceResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
+      assertEquals("Results differ when comparing to " + referenceQuery.getClass().getSimpleName()
+              + " ignoring score\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
+          referenceResultDocIds, testResultDocIds);
+    }
+  }
+
+  protected PhraseWildcardQuery phraseWildcardQuery(String field, int maxExpansions,
+                                                    int slop, boolean segmentOptimizationEnabled, String... terms) {
+    PhraseWildcardQuery.Builder builder = createPhraseWildcardQueryBuilder(field, maxExpansions, segmentOptimizationEnabled)
+        .setSlop(slop);
+    for (String term : terms) {
+      if (term.contains("*") || term.contains("?")) {
+        builder.addMultiTerm(new WildcardQuery(new Term(field, term)));
+      } else {
+        builder.addTerm(new BytesRef(term));
+      }
+    }
+    return builder.build();
+  }
+
+  protected PhraseWildcardQuery.Builder createPhraseWildcardQueryBuilder(
+      String field, int maxExpansions, boolean segmentOptimizationEnabled) {
+    return new PhraseWildcardQuery.Builder(field, maxExpansions, segmentOptimizationEnabled);
+  }
+
+  protected SpanNearQuery spanNearQuery(String field, int slop, String... terms) {
+    SpanQuery[] spanQueries = new SpanQuery[terms.length];
+    for (int i = 0; i < terms.length; i++) {
+      String term = terms[i];
+      spanQueries[i] = term.contains("*") || term.contains("?") ?
+          new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(field, term)))
+          : new SpanTermQuery(new Term(field, term));
+    }
+    return new SpanNearQuery(spanQueries, slop, true);
+  }
+
+  protected MultiPhraseQuery multiPhraseQuery(String field, int maxExpansions, int slop, String... terms) throws IOException {
+    MultiPhraseQuery.Builder builder = new MultiPhraseQuery.Builder()
+        .setSlop(slop);
+    for (String term : terms) {
+      if (term.contains("*") || term.contains("?")) {
+        Term[] expansions = expandMultiTerm(field, term, maxExpansions);
+        if (expansions.length > 0) {
+          builder.add(expansions);
+        } else {
+          builder.add(new Term(field, "non-matching-term"));
+        }
+      } else {
+        builder.add(new Term(field, term));
+      }
+    }
+    return builder.build();
+  }
+
+  protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException {
+    if (maxExpansions == 0) {
+      return new Term[0];
+    }
+    Set<Term> expansions = new HashSet<>();
+    WildcardQuery wq = new WildcardQuery(new Term(field, term));
+    expansion:
+    for (final LeafReaderContext ctx : reader.leaves()) {
+      Terms terms = ctx.reader().terms(field);
+      if (terms != null) {
+        TermsEnum termsEnum = wq.getTermsEnum(terms);
+        while (termsEnum.next() != null) {
+          expansions.add(new Term(field, termsEnum.term()));
+          if (expansions.size() >= maxExpansions) {
+            break expansion;
+          }
+        }
+      }
+    }
+    return expansions.toArray(new Term[0]);
+  }
+
+  protected static boolean equals(ScoreDoc result1, ScoreDoc result2) {
+    // Due to randomness, the value of the score comparison epsilon varies much.
+    // We take 1E-1 epsilon to ensure the test do not flap.
+    return result1.doc == result2.doc && (Math.abs(result1.score - result2.score) < 1E-1);
+  }
+
+  protected void addSegments(RandomIndexWriter iw) throws IOException {
+    // First segment.
+    addDocs(iw,
+        doc(
+            field(field(0), "time conversion"),
+            field(field(1), "eric hawk"),
+            field(field(2), "time travel")
+        ),
+        doc(
+            field(field(0), "the blinking books"),
+            field(field(1), "donald ever"),
+            field(field(2), "time travel")
+        ),
+        doc(
+            field(field(0), "the utopia experiment"),
+            field(field(1), "dylan brief"),
+            field(field(2), "utopia"),
+            field(field(3), "travelling to utopiapolis")
+        )
+    );
+    iw.commit();
+
+    // Second segment.
+    // No field(2).
+    addDocs(iw,
+        doc(
+            field(field(0), "serene evasion"),
+            field(field(1), "eric brown")
+        ),
+        doc(
+            field(field(0), "my blind experiment"),
+            field(field(1), "eric bright")
+        ),
+        doc(
+            field(field(3), "two times travel")
+        )
+    );
+    iw.commit();
+  }
+
+  protected String field(int index) {
+    return FIELDS[index];
+  }
+
+  protected static void addDocs(RandomIndexWriter iw, Document... docs) throws IOException {
+    iw.addDocuments(Arrays.asList(docs));
+  }
+
+  protected static Document doc(Field... fields) {
+    Document doc = new Document();
+    for (Field field : fields) {
+      doc.add(field);
+    }
+    return doc;
+  }
+
+  protected static Field field(String field, String fieldValue) {
+    return newTextField(field, fieldValue, Field.Store.NO);
+  }
+
+  private interface RunnableWithIOException {
+
+    void run() throws IOException;
+  }
+}


Mime
View raw message