lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jpou...@apache.org
Subject [2/2] lucene-solr:branch_7x: LUCENE-7939: Speed up MinShouldMatchSumScorer in conjunctions.
Date Wed, 23 Aug 2017 17:20:26 GMT
LUCENE-7939: Speed up MinShouldMatchSumScorer in conjunctions.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/586270a7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/586270a7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/586270a7

Branch: refs/heads/branch_7x
Commit: 586270a76990f6462c688e79ab85cf16df28b8a8
Parents: fb0a9e6
Author: Adrien Grand <jpountz@gmail.com>
Authored: Wed Aug 23 19:08:41 2017 +0200
Committer: Adrien Grand <jpountz@gmail.com>
Committed: Wed Aug 23 19:11:22 2017 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../lucene/search/MinShouldMatchSumScorer.java  | 51 +++++++++++++++++++-
 2 files changed, 52 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/586270a7/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a24c87b..188af63 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ Optimizations
 * LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their
   boosts. (Adrien Grand)
 
+* LUCENE-7939: MinShouldMatchSumScorer now leverages two-phase iteration in
+  order to be faster when used in conjunctions. (Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/586270a7/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java
index b6edac4..6927371 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java
@@ -128,7 +128,12 @@ final class MinShouldMatchSumScorer extends Scorer {
 
   @Override
   public DocIdSetIterator iterator() {
-    return new DocIdSetIterator() {
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation = new DocIdSetIterator() {
 
       @Override
       public int docID() {
@@ -154,6 +159,12 @@ final class MinShouldMatchSumScorer extends Scorer {
         }
 
         setDocAndFreq();
+        // It would be correct to return doNextCandidate() at this point but if you
+        // call nextDoc as opposed to advance, it probably means that you really
+        // need the next match. Returning 'doc' here would lead to a similar
+        // iteration over sub postings overall except that the decision making would
+        // happen at a higher level where more abstractions are involved and
+        // benchmarks suggested it causes a significant performance hit.
         return doNext();
       }
 
@@ -181,7 +192,7 @@ final class MinShouldMatchSumScorer extends Scorer {
         }
 
         setDocAndFreq();
-        return doNext();
+        return doNextCandidate();
       }
 
       @Override
@@ -189,6 +200,30 @@ final class MinShouldMatchSumScorer extends Scorer {
         return cost;
       }
     };
+    return new TwoPhaseIterator(approximation) {
+
+      @Override
+      public boolean matches() throws IOException {
+        while (freq < minShouldMatch) {
+          assert freq > 0;
+          if (freq + tailSize >= minShouldMatch) {
+            // a match on doc is still possible, try to
+            // advance scorers from the tail
+            advanceTail();
+          } else {
+            return false;
+          }
+        }
+        return true;
+      }
+
+      @Override
+      public float matchCost() {
+        // maximum number of scorer that matches() might advance
+        return tail.length;
+      }
+
+    };
   }
 
   private void addLead(DisiWrapper lead) {
@@ -250,6 +285,18 @@ final class MinShouldMatchSumScorer extends Scorer {
     return doc;
   }
 
+  /** Move iterators to the tail until the cumulated size of lead+tail is
+   *  greater than or equal to minShouldMath */
+  private int doNextCandidate() throws IOException {
+    while (freq + tailSize < minShouldMatch) {
+      // no match on doc is possible, move to the next potential match
+      pushBackLeads();
+      setDocAndFreq();
+    }
+
+    return doc;
+  }
+
   /** Advance all entries from the tail to know about all matches on the
    *  current doc. */
   private void updateFreq() throws IOException {


Mime
View raw message