lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Peter Keegan" <peterlkee...@gmail.com>
Subject BoostingTermQuery performance
Date Tue, 02 Oct 2007 22:44:46 GMT
I have been experimenting with payloads and BoostingTermQuery, which I think
are excellent additions to Lucene core. Currently, BoostingTermQuery extends
SpanQuery. I would suggest changing this class to extend TermQuery and
refactor the current version to something like 'BoostingSpanQuery'.

The reason is rooted in performance. In my testing, I compared query
throughput using TermQuery against 2 versions of BoostingTermQuery - the
current one that extends SpanQuery and one that extends TermQuery (which
I've included, below). Here are the results (qps = queries per second):

TermQuery:    200 qps
BoostingTermQuery (extends SpanQuery): 97 qps
BoostingTermQuery (extends TermQuery): 130 qps

Here is a version of BoostingTermQuery that extends TermQuery. I had to
modify TermQuery and TermScorer to make them public. A code review would be
in order, and I would appreciate your comments on this suggestion.

Peter

-----------------------------------------

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.*;


import java.io.IOException;

/**
 * Copyright 2004 The Apache Software Foundation
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * The BoostingTermQuery is very similar to the {@link
org.apache.lucene.search.spans.SpanTermQuery} except
 * that it factors in the value of the payload located at each of the
positions where the
 * {@link org.apache.lucene.index.Term} occurs.
 * <p>
 * In order to take advantage of this, you must override {@link
org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
 * which returns 1 by default.
 * <p>
 * Payload scores are averaged across term occurrences in the document.
 *
 * <p><font color="#FF0000">
 * WARNING: The status of the <b>Payloads</b> feature is experimental.
 * The APIs introduced here might change in the future and will not be
 * supported anymore in such a case.</font>
 *
 * @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
 */
public class BoostingTermQuery extends TermQuery{
    Term term;
    Similarity similarity;

  public BoostingTermQuery(Term term) {
    super(term);
    this.term = term;

  }


  protected Weight createWeight(Searcher searcher) throws IOException {
    this.similarity = getSimilarity(searcher);
    return new BoostingTermWeight(this, searcher);
  }

  protected class BoostingTermWeight extends TermWeight implements Weight {


    public BoostingTermWeight(BoostingTermQuery query, Searcher searcher)
throws IOException {
      super(searcher);
    }




    public Scorer scorer(IndexReader reader) throws IOException {
      return new BoostingTermScorer(reader.termDocs(term),
reader.termPositions(term), this, similarity,
             reader.norms(term.field()));
    }

    class BoostingTermScorer extends TermScorer {

      //TODO: is this the best way to allocate this?
      byte[] payload = new byte[256];
      private TermPositions positions;
      protected float payloadScore;
      private int payloadsSeen;

      public BoostingTermScorer(TermDocs termDocs, TermPositions
termPositions, Weight weight,
                                Similarity similarity, byte[] norms) throws
IOException {
          super(weight, termDocs, similarity, norms);
          positions = termPositions;

      }

      /**
       * Go to the next document
       *
       */
      public boolean next() throws IOException {

        boolean result = super.next();
        //set the payload.  super.next() properly increments the term
positions
        if (result) {
          if (positions.skipTo(super.doc())) {
              positions.nextPosition();
              processPayload(similarity);
          }
        }

        return result;
      }

      public boolean skipTo(int target) throws IOException {
        boolean result = super.skipTo(target);

        if (result) {
            if (positions.skipTo(target)) {
                positions.nextPosition();
              processPayload(similarity);
          }
        }

        return result;
      }

//      protected boolean setFreqCurrentDoc() throws IOException {
//        if (!more) {
//          return false;
//        }
//        doc = spans.doc();
//        freq = 0.0f;
//        payloadScore = 0;
//        payloadsSeen = 0;
//        Similarity similarity1 = getSimilarity();
//        while (more && doc == spans.doc()) {
//          int matchLength = spans.end() - spans.start();
//
//          freq += similarity1.sloppyFreq(matchLength);
//          processPayload(similarity1);
//
//          more = spans.next();//this moves positions to the next match in
this document
//        }
//        return more || (freq != 0);
//      }


      protected void processPayload(Similarity similarity) throws
IOException {
        if (positions.isPayloadAvailable()) {
          payload = positions.getPayload(payload, 0);
          payloadScore += similarity.scorePayload(payload, 0,
positions.getPayloadLength());
          payloadsSeen++;

        } else {
          //zero out the payload?
        }

      }

      public float score() {

        return super.score() * (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
      }


      public Explanation explain(final int doc) throws IOException {
        Explanation result = new Explanation();
        Explanation nonPayloadExpl = super.explain(doc);
        result.addDetail(nonPayloadExpl);
        //QUESTION: Is there a wau to avoid this skipTo call?  We need to
know whether to load the payload or not

        Explanation payloadBoost = new Explanation();
        result.addDetail(payloadBoost);
/*
        if (skipTo(doc) == true) {
          processPayload();
        }
*/

        float avgPayloadScore =  (payloadsSeen > 0 ? (payloadScore /
payloadsSeen) : 1);
        payloadBoost.setValue(avgPayloadScore);
        //GSI: I suppose we could toString the payload, but I don't think
that would be a good idea
        payloadBoost.setDescription("scorePayload(...)");
        result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
        result.setDescription("btq, product of:");
        return result;
      }
    }

  }


  public boolean equals(Object o) {
    if (!(o instanceof BoostingTermQuery))
      return false;
    BoostingTermQuery other = (BoostingTermQuery) o;
    return (this.getBoost() == other.getBoost())
            && this.term.equals(other.term);
  }
}


Diffs for TermQuery, TermScorer:

Index: src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- src/java/org/apache/lucene/search/TermQuery.java    (revision 581018)
+++ src/java/org/apache/lucene/search/TermQuery.java    (working copy)
@@ -31,7 +31,7 @@
 public class TermQuery extends Query {
   private Term term;

-  private class TermWeight implements Weight {
+  public class TermWeight implements Weight {
     private Similarity similarity;
     private float value;
     private float idf;
Index: src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- src/java/org/apache/lucene/search/TermScorer.java    (revision 581018)
+++ src/java/org/apache/lucene/search/TermScorer.java    (working copy)
@@ -23,7 +23,7 @@

 /** Expert: A <code>Scorer</code> for documents matching a
<code>Term</code>.
  */
-final class TermScorer extends Scorer {
+public class TermScorer extends Scorer {
   private Weight weight;
   private TermDocs termDocs;
   private byte[] norms;
@@ -44,7 +44,7 @@
    * @param similarity The </code>Similarity</code> implementation to be
used for score computations.
    * @param norms The field norms of the document fields for the
<code>Term</code>.
    */
-  TermScorer(Weight weight, TermDocs td, Similarity similarity,
+  public TermScorer(Weight weight, TermDocs td, Similarity similarity,
              byte[] norms) {
     super(similarity);
     this.weight = weight;


Peter

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message