lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r345089 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/search/DisjunctionMaxQuery.java src/java/org/apache/lucene/search/DisjunctionMaxScorer.java src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
Date Wed, 16 Nov 2005 18:47:42 GMT
Author: yonik
Date: Wed Nov 16 10:47:37 2005
New Revision: 345089

URL: http://svn.apache.org/viewcvs?rev=345089&view=rev
Log:
new DisjunctionMaxQuery/DisjunctionMaxScorer from Chuck Williams: LUCENE-323

Added:
    lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java
    lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
Modified:
    lucene/java/trunk/CHANGES.txt

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/CHANGES.txt?rev=345089&r1=345088&r2=345089&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Nov 16 10:47:37 2005
@@ -183,6 +183,10 @@
     must match in a BooleanQuery.  See BooleanQuery.setMinimumNumberShouldMatch().
     (Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395)
 
+27. Added DisjunctionMaxQuery which provides the maximum score across it's clauses.
+    It's very useful for searching across multiple fields.
+    (Chuck Williams via Yonik Seeley, LUCENE-323)
+
 API Changes
 
  1. Several methods and fields have been deprecated. The API documentation

Added: lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java?rev=345089&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java Wed Nov 16
10:47:37 2005
@@ -0,0 +1,246 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+/**
+ * A query that generates the union of the documents produced by its subqueries, and that
scores each document as the maximum
+ * score for that document produced by any subquery plus a tie breaking increment for any
additional matching subqueries.
+ * This is useful to search for a word in multiple fields with different boost factors (so
that the fields cannot be
+ * combined equivalently into a single search field).  We want the primary score to be the
one associated with the highest boost,
+ * not the sum of the field scores (as BooleanQuery would give).
+ * If the query is "albino elephant" this ensures that "albino" matching one field and "elephant"
matching
+ * another gets a higher score than "albino" matching both fields.
+ * To get this result, use both BooleanQuery and DisjunctionMaxQuery:  for each term a DisjunctionMaxQuery
searches for it in
+ * each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery.
+ * The tie breaker capability allows results that include the same term in multiple fields
to be judged better than results that
+ * include this term in only the best of those multiple fields, without confusing this with
the better case of two different terms
+ * in the multiple fields.
+ * @author Chuck Williams
+ */
+public class DisjunctionMaxQuery extends Query implements Iterable {
+
+  /* The subqueries */
+  private ArrayList disjuncts = new ArrayList();
+
+  /* Multiple of the non-max disjunct scores added into our final score.  Non-zero values
support tie-breaking. */
+  private float tieBreakerMultiplier = 0.0f;
+
+  /** Creates a new empty DisjunctionMaxQuery.  Use add() to add the subqueries.
+   * @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is
multiplied by this weight
+   *        and added into the final score.  If non-zero, the value should be small, on the
order of 0.1, which says that
+   *        10 occurrences of word in a lower-scored field that is also in a higher scored
field is just as good as a unique
+   *        word in the lower scored field (i.e., one that is not in any higher scored field.
+   */
+  public DisjunctionMaxQuery(float tieBreakerMultiplier) {
+    this.tieBreakerMultiplier = tieBreakerMultiplier;
+  }
+
+  /**
+   * Creates a new DisjunctionMaxQuery
+   * @param disjuncts an Iterable<Query> of all the disjuncts to add
+   * @param tieBreakerMultiplier   the weight to give to each matching non-maximum disjunct
+   */
+  public DisjunctionMaxQuery(Iterable disjuncts, float tieBreakerMultiplier) {
+    this.tieBreakerMultiplier = tieBreakerMultiplier;
+    add(disjuncts);
+  }
+
+  /** Add a subquery to this disjunction
+   * @param query the disjunct added
+   */
+  public void add(Query query) {
+    disjuncts.add(query);
+  }
+
+  /** Add a collection of disjuncts to this disjunction
+   * via Iterable<Query>
+   */
+  public void add(Iterable disjuncts) {
+    Iterator i = disjuncts.iterator();
+    while (i.hasNext()) add((Query)i.next());
+  }
+
+  /** An Iterator<Query> over the disjuncts */
+  public Iterator iterator() {
+    return disjuncts.iterator();
+  }
+
+  /* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries
*/
+  private class DisjunctionMaxWeight implements Weight {
+
+    private Searcher searcher;       // The searcher with which we are associated.
+    private ArrayList weights = new ArrayList();  // The Weight's for our subqueries, in
1-1 correspondence with disjuncts
+
+    /* Construct the Weight for this Query searched by searcher.  Recursively construct subquery
weights. */
+    public DisjunctionMaxWeight(Searcher searcher) throws IOException {
+      this.searcher = searcher;
+      for (int i = 0; i < disjuncts.size(); i++)
+        weights.add(((Query) disjuncts.get(i)).createWeight(searcher));
+    }
+
+    /* Return our associated DisjunctionMaxQuery */
+    public Query getQuery() { return DisjunctionMaxQuery.this; }
+
+    /* Return our boost */
+    public float getValue() { return getBoost(); }
+
+    /* Compute the sub of squared weights of us applied to our subqueries.  Used for normalization.
*/
+    public float sumOfSquaredWeights() throws IOException {
+      float max = 0.0f, sum = 0.0f;
+      for (int i = 0; i < weights.size(); i++) {
+        float sub = ((Weight) weights.get(i)).sumOfSquaredWeights();
+        sum += sub;
+        max = Math.max(max, sub);
+      }
+      return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * getBoost()
* getBoost();
+    }
+
+    /* Apply the computed normalization factor to our subqueries */
+    public void normalize(float norm) {
+      norm *= getBoost();  // Incorporate our boost
+      for (int i = 0 ; i < weights.size(); i++)
+        ((Weight) weights.get(i)).normalize(norm);
+    }
+
+    /* Create the scorer used to score our associated DisjunctionMaxQuery */
+    public Scorer scorer(IndexReader reader) throws IOException {
+      DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, getSimilarity(searcher));
+      for (int i = 0 ; i < weights.size(); i++) {
+        Weight w = (Weight) weights.get(i);
+        Scorer subScorer = w.scorer(reader);
+        if (subScorer == null) return null;
+        result.add(subScorer);
+      }
+      return result;
+    }
+
+    /* Explain the score we computed for doc */
+    public Explanation explain(IndexReader reader, int doc) throws IOException {
+      if ( disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc);
+      Explanation result = new Explanation();
+      float max = 0.0f, sum = 0.0f;
+      result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier
+ " times others of:");
+      for (int i = 0 ; i < weights.size(); i++) {
+        Explanation e = ((Weight) weights.get(i)).explain(reader, doc);
+        if (e.getValue() > 0) {
+          result.addDetail(e);
+          sum += e.getValue();
+          max = Math.max(max, e.getValue());
+        }
+      }
+      result.setValue(max + (sum - max)*tieBreakerMultiplier);
+      return result;
+    }
+
+  }  // end of DisjunctionMaxWeight inner class
+
+  /* Create the Weight used to score us */
+  protected Weight createWeight(Searcher searcher) throws IOException {
+    return new DisjunctionMaxWeight(searcher);
+  }
+
+  /** Optimize our representation and our subqueries representations
+   * @param reader the IndexReader we query
+   * @return an optimized copy of us (which may not be a copy if there is nothing to optimize)
*/
+  public Query rewrite(IndexReader reader) throws IOException {
+    if (disjuncts.size() == 1) {
+      Query singleton = (Query) disjuncts.get(0);
+      Query result = singleton.rewrite(reader);
+      if (getBoost() != 1.0f) {
+        if (result == singleton) result = (Query)result.clone();
+        result.setBoost(getBoost() * result.getBoost());
+      }
+      return result;
+    }
+    DisjunctionMaxQuery clone = null;
+    for (int i = 0 ; i < disjuncts.size(); i++) {
+      Query clause = (Query) disjuncts.get(i);
+      Query rewrite = clause.rewrite(reader);
+      if (rewrite != clause) {
+        if (clone == null) clone = (DisjunctionMaxQuery)this.clone();
+        clone.disjuncts.set(i, rewrite);
+      }
+    }
+    if (clone != null) return clone;
+    else return this;
+  }
+
+  /** Create a shallow copy of us -- used in rewriting if necessary
+   * @return a copy of us (but reuse, don't copy, our subqueries) */
+  public Object clone() {
+    DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone();
+    clone.disjuncts = (ArrayList)this.disjuncts.clone();
+    return clone;
+  }
+
+  /** Prettyprint us.
+   * @param field the field to which we are applied
+   * @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... |
disjunctn)^boost"
+   */
+  public String toString(String field) {
+    StringBuffer buffer = new StringBuffer();
+    buffer.append("(");
+    for (int i = 0 ; i < disjuncts.size(); i++) {
+      Query subquery = (Query) disjuncts.get(i);
+      if (subquery instanceof BooleanQuery) {   // wrap sub-bools in parens
+        buffer.append("(");
+        buffer.append(subquery.toString(field));
+        buffer.append(")");
+      }
+      else buffer.append(subquery.toString(field));
+      if (i != disjuncts.size()-1) buffer.append(" | ");
+    }
+    buffer.append(")");
+    if (tieBreakerMultiplier != 0.0f) {
+      buffer.append("~");
+      buffer.append(tieBreakerMultiplier);
+    }
+    if (getBoost() != 1.0) {
+      buffer.append("^");
+      buffer.append(getBoost());
+    }
+    return buffer.toString();
+  }
+
+  /** Return true iff we represent the same query as o
+   * @param o another object
+   * @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries,
in the same order, as us
+   */
+  public boolean equals(Object o) {
+    if (! (o instanceof DisjunctionMaxQuery) ) return false;
+    DisjunctionMaxQuery other = (DisjunctionMaxQuery)o;
+    return this.getBoost() == other.getBoost()
+            && this.tieBreakerMultiplier == other.tieBreakerMultiplier
+            && this.disjuncts.equals(other.disjuncts);
+  }
+
+  /** Compute a hash code for hashing us
+   * @return the hash code
+   */
+  public int hashCode() {
+    return Float.floatToIntBits(getBoost())
+            + Float.floatToIntBits(tieBreakerMultiplier)
+            + disjuncts.hashCode();
+  }
+
+}

Added: lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java?rev=345089&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java Wed Nov
16 10:47:37 2005
@@ -0,0 +1,214 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * The Scorer for DisjunctionMaxQuery's.  The union of all documents generated by the the
subquery scorers
+ * is generated in document number order.  The score for each document is the maximum of
the scores computed
+ * by the subquery scorers that generate that document, plus tieBreakerMultiplier times the
sum of the scores
+ * for the other subqueries that generate the document.
+ * @author Chuck Williams
+ */
+class DisjunctionMaxScorer extends Scorer {
+
+    /* The scorers for subqueries that have remaining docs, kept sorted by number of next
doc. */
+    private ArrayList subScorers = new ArrayList();
+
+    /* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed
into the result. */
+    private float tieBreakerMultiplier;
+
+    private boolean more = false;          // True iff there is a next document
+    private boolean firstTime = true;      // True iff next() has not yet been called
+
+    /* Comparator to sort subScorers according to the document number of next document */
+    private static class DisjunctionMaxClauseComparator implements Comparator {
+
+        /* Scorers have all been positioned at their next document already */
+        public int compare(Object o1, Object o2) {
+            if (o1 instanceof Scorer && o2 instanceof Scorer) {
+                Scorer s1 = (Scorer) o1;
+                Scorer s2 = (Scorer) o2;
+
+                return s1.doc() - s2.doc();
+            }
+            else {
+                throw new ClassCastException("Objects not of the type 'Scorer'");
+            }
+        }
+
+        /* Compatible equality */
+        public boolean equals(Scorer s1, Scorer s2) {
+            return s1.doc() == s2.doc();
+        }
+
+     }
+
+    /* Fixed instance of the comparator to reuse */
+    private static DisjunctionMaxClauseComparator subScorerComparator = new DisjunctionMaxClauseComparator();
+
+    /** Creates a new instance of DisjunctionMaxScorer
+     * @param similarity -- not used since our definition involves neither coord nor terms
directly */
+    public DisjunctionMaxScorer(float tieBreakerMultiplier, Similarity similarity) {
+        super(similarity);
+        this.tieBreakerMultiplier = tieBreakerMultiplier;
+    }
+
+    /** Add the scorer for a subquery
+     * @param scorer the scorer of a subquery of our associated DisjunctionMaxQuery
+     */
+    public void add(Scorer scorer) throws IOException {
+        if ( scorer.next() ) {       // Initialize and retain only if it produces docs
+            subScorers.add(scorer);
+            more = true;
+        }
+    }
+
+    /* First time initialization.  Sort subScorers. */
+    private void init() {
+        sortSubScorers();
+        firstTime = false;
+    }
+
+    /* Sort subScorers in order of document number of next document to be generated */
+    private void sortSubScorers() {
+        Scorer[] sorted = (Scorer[]) subScorers.toArray(new Scorer[subScorers.size()]);
+        Arrays.sort(sorted, subScorerComparator);
+        for (int i=0; i<sorted.length; i++) subScorers.set(i, sorted[i]);
+    }
+
+    /** Generate the next document matching our associated DisjunctionMaxQuery.
+     * @return true iff there is a next document
+     */
+    public boolean next() throws IOException {
+        if ( !more ) return false;
+        if ( firstTime ) {
+            init();
+            return true;   // more would have been false if no subScorers had any docs
+        }
+        // Increment all generators that generated the last doc and incrementally re-sort.
+        int lastdoc = ((Scorer) subScorers.get(0)).doc();
+        do {
+            if ( ((Scorer) subScorers.get(0)).next() ) {
+                Scorer s = (Scorer) subScorers.get(0);
+                int snextdoc = s.doc(), i=1;
+                for (; i<subScorers.size() && snextdoc > ((Scorer) subScorers.get(i)).doc();
i++)
+                    subScorers.set(i-1, subScorers.get(i));
+                if ( i!=1 ) subScorers.set(i-1, s);
+            } else {
+                subScorers.remove(0);
+                if ( subScorers.isEmpty() ) return (more = false);
+            }
+        } while ( ((Scorer) subScorers.get(0)).doc()==lastdoc );
+        return true;
+    }
+
+    /** Determine the current document number.  Initially invalid, until {@link #next()}
is called the first time.
+     * @return the document number of the currently generated document
+     */
+    public int doc() {
+        return ((Scorer) subScorers.get(0)).doc();
+    }
+
+    /** Determine the current document score.  Initially invalid, until {@link #next()} is
called the first time.
+     * @return the score of the current generated document
+     */
+    public float score() throws IOException {
+        float max = ((Scorer) subScorers.get(0)).score(), sum = max;
+        for (int i = 1, doc = ((Scorer) subScorers.get(0)).doc(); i < subScorers.size()
&& ((Scorer) subScorers.get(i)).doc() == doc; i++) {
+            float sub = ((Scorer) subScorers.get(i)).score();
+            sum += sub;
+            max = Math.max(max, sub);
+        }
+        return max + (sum - max)*tieBreakerMultiplier;
+    }
+
+    /** Advance to the first document beyond the current whose number is greater than or
equal to target.
+     * @param target the minimum number of the next desired document
+     * @return true iff there is a document to be generated whose number is at least target
+     */
+    public boolean skipTo(int target) throws IOException {
+        int i=0;
+        while ( i<subScorers.size() ) {
+            if ( ((Scorer) subScorers.get(i)).doc() < target ) {
+                if ( ((Scorer) subScorers.get(i)).skipTo(target) ) i++;
+                else subScorers.remove(i);
+            } else i++;
+        }
+        if ( i == 0 ) return false;
+        sortSubScorers();
+        return true;
+    }
+
+    /** Explain a score that we computed.  UNSUPPORTED -- see explanation capability in DisjunctionMaxQuery.
+     * @param doc the number of a document we scored
+     * @return the Explanation for our score
+     */
+    public Explanation explain(int doc) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+}
+
+/***************************************************************************
+ Implementation notes from http://issues.apache.org/jira/browse/LUCENE-323
+
+
+ There is an issue with the MaxDisjunctionScorer in the .zip attachment, I'm
+ sorry I did not see this earlier when I posted on java-dev about this.
+
+ The problem is that MaxDisjunctionScorer uses bubble sort to keep the subscorer
+ sorted over the documents in the next() method (line 103), and this does not scale nicely
+ when the number of subscorers increases.
+ Supposing the number of subscores that match the document is N,
+ the amount of work to be done is proportional to (N*N) per document.
+ In DisjunctionSumScorer a priority queue is used, and there the amount of work is
+ proportional to (N log(N)) per document.
+ So I would recommend to rewrite MaxDisjunctionScorer to inherit from a new common
+ super class with DisjunctionSumScorer, sharing everything except the
+ advanceAfterCurrent() method (which could be abstract in the new superclass).
+ It's possible to be more aggressive in refactoring by initializing and adapting
+ the score per index document using different methods, but this would take N
+ extra method calls per document.
+
+ At the same time the name could be changed to DisjunctionMaxScorer
+ for consistency in the org.lucene.search package.
+
+ Regards,
+ Paul Elschot
+
+ Comment by Chuck Williams [14/Nov/05 11:55 PM]
+ The code only uses bubble sort for the incremental resorting of an already-sorted list.
The initial sort is done with Arrays.sort() which is O(n*logn). The incremental resort is
O(k*n) where k is the number of clauses that match the document last generated. Even if n
is large, k will usually be small. Theoretically this is O(n^2) because k could be as high
as n, but this is extremely unlikely especially when n is large. More likely is that k is
bounded by a small constant, in which case the algorithm is O(n). It's like Quicksort in that
regard -- there are outlier cases where it won't perform well, but it will perform better
than most alternatives for the vast majority of cases.
+
+ Resorting the whole list every time would perform worse. The best algorithm would probably
be to use the standard insert and delete operations on a heap (as in heap sort):
+
+   while top element generated last doc
+       heap remove it
+       generate it
+       heap insert it
+
+ This would yield total time O(k*logn), as with a PriorityQueue.
+
+ I don't think this is much of an issue to worry about, but the algorithm could be revised
to use the heap sort operations if others think it is important.
+
+ Chuck
+
+*********************************************************************/
\ No newline at end of file

Added: lucene/java/trunk/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java?rev=345089&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java Wed Nov
16 10:47:37 2005
@@ -0,0 +1,455 @@
+package org.apache.lucene.search;
+
+
+/**
+ * Copyright 2005 Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.DefaultSimilarity;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+import junit.framework.TestCase;
+
+import java.text.DecimalFormat;
+
+/**
+ * Test of the DisjunctionMaxQuery.
+ *
+ */
+public class TestDisjunctionMaxQuery extends TestCase{
+
+    /** threshold for comparing floats */
+    public static final float SCORE_COMP_THRESH = 0.0000f;
+
+    /**
+     * Similarity to eliminate tf, idf and lengthNorm effects to
+     * isolate test case.
+     *
+     * <p>
+     * same as TestRankingSimilarity in TestRanking.zip from
+     * http://issues.apache.org/jira/browse/LUCENE-323
+     * </p>
+     * @author Williams
+     */
+    private static class TestSimilarity extends DefaultSimilarity {
+
+        public TestSimilarity() {
+        }
+        public float tf(float freq) {
+            if (freq > 0.0f) return 1.0f;
+            else return 0.0f;
+        }
+        public float lengthNorm(String fieldName, int numTerms) {
+            return 1.0f;
+        }
+        public float idf(int docFreq, int numDocs) {
+            return 1.0f;
+        }
+    }
+
+    public Similarity sim = new TestSimilarity();
+    public Directory index;
+    public IndexReader r;
+    public IndexSearcher s;
+
+    public void setUp() throws Exception {
+
+        index = new RAMDirectory();
+        IndexWriter writer = new IndexWriter(index,
+                                             new WhitespaceAnalyzer(),
+                                             true);
+        writer.setSimilarity(sim);
+
+        // hed is the most important field, dek is secondary
+
+        // d1 is an "ok" match for:  albino elephant
+        {
+            Document d1 = new Document();
+            d1.add(Field.Keyword("id", "d1"));
+            d1.add(Field.Text("hed", "elephant"));
+            d1.add(Field.Text("dek", "elephant"));
+            writer.addDocument(d1);
+        }
+
+        // d2 is a "good" match for:  albino elephant
+        {
+            Document d2 = new Document();
+            d2.add(Field.Keyword("id", "d2"));
+            d2.add(Field.Text("hed", "elephant"));
+            d2.add(Field.Text("dek", "albino"));
+            d2.add(Field.Text("dek", "elephant"));
+            writer.addDocument(d2);
+        }
+
+        // d3 is a "better" match for:  albino elephant
+        {
+            Document d3 = new Document();
+            d3.add(Field.Keyword("id", "d3"));
+            d3.add(Field.Text("hed", "albino"));
+            d3.add(Field.Text("hed", "elephant"));
+            writer.addDocument(d3);
+        }
+
+        // d4 is the "best" match for:  albino elephant
+        {
+            Document d4 = new Document();
+            d4.add(Field.Keyword("id", "d4"));
+            d4.add(Field.Text("hed", "albino"));
+            d4.add(Field.Text("hed", "elephant"));
+            d4.add(Field.Text("dek", "albino"));
+            writer.addDocument(d4);
+        }
+
+        writer.close();
+
+        r = IndexReader.open(index);
+        s = new IndexSearcher(r);
+        s.setSimilarity(sim);
+    }
+
+
+    public void testSimpleEqualScores1() throws Exception {
+
+        DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
+        q.add(tq("hed","albino"));
+        q.add(tq("hed","elephant"));
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("all docs should match " + q.toString(),
+                         4, h.length());
+
+            float score = h.score(0);
+            for (int i = 1; i < h.length(); i++) {
+                assertEquals("score #" + i + " is not the same",
+                             score, h.score(i), SCORE_COMP_THRESH);
+            }
+        } catch (Error e) {
+            printHits("testSimpleEqualScores1",h);
+            throw e;
+        }
+
+
+    }
+
+    public void testSimpleEqualScores2() throws Exception {
+
+        DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
+        q.add(tq("dek","albino"));
+        q.add(tq("dek","elephant"));
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("3 docs should match " + q.toString(),
+                         3, h.length());
+            float score = h.score(0);
+            for (int i = 1; i < h.length(); i++) {
+                assertEquals("score #" + i + " is not the same",
+                             score, h.score(i), SCORE_COMP_THRESH);
+            }
+        } catch (Error e) {
+            printHits("testSimpleEqualScores2",h);
+            throw e;
+        }
+
+    }
+
+    public void testSimpleEqualScores3() throws Exception {
+
+        DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
+        q.add(tq("hed","albino"));
+        q.add(tq("hed","elephant"));
+        q.add(tq("dek","albino"));
+        q.add(tq("dek","elephant"));
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("all docs should match " + q.toString(),
+                         4, h.length());
+            float score = h.score(0);
+            for (int i = 1; i < h.length(); i++) {
+                assertEquals("score #" + i + " is not the same",
+                             score, h.score(i), SCORE_COMP_THRESH);
+            }
+        } catch (Error e) {
+            printHits("testSimpleEqualScores3",h);
+            throw e;
+        }
+
+    }
+
+    public void testSimpleTiebreaker() throws Exception {
+
+        DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.01f);
+        q.add(tq("dek","albino"));
+        q.add(tq("dek","elephant"));
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("3 docs should match " + q.toString(),
+                         3, h.length());
+            assertEquals("wrong first",  "d2", h.doc(0).get("id"));
+            float score0 = h.score(0);
+            float score1 = h.score(1);
+            float score2 = h.score(2);
+            assertTrue("d2 does not have better score then others: " +
+                       score0 + " >? " + score1,
+                       score0 > score1);
+            assertEquals("d4 and d1 don't have equal scores",
+                         score1, score2, SCORE_COMP_THRESH);
+        } catch (Error e) {
+            printHits("testSimpleTiebreaker",h);
+            throw e;
+        }
+    }
+
+    public void testBooleanRequiredEqualScores() throws Exception {
+
+        BooleanQuery q = new BooleanQuery();
+        {
+            DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
+            q1.add(tq("hed","albino"));
+            q1.add(tq("dek","albino"));
+            q.add(q1,true,false);
+        }
+        {
+            DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
+            q2.add(tq("hed","elephant"));
+            q2.add(tq("dek","elephant"));
+            q.add(q2,true,false);
+        }
+
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("3 docs should match " + q.toString(),
+                         3, h.length());
+            float score = h.score(0);
+            for (int i = 1; i < h.length(); i++) {
+                assertEquals("score #" + i + " is not the same",
+                             score, h.score(i), SCORE_COMP_THRESH);
+            }
+        } catch (Error e) {
+            printHits("testBooleanRequiredEqualScores1",h);
+            throw e;
+        }
+    }
+
+
+    public void testBooleanOptionalNoTiebreaker() throws Exception {
+
+        BooleanQuery q = new BooleanQuery();
+        {
+            DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
+            q1.add(tq("hed","albino"));
+            q1.add(tq("dek","albino"));
+            q.add(q1,false,false);
+        }
+        {
+            DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
+            q2.add(tq("hed","elephant"));
+            q2.add(tq("dek","elephant"));
+            q.add(q2,false,false);
+        }
+
+
+        Hits h = s.search(q);
+
+        try {
+            assertEquals("4 docs should match " + q.toString(),
+                         4, h.length());
+            float score = h.score(0);
+            for (int i = 1; i < h.length()-1; i++) { /* note: -1 */
+                assertEquals("score #" + i + " is not the same",
+                             score, h.score(i), SCORE_COMP_THRESH);
+            }
+            assertEquals("wrong last", "d1", h.doc(h.length()-1).get("id"));
+            float score1 = h.score(h.length()-1);
+            assertTrue("d1 does not have worse score then others: " +
+                       score + " >? " + score1,
+                       score > score1);
+        } catch (Error e) {
+            printHits("testBooleanOptionalNoTiebreaker",h);
+            throw e;
+        }
+    }
+
+
+    public void testBooleanOptionalWithTiebreaker() throws Exception {
+
+        BooleanQuery q = new BooleanQuery();
+        {
+            DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
+            q1.add(tq("hed","albino"));
+            q1.add(tq("dek","albino"));
+            q.add(q1,false,false);
+        }
+        {
+            DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
+            q2.add(tq("hed","elephant"));
+            q2.add(tq("dek","elephant"));
+            q.add(q2,false,false);
+        }
+
+
+        Hits h = s.search(q);
+
+        try {
+
+            assertEquals("4 docs should match " + q.toString(),
+                         4, h.length());
+
+            float score0 = h.score(0);
+            float score1 = h.score(1);
+            float score2 = h.score(2);
+            float score3 = h.score(3);
+
+            String doc0 = h.doc(0).get("id");
+            String doc1 = h.doc(1).get("id");
+            String doc2 = h.doc(2).get("id");
+            String doc3 = h.doc(3).get("id");
+
+            assertTrue("doc0 should be d2 or d4: " + doc0,
+                       doc0.equals("d2") || doc0.equals("d4"));
+            assertTrue("doc1 should be d2 or d4: " + doc0,
+                       doc1.equals("d2") || doc1.equals("d4"));
+            assertEquals("score0 and score1 should match",
+                         score0, score1, SCORE_COMP_THRESH);
+            assertEquals("wrong third", "d3", doc2);
+            assertTrue("d3 does not have worse score then d2 and d4: " +
+                       score1 + " >? " + score2,
+                       score1 > score2);
+
+            assertEquals("wrong fourth", "d1", doc3);
+            assertTrue("d1 does not have worse score then d3: " +
+                       score2 + " >? " + score3,
+                       score2 > score3);
+
+        } catch (Error e) {
+            printHits("testBooleanOptionalWithTiebreaker",h);
+            throw e;
+        }
+
+    }
+
+
+    public void testBooleanOptionalWithTiebreakerAndBoost() throws Exception {
+
+        BooleanQuery q = new BooleanQuery();
+        {
+            DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
+            q1.add(tq("hed","albino", 1.5f));
+            q1.add(tq("dek","albino"));
+            q.add(q1,false,false);
+        }
+        {
+            DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
+            q2.add(tq("hed","elephant", 1.5f));
+            q2.add(tq("dek","elephant"));
+            q.add(q2,false,false);
+        }
+
+
+        Hits h = s.search(q);
+
+        try {
+
+            assertEquals("4 docs should match " + q.toString(),
+                         4, h.length());
+
+            float score0 = h.score(0);
+            float score1 = h.score(1);
+            float score2 = h.score(2);
+            float score3 = h.score(3);
+
+            String doc0 = h.doc(0).get("id");
+            String doc1 = h.doc(1).get("id");
+            String doc2 = h.doc(2).get("id");
+            String doc3 = h.doc(3).get("id");
+
+            assertEquals("doc0 should be d4: ", "d4", doc0);
+            assertEquals("doc1 should be d3: ", "d3", doc1);
+            assertEquals("doc2 should be d2: ", "d2", doc2);
+            assertEquals("doc3 should be d1: ", "d1", doc3);
+
+            assertTrue("d4 does not have a better score then d3: " +
+                       score0 + " >? " + score1,
+                       score0 > score1);
+            assertTrue("d3 does not have a better score then d2: " +
+                       score1 + " >? " + score2,
+                       score1 > score2);
+            assertTrue("d3 does not have a better score then d1: " +
+                       score2 + " >? " + score3,
+                       score2 > score3);
+
+        } catch (Error e) {
+            printHits("testBooleanOptionalWithTiebreakerAndBoost",h);
+            throw e;
+        }
+    }
+
+
+
+
+
+
+
+    /** macro */
+    protected Query tq(String f, String t) {
+        return new TermQuery(new Term(f, t));
+    }
+    /** macro */
+    protected Query tq(String f, String t, float b) {
+        Query q = tq(f,t);
+        q.setBoost(b);
+        return q;
+    }
+
+
+    protected void printHits(String test, Hits h) throws Exception {
+
+        System.err.println("------- " + test + " -------");
+
+        DecimalFormat f = new DecimalFormat("0.000000000");
+
+        for (int i = 0; i < h.length(); i++) {
+            Document d = h.doc(i);
+            float score = h.score(i);
+            System.err.println("#" + i + ": " + f.format(score) + " - " +
+                               d.get("id"));
+        }
+    }
+
+}



Mime
View raw message