lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r807595 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/ src/java/org/apache/lucene/search/spans/ src/test/org/apache/lucene/search/
Date Tue, 25 Aug 2009 12:50:06 GMT
Author: markrmiller
Date: Tue Aug 25 12:50:06 2009
New Revision: 807595

URL: http://svn.apache.org/viewvc?rev=807595&view=rev
Log:
LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain
methods - these stats should be corpus wide.

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/search/Explanation.java
    lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQuery.java
    lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
    lucene/java/trunk/src/java/org/apache/lucene/search/TermQuery.java
    lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanWeight.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestSimpleExplanations.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Aug 25 12:50:06 2009
@@ -411,6 +411,10 @@
 37. LUCENE-1826: Add constructors that take AttributeSource and
     AttributeFactory to all Tokenizer implementations.
     (Michael Busch)
+    
+38. LUCENE-1847: Similarity#idf for both a Term and Term Collection have
+    been deprecated. New versions that return an IDFExplanation have been
+    added.  (Yasoja Seneviratne, Mike McCandless, Mark Miller)
 
 Bug fixes
 
@@ -516,6 +520,10 @@
     new LocalizedTestCase as base class for localization junit tests.
     (Robert Muir, Uwe Schindler via Michael Busch)
 
+26. LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats 
+    in their Weight#explain methods - these stats should be corpus wide.
+    (Yasoja Seneviratne, Mike McCandless, Mark Miller)
+
 New features
 
  1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
@@ -729,11 +737,6 @@
     ValueSource, but takes care when composite (multi-segment) are
     passed to not double RAM usage in the FieldCache.  (Chris
     Hostetter, Mark Miller, Mike McCandless)
-
-37. LUCENE-1798: Added FieldCache.set/getInfoStream, which uses
-    FieldCacheSanityChecker to detect when a new cache entry has
-    caused additional insanity, printing the details at the time that
-    it happens.  (Chris Hostetter, Mike McCandless)
    
 Optimizations
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/Explanation.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/Explanation.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/Explanation.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/Explanation.java Tue Aug 25 12:50:06
2009
@@ -17,6 +17,7 @@
  * limitations under the License.
  */
 
+import java.io.Serializable;
 import java.util.ArrayList;
 
 /** Expert: Describes the score computation for document and query. */
@@ -124,4 +125,25 @@
 
     return buffer.toString();
   }
+  
+  /**
+   * Small Util class used to pass both an idf factor as well as an
+   * explanation for that factor.
+   * 
+   * This class will likely be held on a {@link Weight}, so be aware 
+   * before storing any large or un-serializable fields.
+   *
+   */
+  public static abstract class IDFExplanation implements Serializable {
+    /**
+     * @return the idf factor
+     */
+    public abstract float getIdf();
+    /**
+     * This should be calculated lazily if possible.
+     * 
+     * @return the explanation for the idf factor.
+     */
+    public abstract String explain();
+  }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQuery.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/PhraseQuery.java Tue Aug 25 12:50:06
2009
@@ -24,6 +24,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;
 
 /** A Query that matches documents containing a particular sequence of terms.
@@ -112,12 +113,14 @@
     private float idf;
     private float queryNorm;
     private float queryWeight;
+    private IDFExplanation idfExp;
 
     public PhraseWeight(Searcher searcher)
       throws IOException {
       this.similarity = getSimilarity(searcher);
 
-      idf = similarity.idf(terms, searcher);
+      idfExp = similarity.idfExplain(terms, searcher);
+      idf = idfExp.getIdf();
     }
 
     public String toString() { return "weight(" + PhraseQuery.this + ")"; }
@@ -167,24 +170,20 @@
       StringBuffer docFreqs = new StringBuffer();
       StringBuffer query = new StringBuffer();
       query.append('\"');
+      docFreqs.append(idfExp.explain());
       for (int i = 0; i < terms.size(); i++) {
         if (i != 0) {
-          docFreqs.append(" ");
           query.append(" ");
         }
 
         Term term = (Term)terms.get(i);
 
-        docFreqs.append(term.text());
-        docFreqs.append("=");
-        docFreqs.append(reader.docFreq(term));
-
         query.append(term.text());
       }
       query.append('\"');
 
       Explanation idfExpl =
-        new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+        new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
 
       // explain query weight
       Explanation queryExpl = new Explanation();

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/Similarity.java Tue Aug 25 12:50:06
2009
@@ -17,13 +17,16 @@
  * limitations under the License.
  */
 
+
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.SmallFloat;
 
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Collection;
+import java.util.IdentityHashMap;
 import java.util.Iterator;
 
 /** Expert: Scoring API.
@@ -287,8 +290,6 @@
  * @see Searcher#setSimilarity(Similarity)
  */
 public abstract class Similarity implements Serializable {
-  /** The Similarity implementation used by default. */
-  private static Similarity defaultImpl = new DefaultSimilarity();
 
   public static final int NO_DOC_ID_PROVIDED = -1;
 
@@ -478,10 +479,62 @@
    * @param term the term in question
    * @param searcher the document collection being searched
    * @return a score factor for the term
+   * @deprecated see {@link #idfExplain(Term, Searcher)}
    */
   public float idf(Term term, Searcher searcher) throws IOException {
     return idf(searcher.docFreq(term), searcher.maxDoc());
   }
+  
+  /**
+   * Computes a score factor for a simple term and returns an explanation
+   * for that score factor.
+   * 
+   * <p>
+   * The default implementation uses:
+   * 
+   * <pre>
+   * idf(searcher.docFreq(term), searcher.maxDoc());
+   * </pre>
+   * 
+   * Note that {@link Searcher#maxDoc()} is used instead of
+   * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is
+   * proportional to {@link Searcher#docFreq(Term)} , i.e., when one is
+   * inaccurate, so is the other, and in the same direction.
+   * 
+   * @param term the term in question
+   * @param searcher the document collection being searched
+   * @return an IDFExplain object that includes both an idf score factor 
+             and an explanation for the term.
+   * @throws IOException
+   */
+  public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException
{
+    if(supportedMethods.overridesTermIDF) {
+      final float idf = idf(term, searcher);
+      return new IDFExplanation() {
+        //@Override
+        public float getIdf() {
+          return idf;
+        }
+        //@Override
+        public String explain() {
+          return "Inexplicable";
+        }
+      };
+    }
+    final int df = searcher.docFreq(term);
+    final int max = searcher.maxDoc();
+    final float idf = idf(df, max);
+    return new IDFExplanation() {
+        //@Override
+        public String explain() {
+          return "idf(docFreq=" + df +
+          ", maxDocs=" + max + ")";
+        }
+        //@Override
+        public float getIdf() {
+          return idf;
+        }};
+   }
 
   /** Computes a score factor for a phrase.
    *
@@ -490,7 +543,8 @@
    *
    * @param terms the terms in the phrase
    * @param searcher the document collection being searched
-   * @return a score factor for the phrase
+   * @return  
+   * @deprecated see {@link #idfExplain(Collection, Searcher)}
    */
   public float idf(Collection terms, Searcher searcher) throws IOException {
     float idf = 0.0f;
@@ -500,6 +554,60 @@
     }
     return idf;
   }
+  
+  /**
+   * Computes a score factor for a phrase.
+   * 
+   * <p>
+   * The default implementation sums the idf factor for
+   * each term in the phrase.
+   * 
+   * @param terms the terms in the phrase
+   * @param searcher the document collection being searched
+   * @return an IDFExplain object that includes both an idf 
+   *         score factor for the phrase and an explanation 
+   *         for each term.
+   * @throws IOException
+   */
+  public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException
{
+    if(supportedMethods.overridesCollectionIDF) {
+      final float idf = idf(terms, searcher);
+      return new IDFExplanation() {
+        //@Override
+        public float getIdf() {
+          return idf;
+        }
+        //@Override
+        public String explain() {
+          return "Inexplicable";
+        }
+      };
+    }
+    final int max = searcher.maxDoc();
+    float idf = 0.0f;
+    final StringBuffer exp = new StringBuffer();
+    Iterator i = terms.iterator();
+    while (i.hasNext()) {
+      Term term = (Term)i.next();
+      final int df = searcher.docFreq(term);
+      idf += idf(df, max);
+      exp.append(" ");
+      exp.append(term.text());
+      exp.append("=");
+      exp.append(df);
+    }
+    final float fIdf = idf;
+    return new IDFExplanation() {
+      //@Override
+      public float getIdf() {
+        return fIdf;
+      }
+      //@Override
+      public String explain() {
+        return exp.toString();
+      }
+    };
+  }
 
   /** Computes a score factor based on a term's document frequency (the number
    * of documents which contain the term).  This value is multiplied by the
@@ -577,5 +685,52 @@
     //TODO: When removing the deprecated scorePayload above, set this to return 1
     return scorePayload(fieldName, payload, offset, length);
   }
+  
+  /** @deprecated Remove this when old API is removed! */
+  private final MethodSupport supportedMethods = getSupportedMethods(this.getClass());
+  
+    /** @deprecated Remove this when old API is removed! */
+  private static final class MethodSupport implements Serializable {
+    final boolean overridesCollectionIDF, overridesTermIDF;
+
+    MethodSupport(Class clazz) {
+      overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS);
+      overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS);
+    }
+    
+    private static boolean isMethodOverridden(Class clazz, String name, Class[] params) {
+      try {
+        return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class;
+      } catch (NoSuchMethodException e) {
+        // should not happen
+        throw new RuntimeException(e);
+      }
+    }
+    /** @deprecated Remove this when old API is removed! */
+    private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class};
+    
+    /** @deprecated Remove this when old API is removed! */
+    private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class};
+  }
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static final IdentityHashMap/*<Class<? extends Similarity>,MethodSupport>*/
knownMethodSupport = new IdentityHashMap();
+  
+  /** @deprecated Remove this when old API is removed! */
+  private static MethodSupport getSupportedMethods(Class clazz) {
+    MethodSupport supportedMethods;
+    synchronized(knownMethodSupport) {
+      supportedMethods = (MethodSupport) knownMethodSupport.get(clazz);
+      if (supportedMethods == null) {
+        knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz));
+      }
+    }
+    return supportedMethods;
+  }
+  
+  /** The Similarity implementation used by default. 
+   *  TODO: move back to top when old API is removed! 
+   **/
+  private static Similarity defaultImpl = new DefaultSimilarity();
 
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/TermQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/TermQuery.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/TermQuery.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/TermQuery.java Tue Aug 25 12:50:06
2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 import org.apache.lucene.util.ToStringUtils;
 
 /** A Query that matches documents containing a term.
@@ -37,11 +38,13 @@
     private float idf;
     private float queryNorm;
     private float queryWeight;
+    private IDFExplanation idfExp;
 
     public TermWeight(Searcher searcher)
       throws IOException {
       this.similarity = getSimilarity(searcher);
-      idf = similarity.idf(term, searcher); // compute idf
+      idfExp = similarity.idfExplain(term, searcher);
+      idf = idfExp.getIdf();
     }
 
     public String toString() { return "weight(" + TermQuery.this + ")"; }
@@ -75,8 +78,7 @@
       ComplexExplanation result = new ComplexExplanation();
       result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
 
-      Explanation expl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
-            ", maxDocs=" + reader.maxDoc() + ")");
+      Explanation expl = new Explanation(idf, idfExp.explain());
 
       // explain query weight
       Explanation queryExpl = new Explanation();

Modified: lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanWeight.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanWeight.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanWeight.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/spans/SpanWeight.java Tue Aug 25 12:50:06
2009
@@ -18,12 +18,11 @@
  */
 
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
 import org.apache.lucene.search.*;
+import org.apache.lucene.search.Explanation.IDFExplanation;
 
 import java.io.IOException;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.Set;
 
 /**
@@ -38,6 +37,7 @@
 
   protected Set terms;
   protected SpanQuery query;
+  private IDFExplanation idfExp;
 
   public SpanWeight(SpanQuery query, Searcher searcher)
     throws IOException {
@@ -45,8 +45,8 @@
     this.query = query;
     terms=new HashSet();
     query.extractTerms(terms);
-
-    idf = this.query.getSimilarity(searcher).idf(terms, searcher);
+    idfExp = similarity.idfExplain(terms, searcher);
+    idf = idfExp.getIdf();
   }
 
   public Query getQuery() { return query; }
@@ -75,21 +75,8 @@
     result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
     String field = ((SpanQuery)getQuery()).getField();
 
-    StringBuffer docFreqs = new StringBuffer();
-    Iterator i = terms.iterator();
-    while (i.hasNext()) {
-      Term term = (Term)i.next();
-      docFreqs.append(term.text());
-      docFreqs.append("=");
-      docFreqs.append(reader.docFreq(term));
-
-      if (i.hasNext()) {
-        docFreqs.append(" ");
-      }
-    }
-
     Explanation idfExpl =
-      new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
+      new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
 
     // explain query weight
     Explanation queryExpl = new Explanation();

Modified: lucene/java/trunk/src/test/org/apache/lucene/search/TestSimpleExplanations.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestSimpleExplanations.java?rev=807595&r1=807594&r2=807595&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestSimpleExplanations.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestSimpleExplanations.java Tue Aug
25 12:50:06 2009
@@ -17,6 +17,19 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockRAMDirectory;
+
+
 /**
  * TestExplanations subclass focusing on basic query types
  */
@@ -291,4 +304,67 @@
   }
   
   
+  public void testTermQueryMultiSearcherExplain() throws Exception {
+    // creating two directories for indices
+    Directory indexStoreA = new MockRAMDirectory();
+    Directory indexStoreB = new MockRAMDirectory();
+
+    Document lDoc = new Document();
+    lDoc.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+    Document lDoc2 = new Document();
+    lDoc2.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+    Document lDoc3 = new Document();
+    lDoc3.add(new Field("handle", "1 2", Field.Store.YES, Field.Index.ANALYZED));
+
+    IndexWriter writerA = new IndexWriter(indexStoreA, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+    IndexWriter writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+
+    writerA.addDocument(lDoc);
+    writerA.addDocument(lDoc2);
+    writerA.optimize();
+    writerA.close();
+
+    writerB.addDocument(lDoc3);
+    writerB.close();
+
+    QueryParser parser = new QueryParser("fulltext", new StandardAnalyzer());
+    Query query = parser.parse("handle:1");
+
+    Searcher[] searchers = new Searcher[2];
+    searchers[0] = new IndexSearcher(indexStoreB);
+    searchers[1] = new IndexSearcher(indexStoreA);
+    Searcher mSearcher = new MultiSearcher(searchers);
+    ScoreDoc[] hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    Explanation explain = mSearcher.explain(query, hits[0].doc);
+    String exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("maxDocs=3") > -1);
+    assertTrue(exp, exp.indexOf("docFreq=3") > -1);
+    
+    query = parser.parse("handle:\"1 2\"");
+    hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    explain = mSearcher.explain(query, hits[0].doc);
+    exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("1=3") > -1);
+    assertTrue(exp, exp.indexOf("2=3") > -1);
+    
+    query = new SpanNearQuery(new SpanQuery[] {
+        new SpanTermQuery(new Term("handle", "1")),
+        new SpanTermQuery(new Term("handle", "2")) }, 0, true);
+    hits = mSearcher.search(query, null, 1000).scoreDocs;
+
+    assertEquals(3, hits.length);
+
+    explain = mSearcher.explain(query, hits[0].doc);
+    exp = explain.toString(0);
+    assertTrue(exp, exp.indexOf("1=3") > -1);
+    assertTrue(exp, exp.indexOf("2=3") > -1);
+    mSearcher.close();
+  }
+  
 }



Mime
View raw message