lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r687052 - in /lucene/java/trunk/contrib/highlighter/src: java/org/apache/lucene/search/highlight/ test/org/apache/lucene/search/highlight/
Date Tue, 19 Aug 2008 13:16:42 GMT
Author: markrmiller
Date: Tue Aug 19 06:16:41 2008
New Revision: 687052

URL: http://svn.apache.org/viewvc?rev=687052&view=rev
Log:
LUCENE-1355: highlighter can incorrectly produce negative idf when index has deletes

Modified:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java?rev=687052&r1=687051&r2=687052&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
(original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
Tue Aug 19 06:16:41 2008
@@ -66,6 +66,10 @@
 	        try
             {
                 int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
+                // docFreq counts deletes
+                if(totalNumDocs < docFreq) {
+                  docFreq = totalNumDocs;
+                }
                 //IDF algorithm taken from DefaultSimilarity class
                 float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
                 terms[i].weight*=idf;

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=687052&r1=687051&r2=687052&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
(original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
Tue Aug 19 06:16:41 2008
@@ -409,7 +409,10 @@
       while (it.hasNext()) {
         WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
         int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
-
+        // docFreq counts deletes
+        if(totalNumDocs < docFreq) {
+          docFreq = totalNumDocs;
+        }
         // IDF algorithm taken from DefaultSimilarity class
         float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
         weightedSpanTerm.weight *= idf;

Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=687052&r1=687051&r2=687052&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
(original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
Tue Aug 19 06:16:41 2008
@@ -42,9 +42,14 @@
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.FilteredQuery;
@@ -57,12 +62,14 @@
 import org.apache.lucene.search.RangeFilter;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanNotQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
@@ -1245,6 +1252,60 @@
 
     helper.start();
   }
+  
+  private Directory dir = new RAMDirectory();
+  private Analyzer a = new WhitespaceAnalyzer();
+  
+  public void testWeightedTermsWithDeletes() throws IOException, ParseException {
+    makeIndex();
+    deleteDocument();
+    searchIndex();
+  }
+  
+  private Document doc( String f, String v ){
+    Document doc = new Document();
+    doc.add( new Field( f, v, Store.YES, Index.TOKENIZED ) );
+    return doc;
+  }
+  
+  private void makeIndex() throws IOException {
+    IndexWriter writer = new IndexWriter( dir, a, MaxFieldLength.LIMITED );
+    writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+    writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
+    writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+    writer.addDocument( doc( "t_text1", "more random words for second field" ) );
+    writer.optimize();
+    writer.close();
+  }
+  
+  private void deleteDocument() throws IOException {
+    IndexWriter writer = new IndexWriter( dir, a, false, MaxFieldLength.LIMITED );
+    writer.deleteDocuments( new Term( "t_text1", "del" ) );
+    // To see negative idf, keep comment the following line
+    //writer.optimize();
+    writer.close();
+  }
+  
+  private void searchIndex() throws IOException, ParseException {
+    String q = "t_text1:random";
+    QueryParser parser = new QueryParser( "t_text1", a );
+    Query query = parser.parse( q );
+    IndexSearcher searcher = new IndexSearcher( dir );
+    // This scorer can return negative idf -> null fragment
+    Scorer scorer = new QueryScorer( query, searcher.getIndexReader(), "t_text1" );
+    // This scorer doesn't use idf (patch version)
+    //Scorer scorer = new QueryScorer( query, "t_text1" );
+    Highlighter h = new Highlighter( scorer );
+
+    TopDocs hits = searcher.search(query, null, 10);
+    for( int i = 0; i < hits.totalHits; i++ ){
+      Document doc = searcher.doc( hits.scoreDocs[i].doc );
+      String result = h.getBestFragment( a, "t_text1", doc.get( "t_text1" ));
+      System.out.println("result:" +  result);
+      assertEquals("more <B>random</B> words for second field", result);
+    }
+    searcher.close();
+  }
 
   /*
    * 



Mime
View raw message