lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1456599 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/ solr/ solr/core/ so...
Date Thu, 14 Mar 2013 18:29:43 GMT
Author: rmuir
Date: Thu Mar 14 18:29:43 2013
New Revision: 1456599

URL: http://svn.apache.org/r1456599
Log:
LUCENE-4816: expose bm25 parameters in PassageScorer

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1456599&r1=1456598&r2=1456599&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Mar 14 18:29:43 2013
@@ -58,6 +58,9 @@ New Features
   now highlights the entire content as a single Passage.  (Robert
   Muir, Mike McCandless)
 
+* LUCENE-4816: Add additional ctor to PostingsHighlighter PassageScorer
+  to provide bm25 k1,b,avgdl parameters. (Robert Muir)
+
 Optimizations
 
 * LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java?rev=1456599&r1=1456598&r2=1456599&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
Thu Mar 14 18:29:43 2013
@@ -30,15 +30,37 @@ public class PassageScorer {
   // TODO: this formula is completely made up. It might not provide relevant snippets!
   
   /** BM25 k1 parameter, controls term frequency normalization */
-  public static final float k1 = 1.2f;
+  final float k1;
   /** BM25 b parameter, controls length normalization. */
-  public static final float b = 0.75f;
+  final float b;
+  /** A pivot used for length normalization. */
+  final float pivot;
   
   /**
-   * A pivot used for length normalization.
-   * The default value is the typical average English sentence length.
+   * Creates PassageScorer with these default values:
+   * <ul>
+   *   <li>{@code k1 = 1.2},
+   *   <li>{@code b = 0.75}.
+   *   <li>{@code pivot = 87}
+   * </ul>
    */
-  public static final float pivot = 87f;
+  public PassageScorer() {
+    // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
+    // 87 is typical average english sentence length.
+    this(1.2f, 0.75f, 87f);
+  }
+  
+  /**
+   * Creates PassageScorer with specified scoring parameters
+   * @param k1 Controls non-linear term frequency normalization (saturation).
+   * @param b Controls to what degree passage length normalizes tf values.
+   * @param pivot Pivot value for length normalization (some rough idea of average sentence
length in characters).
+   */
+  public PassageScorer(float k1, float b, float pivot) {
+    this.k1 = k1;
+    this.b = b;
+    this.pivot = pivot;
+  }
     
   /**
    * Computes term importance, given its in-document statistics.

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java?rev=1456599&r1=1456598&r2=1456599&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
Thu Mar 14 18:29:43 2013
@@ -31,6 +31,7 @@ import org.apache.lucene.document.FieldT
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.Term;
@@ -38,6 +39,7 @@ import org.apache.lucene.search.BooleanC
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
@@ -237,4 +239,79 @@ public class TestPostingsHighlighterRank
       return "Pair [start=" + start + ", end=" + end + "]";
     }
   }
+  
+  /** sets b=0 to disable passage length normalization */
+  public void testCustomB() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test.  This test is a better test but the sentence is
excruiatingly long, " + 
+                        "you have no idea how painful it was for me to type this long sentence
into my IDE.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
+                                             BreakIterator.getSentenceInstance(Locale.ROOT),

+                                             new PassageScorer(1.2f, 0, 87), 
+                                             new PassageFormatter());
+    Query query = new TermQuery(new Term("body", "test"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
+    assertEquals(1, snippets.length);
+    assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
+    
+    ir.close();
+    dir.close();
+  }
+  
+  /** sets k1=0 for simple coordinate-level match (# of query terms present) */
+  public void testCustomK1() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This has only foo foo. " + 
+                        "On the other hand this sentence contains both foo and bar. " + 
+                        "This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter(10000, 
+                                             BreakIterator.getSentenceInstance(Locale.ROOT),

+                                             new PassageScorer(0, 0.75f, 87), 
+                                             new PassageFormatter());
+    BooleanQuery query = new BooleanQuery();
+    query.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD);
+    query.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD);
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 1);
+    assertEquals(1, snippets.length);
+    assertTrue(snippets[0].startsWith("On the other hand"));
+    
+    ir.close();
+    dir.close();
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1456599&r1=1456598&r2=1456599&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
(original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
Thu Mar 14 18:29:43 2013
@@ -55,6 +55,9 @@ import org.apache.solr.util.plugin.Plugi
  *                      preTag="&amp;lt;em&amp;gt;"
  *                      postTag="&amp;lt;/em&amp;gt;"
  *                      ellipsis="... "
+ *                      k1="1.2"
+ *                      b="0.75"
+ *                      pivot="87"
  *                      maxLength=10000/&gt;
  *   &lt;/searchComponent&gt;
  * </pre>
@@ -78,7 +81,23 @@ public class PostingsSolrHighlighter ext
   public void init(PluginInfo info) {
     Map<String,String> attributes = info.attributes;
     BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT);
-    PassageScorer scorer = new PassageScorer();
+    
+    // scorer parameters: k1/b/pivot
+    String k1 = attributes.get("k1");
+    if (k1 == null) {
+      k1 = "1.2";
+    }
+    
+    String b = attributes.get("b");
+    if (b == null) {
+      b = "0.75";
+    }
+    
+    String pivot = attributes.get("pivot");
+    if (pivot == null) {
+      pivot = "87";
+    }
+    PassageScorer scorer = new PassageScorer(Float.parseFloat(k1), Float.parseFloat(b), Float.parseFloat(pivot));
     
     // formatter parameters: preTag/postTag/ellipsis
     String preTag = attributes.get("preTag");



Mime
View raw message