lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From broust...@apache.org
Subject [lucene-solr] branch master updated: LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor
Date Tue, 19 Jan 2021 08:51:41 GMT
This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 227256d  LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor
227256d is described below

commit 227256d951accc1b5c9bb554715ee145bce7cf90
Author: Patrick Marty <marty.patrick@gmail.com>
AuthorDate: Tue Jan 19 09:24:45 2021 +0100

    LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor
---
 lucene/CHANGES.txt                                 |  2 +
 lucene/MIGRATE.md                                  |  5 ++
 .../lucene/search/similarities/BM25Similarity.java | 54 +++++++++++++++-------
 .../search/similarities/TestBooleanSimilarity.java |  1 -
 .../search/similarities/TestClassicSimilarity.java |  1 -
 .../search/similarities/TestSimilarityBase.java    |  5 +-
 .../lucene/luke/models/search/SearchImpl.java      |  4 +-
 .../search/similarity/LegacyBM25Similarity.java    | 25 ++++++----
 .../search/similarities/BM25SimilarityFactory.java | 24 ++++------
 .../similarities/LegacyBM25SimilarityFactory.java  | 24 ++++------
 10 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9dec819..71630e7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -82,6 +82,8 @@ API Changes
 * LUCENE-9317 LUCENE-9318 LUCENE-9319 LUCENE-9558 LUCENE-9600 : Clean up package name conflicts
   between modules. See MIGRATE.md for details. (David Ryan, Tomoko Uchida, Uwe Schindler,
Dawid Weiss)
 
+* LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via
Bruno Roustant)
+
 Improvements
 
 * LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming
of all-caps words
diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
index 09cce55..c531735 100644
--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@@ -7,6 +7,11 @@ NativeUnixDirectory in the misc module was therefore removed and replaced
 by DirectIODirectory. To use it, you need a JVM and operating system that
 supports Direct IO.
 
+## BM25Similarity.setDiscountOverlaps and LegacyBM25Similarity.setDiscountOverlaps methods
removed (LUCENE-9646)
+
+The discount discountOverlaps parameter for both BM25Similarity and LegacyBM25Similarity
+is now set by the constructor of those classes.
+
 ## Packages in misc module are renamed (LUCENE-9600)
 
 Following package names in misc module are renamed.
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index 1aa37de..4d48004 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -34,16 +34,19 @@ import org.apache.lucene.util.SmallFloat;
 public class BM25Similarity extends Similarity {
   private final float k1;
   private final float b;
+  private final boolean discountOverlaps;
 
   /**
    * BM25 with the supplied parameter values.
    *
    * @param k1 Controls non-linear term frequency normalization (saturation).
    * @param b Controls to what degree document length normalizes tf values.
+   * @param discountOverlaps True if overlap tokens (tokens with a position of increment
of zero)
+   *     are discounted from the document's length.
    * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code
b} is not
    *     within the range {@code [0..1]}
    */
-  public BM25Similarity(float k1, float b) {
+  public BM25Similarity(float k1, float b, boolean discountOverlaps) {
     if (Float.isFinite(k1) == false || k1 < 0) {
       throw new IllegalArgumentException(
           "illegal k1 value: " + k1 + ", must be a non-negative finite value");
@@ -53,6 +56,19 @@ public class BM25Similarity extends Similarity {
     }
     this.k1 = k1;
     this.b = b;
+    this.discountOverlaps = discountOverlaps;
+  }
+
+  /**
+   * BM25 with the supplied parameter values.
+   *
+   * @param k1 Controls non-linear term frequency normalization (saturation).
+   * @param b Controls to what degree document length normalizes tf values.
+   * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code
b} is not
+   *     within the range {@code [0..1]}
+   */
+  public BM25Similarity(float k1, float b) {
+    this(k1, b, true);
   }
 
   /**
@@ -62,9 +78,27 @@ public class BM25Similarity extends Similarity {
    *   <li>{@code k1 = 1.2}
    *   <li>{@code b = 0.75}
    * </ul>
+   *
+   * and the supplied parameter value:
+   *
+   * @param discountOverlaps True if overlap tokens (tokens with a position of increment
of zero)
+   *     are discounted from the document's length.
+   */
+  public BM25Similarity(boolean discountOverlaps) {
+    this(1.2f, 0.75f, discountOverlaps);
+  }
+
+  /**
+   * BM25 with these default values:
+   *
+   * <ul>
+   *   <li>{@code k1 = 1.2}
+   *   <li>{@code b = 0.75}
+   *   <li>{@code discountOverlaps = true}
+   * </ul>
    */
   public BM25Similarity() {
-    this(1.2f, 0.75f);
+    this(1.2f, 0.75f, true);
   }
 
   /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>.
*/
@@ -83,23 +117,9 @@ public class BM25Similarity extends Similarity {
   }
 
   /**
-   * True if overlap tokens (tokens with a position of increment of zero) are discounted
from the
-   * document's length.
-   */
-  protected boolean discountOverlaps = true;
-
-  /**
-   * Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing
norm.
-   * By default this is true, meaning overlap tokens do not count when computing norms.
-   */
-  public void setDiscountOverlaps(boolean v) {
-    discountOverlaps = v;
-  }
-
-  /**
    * Returns true if overlap tokens are discounted from the document's length.
    *
-   * @see #setDiscountOverlaps
+   * @see #BM25Similarity(float, float, boolean)
    */
   public boolean getDiscountOverlaps() {
     return discountOverlaps;
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
index 3b9a394..16316c7 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
@@ -102,7 +102,6 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase {
   public void testSameNormsAsBM25() {
     BooleanSimilarity sim1 = new BooleanSimilarity();
     BM25Similarity sim2 = new BM25Similarity();
-    sim2.setDiscountOverlaps(true);
     for (int iter = 0; iter < 100; ++iter) {
       final int length = TestUtil.nextInt(random(), 1, 100);
       final int position = random().nextInt(length);
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
index 1020a51..f37a3fe 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
@@ -172,7 +172,6 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
   public void testSameNormsAsBM25() {
     ClassicSimilarity sim1 = new ClassicSimilarity();
     BM25Similarity sim2 = new BM25Similarity();
-    sim2.setDiscountOverlaps(true);
     for (int iter = 0; iter < 100; ++iter) {
       final int length = TestUtil.nextInt(random(), 1, 1000);
       final int position = random().nextInt(length);
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
index 8d9a855..77788f3 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@@ -512,17 +512,16 @@ public class TestSimilarityBase extends LuceneTestCase {
 
   // LUCENE-5221
   public void testDiscountOverlapsBoost() throws IOException {
-    BM25Similarity expected = new BM25Similarity();
+    BM25Similarity expected = new BM25Similarity(false);
     SimilarityBase actual =
         new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
-    expected.setDiscountOverlaps(false);
     actual.setDiscountOverlaps(false);
     FieldInvertState state =
         new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
     state.setLength(5);
     state.setNumOverlap(2);
     assertEquals(expected.computeNorm(state), actual.computeNorm(state));
-    expected.setDiscountOverlaps(true);
+    expected = new BM25Similarity();
     actual.setDiscountOverlaps(true);
     assertEquals(expected.computeNorm(state), actual.computeNorm(state));
   }
diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
index 86c67aa..bbd038f 100644
--- a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
+++ b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
@@ -398,8 +398,8 @@ public final class SearchImpl extends LukeModel implements Search {
       tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
       similarity = tfidf;
     } else {
-      BM25Similarity bm25 = new BM25Similarity(config.getK1(), config.getB());
-      bm25.setDiscountOverlaps(config.isDiscountOverlaps());
+      BM25Similarity bm25 =
+          new BM25Similarity(config.getK1(), config.getB(), config.isDiscountOverlaps());
       similarity = bm25;
     }
 
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java
b/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java
index 13387ea..fe3ea1c 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java
@@ -40,6 +40,7 @@ public final class LegacyBM25Similarity extends Similarity {
    * <ul>
    *   <li>{@code k1 = 1.2}
    *   <li>{@code b = 0.75}
+   *   <li>{@code discountOverlaps = true}
    * </ul>
    */
   public LegacyBM25Similarity() {
@@ -58,6 +59,20 @@ public final class LegacyBM25Similarity extends Similarity {
     this.bm25Similarity = new BM25Similarity(k1, b);
   }
 
+  /**
+   * BM25 with the supplied parameter values.
+   *
+   * @param k1 Controls non-linear term frequency normalization (saturation).
+   * @param b Controls to what degree document length normalizes tf values.
+   * @param discountOverlaps True if overlap tokens (tokens with a position of increment
of zero)
+   *     are discounted from the document's length.
+   * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code
b} is not
+   *     within the range {@code [0..1]}
+   */
+  public LegacyBM25Similarity(float k1, float b, boolean discountOverlaps) {
+    this.bm25Similarity = new BM25Similarity(k1, b, discountOverlaps);
+  }
+
   @Override
   public long computeNorm(FieldInvertState state) {
     return bm25Similarity.computeNorm(state);
@@ -88,17 +103,9 @@ public final class LegacyBM25Similarity extends Similarity {
   }
 
   /**
-   * Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing
norm.
-   * By default this is true, meaning overlap tokens do not count when computing norms.
-   */
-  public void setDiscountOverlaps(boolean v) {
-    bm25Similarity.setDiscountOverlaps(v);
-  }
-
-  /**
    * Returns true if overlap tokens are discounted from the document's length.
    *
-   * @see #setDiscountOverlaps
+   * @see #LegacyBM25Similarity(float, float, boolean)
    */
   public boolean getDiscountOverlaps() {
     return bm25Similarity.getDiscountOverlaps();
diff --git a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java
b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java
index fefe893..2ba51fa 100644
--- a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java
+++ b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java
@@ -32,33 +32,27 @@ import org.apache.solr.schema.SimilarityFactory;
  *                   The default is <code>1.2</code>
  *   <li>b (float): Controls to what degree document length normalizes tf values.
  *                  The default is <code>0.75</code>
- * </ul>
- * <p>
- * Optional settings:
- * <ul>
- *   <li>discountOverlaps (bool): Sets
- *       {@link BM25Similarity#setDiscountOverlaps(boolean)}</li>
+ *   <li>discountOverlaps (bool): True if overlap tokens (tokens with a position of
increment of zero) are
+ *                                discounted from the document's length.
+ *                                The default is <code>true</code>
  * </ul>
  * @lucene.experimental
  * @since 8.0.0
  */
 public class BM25SimilarityFactory extends SimilarityFactory {
-  private boolean discountOverlaps;
-  private float k1;
-  private float b;
+  private BM25Similarity similarity;
 
   @Override
   public void init(SolrParams params) {
     super.init(params);
-    discountOverlaps = params.getBool("discountOverlaps", true);
-    k1 = params.getFloat("k1", 1.2f);
-    b = params.getFloat("b", 0.75f);
+    boolean discountOverlaps = params.getBool("discountOverlaps", true);
+    float k1 = params.getFloat("k1", 1.2f);
+    float b = params.getFloat("b", 0.75f);
+    similarity = new BM25Similarity(k1, b, discountOverlaps);
   }
 
   @Override
   public Similarity getSimilarity() {
-    BM25Similarity sim = new BM25Similarity(k1, b);
-    sim.setDiscountOverlaps(discountOverlaps);
-    return sim;
+    return similarity;
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java
b/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java
index 66dd802..170e67b 100644
--- a/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java
+++ b/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java
@@ -32,33 +32,27 @@ import org.apache.solr.schema.SimilarityFactory;
  *                   The default is <code>1.2</code>
  *   <li>b (float): Controls to what degree document length normalizes tf values.
  *                  The default is <code>0.75</code>
- * </ul>
- * <p>
- * Optional settings:
- * <ul>
- *   <li>discountOverlaps (bool): Sets
- *       {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}</li>
+ *   <li>discountOverlaps (bool): True if overlap tokens (tokens with a position of
increment of zero) are
+ *                                discounted from the document's length.
+ *                                The default is <code>true</code>
  * </ul>
  * @lucene.experimental
  * @since 8.0.0
  */
 public class LegacyBM25SimilarityFactory extends SimilarityFactory {
-  private boolean discountOverlaps;
-  private float k1;
-  private float b;
+    private LegacyBM25Similarity similarity;
 
   @Override
   public void init(SolrParams params) {
     super.init(params);
-    discountOverlaps = params.getBool("discountOverlaps", true);
-    k1 = params.getFloat("k1", 1.2f);
-    b = params.getFloat("b", 0.75f);
+    boolean discountOverlaps = params.getBool("discountOverlaps", true);
+    float k1 = params.getFloat("k1", 1.2f);
+    float b = params.getFloat("b", 0.75f);
+    similarity = new LegacyBM25Similarity(k1, b, discountOverlaps);
   }
 
   @Override
   public Similarity getSimilarity() {
-    LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b);
-    sim.setDiscountOverlaps(discountOverlaps);
-    return sim;
+    return similarity;
   }
 }


Mime
View raw message