From commits-return-120537-archive-asf-public=cust-asf.ponee.io@lucene.apache.org Tue Jan 19 08:51:44 2021 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mxout1-he-de.apache.org (mxout1-he-de.apache.org [95.216.194.37]) by mx-eu-01.ponee.io (Postfix) with ESMTPS id 7F09D180663 for ; Tue, 19 Jan 2021 09:51:44 +0100 (CET) Received: from mail.apache.org (mailroute1-lw-us.apache.org [207.244.88.153]) by mxout1-he-de.apache.org (ASF Mail Server at mxout1-he-de.apache.org) with SMTP id 769FC64A8B for ; Tue, 19 Jan 2021 08:51:43 +0000 (UTC) Received: (qmail 3353 invoked by uid 500); 19 Jan 2021 08:51:42 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 3344 invoked by uid 99); 19 Jan 2021 08:51:42 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 19 Jan 2021 08:51:42 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 7A29F81FE6; Tue, 19 Jan 2021 08:51:42 +0000 (UTC) Date: Tue, 19 Jan 2021 08:51:41 +0000 To: "commits@lucene.apache.org" Subject: [lucene-solr] branch master updated: LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <161104630079.10587.17093902025957748103@gitbox.apache.org> From: broustant@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: lucene-solr X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 9f5bdf43b713e7555ea03c7698b67e3240039511 X-Git-Newrev: 227256d951accc1b5c9bb554715ee145bce7cf90 X-Git-Rev: 227256d951accc1b5c9bb554715ee145bce7cf90 X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. broustant pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucene-solr.git The following commit(s) were added to refs/heads/master by this push: new 227256d LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor 227256d is described below commit 227256d951accc1b5c9bb554715ee145bce7cf90 Author: Patrick Marty AuthorDate: Tue Jan 19 09:24:45 2021 +0100 LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor --- lucene/CHANGES.txt | 2 + lucene/MIGRATE.md | 5 ++ .../lucene/search/similarities/BM25Similarity.java | 54 +++++++++++++++------- .../search/similarities/TestBooleanSimilarity.java | 1 - .../search/similarities/TestClassicSimilarity.java | 1 - .../search/similarities/TestSimilarityBase.java | 5 +- .../lucene/luke/models/search/SearchImpl.java | 4 +- .../search/similarity/LegacyBM25Similarity.java | 25 ++++++---- .../search/similarities/BM25SimilarityFactory.java | 24 ++++------ .../similarities/LegacyBM25SimilarityFactory.java | 24 ++++------ 10 files changed, 82 insertions(+), 63 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9dec819..71630e7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -82,6 +82,8 @@ API Changes * LUCENE-9317 LUCENE-9318 LUCENE-9319 LUCENE-9558 LUCENE-9600 : Clean up package name conflicts between modules. See MIGRATE.md for details. (David Ryan, Tomoko Uchida, Uwe Schindler, Dawid Weiss) +* LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant) + Improvements * LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 09cce55..c531735 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -7,6 +7,11 @@ NativeUnixDirectory in the misc module was therefore removed and replaced by DirectIODirectory. To use it, you need a JVM and operating system that supports Direct IO. +## BM25Similarity.setDiscountOverlaps and LegacyBM25Similarity.setDiscountOverlaps methods removed (LUCENE-9646) + +The discount discountOverlaps parameter for both BM25Similarity and LegacyBM25Similarity +is now set by the constructor of those classes. + ## Packages in misc module are renamed (LUCENE-9600) Following package names in misc module are renamed. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 1aa37de..4d48004 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -34,16 +34,19 @@ import org.apache.lucene.util.SmallFloat; public class BM25Similarity extends Similarity { private final float k1; private final float b; + private final boolean discountOverlaps; /** * BM25 with the supplied parameter values. * * @param k1 Controls non-linear term frequency normalization (saturation). * @param b Controls to what degree document length normalizes tf values. + * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero) + * are discounted from the document's length. * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not * within the range {@code [0..1]} */ - public BM25Similarity(float k1, float b) { + public BM25Similarity(float k1, float b, boolean discountOverlaps) { if (Float.isFinite(k1) == false || k1 < 0) { throw new IllegalArgumentException( "illegal k1 value: " + k1 + ", must be a non-negative finite value"); @@ -53,6 +56,19 @@ public class BM25Similarity extends Similarity { } this.k1 = k1; this.b = b; + this.discountOverlaps = discountOverlaps; + } + + /** + * BM25 with the supplied parameter values. + * + * @param k1 Controls non-linear term frequency normalization (saturation). + * @param b Controls to what degree document length normalizes tf values. + * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not + * within the range {@code [0..1]} + */ + public BM25Similarity(float k1, float b) { + this(k1, b, true); } /** @@ -62,9 +78,27 @@ public class BM25Similarity extends Similarity { *
  • {@code k1 = 1.2} *
  • {@code b = 0.75} * + * + * and the supplied parameter value: + * + * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero) + * are discounted from the document's length. + */ + public BM25Similarity(boolean discountOverlaps) { + this(1.2f, 0.75f, discountOverlaps); + } + + /** + * BM25 with these default values: + * + *
      + *
    • {@code k1 = 1.2} + *
    • {@code b = 0.75} + *
    • {@code discountOverlaps = true} + *
    */ public BM25Similarity() { - this(1.2f, 0.75f); + this(1.2f, 0.75f, true); } /** Implemented as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)). */ @@ -83,23 +117,9 @@ public class BM25Similarity extends Similarity { } /** - * True if overlap tokens (tokens with a position of increment of zero) are discounted from the - * document's length. - */ - protected boolean discountOverlaps = true; - - /** - * Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing norm. - * By default this is true, meaning overlap tokens do not count when computing norms. - */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; - } - - /** * Returns true if overlap tokens are discounted from the document's length. * - * @see #setDiscountOverlaps + * @see #BM25Similarity(float, float, boolean) */ public boolean getDiscountOverlaps() { return discountOverlaps; diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java index 3b9a394..16316c7 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java @@ -102,7 +102,6 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase { public void testSameNormsAsBM25() { BooleanSimilarity sim1 = new BooleanSimilarity(); BM25Similarity sim2 = new BM25Similarity(); - sim2.setDiscountOverlaps(true); for (int iter = 0; iter < 100; ++iter) { final int length = TestUtil.nextInt(random(), 1, 100); final int position = random().nextInt(length); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java index 1020a51..f37a3fe 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java @@ -172,7 +172,6 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase { public void testSameNormsAsBM25() { ClassicSimilarity sim1 = new ClassicSimilarity(); BM25Similarity sim2 = new BM25Similarity(); - sim2.setDiscountOverlaps(true); for (int iter = 0; iter < 100; ++iter) { final int length = TestUtil.nextInt(random(), 1, 1000); final int position = random().nextInt(length); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 8d9a855..77788f3 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -512,17 +512,16 @@ public class TestSimilarityBase extends LuceneTestCase { // LUCENE-5221 public void testDiscountOverlapsBoost() throws IOException { - BM25Similarity expected = new BM25Similarity(); + BM25Similarity expected = new BM25Similarity(false); SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); - expected.setDiscountOverlaps(false); actual.setDiscountOverlaps(false); FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS); state.setLength(5); state.setNumOverlap(2); assertEquals(expected.computeNorm(state), actual.computeNorm(state)); - expected.setDiscountOverlaps(true); + expected = new BM25Similarity(); actual.setDiscountOverlaps(true); assertEquals(expected.computeNorm(state), actual.computeNorm(state)); } diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java index 86c67aa..bbd038f 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java @@ -398,8 +398,8 @@ public final class SearchImpl extends LukeModel implements Search { tfidf.setDiscountOverlaps(config.isDiscountOverlaps()); similarity = tfidf; } else { - BM25Similarity bm25 = new BM25Similarity(config.getK1(), config.getB()); - bm25.setDiscountOverlaps(config.isDiscountOverlaps()); + BM25Similarity bm25 = + new BM25Similarity(config.getK1(), config.getB(), config.isDiscountOverlaps()); similarity = bm25; } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java b/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java index 13387ea..fe3ea1c 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/search/similarity/LegacyBM25Similarity.java @@ -40,6 +40,7 @@ public final class LegacyBM25Similarity extends Similarity { *
      *
    • {@code k1 = 1.2} *
    • {@code b = 0.75} + *
    • {@code discountOverlaps = true} *
    */ public LegacyBM25Similarity() { @@ -58,6 +59,20 @@ public final class LegacyBM25Similarity extends Similarity { this.bm25Similarity = new BM25Similarity(k1, b); } + /** + * BM25 with the supplied parameter values. + * + * @param k1 Controls non-linear term frequency normalization (saturation). + * @param b Controls to what degree document length normalizes tf values. + * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero) + * are discounted from the document's length. + * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not + * within the range {@code [0..1]} + */ + public LegacyBM25Similarity(float k1, float b, boolean discountOverlaps) { + this.bm25Similarity = new BM25Similarity(k1, b, discountOverlaps); + } + @Override public long computeNorm(FieldInvertState state) { return bm25Similarity.computeNorm(state); @@ -88,17 +103,9 @@ public final class LegacyBM25Similarity extends Similarity { } /** - * Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing norm. - * By default this is true, meaning overlap tokens do not count when computing norms. - */ - public void setDiscountOverlaps(boolean v) { - bm25Similarity.setDiscountOverlaps(v); - } - - /** * Returns true if overlap tokens are discounted from the document's length. * - * @see #setDiscountOverlaps + * @see #LegacyBM25Similarity(float, float, boolean) */ public boolean getDiscountOverlaps() { return bm25Similarity.getDiscountOverlaps(); diff --git a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java index fefe893..2ba51fa 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java @@ -32,33 +32,27 @@ import org.apache.solr.schema.SimilarityFactory; * The default is 1.2 *
  • b (float): Controls to what degree document length normalizes tf values. * The default is 0.75 - * - *

    - * Optional settings: - *

      - *
    • discountOverlaps (bool): Sets - * {@link BM25Similarity#setDiscountOverlaps(boolean)}
    • + *
    • discountOverlaps (bool): True if overlap tokens (tokens with a position of increment of zero) are + * discounted from the document's length. + * The default is true *
    * @lucene.experimental * @since 8.0.0 */ public class BM25SimilarityFactory extends SimilarityFactory { - private boolean discountOverlaps; - private float k1; - private float b; + private BM25Similarity similarity; @Override public void init(SolrParams params) { super.init(params); - discountOverlaps = params.getBool("discountOverlaps", true); - k1 = params.getFloat("k1", 1.2f); - b = params.getFloat("b", 0.75f); + boolean discountOverlaps = params.getBool("discountOverlaps", true); + float k1 = params.getFloat("k1", 1.2f); + float b = params.getFloat("b", 0.75f); + similarity = new BM25Similarity(k1, b, discountOverlaps); } @Override public Similarity getSimilarity() { - BM25Similarity sim = new BM25Similarity(k1, b); - sim.setDiscountOverlaps(discountOverlaps); - return sim; + return similarity; } } diff --git a/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java index 66dd802..170e67b 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/LegacyBM25SimilarityFactory.java @@ -32,33 +32,27 @@ import org.apache.solr.schema.SimilarityFactory; * The default is 1.2 *
  • b (float): Controls to what degree document length normalizes tf values. * The default is 0.75 - * - *

    - * Optional settings: - *

      - *
    • discountOverlaps (bool): Sets - * {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}
    • + *
    • discountOverlaps (bool): True if overlap tokens (tokens with a position of increment of zero) are + * discounted from the document's length. + * The default is true *
    * @lucene.experimental * @since 8.0.0 */ public class LegacyBM25SimilarityFactory extends SimilarityFactory { - private boolean discountOverlaps; - private float k1; - private float b; + private LegacyBM25Similarity similarity; @Override public void init(SolrParams params) { super.init(params); - discountOverlaps = params.getBool("discountOverlaps", true); - k1 = params.getFloat("k1", 1.2f); - b = params.getFloat("b", 0.75f); + boolean discountOverlaps = params.getBool("discountOverlaps", true); + float k1 = params.getFloat("k1", 1.2f); + float b = params.getFloat("b", 0.75f); + similarity = new LegacyBM25Similarity(k1, b, discountOverlaps); } @Override public Similarity getSimilarity() { - LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b); - sim.setDiscountOverlaps(discountOverlaps); - return sim; + return similarity; } }