I'm sorry about this -- I saw this failure locally, thought I fixed it before pushing, but obviously failed to.  I blame git ;)

Thank you to Christine for stepping in and fixing it!



On Tue, Apr 30, 2019 at 3:33 PM Gus Heck <gus.heck@gmail.com> wrote:
https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/

On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <krisden@apache.org> wrote:
It might be https://issues.apache.org/jira/browse/LUCENE-8756

Kevin Risden


On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <gus.heck@gmail.com> wrote:
I'm seeing precommit failures on master that appear to be from this commit. Also it's not clear from the commit message which issue this belongs to...

[forbidden-apis] Loading classes to check...
[forbidden-apis] Scanning classes for violations...
[forbidden-apis] Forbidden method invocation: java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default locale]
[forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis (TestMoreLikeThis.java:497)
[forbidden-apis] Scanned 239 class file(s) for forbidden API invocations (in 0.08s), 1 error(s).


On Tue, Apr 30, 2019 at 12:16 PM <mikemccand@apache.org> wrote:
This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
Author: Olli Kuonanoja <olli.kuonanoja@nosto.com>
AuthorDate: Mon Apr 8 16:44:30 2019 +0300

    Fix MLT like text with custom frequencies

    When an analyzer with custom term frequencies is used with MLT like
    texts, the custom term frequencies are incorrectly omitted and a fixed
    frequency of 1 is used instead.

    This commit fixes the issue by using `TermFrequencyAttribute` to get
    the term frequencies instead of using fixed 1. Also adds test cases
    for them mentioned issue.
---
 .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
 .../lucene/queries/mlt/TestMoreLikeThis.java       | 70 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
index 61ebe93..7c077e5 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
@@ -28,6 +28,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
@@ -824,6 +825,7 @@ public final class MoreLikeThis {
       int tokenCount = 0;
       // for every token
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+      TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class);
       ts.reset();
       while (ts.incrementToken()) {
         String word = termAtt.toString();
@@ -838,9 +840,9 @@ public final class MoreLikeThis {
         // increment frequency
         Int cnt = termFreqMap.get(word);
         if (cnt == null) {
-          termFreqMap.put(word, new Int());
+          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
         } else {
-          cnt.x++;
+          cnt.x += tfAtt.getTermFrequency();
         }
       }
       ts.end();
@@ -982,7 +984,11 @@ public final class MoreLikeThis {
     int x;

     Int() {
-      x = 1;
+      this(1);
+    }
+
+    Int(int initialValue) {
+      x = initialValue;
     }
   }
 }
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
index 4a60015..aeec534 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
@@ -27,7 +27,12 @@ import java.util.Map;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
@@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryUtils;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.LuceneTestCase;

 import static org.hamcrest.core.Is.is;
@@ -427,5 +433,69 @@ public class TestMoreLikeThis extends LuceneTestCase {
       analyzer.close();
     }
   }
+
+  public void testCustomFrequecy() throws IOException {
+    // define an analyzer with delimited term frequency, e.g. "foo|2 bar|3"
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
+        MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
+        return new TokenStreamComponents(tokenizer, addCustomTokenFilter(filt));
+      }
+
+      TokenStream addCustomTokenFilter(TokenStream input) {
+        return new TokenFilter(input) {
+          final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+          final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);
+
+          @Override
+          public boolean incrementToken() throws IOException {
+            if (input.incrementToken()) {
+              final char[] buffer = termAtt.buffer();
+              final int length = termAtt.length();
+              for (int i = 0; i < length; i++) {
+                if (buffer[i] == '|') {
+                  termAtt.setLength(i);
+                  i++;
+                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
+                  return true;
+                }
+              }
+              return true;
+            }
+            return false;
+          }
+        };
+      }
+    };
+
+    mlt.setAnalyzer(analyzer);
+    mlt.setFieldNames(new String[] {"text"});
+    mlt.setBoost(true);
+
+    final double boost10 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|10 release|1")))
+        .clauses()
+        .stream()
+        .map(BooleanClause::getQuery)
+        .map(BoostQuery.class::cast)
+        .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
+        .mapToDouble(BoostQuery::getBoost)
+        .sum();
+
+    final double boost1 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|1 release|1")))
+        .clauses()
+        .stream()
+        .map(BooleanClause::getQuery)
+        .map(BoostQuery.class::cast)
+        .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
+        .mapToDouble(BoostQuery::getBoost)
+        .sum();
+
+    // mlt should use the custom frequencies provided by the analyzer so "lucene|10" should be boosted more than "lucene|1"
+    assertTrue(String.format("%s should be grater than %s", boost10, boost1), boost10 > boost1);
+  }
+
   // TODO: add tests for the MoreLikeThisQuery
 }



--


--