lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From da...@apache.org
Subject [04/36] lucene-solr:jira/http2: SOLR-12581: the JSON Facet 'relatedness()' aggregate function now supports a 'min_popularity' option using the extended type:func syntax
Date Tue, 31 Jul 2018 02:32:20 GMT
SOLR-12581: the JSON Facet 'relatedness()' aggregate function now supports a 'min_popularity'
option using the extended type:func syntax


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/71c0bddd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/71c0bddd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/71c0bddd

Branch: refs/heads/jira/http2
Commit: 71c0bddd149b7c0364fbba8d31494dcd9f57f1ef
Parents: 528e8bc
Author: Chris Hostetter <hossman@apache.org>
Authored: Wed Jul 25 10:06:30 2018 -0700
Committer: Chris Hostetter <hossman@apache.org>
Committed: Wed Jul 25 10:06:30 2018 -0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   3 +
 .../apache/solr/search/ValueSourceParser.java   |   4 +-
 .../solr/search/facet/RelatednessAgg.java       | 109 ++++++++++++++-----
 .../search/facet/TestJsonFacetRefinement.java   |  36 +++++-
 .../solr/search/facet/TestJsonFacets.java       |  54 +++++++++
 solr/solr-ref-guide/src/json-facet-api.adoc     |  20 +++-
 6 files changed, 191 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index efcb8c4..b35ac22 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -117,6 +117,9 @@ New Features
 * SOLR-12567: JSON Facet "functions" now support an extended "type:func" syntax, similar
to other types
   of facets.  This also allows additional local params to be specified for if the aggregation
function
   can take advantage of them.  (hossman)
+  
+* SOLR-12581: the JSON Facet 'relatedness()' aggregate function now supports a 'min_popularity'
option
+  using the extended type:func syntax (hossman)
 
 Bug Fixes
 ----------------------

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
index b7c6815..8d84642 100644
--- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
+++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
@@ -1047,7 +1047,9 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin
{
         // if only one arg, assume it's the foreground
         // (background is the one that will most commonly just be "*:*")
         // see notes in RelatednessAgg constructor about why we don't do this yet
-        return new RelatednessAgg(fp.parseNestedQuery(), fp.parseNestedQuery());
+        RelatednessAgg agg = new RelatednessAgg(fp.parseNestedQuery(), fp.parseNestedQuery());
+        agg.setOpts(fp);
+        return agg;
       }
     });
     

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
index d5a8c74..662496f 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java
@@ -30,9 +30,12 @@ import org.apache.lucene.queries.function.FunctionValues;
 import org.apache.lucene.search.Query;
 
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.search.DocSet;
+import org.apache.solr.search.QParser;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -62,6 +65,7 @@ public class RelatednessAgg extends AggValueSource {
   
   final protected Query fgQ;
   final protected Query bgQ;
+  protected double min_pop = 0.0D;
   
   public static final String NAME = RELATEDNESS;
   public RelatednessAgg(Query fgQ, Query bgQ) {
@@ -80,10 +84,20 @@ public class RelatednessAgg extends AggValueSource {
     }
   }
 
+  public void setOpts(QParser parser) {
+    final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
+    SolrParams opts = parser.getLocalParams();
+    if (null != opts) {
+      if (!isShard) { // ignore min_pop if this is a shard request
+        this.min_pop = opts.getDouble("min_popularity", 0.0D);
+      }
+    }
+  }
+  
   @Override
   public String description() {
     // TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
-    return name +"(" + fgQ + "," + bgQ + ")";
+    return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop+")";
   }
   
   @Override
@@ -92,12 +106,14 @@ public class RelatednessAgg extends AggValueSource {
       return false;
     }
     RelatednessAgg that = (RelatednessAgg) o;
-    return Objects.equals(fgQ, that.fgQ) && Objects.equals(bgQ, that.bgQ);
+    return Objects.equals(fgQ, that.fgQ)
+      && Objects.equals(bgQ, that.bgQ)
+      && Objects.equals(min_pop, that.min_pop);
   }
   
   @Override
   public int hashCode() {
-    return Objects.hash(getClass(), fgQ, bgQ);
+    return Objects.hash(getClass(), fgQ, bgQ, min_pop);
   }
 
   @Override
@@ -139,23 +155,25 @@ public class RelatednessAgg extends AggValueSource {
     
     DocSet fgSet = fcontext.searcher.getDocSet(fgFilters);
     DocSet bgSet = fcontext.searcher.getDocSet(bgQ);
-    return new SKGSlotAcc(fcontext, numSlots, fgSet, bgSet);
+    return new SKGSlotAcc(this, fcontext, numSlots, fgSet, bgSet);
   }
 
   @Override
   public FacetMerger createFacetMerger(Object prototype) {
-    return new Merger();
+    return new Merger(this);
   }
   
   private static final class SKGSlotAcc extends SlotAcc {
+    private final RelatednessAgg agg;
     private BucketData[] slotvalues;
     private final DocSet fgSet;
     private final DocSet bgSet;
     private final long fgSize;
     private final long bgSize;
-    public SKGSlotAcc(final FacetContext fcontext, final int numSlots,
+    public SKGSlotAcc(final RelatednessAgg agg, final FacetContext fcontext, final int numSlots,
                       final DocSet fgSet, final DocSet bgSet) throws IOException {
       super(fcontext);
+      this.agg = agg;
       this.fgSet = fgSet;
       this.bgSet = bgSet;
       // cache the set sizes for frequent re-use on every slot
@@ -164,7 +182,6 @@ public class RelatednessAgg extends AggValueSource {
       this.slotvalues = new BucketData[numSlots];
       reset();
     }
-
     private void processSlot(int slot, IntFunction<SlotContext> slotContext) throws
IOException {
       
       assert null != slotContext;
@@ -181,7 +198,7 @@ public class RelatednessAgg extends AggValueSource {
       // ...and in which case we should just use the current base
       final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ);
 
-      final BucketData slotVal = new BucketData();
+      final BucketData slotVal = new BucketData(agg);
       slotVal.incSizes(fgSize, bgSize);
       slotVal.incCounts(fgSet.intersectionSize(slotSet),
                         bgSet.intersectionSize(slotSet));
@@ -232,7 +249,7 @@ public class RelatednessAgg extends AggValueSource {
       if (null == slotVal) {
         // since we haven't been told about any docs for this slot, use a slot w/no counts,
         // just the known fg/bg sizes. (this is most likely a refinement request for a bucket
we dont have)
-        slotVal = new BucketData();
+        slotVal = new BucketData(agg);
         slotVal.incSizes(fgSize, bgSize);
       }
 
@@ -263,15 +280,31 @@ public class RelatednessAgg extends AggValueSource {
    * @see Merger
    */
   private static final class BucketData implements Comparable<BucketData> {
-    
+    private RelatednessAgg agg;
     private long fg_size = 0;
     private long bg_size = 0;
     private long fg_count = 0;
     private long bg_count = 0;
+    
+    /** 
+     * NaN indicates that <b>all</a> derived values need (re)-computed
+     * @see #computeDerivedValues
+     * @see #getRelatedness
+     */
     private double relatedness = Double.NaN;
+    /** 
+     * @see #computeDerivedValues 
+     * @see #getForegroundPopularity
+     */
+    private double fg_pop;
+    /** 
+     * @see #computeDerivedValues
+     * @see #getBackgroundPopularity
+     */
+    private double bg_pop;
     
-    public BucketData() {
-      /* No-Op */
+    public BucketData(final RelatednessAgg agg) {
+      this.agg = agg;
     }
 
     /** 
@@ -295,7 +328,7 @@ public class RelatednessAgg extends AggValueSource {
     
     @Override
     public int hashCode() {
-      return Objects.hash(this.getClass(), fg_count, bg_count, fg_size, bg_size);
+      return Objects.hash(this.getClass(), fg_count, bg_count, fg_size, bg_size, agg);
     }
     
     @Override
@@ -308,24 +341,45 @@ public class RelatednessAgg extends AggValueSource {
       return Objects.equals(this.fg_count, that.fg_count)
         && Objects.equals(this.bg_count, that.bg_count)
         && Objects.equals(this.fg_size, that.fg_size)
-        && Objects.equals(this.bg_size, that.bg_size);
+        && Objects.equals(this.bg_size, that.bg_size)
+        && Objects.equals(this.agg, that.agg);
     }
 
     /**
-     * Computes (and caches) the derived relatedness score for this bucket
+     * Computes (and caches) the derived relatedness &amp; popularity scores for this
bucket if needed
      */
-    private double getRelatedness() {
-      if (Double.isNaN(this.relatedness)) {
-        this.relatedness = computeRelatedness(this.fg_count, this.fg_size,
-                                              this.bg_count, this.bg_size);
-        // TODO: add support for a "min_pop" option...
-        //
+    private void computeDerivedValues() {
+      if (! Double.isNaN(this.relatedness)) {
+        return; // values already computed;
+      }
+
+      this.fg_pop = roundTo5Digits((double) fg_count / bg_size); // yes, BACKGROUND size
is intentional
+      this.bg_pop = roundTo5Digits((double) bg_count / bg_size);
+      
+      if (0.0D < agg.min_pop) {
         // if min_pop is configured, and either (fg|bg) popularity is lower then that value
-        // then "this.relatedness=-Infinity" so it sorts at the bottom
-        // this logic be ignored on isShard requests -- similar to how shards ignore 'mincount'
+        // then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
+        if (fg_pop < agg.min_pop || bg_pop < agg.min_pop) {
+          this.relatedness = Double.NEGATIVE_INFINITY;
+          return;
+        }
       }
+      
+      this.relatedness = computeRelatedness(this.fg_count, this.fg_size,
+                                            this.bg_count, this.bg_size);
+    }
+    private double getRelatedness() {
+      computeDerivedValues();
       return this.relatedness;
     }
+    private double getForegroundPopularity() {
+      computeDerivedValues();
+      return this.fg_pop;
+    }
+    private double getBackgroundPopularity() {
+      computeDerivedValues();
+      return this.bg_pop;
+    }
     
     @Override
     public int compareTo(BucketData that) {
@@ -364,8 +418,8 @@ public class RelatednessAgg extends AggValueSource {
         // there's no need to bother computing these when returning results *to* a shard
coordinator
         // only useful to external clients 
         result.add(RELATEDNESS, this.getRelatedness());
-        result.add(FG_POP, roundTo5Digits((double) fg_count / bg_size)); // yes, BACKGROUND
size is intentional
-        result.add(BG_POP, roundTo5Digits((double) bg_count / bg_size));
+        result.add(FG_POP, this.getForegroundPopularity());
+        result.add(BG_POP, this.getBackgroundPopularity());
       }
       
       return result;
@@ -376,7 +430,10 @@ public class RelatednessAgg extends AggValueSource {
    * Merges in the per shard {@link BucketData} output into a unified {@link BucketData}
    */
   private static final class Merger extends FacetSortableMerger {
-    private final BucketData mergedData = new BucketData();
+    private final BucketData mergedData;
+    public Merger(final RelatednessAgg agg) {
+      this.mergedData = new BucketData(agg);
+    }
     
     @Override
     public void merge(Object facetResult, Context mcontext) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
index ed4b8bf..0a931bf 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java
@@ -879,10 +879,10 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
     initServers();
     Client client = servers.getClient(random().nextInt());
     client.queryDefaults().set("shards", servers.getShards(), "debugQuery", Boolean.toString(random().nextBoolean()));
-
+    
     List<SolrClient> clients = client.getClientProvider().all();
     assertTrue(clients.size() >= 3);
-
+    
     client.deleteByQuery("*:*", null);
 
     String cat_s = p.get("cat_s");
@@ -926,7 +926,7 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
       } else {
         p.set("terms", "method:" + method+", ");
       }
-
+      
 
       client.testJQ(params(p, "q", "*:*",
           "json.facet", "{" +
@@ -1044,14 +1044,38 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
       );
 
       // test that SKG stat reflects merged refinement
+      // results shouldn't care if we use the short or long syntax, or if we have a low min_pop
+      for (String s : Arrays.asList("'relatedness($fore,$back)'",
+                                    "{ type:func, func:'relatedness($fore,$back)' }",
+                                    "{ type:func, func:'relatedness($fore,$back)', min_popularity:0.2
}")) {
+        client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0
TO 100]",
+                             "json.facet", "{"
+                             + "   cat0:{ ${terms} type:terms, field: ${cat_s}, "
+                             + "          sort:'count desc', limit:1, overrequest:0, refine:true,
"
+                             + "          facet:{ s:"+s+"} } }")
+                      , "facets=={ count:8, cat0:{ buckets:[ "
+                      + "   { val:A, count:4, "
+                      + "     s : { relatedness: 0.00496, "
+                      //+ "           foreground_count: 3, "
+                      //+ "           foreground_size: 5, "
+                      //+ "           background_count: 2, "
+                      //+ "           background_size: 4, "
+                      + "           foreground_popularity: 0.75, "
+                      + "           background_popularity: 0.5, "
+                      + "         } } ] }" +
+                      "}"
+                      );
+      }
+      // same query with a high min_pop should result in a -Infinity relatedness score
       client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0
TO 100]",
                            "json.facet", "{"
                            + "   cat0:{ ${terms} type:terms, field: ${cat_s}, "
                            + "          sort:'count desc', limit:1, overrequest:0, refine:true,
"
-                           + "          facet:{ s:'relatedness($fore,$back)'} } }")
+                           + "          facet:{ s:{ type:func, func:'relatedness($fore,$back)',
"
+                           + "                      min_popularity:0.6 } } } }")
                     , "facets=={ count:8, cat0:{ buckets:[ "
                     + "   { val:A, count:4, "
-                    + "     s : { relatedness: 0.00496, "
+                    + "     s : { relatedness: '-Infinity', "
                     //+ "           foreground_count: 3, "
                     //+ "           foreground_size: 5, "
                     //+ "           background_count: 2, "
@@ -1061,7 +1085,7 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
                     + "         } } ] }" +
                     "}"
                     );
-      
+
       // SKG under nested facet where some terms only exist on one shard
       { 
         // sub-bucket order should change as sort direction changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index fa6a04e..0035b16 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -519,7 +519,61 @@ public class TestJsonFacets extends SolrTestCaseHS {
              + "                       background_size: 6, "
              + "            } }, "
              + "   ] } } ] } } ");
+
     
+    // SKG w/min_pop (NOTE: incredibly contrived and not-useful fore/back for testing min_pop
w/shard sorting)
+    //
+    // we'll re-use these params in 2 requests, one will simulate a shard request
+    final SolrParams minPopSKG = params
+      ("q", "cat_s:[* TO *]", "rows", "0", "fore", "num_i:[0 TO 1000]", "back", "cat_s:B",
"json.facet"
+       , "{x: { type: terms, field: 'cat_s', sort: 'skg desc', "
+       + "      facet: { skg: { type:func, func:'relatedness($fore,$back)', "
+       + "                      min_popularity: 0.001 }" 
+       + "             } } }");
+
+    // plain old request
+    assertJQ(req(minPopSKG)
+             , "facets=={count:5, x:{ buckets:["
+             + "   { val:'B', count:3, "
+             + "     skg : { relatedness: -1.0, "
+             //+ "             foreground_count: 1, "
+             //+ "             foreground_size: 3, "
+             //+ "             background_count: 3, "
+             //+ "             background_size: 3, "
+             + "             foreground_popularity: 0.33333," 
+             + "             background_popularity: 1.0," 
+             + "   } }, "
+             + "   { val:'A', count:2, "
+             + "     skg : { relatedness:'-Infinity', " // bg_pop is below min_pop (otherwise
1.0)
+             //+ "             foreground_count: 2, "
+             //+ "             foreground_size: 3, "
+             //+ "             background_count: 0, "
+             //+ "             background_size: 3, "
+             + "             foreground_popularity: 0.66667,"
+             + "             background_popularity: 0.0,"
+             + "   } } ] } } ");
+
+    // same request, but with whitebox params testing isShard
+    // to verify the raw counts/sizes and that per-shard sorting doesn't pre-emptively sort
"A" to the bottom
+    assertJQ(req(minPopSKG,
+                 // fake an initial shard request
+                 "distrib", "false", "isShard", "true", "_facet_", "{}",
+                 "shards.purpose", ""+FacetModule.PURPOSE_GET_JSON_FACETS)
+             , "facets=={count:5, x:{ buckets:["
+             + "   { val:'A', count:2, "
+             + "     skg : { " 
+             + "             foreground_count: 2, "
+             + "             foreground_size: 3, "
+             + "             background_count: 0, "
+             + "             background_size: 3, "
+             + "   } }, "
+             + "   { val:'B', count:3, "
+             + "     skg : { "
+             + "             foreground_count: 1, "
+             + "             foreground_size: 3, "
+             + "             background_count: 3, "
+             + "             background_size: 3, "
+             + "   } } ] } }");
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71c0bddd/solr/solr-ref-guide/src/json-facet-api.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/json-facet-api.adoc b/solr/solr-ref-guide/src/json-facet-api.adoc
index b9a8a09..008b1a2 100644
--- a/solr/solr-ref-guide/src/json-facet-api.adoc
+++ b/solr/solr-ref-guide/src/json-facet-api.adoc
@@ -387,7 +387,7 @@ The most common way of requesting an aggregation function is as a simple
contain
 }
 ----
 
-An expanded form allows for <<local-parameters-in-queries.adoc#local-parameters-in-queries,Local
Parameters>> to be specified.  These may be used explicitly by some custom aggregations,
but can more commonly be used as parameter references to make aggregation expressions more
readable, with out needing to use (global) request parameters:
+An expanded form allows for <<local-parameters-in-queries.adoc#local-parameters-in-queries,Local
Parameters>> to be specified.  These may be used explicitly by some specialized aggregations
such as `<<json-facet-api.adoc#relatedness-options,relatedness()>>`, but can also
be used as parameter references to make aggregation expressions more readable, with out needing
to use (global) request parameters:
 
 [source,javascript]
 ----
@@ -739,6 +739,23 @@ The `relatedness(...)` function is used to "score" these relationships,
relative
 
 Unlike most aggregation functions, the `relatedness(...)` function is aware of whether and
how it's used in <<nested-facets,Nested Facets>>.  It evaluates the query defining
the current bucket _independently_ from it's parent/ancestor buckets, and intersects those
documents with a "Foreground Set" defined by the foreground query _combined with the ancestor
buckets_.  The result is then compared to a similar intersection done against the "Background
Set" (defined exclusively by background query) to see if there is a positive, or negative,
correlation between the current bucket and the Foreground Set, relative to the Background
Set.
 
+NOTE: While it's very common to define the Background Set as `\*:*`, or some other super-set
of the Foreground Query, it is not strictly required.  The `relatedness(...)` function can
be used to compare the statistical relatedness of sets of documents to orthogonal foreground/background
queries.
+
+[[relatedness-options]]
+=== `relatedness()` Options
+
+When using the extended `type:func` syntax for specifying a `relatedness()` aggregation,
an opional `min_popularity` (float) option can be used to specify a lower bound on the `foreground_popularity`
and `background_popularity` values, that must be met in order for the `relatedness` score
to be valid -- If this `min_popularity` is not met, then the `relatedness` score will be `-Infinity`.
+
+[source,javascript]
+----
+{ "type": "func",
+  "func": "relatedness($fore,$back)",
+  "min_popularity": 0.001,
+}
+----
+
+This can be particularly useful when using a descending sorting on `relatedness()` with foreground
and background queries that are disjoint, to ensure the "top buckets" are all relevant to
both sets.
+
 === Semantic Knowledge Graph Example
 
 .Sample Documents
@@ -843,7 +860,6 @@ curl -sS -X POST http://localhost:8983/solr/gettingstarted/query -d 'rows=0&q=*:
 <6> The number documents matching `age:[35 TO *]` _and_ `hobbies:golf` _and_ `state:AZ`
is 18.75% of the total number of documents in the Background Set
 <7> 50% of the documents in the Background Set match `state:AZ`
 
-NOTE: While it's very common to define the Background Set as `\*:*`, or some other super-set
of the Foreground Query, it is not strictly required.  The `relatedness(...)` function can
be used to compare the statistical relatedness of sets of documents to orthogonal foreground/background
queries.
 
 [[References]]
 == References


Mime
View raw message