lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cpoersc...@apache.org
Subject [8/8] lucene-solr:master: SOLR-8542: Adds Solr Learning to Rank (LTR) plugin for reranking results with machine learning models. (Michael Nilsson, Diego Ceccarelli, Joshua Pantony, Jon Dorando, Naveen Santhapuri, Alessandro Benedetti, David Grohmann, Chr
Date Tue, 01 Nov 2016 19:38:50 GMT
SOLR-8542: Adds Solr Learning to Rank (LTR) plugin for reranking results with machine learning
models. (Michael Nilsson, Diego Ceccarelli, Joshua Pantony, Jon Dorando, Naveen Santhapuri,
Alessandro Benedetti, David Grohmann, Christine Poerschke)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/5a66b3bc
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5a66b3bc
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5a66b3bc

Branch: refs/heads/master
Commit: 5a66b3bc089e4b3e73b1c41c4cdcd89b183b85e7
Parents: b6ff3fd
Author: Christine Poerschke <cpoerschke@apache.org>
Authored: Tue Nov 1 17:50:14 2016 +0000
Committer: Christine Poerschke <cpoerschke@apache.org>
Committed: Tue Nov 1 17:50:14 2016 +0000

----------------------------------------------------------------------
 dev-tools/idea/.idea/modules.xml                |    1 +
 dev-tools/idea/solr/contrib/ltr/ltr.iml         |   37 +
 solr/CHANGES.txt                                |    3 +
 solr/contrib/ltr/README.md                      |  406 +++++
 solr/contrib/ltr/README.txt                     |    1 +
 solr/contrib/ltr/build.xml                      |   30 +
 solr/contrib/ltr/example/config.json            |   14 +
 solr/contrib/ltr/example/libsvm_formatter.py    |  124 ++
 solr/contrib/ltr/example/solrconfig.xml         | 1722 ++++++++++++++++++
 .../ltr/example/techproducts-features.json      |   26 +
 .../contrib/ltr/example/techproducts-model.json |   18 +
 .../ltr/example/train_and_upload_demo_model.py  |  163 ++
 solr/contrib/ltr/example/user_queries.txt       |    8 +
 solr/contrib/ltr/ivy.xml                        |   32 +
 .../src/java/org/apache/solr/ltr/DocInfo.java   |   42 +
 .../java/org/apache/solr/ltr/FeatureLogger.java |  193 ++
 .../java/org/apache/solr/ltr/LTRRescorer.java   |  249 +++
 .../org/apache/solr/ltr/LTRScoringQuery.java    |  738 ++++++++
 .../org/apache/solr/ltr/LTRThreadModule.java    |  163 ++
 .../solr/ltr/SolrQueryRequestContextUtils.java  |   83 +
 .../org/apache/solr/ltr/feature/Feature.java    |  335 ++++
 .../solr/ltr/feature/FeatureException.java      |   31 +
 .../solr/ltr/feature/FieldLengthFeature.java    |  152 ++
 .../solr/ltr/feature/FieldValueFeature.java     |  141 ++
 .../solr/ltr/feature/OriginalScoreFeature.java  |  118 ++
 .../apache/solr/ltr/feature/SolrFeature.java    |  320 ++++
 .../apache/solr/ltr/feature/ValueFeature.java   |  148 ++
 .../apache/solr/ltr/feature/package-info.java   |   21 +
 .../apache/solr/ltr/model/LTRScoringModel.java  |  298 +++
 .../org/apache/solr/ltr/model/LinearModel.java  |  147 ++
 .../apache/solr/ltr/model/ModelException.java   |   31 +
 .../ltr/model/MultipleAdditiveTreesModel.java   |  377 ++++
 .../org/apache/solr/ltr/model/package-info.java |   21 +
 .../solr/ltr/norm/IdentityNormalizer.java       |   53 +
 .../apache/solr/ltr/norm/MinMaxNormalizer.java  |  107 ++
 .../org/apache/solr/ltr/norm/Normalizer.java    |   64 +
 .../solr/ltr/norm/NormalizerException.java      |   31 +
 .../solr/ltr/norm/StandardNormalizer.java       |   99 +
 .../org/apache/solr/ltr/norm/package-info.java  |   23 +
 .../java/org/apache/solr/ltr/package-info.java  |   45 +
 .../org/apache/solr/ltr/store/FeatureStore.java |   67 +
 .../org/apache/solr/ltr/store/ModelStore.java   |   74 +
 .../org/apache/solr/ltr/store/package-info.java |   21 +
 .../ltr/store/rest/ManagedFeatureStore.java     |  215 +++
 .../solr/ltr/store/rest/ManagedModelStore.java  |  319 ++++
 .../solr/ltr/store/rest/package-info.java       |   22 +
 .../LTRFeatureLoggerTransformerFactory.java     |  254 +++
 .../solr/response/transform/package-info.java   |   23 +
 .../apache/solr/search/LTRQParserPlugin.java    |  233 +++
 .../org/apache/solr/search/package-info.java    |   23 +
 solr/contrib/ltr/src/java/overview.html         |   91 +
 .../featureExamples/comp_features.json          |   37 +
 .../featureExamples/external_features.json      |   51 +
 ...external_features_for_sparse_processing.json |   18 +
 .../featureExamples/features-linear-efi.json    |   17 +
 .../featureExamples/features-linear.json        |   51 +
 .../features-store-test-model.json              |   51 +
 .../test-files/featureExamples/fq_features.json |   16 +
 .../multipleadditivetreesmodel_features.json    |   16 +
 .../contrib/ltr/src/test-files/log4j.properties |   32 +
 .../modelExamples/external_model.json           |   12 +
 .../modelExamples/external_model_store.json     |   13 +
 .../src/test-files/modelExamples/fq-model.json  |   20 +
 .../modelExamples/linear-model-efi.json         |   14 +
 .../test-files/modelExamples/linear-model.json  |   30 +
 .../multipleadditivetreesmodel.json             |   38 +
 ...tivetreesmodel_external_binary_features.json |   38 +
 .../multipleadditivetreesmodel_no_feature.json  |   24 +
 .../multipleadditivetreesmodel_no_features.json |   14 +
 .../multipleadditivetreesmodel_no_left.json     |   22 +
 .../multipleadditivetreesmodel_no_params.json   |    8 +
 .../multipleadditivetreesmodel_no_right.json    |   22 +
 ...multipleadditivetreesmodel_no_threshold.json |   24 +
 .../multipleadditivetreesmodel_no_tree.json     |   15 +
 .../multipleadditivetreesmodel_no_trees.json    |   10 +
 .../multipleadditivetreesmodel_no_weight.json   |   24 +
 .../test-files/solr/collection1/conf/schema.xml |   88 +
 .../solr/collection1/conf/solrconfig-ltr.xml    |   65 +
 .../collection1/conf/solrconfig-ltr_Th10_10.xml |   69 +
 .../collection1/conf/solrconfig-multiseg.xml    |   62 +
 .../solr/collection1/conf/stopwords.txt         |   16 +
 .../solr/collection1/conf/synonyms.txt          |   28 +
 solr/contrib/ltr/src/test-files/solr/solr.xml   |   42 +
 .../org/apache/solr/ltr/TestLTROnSolrCloud.java |  211 +++
 .../apache/solr/ltr/TestLTRQParserExplain.java  |  152 ++
 .../apache/solr/ltr/TestLTRQParserPlugin.java   |  114 ++
 .../solr/ltr/TestLTRReRankingPipeline.java      |  300 +++
 .../apache/solr/ltr/TestLTRScoringQuery.java    |  319 ++++
 .../org/apache/solr/ltr/TestLTRWithFacet.java   |  103 ++
 .../org/apache/solr/ltr/TestLTRWithSort.java    |  102 ++
 .../solr/ltr/TestParallelWeightCreation.java    |   77 +
 .../org/apache/solr/ltr/TestRerankBase.java     |  429 +++++
 .../solr/ltr/TestSelectiveWeightCreation.java   |  251 +++
 .../ltr/feature/TestEdisMaxSolrFeature.java     |   76 +
 .../solr/ltr/feature/TestExternalFeatures.java  |  157 ++
 .../ltr/feature/TestExternalValueFeatures.java  |   86 +
 ...stFeatureExtractionFromMultipleSegments.java |  105 ++
 .../solr/ltr/feature/TestFeatureLogging.java    |  254 +++
 .../ltr/feature/TestFeatureLtrScoringModel.java |   71 +
 .../solr/ltr/feature/TestFeatureStore.java      |  106 ++
 .../ltr/feature/TestFieldLengthFeature.java     |  156 ++
 .../solr/ltr/feature/TestFieldValueFeature.java |  173 ++
 .../solr/ltr/feature/TestFilterSolrFeature.java |  105 ++
 .../ltr/feature/TestNoMatchSolrFeature.java     |  192 ++
 .../ltr/feature/TestOriginalScoreFeature.java   |  148 ++
 .../solr/ltr/feature/TestRankingFeature.java    |  123 ++
 .../ltr/feature/TestUserTermScoreWithQ.java     |   74 +
 .../ltr/feature/TestUserTermScorerQuery.java    |   74 +
 .../ltr/feature/TestUserTermScorereQDF.java     |   75 +
 .../solr/ltr/feature/TestValueFeature.java      |  165 ++
 .../apache/solr/ltr/model/TestLinearModel.java  |  207 +++
 .../model/TestMultipleAdditiveTreesModel.java   |  246 +++
 .../solr/ltr/norm/TestMinMaxNormalizer.java     |  120 ++
 .../solr/ltr/norm/TestStandardNormalizer.java   |  132 ++
 .../ltr/store/rest/TestManagedFeatureStore.java |   36 +
 .../solr/ltr/store/rest/TestModelManager.java   |  163 ++
 .../store/rest/TestModelManagerPersistence.java |  121 ++
 117 files changed, 14167 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/dev-tools/idea/.idea/modules.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml
index 6fbe496..5d2d106 100644
--- a/dev-tools/idea/.idea/modules.xml
+++ b/dev-tools/idea/.idea/modules.xml
@@ -60,6 +60,7 @@
       <module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/uima/uima.iml"
/>
       <module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/velocity/velocity.iml"
/>
       <module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/analytics/analytics.iml"
/>
+      <module group="Solr/Contrib" filepath="$PROJECT_DIR$/solr/contrib/ltr/ltr.iml" />
     </modules>
   </component>
 </project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/dev-tools/idea/solr/contrib/ltr/ltr.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/solr/contrib/ltr/ltr.iml b/dev-tools/idea/solr/contrib/ltr/ltr.iml
new file mode 100644
index 0000000..efc505d
--- /dev/null
+++ b/dev-tools/idea/solr/contrib/ltr/ltr.iml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="false">
+    <output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/ltr/classes/java"
/>
+    <output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/ltr/classes/test"
/>
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource"
/>
+      <sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" scope="TEST" name="JUnit" level="project" />
+    <orderEntry type="library" name="Solr core library" level="project" />
+    <orderEntry type="library" name="Solrj library" level="project" />
+    <orderEntry type="module-library">
+      <library>
+        <CLASSES>
+          <root url="file://$MODULE_DIR$/lib" />
+        </CLASSES>
+        <JAVADOC />
+        <SOURCES />
+        <jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
+      </library>
+    </orderEntry>
+    <orderEntry type="library" scope="TEST" name="Solr example library" level="project"
/>
+    <orderEntry type="library" scope="TEST" name="Solr core test library" level="project"
/>
+    <orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
+    <orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
+    <orderEntry type="module" module-name="solr-core" />
+    <orderEntry type="module" module-name="solrj" />
+    <orderEntry type="module" module-name="lucene-core" />
+    <orderEntry type="module" module-name="analysis-common" />
+  </component>
+</module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index fd4d2af..16cae8c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -93,6 +93,9 @@ New Features
   SOLR_HOME on every node. Editing config through API is supported but affects only that
one node. 
   (janhoy)
 
+* SOLR-8542: Adds Solr Learning to Rank (LTR) plugin for reranking results with machine learning
models.
+  (Michael Nilsson, Diego Ceccarelli, Joshua Pantony, Jon Dorando, Naveen Santhapuri, Alessandro
Benedetti, David Grohmann, Christine Poerschke)
+
 Optimizations
 ----------------------
 * SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/contrib/ltr/README.md
----------------------------------------------------------------------
diff --git a/solr/contrib/ltr/README.md b/solr/contrib/ltr/README.md
new file mode 100644
index 0000000..5fe0087
--- /dev/null
+++ b/solr/contrib/ltr/README.md
@@ -0,0 +1,406 @@
+Apache Solr Learning to Rank
+========
+
+This is the main [learning to rank integrated into solr](http://www.slideshare.net/lucidworks/learning-to-rank-in-solr-presented-by-michael-nilsson-diego-ceccarelli-bloomberg-lp)
+repository.
+[Read up on learning to rank](https://en.wikipedia.org/wiki/Learning_to_rank)
+
+Apache Solr Learning to Rank (LTR) provides a way for you to extract features
+directly inside Solr for use in training a machine learned model.  You can then
+deploy that model to Solr and use it to rerank your top X search results.
+
+# Test the plugin with solr/example/techproducts in a few easy steps!
+
+Solr provides some simple example of indices. In order to test the plugin with
+the techproducts example please follow these steps.
+
+1. Compile solr and the examples
+
+    `cd solr`
+    `ant dist`
+    `ant server`
+
+2. Run the example to setup the index
+
+   `./bin/solr -e techproducts`
+
+3. Stop solr and install the plugin:
+     1. Stop solr
+
+        `./bin/solr stop`
+     2. Create the lib folder
+
+        `mkdir example/techproducts/solr/techproducts/lib`
+     3. Install the plugin in the lib folder
+
+        `cp build/contrib/ltr/solr-ltr-7.0.0-SNAPSHOT.jar example/techproducts/solr/techproducts/lib/`
+     4. Replace the original solrconfig with one importing all the ltr components
+
+        `cp contrib/ltr/example/solrconfig.xml example/techproducts/solr/techproducts/conf/`
+
+4. Run the example again
+
+   `./bin/solr -e techproducts`
+
+   Note you could also have just restarted your collection using the admin page.
+   You can find more detailed instructions [here](https://wiki.apache.org/solr/SolrPlugins).
+
+5. Deploy features and a model
+
+      `curl -XPUT 'http://localhost:8983/solr/techproducts/schema/feature-store'  --data-binary
"@./contrib/ltr/example/techproducts-features.json"  -H 'Content-type:application/json'`
+
+      `curl -XPUT 'http://localhost:8983/solr/techproducts/schema/model-store'  --data-binary
"@./contrib/ltr/example/techproducts-model.json"  -H 'Content-type:application/json'`
+
+6. Have fun !
+
+     * Access to the default feature store
+
+       http://localhost:8983/solr/techproducts/schema/feature-store/\_DEFAULT\_
+     * Access to the model store
+
+       http://localhost:8983/solr/techproducts/schema/model-store
+     * Perform a reranking query using the model, and retrieve the features
+
+       http://localhost:8983/solr/techproducts/query?indent=on&q=test&wt=json&rq={!ltr%20model=linear%20reRankDocs=25%20efi.user_query=%27test%27}&fl=[features],price,score,name
+
+
+BONUS: Train an actual machine learning model
+
+1. Download and install [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/)
+
+2. Change `contrib/ltr/example/config.json` "trainingLibraryLocation" to point to the train
directory where you installed liblinear.
+
+3. Extract features, train a reranking model, and deploy it to Solr.
+
+  `cd  contrib/ltr/example`
+
+  `python  train_and_upload_demo_model.py -c config.json`
+
+   This script deploys your features from `config.json` "featuresFile" to Solr.  Then it
takes the relevance judged query
+   document pairs of "userQueriesFile" and merges it with the features extracted from Solr
into a training
+   file.  That file is used to train a linear model, which is then deployed to Solr for you
to rerank results.
+
+4. Search and rerank the results using the trained model
+
+   http://localhost:8983/solr/techproducts/query?indent=on&q=test&wt=json&rq={!ltr%20model=ExampleModel%20reRankDocs=25%20efi.user_query=%27test%27}&fl=price,score,name
+
+# Changes to solrconfig.xml
+```xml
+<config>
+  ...
+
+  <!-- Query parser used to rerank top docs with a provided model -->
+  <queryParser name="ltr" class="org.apache.solr.search.LTRQParserPlugin" />
+
+  <!--  Transformer that will encode the document features in the response.
+  For each document the transformer will add the features as an extra field
+  in the response. The name of the field will be the the name of the
+  transformer enclosed between brackets (in this case [features]).
+  In order to get the feature vector you will have to
+  specify that you want the field (e.g., fl="*,[features])  -->
+
+  <transformer name="features" class="org.apache.solr.response.transform.LTRFeatureLoggerTransformerFactory"
/>
+
+  <query>
+    ...
+
+    <!-- Cache for storing and fetching feature vectors -->
+    <cache name="QUERY_DOC_FV"
+      class="solr.search.LRUCache"
+      size="4096"
+      initialSize="2048"
+      autowarmCount="4096"
+      regenerator="solr.search.NoOpRegenerator" />
+  </query>
+
+</config>
+
+```
+
+# Defining Features
+In the learning to rank plugin, you can define features in a feature space
+using standard Solr queries. As an example:
+
+###### features.json
+```json
+[
+{ "name": "isBook",
+  "class": "org.apache.solr.ltr.feature.SolrFeature",
+  "params":{ "fq": ["{!terms f=category}book"] }
+},
+{
+  "name":  "documentRecency",
+  "class": "org.apache.solr.ltr.feature.SolrFeature",
+  "params": {
+      "q": "{!func}recip( ms(NOW,publish_date), 3.16e-11, 1, 1)"
+  }
+},
+{
+  "name":"originalScore",
+  "class":"org.apache.solr.ltr.feature.OriginalScoreFeature",
+  "params":{}
+},
+{
+  "name" : "userTextTitleMatch",
+  "class" : "org.apache.solr.ltr.feature.SolrFeature",
+  "params" : { "q" : "{!field f=title}${user_text}" }
+},
+ {
+   "name" : "userFromMobile",
+   "class" : "org.apache.solr.ltr.feature.ValueFeature",
+   "params" : { "value" : "${userFromMobile}", "required":true }
+ }
+]
+```
+
+Defines five features. Anything that is a valid Solr query can be used to define
+a feature.
+
+### Filter Query Features
+The first feature isBook fires if the term 'book' matches the category field
+for the given examined document. Since in this feature q was not specified,
+either the score 1 (in case of a match) or the score 0 (in case of no match)
+will be returned.
+
+### Query Features
+In the second feature (documentRecency) q was specified using a function query.
+In this case the score for the feature on a given document is whatever the query
+returns (1 for docs dated now, 1/2 for docs dated 1 year ago, 1/3 for docs dated
+2 years ago, etc..) . If both an fq and q is used, documents that don't match
+the fq will receive a score of 0 for the documentRecency feature, all other
+documents will receive the score specified by the query for this feature.
+
+### Original Score Feature
+The third feature (originalScore) has no parameters, and uses the
+OriginalScoreFeature class instead of the SolrFeature class.  Its purpose is
+to simply return the score for the original search request against the current
+matching document.
+
+### External Features
+Users can specify external information that can to be passed in as
+part of the query to the ltr ranking framework. In this case, the
+fourth feature (userTextPhraseMatch) will be looking for an external field
+called 'user_text' passed in through the request, and will fire if there is
+a term match for the document field 'title' from the value of the external
+field 'user_text'.  You can provide default values for external features as
+well by specifying ${myField:myDefault}, similar to how you would in a Solr config.
+In this case, the fifth feature (userFromMobile) will be looking for an external parameter
+called 'userFromMobile' passed in through the request, if the ValueFeature is :
+required=true, it will throw an exception if the external feature is not passed
+required=false, it will silently ignore the feature and avoid the scoring ( at Document scoring
time, the model will consider 0 as feature value)
+The advantage in defining a feature as not required, where possible, is to avoid wasting
caching space and time in calculating the featureScore.
+See the [Run a Rerank Query](#run-a-rerank-query) section for how to pass in external information.
+
+### Custom Features
+Custom features can be created by extending from
+org.apache.solr.ltr.feature.Feature, however this is generally not recommended.
+The majority of features should be possible to create using the methods described
+above.
+
+# Defining Models
+Currently the Learning to Rank plugin supports 2 generalized forms of
+models: 1. Linear Model i.e. [RankSVM](http://www.cs.cornell.edu/people/tj/publications/joachims_02c.pdf),
[Pranking](https://papers.nips.cc/paper/2023-pranking-with-ranking.pdf)
+and 2. Multiple Additive Trees i.e. [LambdaMART](http://research.microsoft.com/pubs/132652/MSR-TR-2010-82.pdf),
[Gradient Boosted Regression Trees (GBRT)](https://papers.nips.cc/paper/3305-a-general-boosting-method-and-its-application-to-learning-ranking-functions-for-web-search.pdf)
+
+### Linear
+If you'd like to introduce a bias set a constant feature
+to the bias value you'd like and make a weight of 1.0 for that feature.
+
+###### model.json
+```json
+{
+    "class":"org.apache.solr.ltr.model.LinearModel",
+    "name":"myModelName",
+    "features":[
+        { "name": "userTextTitleMatch"},
+        { "name": "originalScore"},
+        { "name": "isBook"}
+    ],
+    "params":{
+        "weights": {
+            "userTextTitleMatch": 1.0,
+            "originalScore": 0.5,
+            "isBook": 0.1
+        }
+
+    }
+}
+```
+
+This is an example of a toy Linear model. Class specifies the class to be
+using to interpret the model. Name is the model identifier you will use 
+when making request to the ltr framework. Features specifies the feature 
+space that you want extracted when using this model. All features that 
+appear in the model params will be used for scoring and must appear in 
+the features list.  You can add extra features to the features list that 
+will be computed but not used in the model for scoring, which can be useful 
+for logging. Params are the Linear parameters.
+
+Good library for training SVM, an example of a Linear model, is 
+(https://www.csie.ntu.edu.tw/~cjlin/liblinear/ , https://www.csie.ntu.edu.tw/~cjlin/libsvm/)
. 
+You will need to convert the libSVM model format to the format specified above.
+
+### Multiple Additive Trees
+
+###### model2.json
+```json
+{
+    "class":"org.apache.solr.ltr.model.MultipleAdditiveTreesModel",
+    "name":"multipleadditivetreesmodel",
+    "features":[
+        { "name": "userTextTitleMatch"},
+        { "name": "originalScore"}
+    ],
+    "params":{
+        "trees": [
+            {
+                "weight" : 1,
+                "root": {
+                    "feature": "userTextTitleMatch",
+                    "threshold": 0.5,
+                    "left" : {
+                        "value" : -100
+                    },
+                    "right": {
+                        "feature" : "originalScore",
+                        "threshold": 10.0,
+                        "left" : {
+                            "value" : 50
+                        },
+                        "right" : {
+                            "value" : 75
+                        }
+                    }
+                }
+            },
+            {
+                "weight" : 2,
+                "root": {
+                    "value" : -10
+                }
+            }
+        ]
+    }
+}
+```
+This is an example of a toy Multiple Additive Trees. Class specifies the class to be using
to
+interpret the model. Name is the
+model identifier you will use when making request to the ltr framework.
+Features specifies the feature space that you want extracted when using this
+model. All features that appear in the model params will be used for scoring and
+must appear in the features list.  You can add extra features to the features
+list that will be computed but not used in the model for scoring, which can
+be useful for logging. Params are the Multiple Additive Trees specific parameters. In this
+case we have 2 trees, one with 3 leaf nodes and one with 1 leaf node.
+
+A good library for training LambdaMART, an example of Multiple Additive Trees, is ( http://sourceforge.net/p/lemur/wiki/RankLib/
).
+You will need to convert the RankLib model format to the format specified above.
+
+# Deploy Models and Features
+To send features run
+
+`curl -XPUT 'http://localhost:8983/solr/collection1/schema/feature-store' --data-binary @/path/features.json
-H 'Content-type:application/json'`
+
+To send models run
+
+`curl -XPUT 'http://localhost:8983/solr/collection1/schema/model-store' --data-binary @/path/model.json
-H 'Content-type:application/json'`
+
+
+# View Models and Features
+`curl -XGET 'http://localhost:8983/solr/collection1/schema/feature-store'`
+
+`curl -XGET 'http://localhost:8983/solr/collection1/schema/model-store'`
+
+# Run a Rerank Query
+Add to your original solr query
+`rq={!ltr model=myModelName reRankDocs=25}`
+
+The model name is the name of the model you sent to solr earlier.
+The number of documents you want reranked, which can be larger than the
+number you display, is reRankDocs.
+
+### Pass in external information for external features
+Add to your original solr query
+`rq={!ltr reRankDocs=3 model=externalmodel efi.field1='text1' efi.field2='text2'}`
+
+Where "field1" specifies the name of the customized field to be used by one
+or more of your features, and text1 is the information to be pass in. As an
+example that matches the earlier shown userTextTitleMatch feature one could do:
+
+`rq={!ltr reRankDocs=3 model=externalmodel efi.user_text='Casablanca' efi.user_intent='movie'}`
+
+# Extract features
+To extract features you need to use the feature vector transformer `features`
+
+`fl=*,score,[features]&rq={!ltr model=yourModel reRankDocs=25}`
+
+If you use `[features]` together with your reranking model, it will return
+the array of features used by your model. Otherwise you can just ask solr to
+produce the features without doing the reranking:
+
+`fl=*,score,[features store=yourFeatureStore format=[dense|sparse] ]`
+
+This will return the values of the features in the given store. The format of the 
+extracted features will be based on the format parameter. The default is sparse.
+
+# Assemble training data
+In order to train a learning to rank model you need training data. Training data is
+what "teaches" the model what the appropriate weight for each feature is. In general
+training data is a collection of queries with associated documents and what their ranking/score
+should be. As an example:
+```
+secretary of state|John Kerry|0.66|CROWDSOURCE
+secretary of state|Cesar A. Perales|0.33|CROWDSOURCE
+secretary of state|New York State|0.0|CROWDSOURCE
+secretary of state|Colorado State University Secretary|0.0|CROWDSOURCE
+
+microsoft ceo|Satya Nadella|1.0|CLICK_LOG
+microsoft ceo|Microsoft|0.0|CLICK_LOG
+microsoft ceo|State|0.0|CLICK_LOG
+microsoft ceo|Secretary|0.0|CLICK_LOG
+```
+In this example the first column indicates the query, the second column indicates a unique
id for that doc,
+the third column indicates the relative importance or relevance of that doc, and the fourth
column indicates the source.
+There are 2 primary ways you might collect data for use with your machine learning algorithim.
The first
+is to collect the clicks of your users given a specific query. There are many ways of preparing
this data
+to train a model (http://www.cs.cornell.edu/people/tj/publications/joachims_etal_05a.pdf).
The general idea
+is that if a user sees multiple documents and clicks the one lower down, that document should
be scored higher
+than the one above it. The second way is explicitly through a crowdsourcing platform like
Mechanical Turk or
+CrowdFlower. These platforms allow you to show human workers documents associated with a
query and have them
+tell you what the correct ranking should be.
+
+At this point you'll need to collect feature vectors for each query document pair. You can
use the information
+from the Extract features section above to do this. An example script has been included in
example/train_and_upload_demo_model.py.
+
+# Explanation of the core reranking logic
+An LTR model is plugged into the ranking through the [LTRQParserPlugin](/solr/contrib/ltr/src/java/org/apache/solr/search/LTRQParserPlugin.java).
The plugin will
+read from the request the model, an instance of [LTRScoringModel](/solr/contrib/ltr/src/java/org/apache/solr/ltr/model/LTRScoringModel.java),
+plus other parameters. The plugin will generate an LTRQuery, a particular [ReRankQuery](/solr/core/src/java/org/apache/solr/search/AbstractReRankQuery.java).
+It wraps the original solr query for the first pass ranking, and uses the provided model
in an
+[LTRScoringQuery](/solr/contrib/ltr/src/java/org/apache/solr/ltr/LTRScoringQuery.java) to
+rescore and rerank the top documents.  The LTRScoringQuery will take care of computing the
values of all the
+[features](/solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/Feature.java) and then
will delegate the final score
+generation to the LTRScoringModel.
+
+# Speeding up the weight creation with threads
+About half the time for ranking is spent in the creation of weights for each feature used
in ranking. If the number of features is significantly high (say, 500 or more), this increases
the ranking overhead proportionally. To alleviate this problem, parallel weight creation is
provided as a configurable option. In order to use this feature, the following lines need
to be added to the solrconfig.xml
+```xml
+
+<config>
+  <!-- Query parser used to rerank top docs with a provided model -->
+  <queryParser name="ltr" class="org.apache.solr.search.LTRQParserPlugin">
+     <int name="threadModule.totalPoolThreads">10</int> <!-- Maximum threads
to share for all requests -->
+     <int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads
to use for a single requests-->
+  </queryParser>
+  
+  <!-- Transformer for extracting features -->
+  <transformer name="features" class="org.apache.solr.response.transform.LTRFeatureLoggerTransformerFactory">
+     <int name="threadModule.totalPoolThreads">10</int> <!-- Maximum threads
to share for all requests -->
+     <int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads
to use for a single requests-->
+  </transformer>
+</config>
+
+```
+  
+The threadModule.totalPoolThreads option limits the total number of threads to be used across
all query instances at any given time. threadModule.numThreadsPerRequest limits the number
of threads used to process a single query. In the above example, 10 threads will be used to
services all queries and a maximum of 5 threads to service a single query. If the solr instances
is expected to receive no more than one query at a time, it is best to set both these numbers
to the same value. If multiple queries need to serviced simultaneously, the numbers can be
adjusted based on the expected response times. If the value of  threadModule.numThreadsPerRequest
is higher, the reponse time for a single query will be improved upto a point. If multiple
queries are serviced simultaneously, the threadModule.totalPoolThreads imposes a contention
between the queries if (threadModule.numThreadsPerRequest*total parallel queries > threadModule.totalPoolThreads).

+

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/contrib/ltr/README.txt
----------------------------------------------------------------------
diff --git a/solr/contrib/ltr/README.txt b/solr/contrib/ltr/README.txt
new file mode 120000
index 0000000..42061c0
--- /dev/null
+++ b/solr/contrib/ltr/README.txt
@@ -0,0 +1 @@
+README.md
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/contrib/ltr/build.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/ltr/build.xml b/solr/contrib/ltr/build.xml
new file mode 100644
index 0000000..bbd5cf3
--- /dev/null
+++ b/solr/contrib/ltr/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="solr-ltr" default="default">
+
+  <description>
+    Learning to Rank Package
+  </description>
+
+  <import file="../contrib-build.xml"/>
+
+  <target name="compile-core" depends=" solr-contrib-build.compile-core"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/contrib/ltr/example/config.json
----------------------------------------------------------------------
diff --git a/solr/contrib/ltr/example/config.json b/solr/contrib/ltr/example/config.json
new file mode 100644
index 0000000..483fe69
--- /dev/null
+++ b/solr/contrib/ltr/example/config.json
@@ -0,0 +1,14 @@
+{
+  "host":                     "localhost",
+  "port":                     8983,
+  "collection":               "techproducts",
+  "requestHandler":           "query",
+  "q":                        "*:*",
+  "otherParams":              "fl=id,score,[features efi.user_query='$USERQUERY']",
+  "userQueriesFile":          "user_queries.txt",
+  "trainingFile":             "ClickData",
+  "featuresFile":             "techproducts-features.json",
+  "trainingLibraryLocation":  "liblinear/train",
+  "solrModelFile":            "solrModel.json",
+  "solrModelName":            "ExampleModel"
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5a66b3bc/solr/contrib/ltr/example/libsvm_formatter.py
----------------------------------------------------------------------
diff --git a/solr/contrib/ltr/example/libsvm_formatter.py b/solr/contrib/ltr/example/libsvm_formatter.py
new file mode 100644
index 0000000..25cf10b
--- /dev/null
+++ b/solr/contrib/ltr/example/libsvm_formatter.py
@@ -0,0 +1,124 @@
+from subprocess import call
+import os
+
+PAIRWISE_THRESHOLD = 1.e-1
+FEATURE_DIFF_THRESHOLD = 1.e-6
+
+class LibSvmFormatter:
+    def processQueryDocFeatureVector(self,docClickInfo,trainingFile):
+        '''Expects as input a sorted by queries list or generator that provides the context

+        for each query in a tuple composed of: (query , docId , relevance , source , featureVector).
+        The list of documents that are part of the same query will generate comparisons
+        against each other for training. '''
+        curQueryAndSource = "";
+        with open(trainingFile,"w") as output:
+            self.featureNameToId  = {}
+            self.featureIdToName = {}
+            self.curFeatIndex = 1;
+            curListOfFv = []
+            for query,docId,relevance,source,featureVector in docClickInfo:
+                if curQueryAndSource != query + source:
+                    #Time to flush out all the pairs
+                    _writeRankSVMPairs(curListOfFv,output);
+                    curListOfFv = []
+                    curQueryAndSource = query + source
+                curListOfFv.append((relevance,self._makeFeaturesMap(featureVector)))
+            _writeRankSVMPairs(curListOfFv,output); #This catches the last list of comparisons
+
+    def _makeFeaturesMap(self,featureVector):
+        '''expects a list of strings with "feature name":"feature value" pairs. Outputs a
map of map[key] = value.
+        Where key is now an integer. libSVM requires the key to be an integer but not all
libraries have
+        this requirement.'''
+        features = {}
+        for keyValuePairStr in featureVector:
+            featName,featValue = keyValuePairStr.split(":");
+            features[self._getFeatureId(featName)] = float(featValue);
+        return features
+
+    def _getFeatureId(self,key):
+        if key not in self.featureNameToId:
+                self.featureNameToId[key] = self.curFeatIndex;
+                self.featureIdToName[self.curFeatIndex] = key;
+                self.curFeatIndex += 1;
+        return self.featureNameToId[key];
+
+    def convertLibSvmModelToLtrModel(self,libSvmModelLocation, outputFile, modelName):
+        with open(libSvmModelLocation, 'r') as inFile:
+            with open(outputFile,'w') as convertedOutFile:
+                convertedOutFile.write('{\n\t"class":"org.apache.solr.ltr.model.LinearModel",\n')
+                convertedOutFile.write('\t"name": "' + str(modelName) + '",\n')
+                convertedOutFile.write('\t"features": [\n')
+                isFirst = True;
+                for featKey in self.featureNameToId.keys():
+                    convertedOutFile.write('\t\t{ "name":"' + featKey  + '"}' if isFirst
else ',\n\t\t{ "name":"' + featKey  + '"}' );
+                    isFirst = False;
+                convertedOutFile.write("\n\t],\n");
+                convertedOutFile.write('\t"params": {\n\t\t"weights": {\n');
+
+                startReading = False
+                isFirst = True
+                counter = 1
+                for line in inFile:
+                    if startReading:
+                        newParamVal = float(line.strip())
+                        if not isFirst:
+                            convertedOutFile.write(',\n\t\t\t"' + self.featureIdToName[counter]
+ '":' + str(newParamVal))
+                        else:
+                            convertedOutFile.write('\t\t\t"' + self.featureIdToName[counter]
+ '":' + str(newParamVal))
+                            isFirst = False
+                        counter += 1
+                    elif line.strip() == 'w':
+                        startReading = True
+                convertedOutFile.write('\n\t\t}\n\t}\n}')
+
+def _writeRankSVMPairs(listOfFeatures,output):
+    '''Given a list of (relevance, {Features Map}) where the list represents
+    a set of documents to be compared, this calculates all pairs and
+    writes the Feature Vectors in a format compatible with libSVM.
+    Ex: listOfFeatures = [
+      #(relevance, {feature1:value, featureN:value})
+      (4, {1:0.9, 2:0.9, 3:0.1})
+      (3, {1:0.7, 2:0.9, 3:0.2})
+      (1, {1:0.1, 2:0.9, 6:0.1})
+    ]    
+    '''
+    for d1 in range(0,len(listOfFeatures)):
+        for d2 in range(d1+1,len(listOfFeatures)):
+            doc1,doc2 = listOfFeatures[d1], listOfFeatures[d2]
+            fv1,fv2 = doc1[1],doc2[1]
+            d1Relevance, d2Relevance = float(doc1[0]),float(doc2[0])
+            if  d1Relevance - d2Relevance > PAIRWISE_THRESHOLD:#d1Relevance > d2Relevance
+                outputLibSvmLine("+1",subtractFvMap(fv1,fv2),output);
+                outputLibSvmLine("-1",subtractFvMap(fv2,fv1),output);
+            elif d1Relevance - d2Relevance < -PAIRWISE_THRESHOLD: #d1Relevance < d2Relevance:
+                outputLibSvmLine("+1",subtractFvMap(fv2,fv1),output);
+                outputLibSvmLine("-1",subtractFvMap(fv1,fv2),output);
+            else: #Must be approximately equal relevance, in which case this is a useless
signal and we should skip
+                continue;
+
+def subtractFvMap(fv1,fv2):
+    '''returns the fv from fv1 - fv2'''
+    retFv = fv1.copy();
+    for featInd in fv2.keys():
+        subVal = 0.0;
+        if featInd in fv1:
+            subVal = fv1[featInd] - fv2[featInd]
+        else:
+            subVal = -fv2[featInd]
+        if abs(subVal) > FEATURE_DIFF_THRESHOLD: #This ensures everything is in sparse
format, and removes useless signals
+            retFv[featInd] = subVal;
+        else:
+            retFv.pop(featInd, None)
+    return retFv;
+
+def outputLibSvmLine(sign,fvMap,outputFile):
+    outputFile.write(sign)
+    for feat in fvMap.keys():
+        outputFile.write(" " + str(feat) + ":" + str(fvMap[feat]));
+    outputFile.write("\n")
+
+def trainLibSvm(libraryLocation,trainingFileName):
+    if os.path.isfile(libraryLocation):
+        call([libraryLocation, trainingFileName])
+    else:
+        raise Exception("NO LIBRARY FOUND: " + libraryLocation);


Mime
View raw message