Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 26490 invoked from network); 27 Jul 2007 20:25:18 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 27 Jul 2007 20:25:18 -0000 Received: (qmail 15959 invoked by uid 500); 27 Jul 2007 20:25:18 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 15922 invoked by uid 500); 27 Jul 2007 20:25:18 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 15908 invoked by uid 99); 27 Jul 2007 20:25:18 -0000 Received: from Unknown (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 27 Jul 2007 13:25:18 -0700 X-ASF-Spam-Status: No, hits=-100.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 27 Jul 2007 20:25:16 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id A833C1A981A; Fri, 27 Jul 2007 13:24:55 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r560372 [1/2] - in /lucene/java/trunk: ./ contrib/benchmark/ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/ contrib/benchmark/src/java/org/apache/lucene/b... Date: Fri, 27 Jul 2007 20:24:53 -0000 To: java-commits@lucene.apache.org From: doronc@apache.org X-Mailer: svnmailer-1.1.0 Message-Id: <20070727202455.A833C1A981A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: doronc Date: Fri Jul 27 13:24:52 2007 New Revision: 560372 URL: http://svn.apache.org/viewvc?view=rev&rev=560372 Log: LUCENE-836: Add support for search quality benchmarking. Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/package.html (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java (with props) lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html (with props) lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (with props) lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt (with props) lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt (with props) Modified: lucene/java/trunk/common-build.xml lucene/java/trunk/contrib/benchmark/CHANGES.txt lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Modified: lucene/java/trunk/common-build.xml URL: http://svn.apache.org/viewvc/lucene/java/trunk/common-build.xml?view=diff&rev=560372&r1=560371&r2=560372 ============================================================================== --- lucene/java/trunk/common-build.xml (original) +++ lucene/java/trunk/common-build.xml Fri Jul 27 13:24:52 2007 @@ -284,6 +284,8 @@ + + ################################################################## @@ -299,6 +301,10 @@ + + + + Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?view=diff&rev=560372&r1=560371&r2=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original) +++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Jul 27 13:24:52 2007 @@ -4,6 +4,14 @@ $Id:$ +7/27/07 + LUCENE-836: Add support for search quality benchmarking, running + a set of queries against a searcher, and, optionally produce a submission + report, and, if query judgements are available, compute quality measures: + recall, precision_at_N, average_precision, MAP. TREC specific Judge (based + on TREC QRels) and TREC Topics reader are included in o.a.l.benchmark.quality.trec + but any other format of queries and judgements can be implemented and used. + 7/24/07 LUCENE-947: Add support for creating and index "one document per line" from a large text file, which reduces per-document overhead of Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.PrintWriter; + +/** + * Judge if a document is relevant for a quality query. + */ +public interface Judge { + + /** + * Judge if document docName is relevant for the given quality query. + * @param docName name of doc tested for relevancy. + * @param query tested quality query. + * @return true if relevant, false if not. + */ + public boolean isRelevant(String docName, QualityQuery query); + + /** + * Validate that queries and this Judge match each other. + * To be perfectly valid, this Judge must have some data for each and every + * input quality query, and must not have any data on any other quality query. + * Note: the quality benchmark run would not fail in case of imperfect + * validity, just a warning message would be logged. + * @param qq quality queries to be validated. + * @param logger if not null, validation issues are logged. + * @return true if perfectly valid, false if not. + */ + public boolean validateData (QualityQuery qq[], PrintWriter logger); + + /** + * Return the maximal recall for the input quality query. + * It is the number of relevant docs this Judge "knows" for the query. + * @param query the query whose maximal recall is needed. + */ + public int maxRecall (QualityQuery query); + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.lucene.benchmark.quality.utils.DocNameExtractor; +import org.apache.lucene.benchmark.quality.utils.SubmissionReport; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TopDocs; + +/** + * Main entry point for running a quality benchmark. + *

+ * There are two main configurations for running a quality benchmark:

    + *
  • Against existing judgements.
  • + *
  • For submission (e.g. for a contest).
  • + *
+ * The first configuration requires a non null + * {@link org.apache.lucene.benchmark.quality.Judge Judge}. + * The second configuration requires a non null + * {@link org.apache.lucene.benchmark.quality.utils.SubmissionReport SubmissionLogger}. + */ +public class QualityBenchmark { + + /** Quality Queries that this quality benchmark would execute. */ + protected QualityQuery qualityQueries[]; + + /** Parser for turning QualityQueries into Lucene Queries. */ + protected QualityQueryParser qqParser; + + /** Index to be searched. */ + protected Searcher searcher; + + /** index field to extract doc name for each search result; used for judging the results. */ + protected String docNameField; + + /** + * Create a QualityBenchmark. + * @param qqs quality queries to run. + * @param qqParser parser for turning QualityQueries into Lucene Queries. + * @param searcher index to be searched. + * @param docNameField name of field containg the document name. + * This allows to extract the doc name for search results, + * and is important for judging the results. + */ + public QualityBenchmark(QualityQuery qqs[], QualityQueryParser qqParser, + Searcher searcher, String docNameField) { + this.qualityQueries = qqs; + this.qqParser = qqParser; + this.searcher = searcher; + this.docNameField = docNameField; + } + + /** + * Run the quality benchmark. + * @param maxResults how many results to collect for each quality query. + * @param judge the judge that can tell if a certain result doc is relevant for a certain quality query. + * If null, no judgements would be made. Usually null for a submission run. + * @param submitRep submission report is created if non null. + * @param qualityLog If not null, quality run data would be printed for each query. + * @return QualityStats of each quality query that was executed. + * @throws Exception if quality benchmark failed to run. + */ + public QualityStats [] execute(int maxResults, Judge judge, SubmissionReport submitRep, + PrintWriter qualityLog) throws Exception { + QualityStats stats[] = new QualityStats[qualityQueries.length]; + for (int i=0; i + * The ID allows to map the quality query with its judgements. + *

+ * The name-value pairs are used by a + * {@link org.apache.lucene.benchmark.quality.QualityQueryParser} + * to create a Lucene {@link org.apache.lucene.search.Query}. + *

+ * It is very likely that name-value-pairs would be mapped into fields in a Lucene query, + * but it is up to the QualityQueryParser how to map - e.g. all values in a single field, + * or each pair as its own field, etc., - and this of course must match the way the + * searched index was constructed. + */ +public class QualityQuery implements Comparable { + private String queryID; + private Map nameValPairs; + + /** + * Create a QualityQuery with given ID and name-value pairs. + * @param queryID ID of this quality query. + * @param nameValPairs the contents of this quality query. + */ + public QualityQuery(String queryID, Map nameValPairs) { + this.queryID = queryID; + this.nameValPairs = nameValPairs; + } + + /** + * Return all the names of name-value-pairs in this QualityQuery. + */ + public String[] getNames() { + return (String[]) nameValPairs.keySet().toArray(new String[0]); + } + + /** + * Return the value of a certain name-value pair. + * @param name the name whose value should be returned. + */ + public String getValue(String name) { + return (String) nameValPairs.get(name); + } + + /** + * Return the ID of this query. + * The ID allows to map the quality query with its judgements. + */ + public String getQueryID() { + return queryID; + } + + /* for a nicer sort of input queries before running them. + * Try first as ints, fall back to string if not int. */ + public int compareTo(Object o) { + QualityQuery other = (QualityQuery) o; + try { + // compare as ints when ids ints + int n = Integer.parseInt(queryID); + int nOther = Integer.parseInt(other.queryID); + return n - nOther; + } catch (NumberFormatException e) { + // fall back to string comparison + return queryID.compareTo(other.queryID); + } + } + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Query; + +/** + * Parse a QualityQuery into a Lucene query. + */ +public interface QualityQueryParser { + + /** + * Parse a given QualityQuery into a Lucene query. + * @param qq the quality query to be parsed. + * @throws ParseException if parsing failed. + */ + public Query parse(QualityQuery qq) throws ParseException; + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,266 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.PrintWriter; +import java.text.NumberFormat; +import java.util.ArrayList; + +/** + * Results of quality benchmark run for a single query or for a set of queries. + */ +public class QualityStats { + + /** Number of points for which precision is computed. */ + public static final int MAX_POINTS = 20; + + private double maxGoodPoints; + private double recall; + private double pAt[]; + private double pReleventSum = 0; + private double numPoints = 0; + private double numGoodPoints = 0; + private long searchTime; + private long docNamesExtractTime; + + /** + * A certain rank in which a relevant doc was found. + */ + public static class RecallPoint { + private int rank; + private double recall; + private RecallPoint(int rank, double recall) { + this.rank = rank; + this.recall = recall; + } + /** Returns the rank: where on the list of returned docs this relevant doc appeared. */ + public int getRank() { + return rank; + } + /** Returns the recall: how many relevant docs were returned up to this point, inclusive. */ + public double getRecall() { + return recall; + } + } + + private ArrayList recallPoints; + + /** + * Construct a QualityStats object with anticipated maximal number of relevant hits. + * @param maxGoodPoints maximal possible relevant hits. + */ + public QualityStats(double maxGoodPoints, long searchTime) { + this.maxGoodPoints = maxGoodPoints; + this.searchTime = searchTime; + this.recallPoints = new ArrayList(); + pAt = new double[MAX_POINTS+1]; // pAt[0] unused. + } + + /** + * Add a (possibly relevant) doc. + * @param n rank of the added doc (its ordinal position within the query results). + * @param isRelevant true if the added doc is relevant, false otherwise. + */ + public void addResult(int n, boolean isRelevant, long docNameExtractTime) { + if (Math.abs(numPoints+1 - n) > 1E-6) { + throw new IllegalArgumentException("point "+n+" illegal after "+numPoints+" points!"); + } + if (isRelevant) { + numGoodPoints+=1; + recallPoints.add(new RecallPoint(n,numGoodPoints)); + } + numPoints = n; + double p = numGoodPoints / numPoints; + if (isRelevant) { + pReleventSum += p; + } + if (nn hits}| / n. + * @param n requested precision point, must be at least 1 and at most {@link #MAX_POINTS}. + */ + public double getPrecisionAt(int n) { + if (n<1 || n>MAX_POINTS) { + throw new IllegalArgumentException("n="+n+" - but it must be in [1,"+MAX_POINTS+"] range!"); + } + if (n>numPoints) { + return (numPoints * pAt[(int)numPoints])/n; + } + return pAt[n]; + } + + /** + * Return the average precision at recall points: sum of precision at recall points / maxGoodPoints. + */ + public double getAvp() { + return maxGoodPoints==0 ? 0 : pReleventSum/maxGoodPoints; + } + + /** + * Return the recall: |{relevant hits}| / |{hits}|. + */ + public double getRecall() { + return recall; + } + + /** + * Log information on this QualityStats object. + * @param logger Logger. + * @param prefix prefix before each log line. + */ + public void log(String title, int paddLines, PrintWriter logger, String prefix) { + for (int i=0; i0) { + logger.println(title); + } + prefix = prefix==null ? "" : prefix; + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(3); + nf.setMinimumFractionDigits(3); + nf.setGroupingUsed(true); + int M = 19; + logger.println(prefix+format("Search Seconds: ",M)+ + fracFormat(nf.format((double)searchTime/1000))); + logger.println(prefix+format("DocName Seconds: ",M)+ + fracFormat(nf.format((double)docNamesExtractTime/1000))); + logger.println(prefix+format("Num Points: ",M)+ + fracFormat(nf.format(numPoints))); + logger.println(prefix+format("Num Good Points: ",M)+ + fracFormat(nf.format(numGoodPoints))); + logger.println(prefix+format("Max Good Points: ",M)+ + fracFormat(nf.format(maxGoodPoints))); + logger.println(prefix+format("Average Precision: ",M)+ + fracFormat(nf.format(getAvp()))); + logger.println(prefix+format("Recall: ",M)+ + fracFormat(nf.format(getRecall()))); + for (int i=1; i<(int)numPoints && i0) { + m++; + avg.numGoodPoints += stats[i].numGoodPoints; + avg.numPoints += stats[i].numPoints; + avg.pReleventSum += stats[i].getAvp(); + avg.recall += stats[i].recall; + avg.maxGoodPoints += stats[i].maxGoodPoints; + for (int j=1; j0 : "Fishy: no \"good\" queries!"; + // take average: times go by all queries, other meassures go by "good" queries noly. + avg.searchTime /= stats.length; + avg.docNamesExtractTime /= stats.length; + avg.numGoodPoints /= m; + avg.numPoints /= m; + avg.recall /= m; + avg.maxGoodPoints /= m; + for (int j=1; j + +

Search Quality Benchmarking.

+

+This package allows to benchmark search quality of a Lucene application. +

+In order to use this package you should provide: +

+

+For benchmarking TREC collections with TREC QRels, take a look at the +trec package. +

+Here is a sample code used to run the TREC 2006 queries 701-850 on the .Gov2 collection: + +

+    File topicsFile = new File("topics-701-850.txt");
+    File qrelsFile = new File("qrels-701-850.txt");
+    Searcher searcher = new IndexSearcher("index");
+
+    int maxResults = 1000;
+    String docNameField = "docname"; 
+    
+    PrintWriter logger = new PrintWriter(System.out,true); 
+
+    // use trec utilities to read trec topics into quality queries
+    TrecTopicsReader qReader = new TrecTopicsReader();
+    QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile)));
+    
+    // prepare judge, with trec utilities that read from a QRels file
+    Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));
+    
+    // validate topics & judgments match each other
+    judge.validateData(qqs, logger);
+    
+    // set the parsing of quality queries into Lucene queries.
+    QualityQueryParser qqParser = new SimpleQQParser("title", "body");
+    
+    // run the benchmark
+    QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
+    SubmissionReport submitLog = null;
+    QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger);
+    
+    // print an avarage sum of the results
+    QualityStats avg = QualityStats.average(stats);
+    avg.log("SUMMARY",2,logger, "  ");
+
+ +

+Some immediate ways to modify this program to your needs are: +

+ + + + Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/package.html ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/package.html ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.trec; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.StringTokenizer; + +import org.apache.lucene.benchmark.quality.Judge; +import org.apache.lucene.benchmark.quality.QualityQuery; + +/** + * Judge if given document is relevant to given quality query, based on Trec format for judgements. + */ +public class TrecJudge implements Judge { + + HashMap judgements; + + /** + * Constructor from a reader. + *

+ * Expected input format: + *

+   *     qnum  0   doc-name     is-relevant
+   * 
+ * Two sample lines: + *
 
+   *     19    0   doc303       1
+   *     19    0   doc7295      0
+   * 
+ * @param reader where judgments are read from. + * @throws IOException + */ + public TrecJudge (BufferedReader reader) throws IOException { + judgements = new HashMap(); + QRelJudgement curr = null; + String zero = "0"; + String line; + + try { + while (null!=(line=reader.readLine())) { + line = line.trim(); + if (line.length()==0 || '#'==line.charAt(0)) { + continue; + } + StringTokenizer st = new StringTokenizer(line); + String queryID = st.nextToken(); + st.nextToken(); + String docName = st.nextToken(); + boolean relevant = !zero.equals(st.nextToken()); + assert !st.hasMoreTokens() : "wrong format: "+line+" next: "+st.nextToken(); + if (relevant) { // only keep relevant docs + if (curr==null || !curr.queryID.equals(queryID)) { + curr = (QRelJudgement)judgements.get(queryID); + if (curr==null) { + curr = new QRelJudgement(queryID); + judgements.put(queryID,curr); + } + } + curr.addRelevandDoc(docName); + } + } + } finally { + reader.close(); + } + } + + // inherit javadocs + public boolean isRelevant(String docName, QualityQuery query) { + QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); + return qrj!=null && qrj.isRelevant(docName); + } + + /** single Judgement of a trec quality query */ + private static class QRelJudgement { + private String queryID; + private HashMap relevantDocs; + + QRelJudgement(String queryID) { + this.queryID = queryID; + relevantDocs = new HashMap(); + } + + public void addRelevandDoc(String docName) { + relevantDocs.put(docName,docName); + } + + boolean isRelevant(String docName) { + return relevantDocs.containsKey(docName); + } + + public int maxRecall() { + return relevantDocs.size(); + } + } + + // inherit javadocs + public boolean validateData(QualityQuery[] qq, PrintWriter logger) { + HashMap missingQueries = (HashMap) judgements.clone(); + ArrayList missingJudgements = new ArrayList(); + for (int i=0; i0) { + isValid = false; + if (logger!=null) { + logger.println("WARNING: "+missingJudgements.size()+" queries have no judgments! - "); + for (int i=0; i0) { + isValid = false; + if (logger!=null) { + logger.println("WARNING: "+missingQueries.size()+" judgments match no query! - "); + for (Iterator it = missingQueries.keySet().iterator(); it.hasNext();) { + String id = (String) it.next(); + logger.println(" "+id); + } + } + } + return isValid; + } + + // inherit javadocs + public int maxRecall(QualityQuery query) { + QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); + if (qrj!=null) { + return qrj.maxRecall(); + } + return 0; + } +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,123 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.trec; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import org.apache.lucene.benchmark.quality.QualityQuery; + +/** + * Read TREC topics. + *

+ * Expects this topic format - + *

+ *   <top>
+ *   <num> Number: nnn
+ *     
+ *   <title> title of the topic
+ *     
+ *   <desc> Description:
+ *   description of the topic
+ *     
+ *   <narr> Narrative:
+ *   "story" composed by assessors.
+ *    
+ *   </top>
+ * 
+ * Comment lines starting with '#' are ignored. + */ +public class TrecTopicsReader { + + private static final String newline = System.getProperty("line.separator"); + + /** + * Constructor for Trec's TopicsReader + */ + public TrecTopicsReader() { + super(); + } + + /** + * Read quality queries from trec format topics file. + * @param reader where queries are read from. + * @return the result quality queries. + * @throws IOException if cannot read the queries. + */ + public QualityQuery[] readQueries(BufferedReader reader) throws IOException { + ArrayList res = new ArrayList(); + StringBuffer sb; + try { + while (null!=(sb=read(reader,"",null,false,false))) { + HashMap fields = new HashMap(); + // id + sb = read(reader,"",null,true,false); + int k = sb.indexOf(":"); + String id = sb.substring(k+1).trim(); + // title + sb = read(reader,"",null,true,false); + k = sb.indexOf(">"); + String title = sb.substring(k+1).trim(); + // description + sb = read(reader,"<desc>",null,false,false); + sb = read(reader,"<narr>",null,false,true); + String descripion = sb.toString().trim(); + // we got a topic! + fields.put("title",title); + fields.put("description",descripion); + QualityQuery topic = new QualityQuery(id,fields); + res.add(topic); + // skip narrative, get to end of doc + read(reader,"</top>",null,false,false); + } + } finally { + reader.close(); + } + // sort result array (by ID) + QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]); + Arrays.sort(qq); + return qq; + } + + // read until finding a line that starts with the specified prefix + private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException { + sb = (sb==null ? new StringBuffer() : sb); + String sep = ""; + while (true) { + String line = reader.readLine(); + if (line==null) { + return null; + } + if (line.startsWith(prefix)) { + if (collectMatchLine) { + sb.append(sep+line); + sep = newline; + } + break; + } + if (collectAll) { + sb.append(sep+line); + sep = newline; + } + } + //System.out.println("read: "+sb); + return sb; + } +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html Fri Jul 27 13:24:52 2007 @@ -0,0 +1,6 @@ +<html> +<body> +Utilities for Trec related quality benchmarking, feeding from Trec Topics and QRels inputs. +</body> + +</html> Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.IOException; + +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.search.Searcher; + +/** + * Utility: extract doc names from an index + */ +public class DocNameExtractor { + + private FieldSelector fldSel; + private String docNameField; + + /** + * Constructor for DocNameExtractor. + * @param docNameField name of the stored field containing the doc name. + */ + public DocNameExtractor (final String docNameField) { + this.docNameField = docNameField; + fldSel = new FieldSelector() { + public FieldSelectorResult accept(String fieldName) { + return fieldName.equals(docNameField) ? + FieldSelectorResult.LOAD_AND_BREAK : + FieldSelectorResult.NO_LOAD; + } + }; + } + + /** + * Extract the name of the input doc from the index. + * @param searcher access to the index. + * @param docid ID of doc whose name is needed. + * @return the name of the input doc as extracted from the index. + * @throws IOException if cannot extract the doc name from the index. + */ + public String docName(Searcher searcher, int docid) throws IOException { + return searcher.doc(docid,fldSel).get(docNameField); + } + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.File; +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.PriorityQueue; + +/** + * Suggest Quality queries based on an index contents. + * Utility class, used for making quality test benchmarks. + */ +public class QualityQueriesFinder { + + private static final String newline = System.getProperty("line.separator"); + private Directory dir; + + /** + * Constrctor over a directory containing the index. + * @param dir directory containing the index we search for the quality test. + */ + private QualityQueriesFinder(Directory dir) { + this.dir = dir; + } + + /** + * @param args {index-dir} + * @throws IOException if cannot access the index. + */ + public static void main(String[] args) throws IOException { + if (args.length<1) { + System.err.println("Usage: java QualityQueriesFinder <index-dir>"); + System.exit(1); + } + QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.getDirectory(new File(args[0]))); + String q[] = qqf.bestQueries("body",20); + for (int i=0; i<q.length; i++) { + System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null)); + } + } + + private String [] bestQueries(String field,int numQueries) throws IOException { + String words[] = bestTerms("body",4*numQueries); + int n = words.length; + int m = n/4; + String res[] = new String[m]; + for (int i=0; i<res.length; i++) { + res[i] = words[i] + " " + words[m+i]+ " " + words[n-1-m-i] + " " + words[n-1-i]; + //System.out.println("query["+i+"]: "+res[i]); + } + return res; + } + + private static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) { + return + "<top>" + newline + + "<num> Number: " + qnum + newline + newline + + "<title> " + (title==null?"":title) + newline + newline + + "<desc> Description:" + newline + + (description==null?"":description) + newline + newline + + "<narr> Narrative:" + newline + + (narrative==null?"":narrative) + newline + newline + + "</top>"; + } + + private String [] bestTerms(String field,int numTerms) throws IOException { + PriorityQueue pq = new TermsDfQueue(numTerms); + IndexReader ir = IndexReader.open(dir); + try { + int threshold = ir.maxDoc() / 10; // ignore words too common. + TermEnum terms = ir.terms(new Term(field,"")); + while (terms.next()) { + if (!field.equals(terms.term().field())) { + break; + } + int df = terms.docFreq(); + if (df<threshold) { + String ttxt = terms.term().text(); + pq.insert(new TermDf(ttxt,df)); + } + } + } finally { + ir.close(); + } + String res[] = new String[pq.size()]; + int i = 0; + while (pq.size()>0) { + TermDf tdf = (TermDf) pq.pop(); + res[i++] = tdf.word; + System.out.println(i+". word: "+tdf.df+" "+tdf.word); + } + return res; + } + + private static class TermDf { + String word; + int df; + TermDf (String word, int freq) { + this.word = word; + this.df = freq; + } + } + + private static class TermsDfQueue extends PriorityQueue { + TermsDfQueue (int maxSize) { + initialize(maxSize); + } + protected boolean lessThan(Object a, Object b) { + TermDf tf1 = (TermDf) a; + TermDf tf2 = (TermDf) b; + return tf1.df < tf2.df; + } + } + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.benchmark.quality.QualityQueryParser; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Query; + +/** + * Simplistic quality query parser. A Lucene query is created by passing + * the value of the specified QualityQuery name-value pair into + * a Lucene's QueryParser using StandardAnalyzer. */ +public class SimpleQQParser implements QualityQueryParser { + + private String qqName; + private String indexField; + ThreadLocal queryParser = new ThreadLocal(); + + /** + * Constructor of a simple qq parser. + * @param qqName name-value pair of quality query to use for creating the query + * @param indexField corresponding index field + */ + public SimpleQQParser(String qqName, String indexField) { + this.qqName = qqName; + this.indexField = indexField; + } + + /* (non-Javadoc) + * @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery) + */ + public Query parse(QualityQuery qq) throws ParseException { + QueryParser qp = (QueryParser) queryParser.get(); + if (qp==null) { + qp = new QueryParser(indexField, new StandardAnalyzer()); + queryParser.set(qp); + } + return qp.parse(qq.getValue(qqName)); + } + +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java ------------------------------------------------------------------------------ svn:executable = * Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.IOException; +import java.io.PrintWriter; +import java.text.NumberFormat; + +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TopDocs; + +/** + * Create a log ready for submission. + * Extend this class and override + * {@link #report(QualityQuery, TopDocs, String, Searcher)} + * to create different reports. + */ +public class SubmissionReport { + + private NumberFormat nf; + private PrintWriter logger; + + /** + * Constructor for SubmissionReport. + * @param logger if null, no submission data is created. + */ + public SubmissionReport (PrintWriter logger) { + this.logger = logger; + nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(4); + nf.setMinimumFractionDigits(4); + } + + /** + * Report a search result for a certain quality query. + * @param qq quality query for which the results are reported. + * @param td search results for the query. + * @param docNameField stored field used for fetching the result doc name. + * @param searcher index access for fetching doc name. + * @throws IOException in case of a problem. + */ + public void report(QualityQuery qq, TopDocs td, String docNameField, Searcher searcher) throws IOException { + if (logger==null) { + return; + } + ScoreDoc sd[] = td.scoreDocs; + String sep = " \t "; + DocNameExtractor xt = new DocNameExtractor(docNameField); + for (int i=0; i<sd.length; i++) { + String docName = xt.docName(searcher,sd[i].doc); + logger.println( + qq.getQueryID() + sep + + '0' + sep + + format(docName,20) + sep + + format(""+i,7) + sep + + nf.format(sd[i].score) + ); + } + } + + private static String padd = " "; + private String format(String s, int minLen) { + s = (s==null ? "" : s); + int n = Math.max(minLen,s.length()); + return (s+padd).substring(0,n); + } +} Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html (added) +++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html Fri Jul 27 13:24:52 2007 @@ -0,0 +1,6 @@ +<html> +<body> +Miscellaneous utilities for search quality benchmarking: query parsing, submission reports. +</body> + +</html> Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html ------------------------------------------------------------------------------ svn:executable = * Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?view=diff&rev=560372&r1=560371&r2=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original) +++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Fri Jul 27 13:24:52 2007 @@ -23,6 +23,9 @@ import java.io.BufferedReader; import org.apache.lucene.benchmark.byTask.Benchmark; +import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; +import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -135,8 +138,8 @@ // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", - "doc.add.log.step=2697", + "doc.maker="+Reuters20DocMaker.class.getName(), + "doc.add.log.step=3", "doc.term.vector=false", "doc.maker.forever=false", "directory=FSDirectory", @@ -153,7 +156,7 @@ // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton. + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -221,7 +224,7 @@ } // create the benchmark and execute it. - private Benchmark execBenchmark(String[] algLines) throws Exception { + public static Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); logTstLogic(algText); Benchmark benchmark = new Benchmark(new StringReader(algText)); @@ -230,7 +233,7 @@ } // catenate alg lines to make the alg text - private String algLinesToText(String[] algLines) { + private static String algLinesToText(String[] algLines) { String indent = " "; StringBuffer sb = new StringBuffer(); for (int i = 0; i < propLines.length; i++) { @@ -242,11 +245,22 @@ return sb.toString(); } - private void logTstLogic (String txt) { + private static void logTstLogic (String txt) { if (!DEBUG) return; System.out.println("Test logic of:"); System.out.println(txt); } + /** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */ + public static class Reuters20DocMaker extends ReutersDocMaker { + private int nDocs=0; + protected DocData getNextDocData() throws Exception { + if (nDocs>=20 && !forever) { + throw new NoMoreDataException(); + } + nDocs++; + return super.getNextDocData(); + } + } } Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java?view=auto&rev=560372 ============================================================================== --- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (added) +++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java Fri Jul 27 13:24:52 2007 @@ -0,0 +1,174 @@ +package org.apache.lucene.benchmark.quality; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.PrintWriter; + +import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic; +import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; +import org.apache.lucene.benchmark.quality.Judge; +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.benchmark.quality.QualityQueryParser; +import org.apache.lucene.benchmark.quality.QualityBenchmark; +import org.apache.lucene.benchmark.quality.trec.TrecJudge; +import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader; +import org.apache.lucene.benchmark.quality.utils.SimpleQQParser; +import org.apache.lucene.benchmark.quality.utils.SubmissionReport; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.FSDirectory; + +import junit.framework.TestCase; + +/** + * Test that quality run does its job. + */ +public class TestQualityRun extends TestCase { + + private static boolean DEBUG = Boolean.getBoolean("tests.verbose"); + + /** + * @param arg0 + */ + public TestQualityRun(String name) { + super(name); + } + + public void testTrecQuality() throws Exception { + // first create the complete reuters index + createReutersIndex(); + + File workDir = new File(System.getProperty("benchmark.work.dir","work")); + assertTrue("Bad workDir: "+workDir, workDir.exists()&& workDir.isDirectory()); + + int maxResults = 1000; + String docNameField = "docid"; + + PrintWriter logger = DEBUG ? new PrintWriter(System.out,true) : null; + + // <tests src dir> for topics/qrels files - src/test/org/apache/lucene/benchmark/quality + File srcTestDir = new File(new File(new File(new File(new File( + new File(new File(workDir.getAbsoluteFile().getParentFile(), + "src"),"test"),"org"),"apache"),"lucene"),"benchmark"),"quality"); + + // prepare topics + File topicsFile = new File(srcTestDir, "trecTopics.txt"); + assertTrue("Bad topicsFile: "+topicsFile, topicsFile.exists()&& topicsFile.isFile()); + TrecTopicsReader qReader = new TrecTopicsReader(); + QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); + + // prepare judge + File qrelsFile = new File(srcTestDir, "trecQRels.txt"); + assertTrue("Bad qrelsFile: "+qrelsFile, qrelsFile.exists()&& qrelsFile.isFile()); + Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); + + // validate topics & judgments match each other + judge.validateData(qqs, logger); + + IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(new File(workDir,"index"))); + + QualityQueryParser qqParser = new SimpleQQParser("title","body"); + QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); + + SubmissionReport submitLog = DEBUG ? new SubmissionReport(logger) : null; + QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger); + + // --------- verify by the way judgments were altered for this test: + // for some queries, depending on m = qnum % 8 + // m==0: avg_precision and recall are hurt, by marking fake docs as relevant + // m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs + // m==2: all precision, precision_at_n and recall are hurt. + // m>=3: these queries remain perfect + for (int i = 0; i < stats.length; i++) { + QualityStats s = stats[i]; + switch (i%8) { + + case 0: + assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp()); + assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); + } + break; + + case 1: + assertTrue("avg-p should be hurt", 1.0 > s.getAvp()); + assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); + } + break; + + case 2: + assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp()); + assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); + } + break; + + default: { + assertEquals("avg-p should be perfect: "+s.getAvp(), 1.0, s.getAvp(), 1E-9); + assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); + } + } + + } + } + + QualityStats avg = QualityStats.average(stats); + if (logger!=null) { + avg.log("Average statistis:",1,logger," "); + } + + assertTrue("mean avg-p should be hurt: "+avg.getAvp(), 1.0 > avg.getAvp()); + assertTrue("avg recall should be hurt: "+avg.getRecall(), 1.0 > avg.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("avg p_at_"+j+" should be hurt: "+avg.getPrecisionAt(j), 1.0 > avg.getPrecisionAt(j)); + } + + + } + + // use benchmark logic to create the full Reuters index + private void createReutersIndex() throws Exception { + // 1. alg definition + String algLines[] = { + "# ----- properties ", + "doc.maker="+ReutersDocMaker.class.getName(), + "doc.add.log.step=2500", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=FSDirectory", + "doc.stored=true", + "doc.tokenized=true", + "# ----- alg ", + "ResetSystemErase", + "CreateIndex", + "{ AddDoc } : *", + "CloseIndex", + }; + + // 2. execute the algorithm (required in every "logic" test) + TestPerfTasksLogic.execBenchmark(algLines); + } +} Propchange: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java ------------------------------------------------------------------------------ svn:eol-style = native