lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r835677 - in /lucene/java/trunk/contrib/benchmark: CHANGES.txt src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
Date Fri, 13 Nov 2009 00:47:16 GMT
Author: rmuir
Date: Fri Nov 13 00:47:15 2009
New Revision: 835677

URL: http://svn.apache.org/viewvc?rev=835677&view=rev
Log:
LUCENE-2059: allow TrecContentSource not to change the docname

Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Nov 13 00:47:15 2009
@@ -5,6 +5,13 @@
 $Id:$
 
 11/12/2009
+  LUCENE-2059: allow TrecContentSource not to change the docname.
+  Previously, it would always append the iteration # to the docname.
+  With the new option content.source.excludeIteration, you can disable this.
+  The resulting index can then be used with the quality package to measure
+  relevance. (Robert Muir)
+  
+11/12/2009
   LUCENE-2058: specify trec_eval submission output from the command line.
   Previously, 4 arguments were required, but the third was unused. The 
   third argument is now the desired location of submission.txt  (Robert Muir)

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
Fri Nov 13 00:47:15 2009
@@ -48,6 +48,7 @@
  * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use
for
  * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
  * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is
used.
+ * <li><b>content.source.excludeIteration</b> - if true, do not append
iteration number to docname
  * </ul>
  */
 public class TrecContentSource extends ContentSource {
@@ -91,6 +92,7 @@
   BufferedReader reader;
   int iteration = 0;
   HTMLParser htmlParser;
+  private boolean excludeDocnameIteration;
   
   private DateFormatInfo getDateFormatInfo() {
     DateFormatInfo dfi = dateFormats.get();
@@ -256,7 +258,8 @@
       read(docBuf, DOCNO, true, false, null);
       name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
           DOCNO.length()));
-      name = name + "_" + iteration;
+      if (!excludeDocnameIteration)
+        name = name + "_" + iteration;
 
       // 3. skip until doc header
       docBuf.setLength(0);
@@ -342,6 +345,7 @@
     if (encoding == null) {
       encoding = "ISO-8859-1";
     }
+    excludeDocnameIteration = config.get("content.source.excludeIteration", false);
   }
 
 }



Mime
View raw message