lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r733697 - in /lucene/java/trunk/contrib/benchmark/src: java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java test/org/apache/lucene/benchmark/byTask/feeds/ test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
Date Mon, 12 Jan 2009 11:37:24 GMT
Author: mikemccand
Date: Mon Jan 12 03:37:23 2009
New Revision: 733697

URL: http://svn.apache.org/viewvc?rev=733697&view=rev
Log:
LUCENE-1479: if date is missing, don't skip the doc; just don't add 'docdate' field to the
doc

Added:
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
  (with props)
Modified:
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java?rev=733697&r1=733696&r2=733697&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
Mon Jan 12 03:37:23 2009
@@ -33,7 +33,6 @@
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
 
-
 /**
  * A DocMaker using the (compressed) Trec collection for its input.
  * <p>
@@ -44,7 +43,14 @@
  */
 public class TrecDocMaker extends BasicDocMaker {
 
-  private static final String newline = System.getProperty("line.separator");
+  private static final String DATE = "Date: ";
+  private static final String DOCHDR = "<DOCHDR>";
+  private static final String TERM_DOCHDR = "</DOCHDR>";
+  private static final String TERM_DOCNO = "</DOCNO>";
+  private static final String DOCNO = "<DOCNO>";
+  private static final String TERM_DOC = "</DOC>";
+  private static final String DOC = "<DOC>";
+  private static final String NEW_LINE = System.getProperty("line.separator");
   
   protected ThreadLocal dateFormat = new ThreadLocal();
   protected File dataDir = null;
@@ -135,25 +141,37 @@
   }
   
   // read until finding a line that starts with the specified prefix
-  protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine,
boolean collectAll) throws Exception {
+  protected StringBuffer read(String prefix, StringBuffer sb,
+                              boolean collectMatchLine, boolean collectAll,
+                              String terminatingTag) throws Exception {
     sb = (sb==null ? new StringBuffer() : sb);
     String sep = "";
     while (true) {
       String line = reader.readLine();
-      if (line==null) {
+      if (line == null) {
         openNextFile();
         continue;
       }
       if (line.startsWith(prefix)) {
         if (collectMatchLine) {
-          sb.append(sep+line);
-          sep = newline;
+          sb.append(sep).append(line);
+          sep = NEW_LINE;
         }
         break;
       }
+      
+      if (terminatingTag != null && line.startsWith(terminatingTag)) {
+    	  // didn't find the prefix that was asked, but the terminating
+    	  // tag was found. set the length to 0 to signal no match was
+    	  // found.
+    	  sb.setLength(0);
+    	  break;
+      }
+		
+
       if (collectAll) {
-        sb.append(sep+line);
-        sep = newline;
+        sb.append(sep).append(line);
+        sep = NEW_LINE;
       }
     }
     //System.out.println("read: "+sb);
@@ -165,22 +183,31 @@
       openNextFile();
     }
     // 1. skip until doc start
-    read("<DOC>",null,false,false); 
+    read(DOC,null,false,false,null); 
     // 2. name
-    StringBuffer sb = read("<DOCNO>",null,true,false);
-    String name = sb.substring("<DOCNO>".length());
-    name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration;
+    StringBuffer sb = read(DOCNO,null,true,false,null);
+    String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length()));
+    name = name + "_" + iteration;
     // 3. skip until doc header
-    read("<DOCHDR>",null,false,false); 
+    read(DOCHDR,null,false,false,null);
+    boolean findTerminatingDocHdr = false;
     // 4. date
-    sb = read("Date: ",null,true,false);
-    String dateStr = sb.substring("Date: ".length());
+    sb = read(DATE,null,true,false,TERM_DOCHDR);
+    String dateStr = null;
+    if (sb.length() != 0) {
+      // Date found.
+      dateStr = sb.substring(DATE.length());
+      findTerminatingDocHdr = true;
+    }
+
     // 5. skip until end of doc header
-    read("</DOCHDR>",null,false,false); 
+    if (findTerminatingDocHdr) {
+      read(TERM_DOCHDR,null,false,false,null); 
+    }
     // 6. collect until end of doc
-    sb = read("</DOC>",null,false,true);
+    sb = read(TERM_DOC,null,false,true,null);
     // this is the next document, so parse it 
-    Date date = parseDate(dateStr);
+    Date date = dateStr != null ? parseDate(dateStr) : null;
     HTMLParser p = getHtmlParser();
     DocData docData = p.parse(name, date, sb, getDateFormat(0));
     addBytes(sb.length()); // count char length of parsed html text (larger than the plain
doc body text). 
@@ -202,18 +229,14 @@
   }
 
   protected Date parseDate(String dateStr) {
-    Date date = null;
-    for (int i=0; i<DATE_FORMATS.length; i++) {
+    for (int i = 0; i < DATE_FORMATS.length; i++) {
       try {
-        date = getDateFormat(i).parse(dateStr.trim());
-        return date;
-      } catch (ParseException e) {
-      }
+        return getDateFormat(i).parse(dateStr.trim());
+      } catch (ParseException e) {}
     }
     // do not fail test just because a date could not be parsed
-    System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
-    date = new Date(); // now 
-    return date;
+    System.out.println("ignoring date parse exception (assigning 'null') for: "+dateStr);
+    return null;
   }
 
 

Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java?rev=733697&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
Mon Jan 12 03:37:23 2009
@@ -0,0 +1,321 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.StringReader;
+import java.util.Date;
+
+import junit.framework.TestCase;
+
+public class TrecDocMakerTest extends TestCase {
+
+  /** A TrecDocMaker which works on a String and not files. */
+  private static class StringableTrecDocMaker extends TrecDocMaker {
+  
+    private String docs = null;
+    
+    public StringableTrecDocMaker(String docs, boolean forever) {
+      this.docs = docs;
+      this.forever = forever;
+    }
+    
+    protected void openNextFile() throws NoMoreDataException, Exception {
+      if (reader != null) {
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        ++iteration;
+      }
+      
+      reader = new BufferedReader(new StringReader(docs));
+    }
+    
+  }
+  
+  private void assertDocData(DocData dd, String expName, String expTitle, String expBody,
Date expDate) {
+    assertNotNull(dd);
+    assertEquals(expName, dd.getName());
+    assertEquals(expTitle, dd.getTitle());
+    assertTrue(dd.getBody().indexOf(expBody) != -1);
+    assertEquals(expDate, dd.getDate());
+  }
+  
+  private void assertNoMoreDataException(StringableTrecDocMaker stdm) throws Exception {
+    boolean thrown = false;
+    try {
+      stdm.getNextDocData();
+    } catch (NoMoreDataException e) {
+      thrown = true;
+    }
+    assertTrue("Expecting NoMoreDataException", thrown);
+  }
+  
+  public void testOneDocument() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+    stdm.setHTMLParser(new DemoHTMLParser());
+    
+    DocData dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    assertNoMoreDataException(stdm);
+  }
+  
+  public void testTwoDocuments() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+    stdm.setHTMLParser(new DemoHTMLParser());
+    
+    DocData dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(stdm);
+  }
+
+  // If a Date: attribute is missing, make sure the document is not skipped, but
+  // rather that null Data is assigned.
+  public void testMissingDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+    stdm.setHTMLParser(new DemoHTMLParser());
+
+    DocData dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(stdm);
+  }
+
+  // When a 'bad date' is input (unparsable date), make sure the DocData date is
+  // assigned null.
+  public void testBadDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Bad Date\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
+    stdm.setHTMLParser(new DemoHTMLParser());
+
+    DocData dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    assertNoMoreDataException(stdm);
+  }
+
+  public void testForever() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true);
+    stdm.setHTMLParser(new DemoHTMLParser());
+
+    DocData dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    // same document, but the second iteration changes the name.
+    dd = stdm.getNextDocData();
+    assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+    // Don't test that NoMoreDataException is thrown, since the forever flag is
+    // turned on.
+  }
+
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message