lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r786233 [2/3] - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/programmatic/ src/java/org/apache/...
Date Thu, 18 Jun 2009 19:59:01 GMT
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java Thu Jun 18 19:58:59 2009
@@ -17,288 +17,54 @@
  * limitations under the License.
  */
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.Map;
-
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
-import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-import org.xml.sax.helpers.XMLReaderFactory;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
 
 /**
- * A {@link LineDocMaker} which reads the english wikipedia
- * dump.  You can read the .bz2 file directly (it will be
- * decompressed on the fly).
- * Config properties:
- * <ul>
- * <li>keep.image.only.docs=false|true
- * <li>[those available in {@link LineDocMaker}]
- * </ul>
- * 
- * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
+ * A {@link DocMaker} which reads the English Wikipedia dump. Uses
+ * {@link EnwikiContentSource} as its content source, regardless if a different
+ * content source was defined in the configuration.
  */
-public class EnwikiDocMaker extends LineDocMaker {
-  
-  private static final Map ELEMENTS = new HashMap();
-  
-  static final int TITLE = 0;
-  static final int DATE = TITLE + 1;
-  static final int BODY = DATE + 1;
-  static final int ID = BODY + 1;
-  static final int LENGTH = ID + 1;
-  // LENGTH is used as the size of the tuple, so whatever constants we need that
-  // should not be part of the tuple, we should define them after LENGTH.
-  static final int PAGE = LENGTH + 1;
+public class EnwikiDocMaker extends DocMaker {
   
-  static final String[] months = {"JAN", "FEB", "MAR", "APR",
-                                  "MAY", "JUN", "JUL", "AUG",
-                                  "SEP", "OCT", "NOV", "DEC"};
-
-  static {
-    ELEMENTS.put("page", new Integer(PAGE));
-    ELEMENTS.put("text", new Integer(BODY));
-    ELEMENTS.put("timestamp", new Integer(DATE));
-    ELEMENTS.put("title", new Integer(TITLE));
-    ELEMENTS.put("id", new Integer(ID));
+  public Document makeDocument() throws Exception {
+    DocState ds = reuseFields ? getDocState() : localDocState;
+    DocData dd = source.getNextDocData(ds.docData);
+    Document doc = reuseFields ? ds.doc : new Document();
+    doc.getFields().clear();
+
+    Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    body.setValue(dd.getBody());
+    doc.add(body);
+    
+    Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    title.setValue(dd.getTitle());
+    doc.add(title);
+    
+    Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    date.setValue(dd.getDate());
+    doc.add(date);
+    
+    Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
+    id.setValue(dd.getName());
+    doc.add(id);
+    
+    return doc;
   }
-  
-  /**
-   * Returns the type of the element if defined, otherwise returns -1. This
-   * method is useful in startElement and endElement, by not needing to compare
-   * the element qualified name over and over.
-   */
-  private final static int getElementType(String elem) {
-    Integer val = (Integer) ELEMENTS.get(elem);
-    return val == null ? -1 : val.intValue();
+
+  public Document makeDocument(int size) throws Exception {
+    throw new RuntimeException("cannot change document size with EnwikiDocMaker");
   }
-  
-  protected boolean keepImages = true;
 
   public void setConfig(Config config) {
     super.setConfig(config);
-    keepImages = config.get("keep.image.only.docs", true);
+    // Override whatever content source was set in the config
+    source = new EnwikiContentSource();
+    source.setConfig(config);
   }
-
-  class Parser extends DefaultHandler implements Runnable {
-    Thread t;
-    boolean threadDone;
-
-    public void run() {
-
-      try {
-        XMLReader reader =
-          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
-        reader.setContentHandler(this);
-        reader.setErrorHandler(this);
-        while(true){
-          final InputStream localFileIS = fileIS;
-          try {
-            InputSource is = new InputSource(localFileIS);
-            reader.parse(is);
-          } catch (IOException ioe) {
-            synchronized(EnwikiDocMaker.this) {
-              if (localFileIS != fileIS) {
-                // fileIS was closed on us, so, just fall
-                // through
-              } else
-                // Exception is real
-                throw ioe;
-            }
-          }
-          synchronized(this) {
-            if (!forever) {
-              nmde = new NoMoreDataException();
-              notify();
-              return;
-            } else if (localFileIS == fileIS) {
-              // If file is not already re-opened then
-              // re-open it now
-              openFile();
-            }
-          }
-        }
-      } catch (SAXException sae) {
-        throw new RuntimeException(sae);
-      } catch (IOException ioe) {
-        throw new RuntimeException(ioe);
-      } finally {
-        synchronized(this) {
-          threadDone = true;
-          notify();
-        }
-      }
-    }
-
-    String[] tuple;
-    NoMoreDataException nmde;
-
-    String[] next() throws NoMoreDataException {
-      if (t == null) {
-        threadDone = false;
-        t = new Thread(this);
-        t.setDaemon(true);
-        t.start();
-      }
-      String[] result;
-      synchronized(this){
-        while(tuple == null && nmde == null && !threadDone) {
-          try {
-            wait();
-          } catch (InterruptedException ie) {
-          }
-        }
-        if (nmde != null) {
-          // Set to null so we will re-start thread in case
-          // we are re-used:
-          t = null;
-          throw nmde;
-        }
-        if (t != null && threadDone) {
-          // The thread has exited yet did not hit end of
-          // data, so this means it hit an exception.  We
-          // throw NoMorDataException here to force
-          // benchmark to stop the current alg:
-          throw new NoMoreDataException();
-        }
-        result = tuple;
-        tuple = null;
-        notify();
-      }
-      return result;
-    }
-
-    StringBuffer contents = new StringBuffer();
-
-    public void characters(char[] ch, int start, int length) {
-      contents.append(ch, start, length);
-    }
-
-    String title;
-    String body;
-    String time;
-    String id;
-
-    public void startElement(String namespace,
-                             String simple,
-                             String qualified,
-                             Attributes attributes) {
-      int elemType = getElementType(qualified);
-      switch (elemType) {
-        case PAGE:
-          title = null;
-          body = null;
-          time = null;
-          id = null;
-          break;
-        // intentional fall-through.
-        case BODY:
-        case DATE:
-        case TITLE:
-        case ID:
-          contents.setLength(0);
-          break;
-        default:
-          // this element should be discarded.
-      }
-    }
-
-    String time(String original) {
-      StringBuffer buffer = new StringBuffer();
-
-      buffer.append(original.substring(8, 10));
-      buffer.append('-');
-      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
-      buffer.append('-');
-      buffer.append(original.substring(0, 4));
-      buffer.append(' ');
-      buffer.append(original.substring(11, 19));
-      buffer.append(".000");
-
-      return buffer.toString();
-    }
-
-    public void create(String title, String time, String body, String id) {
-      String[] t = new String[LENGTH];
-      t[TITLE] = title.replace('\t', ' ');
-      t[DATE] = time.replace('\t', ' ');
-      t[BODY] = body.replaceAll("[\t\n]", " ");
-      t[ID] = id;
-      synchronized(this) {
-        while(tuple!=null) {
-          try {
-            wait();
-          } catch (InterruptedException ie) {
-          }
-        }
-        tuple = t;
-        notify();
-      }
-    }
-
-    public void endElement(String namespace, String simple, String qualified)
-      throws SAXException {
-      int elemType = getElementType(qualified);
-      switch (elemType) {
-        case PAGE:
-          // the body must be null and we either are keeping image docs or the
-          // title does not start with Image:
-          if (body != null && (keepImages || !title.startsWith("Image:"))) {
-            create(title, time, body, id);
-          }
-          break;
-        case BODY:
-          body = contents.toString();
-          //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
-          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
-          if (startsWith.startsWith("#redirect")) {
-            body = null;
-          }
-          break;
-        case DATE:
-          time = time(contents.toString());
-          break;
-        case TITLE:
-          title = contents.toString();
-          break;
-        case ID:
-          id = contents.toString();
-          break;
-        default:
-          // this element should be discarded.
-      }
-    }
-  }
-
-  Parser parser = new Parser();
-
-  class DocState extends LineDocMaker.DocState {
-    public Document setFields(String[] tuple) {
-      titleField.setValue(tuple[TITLE]);
-      dateField.setValue(tuple[DATE]);
-      bodyField.setValue(tuple[BODY]);
-      idField.setValue(tuple[ID]);
-      return doc;
-    }
-  }
-
-  private DocState getDocState() {
-    DocState ds = (DocState) docState.get();
-    if (ds == null) {
-      ds = new DocState();
-      docState.set(ds);
-    }
-    return ds;
-  }
-
-  public Document makeDocument() throws Exception {
-    String[] tuple = parser.next();
-    return getDocState().setFields(tuple);
-  }
-
+  
 }
\ No newline at end of file

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java Thu Jun 18 19:58:59 2009
@@ -46,7 +46,7 @@
 
     Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer",
             "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
-    String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD);
+    String defaultField = config.get("file.query.maker.default.field", DocMaker.BODY_FIELD);
     QueryParser qp = new QueryParser(defaultField, anlzr);
 
     List qq = new ArrayList();
@@ -55,8 +55,7 @@
     {
       File file = new File(fileName);
       Reader reader = null;
-      if (file != null && file.exists())
-      {
+      if (file.exists()) {
         reader = new FileReader(file);
       } else {
         //see if we can find it as a resource

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Thu Jun 18 19:58:59 2009
@@ -39,13 +39,13 @@
    * @throws IOException
    * @throws InterruptedException
    */
-  public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+  public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
   
   /**
    * Parse the inputText and return DocData. 
    * @param inputText the html text to parse.
    * @see #parse(String, Date, Reader, DateFormat)
    */
-  public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
+  public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
 
 }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java Thu Jun 18 19:58:59 2009
@@ -17,246 +17,76 @@
  * limitations under the License.
  */
 
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.util.Random;
 
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
 
 /**
  * A DocMaker reading one line at a time as a Document from a single file. This
- * saves IO cost (over DirDocMaker) of recursing through a directory and opening
- * a new file for every document. It also re-uses its Document and Field
+ * saves IO cost (over DirContentSource) of recursing through a directory and
+ * opening a new file for every document. It also re-uses its Document and Field
  * instance to improve indexing speed.<br>
  * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
  * <i>title, date, body</i>. If a line is read in a different format, a
  * {@link RuntimeException} will be thrown. In general, you should use this doc
- * maker with files that were created with {@link WriteLineDocTask}.<br><br>
- * 
+ * maker with files that were created with {@link WriteLineDocTask}.<br>
+ * <br>
  * Config properties:
  * <ul>
- * <li>docs.file=&lt;path to the file&gt;
- * <li>doc.reuse.fields=true|false (default true)
- * <li>bzip.compression=true|false (default false)
  * <li>doc.random.id.limit=N (default -1) -- create random docid in the range
  * 0..N; this is useful with UpdateDoc to test updating random documents; if
  * this is unspecified or -1, then docid is sequentially assigned
  * </ul>
  */
-public class LineDocMaker extends BasicDocMaker {
+public class LineDocMaker extends DocMaker {
 
-  InputStream fileIS;
-  BufferedReader fileIn;
-  ThreadLocal docState = new ThreadLocal();
-  private String fileName;
-
-  private static int READER_BUFFER_BYTES = 64*1024;
-  private final DocState localDocState = new DocState();
-
-  private boolean doReuseFields = true;
-  private boolean bzipCompressionEnabled = false;
   private Random r;
   private int numDocs;
-  
-  private CompressorStreamFactory csFactory = new CompressorStreamFactory();
-  
-  class DocState {
-    Document doc;
-    Field bodyField;
-    Field titleField;
-    Field dateField;
-    Field idField;
-
-    public DocState() {
-
-      bodyField = new Field(BasicDocMaker.BODY_FIELD,
-                            "",
-                            storeVal,
-                            Field.Index.ANALYZED,
-                            termVecVal);
-      titleField = new Field(BasicDocMaker.TITLE_FIELD,
-                             "",
-                             storeVal,
-                             Field.Index.ANALYZED,
-                             termVecVal);
-      dateField = new Field(BasicDocMaker.DATE_FIELD,
-                            "",
-                            storeVal,
-                            Field.Index.ANALYZED,
-                            termVecVal);
-      idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
-
-      doc = new Document();
-      doc.add(bodyField);
-      doc.add(titleField);
-      doc.add(dateField);
-      doc.add(idField);
-    }
-
-    final static char SEP = WriteLineDocTask.SEP;
-
-    private int numDocsCreated;
-    private synchronized int incrNumDocsCreated() {
-      return numDocsCreated++;
-    }
-
-    public Document setFields(String line) {
-      // A line must be in the following format. If it's not, fail !
-      // title <TAB> date <TAB> body <NEWLINE>
-      int spot = line.indexOf(SEP);
-      if (spot == -1) {
-        throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-      }
-      int spot2 = line.indexOf(SEP, 1 + spot);
-      if (spot2 == -1) {
-        throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-      }
-      final String title = line.substring(0, spot);
-      final String date = line.substring(1+spot, spot2);
-      final String body = line.substring(1+spot2, line.length());
-      final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
-
-      if (doReuseFields) {
-        idField.setValue(docID);
-        titleField.setValue(title);
-        dateField.setValue(date);
-        bodyField.setValue(body);
-        return doc;
-      } else {
-        Field localIDField = new Field(BasicDocMaker.ID_FIELD,
-                                       docID,
-                                       Field.Store.YES,
-                                       Field.Index.NOT_ANALYZED_NO_NORMS);
-
-        Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
-                                          title,
-                                          storeVal,
-                                          Field.Index.ANALYZED,
-                                          termVecVal);
-        Field localBodyField = new Field(BasicDocMaker.BODY_FIELD,
-                                         body,
-                                         storeVal,
-                                         Field.Index.ANALYZED,
-                                         termVecVal);
-        Field localDateField = new Field(BasicDocMaker.BODY_FIELD,
-                                         date,
-                                         storeVal,
-                                         Field.Index.ANALYZED,
-                                         termVecVal);
-        Document localDoc = new Document();
-        localDoc.add(localIDField);
-        localDoc.add(localBodyField);
-        localDoc.add(localTitleField);
-        localDoc.add(localDateField);
-        return localDoc;
-      }
-    }
-  }
-
-  protected DocData getNextDocData() throws Exception {
-    throw new RuntimeException("not implemented");
-  }
-
-  private DocState getDocState() {
-    DocState ds = (DocState) docState.get();
-    if (ds == null) {
-      ds = new DocState();
-      docState.set(ds);
-    }
-    return ds;
-  }
 
   public Document makeDocument() throws Exception {
 
-    String line;
-    synchronized(this) {
-      line = fileIn.readLine();
-      if (line == null) {
-        if (!forever) {
-          throw new NoMoreDataException();
-        }
-        // Reset the file
-        openFile();
-        return makeDocument();
-      }
-    }
-
-    if (doReuseFields)
-      return getDocState().setFields(line);
-    else
-      return localDocState.setFields(line);
+    DocState ds = reuseFields ? getDocState() : localDocState;
+    DocData dd = source.getNextDocData(ds.docData);
+    Document doc = reuseFields ? ds.doc : new Document();
+    doc.getFields().clear();
+
+    Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    body.setValue(dd.getBody());
+    doc.add(body);
+    
+    Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    title.setValue(dd.getTitle());
+    doc.add(title);
+    
+    Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
+    date.setValue(dd.getDate());
+    doc.add(date);
+    
+    String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
+    Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
+    id.setValue(docID);
+    doc.add(id);
+    
+    return doc;
   }
 
   public Document makeDocument(int size) throws Exception {
-    throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead");
+    throw new RuntimeException("cannot change document size with LineDocMaker");
   }
   
-  public synchronized void resetInputs() {
-    super.resetInputs();
-    openFile();
-  }
-
   public void setConfig(Config config) {
     super.setConfig(config);
-    fileName = config.get("docs.file", null);
-    if (fileName == null) {
-      throw new IllegalArgumentException("docs.file must be set");
-    }
-    doReuseFields = config.get("doc.reuse.fields", true);
-    String doBZCompress = config.get("bzip.compression", null);
-    if (doBZCompress != null) {
-      // Property was set, use the value.
-      bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
-    } else {
-      // Property was not set, attempt to detect based on file's extension
-      bzipCompressionEnabled = fileName.endsWith("bz2");
-    }
+    source = new LineDocSource();
+    source.setConfig(config);
     numDocs = config.get("doc.random.id.limit", -1);
     if (numDocs != -1) {
       r = new Random(179);
     }
   }
 
-  synchronized void openFile() {
-    try {
-      if (fileIn != null) {
-        fileIn.close();
-      }
-      fileIS = new FileInputStream(fileName);
-      if (bzipCompressionEnabled) {
-        // According to BZip2CompressorInputStream's code, it reads the first 
-        // two file header chars ('B' and 'Z'). We only need to wrap the
-        // underlying stream with a BufferedInputStream, since the code uses
-        // the read() method exclusively.
-        fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
-        fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
-      }
-      // Wrap the stream with a BufferedReader for several reasons:
-      // 1. We need the readLine() method.
-      // 2. Even if bzip.compression is enabled, and is wrapped with
-      // BufferedInputStream, wrapping with a buffer can still improve
-      // performance, since the BIS buffer will be used to read from the
-      // compressed stream, while the BR buffer will be used to read from the
-      // uncompressed stream.
-      fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    } catch (CompressorException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  public int numUniqueTexts() {
-    return -1;
-  }
-
 }

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,116 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * A {@link ContentSource} reading one line at a time as a
+ * {@link org.apache.lucene.document.Document} from a single file. This saves IO
+ * cost (over DirContentSource) of recursing through a directory and opening a
+ * new file for every document.<br>
+ * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
+ * <i>title, date, body</i>. If a line is read in a different format, a
+ * {@link RuntimeException} will be thrown. In general, you should use this
+ * content source for files that were created with {@link WriteLineDocTask}.<br>
+ * <br>
+ * Config properties:
+ * <ul>
+ * <li>docs.file=&lt;path to the file&gt;
+ * </ul>
+ */
+public class LineDocSource extends ContentSource {
+
+  private final static char SEP = WriteLineDocTask.SEP;
+
+  private File file;
+  private BufferedReader reader;
+
+  private synchronized void openFile() {
+    try {
+      if (reader != null) {
+        reader.close();
+      }
+      InputStream is = getInputStream(file);
+      reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public void close() throws IOException {
+    if (reader != null) {
+      reader.close();
+      reader = null;
+    }
+  }
+  
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+    String line;
+    synchronized(this) {
+      line = reader.readLine();
+      if (line == null) {
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        // Reset the file
+        openFile();
+        return getNextDocData(docData);
+      }
+    }
+    
+    // A line must be in the following format. If it's not, fail !
+    // title <TAB> date <TAB> body <NEWLINE>
+    int spot = line.indexOf(SEP);
+    if (spot == -1) {
+      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+    }
+    int spot2 = line.indexOf(SEP, 1 + spot);
+    if (spot2 == -1) {
+      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+    }
+    // The date String was written in the format of DateTools.dateToString.
+    docData.clear();
+    docData.setBody(line.substring(1 + spot2, line.length()));
+    docData.setTitle(line.substring(0, spot));
+    docData.setDate(line.substring(1 + spot, spot2));
+    return docData;
+  }
+
+  public void resetInputs() throws IOException {
+    super.resetInputs();
+    openFile();
+  }
+  
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    String fileName = config.get("docs.file", null);
+    if (fileName == null) {
+      throw new IllegalArgumentException("docs.file must be set");
+    }
+    file = new File(fileName).getAbsoluteFile();
+  }
+
+}

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,147 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.ParsePosition;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Locale;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * A {@link ContentSource} reading from the Reuters collection.
+ * <p>
+ * Config properties:
+ * <ul>
+ * <li><b>work.dir</b> - path to the root of docs and indexes dirs (default
+ * <b>work</b>).
+ * <li><b>docs.dir</b> - path to the docs dir (default <b>reuters-out</b>).
+ * </ul>
+ */
+public class ReutersContentSource extends ContentSource {
+
+  private static final class DateFormatInfo {
+    DateFormat df;
+    ParsePosition pos;
+  }
+
+  private ThreadLocal dateFormat = new ThreadLocal();
+  private File dataDir = null;
+  private ArrayList inputFiles = new ArrayList();
+  private int nextFile = 0;
+  private int iteration = 0;
+  
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    File workDir = new File(config.get("work.dir", "work"));
+    String d = config.get("docs.dir", "reuters-out");
+    dataDir = new File(d);
+    if (!dataDir.isAbsolute()) {
+      dataDir = new File(workDir, d);
+    }
+    inputFiles.clear();
+    collectFiles(dataDir, inputFiles);
+    if (inputFiles.size() == 0) {
+      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
+    }
+  }
+
+  private synchronized DateFormatInfo getDateFormatInfo() {
+    DateFormatInfo dfi = (DateFormatInfo) dateFormat.get();
+    if (dfi == null) {
+      dfi = new DateFormatInfo();
+      // date format: 30-MAR-1987 14:22:36.87
+      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
+      dfi.df.setLenient(true);
+      dfi.pos = new ParsePosition(0);
+      dateFormat.set(dfi);
+    }
+    return dfi;
+  }
+  
+  private Date parseDate(String dateStr) {
+    DateFormatInfo dfi = getDateFormatInfo();
+    dfi.pos.setIndex(0);
+    dfi.pos.setErrorIndex(-1);
+    return dfi.df.parse(dateStr.trim(), dfi.pos);
+  }
+
+
+  public void close() throws IOException {
+    // TODO implement?
+  }
+  
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+    File f = null;
+    String name = null;
+    synchronized (this) {
+      if (nextFile >= inputFiles.size()) {
+        // exhausted files, start a new round, unless forever set to false.
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        nextFile = 0;
+        iteration++;
+      }
+      f = (File) inputFiles.get(nextFile++);
+      name = f.getCanonicalPath() + "_" + iteration;
+    }
+
+    BufferedReader reader = new BufferedReader(new FileReader(f));
+    try {
+      // First line is the date, 3rd is the title, rest is body
+      String dateStr = reader.readLine();
+      reader.readLine();// skip an empty line
+      String title = reader.readLine();
+      reader.readLine();// skip an empty line
+      StringBuffer bodyBuf = new StringBuffer(1024);
+      String line = null;
+      while ((line = reader.readLine()) != null) {
+        bodyBuf.append(line).append(' ');
+      }
+      reader.close();
+      
+      addBytes(f.length());
+      
+      Date date = parseDate(dateStr.trim());
+      
+      docData.clear();
+      docData.setName(name);
+      docData.setBody(bodyBuf.toString());
+      docData.setTitle(title);
+      docData.setDate(date);
+      return docData;
+    } finally {
+      reader.close();
+    }
+  }
+
+  public synchronized void resetInputs() throws IOException {
+    super.resetInputs();
+    nextFile = 0;
+    iteration = 0;
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java Thu Jun 18 19:58:59 2009
@@ -71,7 +71,7 @@
    * @return array of Lucene queries
    */
   private static Query[] createQueries(List qs, Analyzer a) {
-    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a);
+    QueryParser qp = new QueryParser(DocMaker.BODY_FIELD, a);
     List queries = new ArrayList();
     for (int i = 0; i < qs.size(); i++)  {
       try {
@@ -107,7 +107,7 @@
     
     List queryList = new ArrayList(20);
     queryList.addAll(Arrays.asList(STANDARD_QUERIES));
-    queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD)));
+    queryList.addAll(Arrays.asList(getPrebuiltQueries(DocMaker.BODY_FIELD)));
     return createQueries(queryList, anlzr);
   }
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java Thu Jun 18 19:58:59 2009
@@ -29,7 +29,7 @@
 
 /**
  * A QueryMaker that makes queries for a collection created 
- * using {@link org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker}.
+ * using {@link org.apache.lucene.benchmark.byTask.feeds.SingleDocSource}.
  */
 public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker {
 
@@ -45,11 +45,11 @@
     Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer",
         "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); 
     
-    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr);
+    QueryParser qp = new QueryParser(DocMaker.BODY_FIELD,anlzr);
     ArrayList qq = new ArrayList();
-    Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2"));
+    Query q1 = new TermQuery(new Term(DocMaker.ID_FIELD,"doc2"));
     qq.add(q1);
-    Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple"));
+    Query q2 = new TermQuery(new Term(DocMaker.BODY_FIELD,"simple"));
     qq.add(q2);
     BooleanQuery bq = new BooleanQuery();
     bq.add(q1,Occur.MUST);

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java Thu Jun 18 19:58:59 2009
@@ -36,7 +36,7 @@
     // exatract some 100 words from doc text to an array
     String words[];
     ArrayList w = new ArrayList();
-    StringTokenizer st = new StringTokenizer(SimpleDocMaker.DOC_TEXT);
+    StringTokenizer st = new StringTokenizer(SingleDocSource.DOC_TEXT);
     while (st.hasMoreTokens() && w.size()<100) {
       w.add(st.nextToken());
     }
@@ -53,7 +53,7 @@
           q.setSlop(slop);
           int wind = wd;
           for (int i=0; i<qlen; i++) {
-            q.add(new Term(BasicDocMaker.BODY_FIELD,words[wind++]));
+            q.add(new Term(DocMaker.BODY_FIELD,words[wind++]));
             if (remainedSlop>0) {
               remainedSlop--;
               wind++;
@@ -66,7 +66,7 @@
           q.setSlop(slop+2*qlen);
           wind = wd+qlen+remainedSlop-1;
           for (int i=0; i<qlen; i++) {
-            q.add(new Term(BasicDocMaker.BODY_FIELD,words[wind--]));
+            q.add(new Term(DocMaker.BODY_FIELD,words[wind--]));
             if (remainedSlop>0) {
               remainedSlop--;
               wind--;

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,69 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import java.io.IOException;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Creates the same document each time {@link #getNextDocData()} is called.
+ */
+public class SingleDocSource extends ContentSource {
+  
+  private int docID = 0;
+
+  static final String DOC_TEXT =  
+    "Well, this is just some plain text we use for creating the " +
+    "test documents. It used to be a text from an online collection " +
+    "devoted to first aid, but if there was there an (online) lawyers " +
+    "first aid collection with legal advices, \"it\" might have quite " +
+    "probably advised one not to include \"it\"'s text or the text of " +
+    "any other online collection in one's code, unless one has money " +
+    "that one don't need and one is happy to donate for lawyers " +
+    "charity. Anyhow at some point, rechecking the usage of this text, " +
+    "it became uncertain that this text is free to use, because " +
+    "the web site in the disclaimer of he eBook containing that text " +
+    "was not responding anymore, and at the same time, in projGut, " +
+    "searching for first aid no longer found that eBook as well. " +
+    "So here we are, with a perhaps much less interesting " +
+    "text for the test, but oh much much safer. ";
+  
+  // return a new docid
+  private synchronized int newdocid() throws NoMoreDataException {
+    if (docID > 0 && !forever) {
+      throw new NoMoreDataException();
+    }
+    return docID++;
+  }
+
+  public void close() throws IOException {}
+  
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException {
+    int id = newdocid();
+    addBytes(DOC_TEXT.length());
+    docData.clear();
+    docData.setName("doc" + id);
+    docData.setBody(DOC_TEXT);
+    return docData;
+  }
+
+  public synchronized void resetInputs() throws IOException {
+    super.resetInputs();
+    docID = 0;
+  }
+
+}

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,95 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+import java.util.Properties;
+import java.util.Random;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * Adds fields appropriate for sorting: country, random_string and sort_field
+ * (int). Supports the following parameters:
+ * <ul>
+ * <li><b>sort.rng</b> - defines the range for sort-by-int field (default
+ * <b>20000</b>).
+ * <li><b>rand.seed</b> - defines the seed to initialize Random with (default
+ * <b>13</b>).
+ * </ul>
+ */
+public class SortableSingleDocSource extends SingleDocSource {
+  
+  private static String[] COUNTRIES = new String[] {
+    "European Union", "United States", "Japan", "Germany", "China (PRC)", 
+    "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia",
+    "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey", 
+    "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway", 
+    "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran", 
+    "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela", 
+    "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia", 
+    "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel", 
+    "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria", 
+    "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia", 
+    "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq", 
+    "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador", 
+    "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic", 
+    "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia", 
+    "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen", 
+    "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador", 
+    "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania", 
+    "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia", 
+    "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia", 
+    "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia", 
+    "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia", 
+    "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia", 
+    "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius", 
+    "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin", 
+    "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova", 
+    "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados", 
+    "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo", 
+    "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey", 
+    "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea", 
+    "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland", 
+    "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize", 
+    "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana", 
+    "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia", 
+    "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada", 
+    "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor", 
+    "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau", 
+    "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands", 
+    "Palau", "Marshall Islands", "S�o Tom� and Pr�ncipe", "Anguilla", 
+    "Kiribati", "Tuvalu", "Niue" };
+
+  private int sortRange;
+  private Random r;
+
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException {
+    docData = super.getNextDocData(docData);
+    Properties props = new Properties();
+
+    // random int
+    props.put("sort_field", Integer.toString(r.nextInt(sortRange)));
+
+    // random string
+    int len = nextInt(2, 20);
+    char[] buffer = new char[len];
+    for (int i = 0; i < len; i++) {
+      buffer[i] = (char) r.nextInt(0x80); 
+    }
+    props.put("random_string", new String(buffer));
+
+    // random country
+    props.put("country", COUNTRIES[r.nextInt(COUNTRIES.length)]);
+    docData.setProps(props);
+    return docData;
+  }
+
+  private int nextInt(int start, int end) {
+    return start + r.nextInt(end - start);
+  }
+
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    sortRange = config.get("sort.rng", 20000);
+    r = new Random(config.get("rand.seed", 13));
+  }
+  
+}

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,339 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.text.DateFormat;
+import java.text.ParsePosition;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Locale;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
+
+/**
+ * Implements a {@link ContentSource} over the TREC collection.
+ * <p>
+ * Supports the following configuration parameters (on top of
+ * {@link ContentSource}):
+ * <ul>
+ * <li><b>work.dir</b> - specifies the working directory. Required if "docs.dir"
+ * denotes a relative path (<b>default=work</b>).
+ * <li><b>docs.dir</b> - specifies the directory where the TREC files reside.
+ * Can be set to a relative path if "work.dir" is also specified
+ * (<b>default=trec</b>).
+ * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
+ * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
+ * </ul>
+ */
+public class TrecContentSource extends ContentSource {
+  // TODO (3.0): change StringBuffer to StringBuffer
+
+  private static final class DateFormatInfo {
+    DateFormat[] dfs;
+    ParsePosition pos;
+  }
+
+  private static final String DATE = "Date: ";
+  private static final String DOCHDR = "<DOCHDR>";
+  private static final String TERMINATING_DOCHDR = "</DOCHDR>";
+  private static final String DOCNO = "<DOCNO>";
+  private static final String TERMINATING_DOCNO = "</DOCNO>";
+  private static final String DOC = "<DOC>";
+  private static final String TERMINATING_DOC = "</DOC>";
+
+  private static final String NEW_LINE = System.getProperty("line.separator");
+
+  private static final String DATE_FORMATS [] = {
+       "EEE, dd MMM yyyy kk:mm:ss z",	  // Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE MMM dd kk:mm:ss yyyy z",  	// Tue Dec 09 16:45:08 2003 EST
+       "EEE, dd-MMM-':'y kk:mm:ss z", 	// Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE, dd-MMM-yyy kk:mm:ss z", 	  // Tue, 09 Dec 2003 22:39:08 GMT
+       "EEE MMM dd kk:mm:ss yyyy",  	  // Tue Dec 09 16:45:08 2003
+  };
+
+  private ThreadLocal dateFormats = new ThreadLocal();
+  private ThreadLocal trecDocReader = new ThreadLocal();
+  private ThreadLocal trecDocBuffer = new ThreadLocal();
+  private File dataDir = null;
+  private ArrayList inputFiles = new ArrayList();
+  private int nextFile = 0;
+  private int rawDocSize;
+
+  // Use to synchronize threads on reading from the TREC documents.
+  private Object lock = new Object();
+
+  // Required for test
+  BufferedReader reader;
+  int iteration = 0;
+  HTMLParser htmlParser;
+  
+  private DateFormatInfo getDateFormatInfo() {
+    DateFormatInfo dfi = (DateFormatInfo) dateFormats.get();
+    if (dfi == null) {
+      dfi = new DateFormatInfo();
+      dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
+      for (int i = 0; i < dfi.dfs.length; i++) {
+        dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
+        dfi.dfs[i].setLenient(true);
+      }
+      dfi.pos = new ParsePosition(0);
+      dateFormats.set(dfi);
+    }
+    return dfi;
+  }
+
+  private StringBuffer getDocBuffer() {
+    StringBuffer sb = (StringBuffer) trecDocBuffer.get();
+    if (sb == null) {
+      sb = new StringBuffer();
+      trecDocBuffer.set(sb);
+    }
+    return sb;
+  }
+  
+  private Reader getTrecDocReader(StringBuffer docBuffer) {
+    StringBufferReader r = (StringBufferReader) trecDocReader.get();
+    if (r == null) {
+      r = new StringBufferReader(docBuffer);
+      trecDocReader.set(r);
+    } else {
+      r.set(docBuffer);
+    }
+    return r;
+  }
+
+  // read until finding a line that starts with the specified prefix, or a terminating tag has been found.
+  private void read(StringBuffer buf, String prefix, boolean collectMatchLine,
+                    boolean collectAll, String terminatingTag)
+      throws IOException, NoMoreDataException {
+    String sep = "";
+    while (true) {
+      String line = reader.readLine();
+
+      if (line == null) {
+        openNextFile();
+        continue;
+      }
+
+      rawDocSize += line.length();
+
+      if (line.startsWith(prefix)) {
+        if (collectMatchLine) {
+          buf.append(sep).append(line);
+          sep = NEW_LINE;
+        }
+        break;
+      }
+
+      if (terminatingTag != null && line.startsWith(terminatingTag)) {
+        // didn't find the prefix that was asked, but the terminating
+        // tag was found. set the length to 0 to signal no match was
+        // found.
+        buf.setLength(0);
+        break;
+      }
+
+      if (collectAll) {
+        buf.append(sep).append(line);
+        sep = NEW_LINE;
+      }
+    }
+  }
+  
+  void openNextFile() throws NoMoreDataException, IOException {
+    close();
+    int retries = 0;
+    while (true) {
+      if (nextFile >= inputFiles.size()) { 
+        // exhausted files, start a new round, unless forever set to false.
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        nextFile = 0;
+        iteration++;
+      }
+      File f = (File) inputFiles.get(nextFile++);
+      if (verbose) {
+        System.out.println("opening: " + f + " length: " + f.length());
+      }
+      try {
+        GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16);
+        reader = new BufferedReader(new InputStreamReader(zis), 1 << 16);
+        return;
+      } catch (Exception e) {
+        retries++;
+        if (retries < 20 && verbose) {
+          System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + "  #retries=" + retries);
+          continue;
+        }
+        throw new NoMoreDataException();
+      }
+    }
+  }
+
+  Date parseDate(String dateStr) {
+    dateStr = dateStr.trim();
+    DateFormatInfo dfi = getDateFormatInfo();
+    for (int i = 0; i < dfi.dfs.length; i++) {
+      DateFormat df = dfi.dfs[i];
+      dfi.pos.setIndex(0);
+      dfi.pos.setErrorIndex(-1);
+      Date d = df.parse(dateStr, dfi.pos);
+      if (d != null) {
+        // Parse succeeded.
+        return d;
+      }
+    }
+    // do not fail test just because a date could not be parsed
+    if (verbose) {
+      System.out.println("failed to parse date (assigning 'now') for: " + dateStr);
+    }
+    return null; 
+  }
+  
+  public void close() throws IOException {
+    if (reader == null) {
+      return;
+    }
+
+    try {
+      reader.close();
+    } catch (IOException e) {
+      if (verbose) {
+        System.out.println("failed to close reader !");
+        e.printStackTrace(System.out);
+      }
+    }
+    reader = null;
+  }
+
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+    String dateStr = null, name = null;
+    Reader r = null;
+    // protect reading from the TREC files by multiple threads. The rest of the
+    // method, i.e., parsing the content and returning the DocData can run
+    // unprotected.
+    synchronized (lock) {
+      if (reader == null) {
+        openNextFile();
+      }
+
+      StringBuffer docBuf = getDocBuffer();
+      
+      // 1. skip until doc start
+      docBuf.setLength(0);
+      read(docBuf, DOC, false, false, null);
+
+      // 2. name
+      docBuf.setLength(0);
+      read(docBuf, DOCNO, true, false, null);
+      name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
+          DOCNO.length()));
+      name = name + "_" + iteration;
+
+      // 3. skip until doc header
+      docBuf.setLength(0);
+      read(docBuf, DOCHDR, false, false, null);
+
+      boolean findTerminatingDocHdr = false;
+
+      // 4. date - look for the date only until /DOCHDR
+      docBuf.setLength(0);
+      read(docBuf, DATE, true, false, TERMINATING_DOCHDR);
+      if (docBuf.length() != 0) {
+        // Date found.
+        dateStr = docBuf.substring(DATE.length());
+        findTerminatingDocHdr = true;
+      }
+
+      // 5. skip until end of doc header
+      if (findTerminatingDocHdr) {
+        docBuf.setLength(0);
+        read(docBuf, TERMINATING_DOCHDR, false, false, null);
+      }
+
+      // 6. collect until end of doc
+      docBuf.setLength(0);
+      read(docBuf, TERMINATING_DOC, false, true, null);
+      
+      // 7. Set up a Reader over the read content
+      r = getTrecDocReader(docBuf);
+      // Resetting the thread's reader means it will reuse the instance
+      // allocated as well as re-read from docBuf.
+      r.reset();
+      
+      // count char length of parsed html text (larger than the plain doc body text).
+      addBytes(docBuf.length()); 
+    }
+
+    // This code segment relies on HtmlParser being thread safe. When we get 
+    // here, everything else is already private to that thread, so we're safe.
+    Date date = dateStr != null ? parseDate(dateStr) : null;
+    try {
+      docData = htmlParser.parse(docData, name, date, r, null);
+      addDoc();
+    } catch (InterruptedException e) {
+      IOException ex = new IOException(e.getMessage());
+      ex.initCause(e);
+      throw ex;
+    }
+
+    return docData;
+  }
+
+  public void resetInputs() throws IOException {
+    synchronized (lock) {
+      super.resetInputs();
+      close();
+      nextFile = 0;
+      iteration = 0;
+    }
+  }
+
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    File workDir = new File(config.get("work.dir", "work"));
+    String d = config.get("docs.dir", "trec");
+    dataDir = new File(d);
+    if (!dataDir.isAbsolute()) {
+      dataDir = new File(workDir, d);
+    }
+    collectFiles(dataDir, inputFiles);
+    if (inputFiles.size() == 0) {
+      throw new IllegalArgumentException("No files in dataDir: " + dataDir);
+    }
+    try {
+      String parserClassName = config.get("html.parser",
+          "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser");
+      htmlParser = (HTMLParser) Class.forName(parserClassName).newInstance();
+    } catch (Exception e) {
+      // Should not get here. Throw runtime exception.
+      throw new RuntimeException(e);
+    }
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java Thu Jun 18 19:58:59 2009
@@ -80,9 +80,8 @@
     Properties p = new Properties();
     p.setProperty ( "task.max.depth.log"  , "3" );
     p.setProperty ( "max.buffered"        , "buf:10:10:100:100:10:10:100:100" );
-    p.setProperty ( "doc.maker"           , "org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker" );
-    p.setProperty ( "doc.add.log.step"    , "2000" );
-    p.setProperty ( "doc.delete.log.step" , "2000" );
+    p.setProperty ( "doc.maker"           , "org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource" );
+    p.setProperty ( "log.step"            , "2000" );
     p.setProperty ( "doc.delete.step"     , "8" );
     p.setProperty ( "analyzer"            , "org.apache.lucene.analysis.standard.StandardAnalyzer" );
     p.setProperty ( "doc.term.vector"     , "false" );

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java Thu Jun 18 19:58:59 2009
@@ -20,38 +20,23 @@
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.document.Document;
-import java.text.NumberFormat;
-
 
 /**
  * Add a document, optionally with of a certain size.
  * <br>Other side effects: none.
- * <br>Relevant properties: <code>doc.add.log.step</code>.
  * <br>Takes optional param: document size. 
  */
 public class AddDocTask extends PerfTask {
 
-  /**
-   * Default value for property <code>doc.add.log.step<code> - indicating how often 
-   * an "added N docs" message should be logged.  
-   */
-  public static final int DEFAULT_ADD_DOC_LOG_STEP = 500;
-
   public AddDocTask(PerfRunData runData) {
     super(runData);
   }
 
-  private int logStep = -1;
   private int docSize = 0;
-  int count = 0;
   
   // volatile data passed between setup(), doLogic(), tearDown().
   private Document doc = null;
   
-  /*
-   *  (non-Javadoc)
-   * @see PerfTask#setup()
-   */
   public void setup() throws Exception {
     super.setup();
     DocMaker docMaker = getRunData().getDocMaker();
@@ -62,33 +47,20 @@
     }
   }
 
-  /* (non-Javadoc)
-   * @see PerfTask#tearDown()
-   */
   public void tearDown() throws Exception {
-    log(++count);
     doc = null;
     super.tearDown();
   }
 
+  protected String getLogMessage(int recsCount) {
+    return "added " + recsCount + " docs";
+  }
+  
   public int doLogic() throws Exception {
     getRunData().getIndexWriter().addDocument(doc);
     return 1;
   }
 
-  protected void log (int count) {
-    if (logStep<0) {
-      // init once per instance
-      logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
-    }
-    if (logStep>0 && (count%logStep)==0) {
-      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
-      NumberFormat nf = NumberFormat.getInstance();
-      nf.setMaximumFractionDigits(2);
-      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs");
-    }
-  }
-
   /**
    * Set the params (docSize only)
    * @param params docSize, or 0 for no limit.

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,67 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
+import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * Consumes a {@link org.apache.lucene.benchmark.byTask.feeds.ContentSource}.
+ * Supports the following parameters:
+ * <ul>
+ * <li>content.source - the content source to use. (mandatory)
+ * </ul>
+ */
+public class ConsumeContentSourceTask extends PerfTask {
+
+  private ContentSource source;
+  private DocData dd = new DocData();
+  
+  public ConsumeContentSourceTask(PerfRunData runData) {
+    super(runData);
+    Config config = runData.getConfig();
+    String sourceClass = config.get("content.source", null);
+    if (sourceClass == null) {
+      throw new IllegalArgumentException("content.source must be defined");
+    }
+    try {
+      source = (ContentSource) Class.forName(sourceClass).newInstance();
+      source.setConfig(config);
+      source.resetInputs();
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  protected String getLogMessage(int recsCount) {
+    return "read " + recsCount + " documents from the content source";
+  }
+  
+  public void close() throws Exception {
+    source.close();
+    super.close();
+  }
+
+  public int doLogic() throws Exception {
+    dd = source.getNextDocData(dd);
+    return 1;
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java Thu Jun 18 19:58:59 2009
@@ -22,7 +22,7 @@
 /**
  * Delete a document by docid.
  * <br>Other side effects: none.
- * <br>Relevant properties: <code>doc.delete.log.step , doc.delete.step</code>.
+ * <br>Relevant properties: <code>doc.delete.step, delete.log.step</code>.
  * <br>If no docid param is supplied, deletes doc with <code>id = last-deleted-doc + doc.delete.step</code>. 
  * <br>Takes optional param: document id. 
  */
@@ -33,19 +33,16 @@
    */
   public static final int DEFAULT_DOC_DELETE_STEP = 8;
   
-  /**
-   * Default value for property <code>doc.delete.log.step<code> - indicating how often 
-   * an "deleted N docs" message should be logged.  
-   */
-  public static final int DEFAULT_DELETE_DOC_LOG_STEP = 500;
-  
   public DeleteDocTask(PerfRunData runData) {
     super(runData);
+    // Override log.step, which is read by PerfTask
+    int deleteLogStep = runData.getConfig().get("delete.log.step", -1);
+    if (deleteLogStep != -1) {
+      logStep = deleteLogStep;
+    }
   }
 
-  private int logStep = -1;
   private int deleteStep = -1;
-  private static int numDeleted = 0;
   private static int lastDeleted = -1;
 
   private int docid = -1;
@@ -62,10 +59,6 @@
    */
   public void setup() throws Exception {
     super.setup();
-    // one time static initializations
-    if (logStep<0) {
-      logStep = getRunData().getConfig().get("doc.delete.log.step",DEFAULT_DELETE_DOC_LOG_STEP);
-    }
     if (deleteStep<0) {
       deleteStep = getRunData().getConfig().get("doc.delete.step",DEFAULT_DOC_DELETE_STEP);
     }
@@ -73,18 +66,8 @@
     docid = (byStep ? lastDeleted + deleteStep : docid);
   }
 
-  /* (non-Javadoc)
-   * @see PerfTask#tearDown()
-   */
-  public void tearDown() throws Exception {
-    log(++numDeleted);
-    super.tearDown();
-  }
-
-  private void log (int count) {
-    if (logStep>0 && (count%logStep)==0) {
-      System.out.println("--> processed (delete) "+count+" docs, last deleted: "+lastDeleted);
-    }
+  protected String getLogMessage(int recsCount) {
+    return "deleted " + recsCount + " docs, last deleted: " + lastDeleted;
   }
   
   /**

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Thu Jun 18 19:58:59 2009
@@ -17,54 +17,80 @@
  * limitations under the License.
  */
 
+import java.text.NumberFormat;
+
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.stats.Points;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.Format;
 
 /**
- * A (abstract)  task to be tested for performance.
- * <br>
- * Every performance task extends this class, and provides its own doLogic() method, 
- * which performss the actual task.
- * <br>
- * Tasks performing some work that should be measured for the task, can overide setup() and/or tearDown() and 
- * placed that work there. 
- * <br>
+ * An abstract task to be tested for performance. <br>
+ * Every performance task extends this class, and provides its own
+ * {@link #doLogic()} method, which performss the actual task. <br>
+ * Tasks performing some work that should be measured for the task, can overide
+ * {@link #setup()} and/or {@link #tearDown()} and place that work there. <br>
  * Relevant properties: <code>task.max.depth.log</code>.
  */
 public abstract class PerfTask implements Cloneable {
 
+  private static final int DEFAULT_LOG_STEP = 1000;
+  
   private PerfRunData runData;
   
   // propeties that all tasks have
   private String name;
   private int depth = 0;
+  protected int logStep;
+  private int logStepCount = 0;
   private int maxDepthLogStart = 0;
   private boolean disableCounting = false;
   protected String params = null;
   
   protected static final String NEW_LINE = System.getProperty("line.separator");
 
-  /**
-   * Should not be used externally
-   */
+  /** Should not be used externally */
   private PerfTask() {
-    name =  Format.simpleName(getClass());
+    name = Format.simpleName(getClass());
     if (name.endsWith("Task")) {
-      name = name.substring(0,name.length()-4);
+      name = name.substring(0, name.length() - 4);
     }
   }
 
+  /**
+   * @deprecated will be removed in 3.0. checks if there are any obsolete
+   *             settings, like doc.add.log.step and doc.delete.log.step and
+   *             alerts the user.
+   */
+  private void checkObsoleteSettings(Config config) {
+    if (config.get("doc.add.log.step", null) != null) {
+      throw new RuntimeException("doc.add.log.step is not supported anymore. " +
+      		"Use log.step and refer to CHANGES to read on the recent API changes " +
+      		"done to Benchmark's DocMaker and Task-based logging.");
+    }
+    
+    if (config.get("doc.delete.log.step", null) != null) {
+      throw new RuntimeException("doc.delete.log.step is not supported anymore. " +
+          "Use delete.log.step and refer to CHANGES to read on the recent API changes " +
+          "done to Benchmark's DocMaker and Task-based logging.");
+    }
+  }
+  
   public PerfTask(PerfRunData runData) {
     this();
     this.runData = runData;
-    this.maxDepthLogStart = runData.getConfig().get("task.max.depth.log",0);
+    Config config = runData.getConfig();
+    this.maxDepthLogStart = config.get("task.max.depth.log",0);
+    logStep = config.get("log.step", DEFAULT_LOG_STEP);
+    // To avoid the check 'if (logStep > 0)' in tearDown(). This effectively
+    // turns logging off.
+    if (logStep <= 0) {
+      logStep = Integer.MAX_VALUE;
+    }
+    checkObsoleteSettings(config);
   }
   
-  /* (non-Javadoc)
-   * @see java.lang.Object#clone()
-   */
   protected Object clone() throws CloneNotSupportedException {
     // tasks having non primitive data structures should overide this.
     // otherwise parallel running of a task sequence might not run crrectly. 
@@ -173,6 +199,10 @@
     return maxDepthLogStart;
   }
 
+  protected String getLogMessage(int recsCount) {
+    return "processed " + recsCount + " records";
+  }
+  
   /**
    * Tasks that should never log at start can overide this.  
    * @return true if this task should never log when it start.
@@ -207,7 +237,14 @@
    * Notice that higher level (sequence) tasks containing this task would then 
    * measure larger time than the sum of their contained tasks.
    */
-  public void tearDown () throws Exception {
+  public void tearDown() throws Exception {
+    if (++logStepCount % logStep == 0) {
+      double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
+      NumberFormat nf = NumberFormat.getInstance();
+      nf.setMaximumFractionDigits(2);
+      System.out.println(nf.format(time) + " sec --> "
+          + Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
+    }
   }
 
   /**

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Thu Jun 18 19:58:59 2009
@@ -17,58 +17,44 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.analysis.Token;
+import java.io.Reader;
+import java.util.List;
+
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import java.text.NumberFormat;
-import java.io.Reader;
-import java.util.List;
-
 
 /**
  * Simple task to test performance of tokenizers.  It just
  * creates a token stream for each field of the document and
  * read all tokens out of that stream.
- * <br>Relevant properties: <code>doc.tokenize.log.step</code>.
  */
 public class ReadTokensTask extends PerfTask {
 
-  /**
-   * Default value for property <code>doc.tokenize.log.step<code> - indicating how often 
-   * an "added N docs / M tokens" message should be logged.  
-   */
-  public static final int DEFAULT_DOC_LOG_STEP = 500;
-
   public ReadTokensTask(PerfRunData runData) {
     super(runData);
   }
 
-  private int logStep = -1;
-  int count = 0;
-  int totalTokenCount = 0;
+  private int totalTokenCount = 0;
   
   // volatile data passed between setup(), doLogic(), tearDown().
   private Document doc = null;
   
-  /*
-   *  (non-Javadoc)
-   * @see PerfTask#setup()
-   */
   public void setup() throws Exception {
     super.setup();
     DocMaker docMaker = getRunData().getDocMaker();
     doc = docMaker.makeDocument();
   }
 
-  /* (non-Javadoc)
-   * @see PerfTask#tearDown()
-   */
+  protected String getLogMessage(int recsCount) {
+    return "read " + recsCount + " docs; " + totalTokenCount + " tokens";
+  }
+  
   public void tearDown() throws Exception {
-    log(++count);
     doc = null;
     super.tearDown();
   }
@@ -117,19 +103,6 @@
     return tokenCount;
   }
 
-  private void log(int count) {
-    if (logStep<0) {
-      // init once per instance
-      logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
-    }
-    if (logStep>0 && (count%logStep)==0) {
-      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
-      NumberFormat nf = NumberFormat.getInstance();
-      nf.setMaximumFractionDigits(2);
-      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
-    }
-  }
-
   /* Simple StringReader that can be reset to a new string;
    * we use this when tokenizing the string value from a
    * Field. */

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Thu Jun 18 19:58:59 2009
@@ -62,6 +62,7 @@
     for(int i=0;i<tasksArray.length;i++) {
       tasksArray[i].close();
     }
+    getRunData().getDocMaker().close();
   }
 
   private void initTasksArray() {
@@ -106,8 +107,8 @@
       if (isParallel()) {
         throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
       }
-      if (getRunData().getConfig().get("doc.maker.forever",true)) {
-        throw new Exception("REPEAT_EXHAUST requires setting doc.maker.forever=false");
+      if (getRunData().getConfig().get("content.source.forever",true)) {
+        throw new Exception("REPEAT_EXHAUST requires setting content.source.forever=false");
       }
     }
     setSequenceName();

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java Thu Jun 18 19:58:59 2009
@@ -19,17 +19,13 @@
 
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.Term;
-import java.text.NumberFormat;
-
 
 /**
  * Update a document, using IndexWriter.updateDocument,
  * optionally with of a certain size.
  * <br>Other side effects: none.
- * <br>Relevant properties: <code>doc.add.log.step</code>.
  * <br>Takes optional param: document size. 
  */
 public class UpdateDocTask extends PerfTask {
@@ -38,17 +34,11 @@
     super(runData);
   }
 
-  private int logStep = -1;
   private int docSize = 0;
-  int count = 0;
   
   // volatile data passed between setup(), doLogic(), tearDown().
   private Document doc = null;
   
-  /*
-   *  (non-Javadoc)
-   * @see PerfTask#setup()
-   */
   public void setup() throws Exception {
     super.setup();
     DocMaker docMaker = getRunData().getDocMaker();
@@ -59,38 +49,24 @@
     }
   }
 
-  /* (non-Javadoc)
-   * @see PerfTask#tearDown()
-   */
   public void tearDown() throws Exception {
-    log(++count);
     doc = null;
     super.tearDown();
   }
 
   public int doLogic() throws Exception {
-    final String docID = doc.get(BasicDocMaker.ID_FIELD);
+    final String docID = doc.get(DocMaker.ID_FIELD);
     if (docID == null) {
       throw new IllegalStateException("document must define the docid field");
     }
-    getRunData().getIndexWriter().updateDocument(new Term(BasicDocMaker.ID_FIELD, docID),
-                                                 doc);
+    getRunData().getIndexWriter().updateDocument(new Term(DocMaker.ID_FIELD, docID), doc);
     return 1;
   }
 
-  private void log (int count) {
-    if (logStep<0) {
-      // init once per instance
-      logStep = getRunData().getConfig().get("doc.add.log.step",AddDocTask.DEFAULT_ADD_DOC_LOG_STEP);
-    }
-    if (logStep>0 && (count%logStep)==0) {
-      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
-      NumberFormat nf = NumberFormat.getInstance();
-      nf.setMaximumFractionDigits(2);
-      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (update) "+count+" docs");
-    }
+  protected String getLogMessage(int recsCount) {
+    return "updated " + recsCount + " docs";
   }
-
+  
   /**
    * Set the params (docSize only)
    * @param params docSize, or 0 for no limit.

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Thu Jun 18 19:58:59 2009
@@ -25,7 +25,6 @@
 
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
@@ -45,23 +44,13 @@
  * <li>bzip.compression - whether the output should be bzip-compressed. This is
  * recommended when the output file is expected to be large. (optional, default:
  * false).
- * <li>doc.writeline.log.step - controls how many records to process before
- * logging the status of the task. <b>NOTE:</b> to disable logging, set this
- * value to 0 or negative. (optional, default:1000).
  * </ul>
  */
 public class WriteLineDocTask extends PerfTask {
 
-  /**
-   * Default value for property <code>doc.add.log.step<code> - indicating how often 
-   * an "added N docs" message should be logged.  
-   */
-  public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
   public final static char SEP = '\t';
 
-  private int logStep = -1;
   private int docSize = 0;
-  int count = 0;
   private BufferedWriter lineFileOut = null;
   private DocMaker docMaker;
   
@@ -93,30 +82,23 @@
     }
     lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
     docMaker = runData.getDocMaker();
-    logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
-    // To avoid the check 'if (logStep > 0)' in log(). This effectively turns
-    // logging off.
-    if (logStep <= 0) {
-      logStep = Integer.MAX_VALUE;
-    }
   }
 
-  public void tearDown() throws Exception {
-    log(++count);
-    super.tearDown();
+  protected String getLogMessage(int recsCount) {
+    return "Wrote " + recsCount + " line docs";
   }
-
+  
   public int doLogic() throws Exception {
     Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
 
-    Field f = doc.getField(BasicDocMaker.BODY_FIELD);
+    Field f = doc.getField(DocMaker.BODY_FIELD);
     String body = f != null ? f.stringValue().replace('\t', ' ') : null;
     
     if (body != null) {
-      f = doc.getField(BasicDocMaker.TITLE_FIELD);
+      f = doc.getField(DocMaker.TITLE_FIELD);
       String title = f != null ? f.stringValue().replace('\t', ' ') : "";
       
-      f = doc.getField(BasicDocMaker.DATE_FIELD);
+      f = doc.getField(DocMaker.DATE_FIELD);
       String date = f != null ? f.stringValue().replace('\t', ' ') : "";
       
       lineFileOut.write(title, 0, title.length());
@@ -129,17 +111,6 @@
     return 1;
   }
 
-  private void log(int count) {
-    // logStep is initialized in the ctor to a positive value. If the config
-    // file indicates no logging, or contains an invalid value, logStep is init
-    // to Integer.MAX_VALUE, so that logging will not occur (at least for the
-    // first Integer.MAX_VALUE records).
-    if (count % logStep == 0) {
-      System.out.println("--> " + Thread.currentThread().getName()
-          + " processed (write line) " + count + " docs");
-    }
-  }
-
   public void close() throws Exception {
     lineFileOut.close();
     super.close();
@@ -156,9 +127,6 @@
     docSize = (int) Float.parseFloat(params); 
   }
 
-  /* (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
-   */
   public boolean supportsParams() {
     return true;
   }



Mime
View raw message