lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r765543 - in /lucene/java/trunk/contrib: ./ benchmark/ benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ benchmark/src/test/org/apache/lucene/benchmark/ benchmark/src/t...
Date Thu, 16 Apr 2009 09:46:31 GMT
Author: mikemccand
Date: Thu Apr 16 09:46:30 2009
New Revision: 765543

URL: http://svn.apache.org/viewvc?rev=765543&view=rev
Log:
LUCENE-1591: add bzip2 compression/decompress to contrib/benchmark

Added:
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java   (with props)
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java   (with props)
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java   (with props)
Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/build.xml
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Thu Apr 16 09:46:30 2009
@@ -50,6 +50,11 @@
     a field needs to use a custom Collator.  (Steven Rowe via Mike
     McCandless)
 
+ 4. LUCENE-1591: EnWikiDocMaker, LineDocMaker, WriteLineDoc can now
+    read/write bz2 using Apache commons compress library.  This means
+    you can download the .bz2 export from http://wikipedia.org and
+    immediately index it.  (Shai Erera via Mike McCandless)
+
 
 Documentation
 

Modified: lucene/java/trunk/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/build.xml?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/build.xml (original)
+++ lucene/java/trunk/contrib/benchmark/build.xml Thu Apr 16 09:46:30 2009
@@ -100,23 +100,14 @@
         <antcall target="expand-reuters"/>
         <antcall target="extract-reuters"/>
     </target>
-    <property name="digester.jar" value="commons-digester-1.7.jar"/>
-    <property name="collections.jar" value="commons-collections-3.1.jar"/>
-    <property name="logging.jar" value="commons-logging-1.0.4.jar"/>
-    <property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
-    <property name="xercesImpl.jar" value="xerces-2.9.1-patched-XERCESJ-1257.jar"/>
-    <property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
 
     <path id="classpath">
         <pathelement path="${common.dir}/build/classes/java"/>
         <pathelement path="${common.dir}/build/classes/demo"/>
         <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
-        <pathelement path="lib/${digester.jar}"/>
-        <pathelement path="lib/${collections.jar}"/>
-        <pathelement path="lib/${logging.jar}"/>
-        <pathelement path="lib/${bean-utils.jar}"/>
-        <pathelement path="lib/${xercesImpl.jar}"/>
-        <pathelement path="lib/${xml-apis.jar}"/>
+    	<fileset dir="lib">
+    		<include name="**/*.jar"/>
+    	</fileset>
     </path>
     <path id="run.classpath">
         <path refid="classpath"/>

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java Thu Apr 16 09:46:30 2009
@@ -17,49 +17,75 @@
  * limitations under the License.
  */
 
-import org.xml.sax.XMLReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 import org.xml.sax.helpers.XMLReaderFactory;
 
-import java.io.IOException;
-import java.io.FileInputStream;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-
 /**
- * A LineDocMaker which reads the uncompressed english wikipedia dump.
- *
+ * A {@link LineDocMaker} which reads the english wikipedia
+ * dump.  You can read the .bz2 file directly (it will be
+ * decompressed on the fly).
  * Config properties:
- * keep.image.only.docs=false|true
- * <br/>
- * Plus those available in LineDocMaker
- *
- *
+ * <ul>
+ * <li>keep.image.only.docs=false|true
+ * <li>[those available in {@link LineDocMaker}]
+ * </ul>
+ * 
  * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
  */
 public class EnwikiDocMaker extends LineDocMaker {
-  protected boolean keepImages = true;
+  
+  private static final Map ELEMENTS = new HashMap();
+  
   static final int TITLE = 0;
-  static final int DATE = TITLE+1;
-  static final int BODY = DATE+1;
+  static final int DATE = TITLE + 1;
+  static final int BODY = DATE + 1;
   static final int ID = BODY + 1;
-  static final int LENGTH = ID+1;
-
+  static final int LENGTH = ID + 1;
+  // LENGTH is used as the size of the tuple, so whatever constants we need that
+  // should not be part of the tuple, we should define them after LENGTH.
+  static final int PAGE = LENGTH + 1;
+  
   static final String[] months = {"JAN", "FEB", "MAR", "APR",
                                   "MAY", "JUN", "JUL", "AUG",
                                   "SEP", "OCT", "NOV", "DEC"};
 
+  static {
+    ELEMENTS.put("page", new Integer(PAGE));
+    ELEMENTS.put("text", new Integer(BODY));
+    ELEMENTS.put("timestamp", new Integer(DATE));
+    ELEMENTS.put("title", new Integer(TITLE));
+    ELEMENTS.put("id", new Integer(ID));
+  }
+  
+  /**
+   * Returns the type of the element if defined, otherwise returns -1. This
+   * method is useful in startElement and endElement, by not needing to compare
+   * the element qualified name over and over.
+   */
+  private final static int getElementType(String elem) {
+    Integer val = (Integer) ELEMENTS.get(elem);
+    return val == null ? -1 : val.intValue();
+  }
+  
+  protected boolean keepImages = true;
+
   public void setConfig(Config config) {
     super.setConfig(config);
     keepImages = config.get("keep.image.only.docs", true);
   }
 
   class Parser extends DefaultHandler implements Runnable {
-
     Thread t;
     boolean threadDone;
 
@@ -71,7 +97,7 @@
         reader.setContentHandler(this);
         reader.setErrorHandler(this);
         while(true){
-          final FileInputStream localFileIS = fileIS;
+          final InputStream localFileIS = fileIS;
           try {
             InputSource is = new InputSource(localFileIS);
             reader.parse(is);
@@ -133,12 +159,13 @@
           t = null;
           throw nmde;
         }
-        if (t != null && threadDone)
+        if (t != null && threadDone) {
           // The thread has exited yet did not hit end of
           // data, so this means it hit an exception.  We
           // throw NoMorDataException here to force
           // benchmark to stop the current alg:
           throw new NoMoreDataException();
+        }
         result = tuple;
         tuple = null;
         notify();
@@ -157,25 +184,27 @@
     String time;
     String id;
 
-
-    
     public void startElement(String namespace,
                              String simple,
                              String qualified,
                              Attributes attributes) {
-      if (qualified.equals("page")) {
-        title = null;
-        body = null;
-        time = null;
-        id = null;
-      } else if (qualified.equals("text")) {
-        contents.setLength(0);
-      } else if (qualified.equals("timestamp")) {
-        contents.setLength(0);
-      } else if (qualified.equals("title")) {
-        contents.setLength(0);
-      } else if (qualified.equals("id")) {
-        contents.setLength(0);
+      int elemType = getElementType(qualified);
+      switch (elemType) {
+        case PAGE:
+          title = null;
+          body = null;
+          time = null;
+          id = null;
+          break;
+        // intentional fall-through.
+        case BODY:
+        case DATE:
+        case TITLE:
+        case ID:
+          contents.setLength(0);
+          break;
+        default:
+          // this element should be discarded.
       }
     }
 
@@ -214,25 +243,34 @@
 
     public void endElement(String namespace, String simple, String qualified)
       throws SAXException {
-      if (qualified.equals("title")) {
-        title = contents.toString();
-      } else if (qualified.equals("text")) {
-        body = contents.toString();
-        //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
-        String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
-        if (startsWith.startsWith("#redirect")) {
-          body = null;
-        }
-      } else if (qualified.equals("timestamp")) {
-        time = time(contents.toString());
-      } else if (qualified.equals("id") && id == null) {//just get the first id
-        id = contents.toString();
-      }
-      else if (qualified.equals("page")) {
-        //the body must be null and we either are keeping image docs or the title does not start with Image:
-        if (body != null && (keepImages == true || title.startsWith("Image:") == false)) {
-          create(title, time, body, id);
-        }
+      int elemType = getElementType(qualified);
+      switch (elemType) {
+        case PAGE:
+          // the body must be null and we either are keeping image docs or the
+          // title does not start with Image:
+          if (body != null && (keepImages || !title.startsWith("Image:"))) {
+            create(title, time, body, id);
+          }
+          break;
+        case BODY:
+          body = contents.toString();
+          //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
+          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+          if (startsWith.startsWith("#redirect")) {
+            body = null;
+          }
+          break;
+        case DATE:
+          time = time(contents.toString());
+          break;
+        case TITLE:
+          title = contents.toString();
+          break;
+        case ID:
+          id = contents.toString();
+          break;
+        default:
+          // this element should be discarded.
       }
     }
   }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java Thu Apr 16 09:46:30 2009
@@ -17,38 +17,44 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
+import java.io.BufferedInputStream;
 import java.io.BufferedReader;
-import java.io.IOException;
 import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.Random;
 
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 
 /**
- * A DocMaker reading one line at a time as a Document from
- * a single file.  This saves IO cost (over DirDocMaker) of
- * recursing through a directory and opening a new file for
- * every document.  It also re-uses its Document and Field
- * instance to improve indexing speed.
- *
+ * A DocMaker reading one line at a time as a Document from a single file. This
+ * saves IO cost (over DirDocMaker) of recursing through a directory and opening
+ * a new file for every document. It also re-uses its Document and Field
+ * instance to improve indexing speed.<br>
+ * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
+ * <i>title, date, body</i>. If a line is read in a different format, a
+ * {@link RuntimeException} will be thrown. In general, you should use this doc
+ * maker with files that were created with {@link WriteLineDocTask}.<br><br>
+ * 
  * Config properties:
- * docs.file=&lt;path to the file%gt;
- * doc.reuse.fields=true|false (default true)
- * doc.random.id.limit=N (default -1) -- create random
- *   docid in the range 0..N; this is useful
- *   with UpdateDoc to test updating random documents; if
- *   this is unspecified or -1, then docid is sequentially
- *   assigned
+ * <ul>
+ * <li>docs.file=&lt;path to the file&gt;
+ * <li>doc.reuse.fields=true|false (default true)
+ * <li>bzip.compression=true|false (default false)
+ * <li>doc.random.id.limit=N (default -1) -- create random docid in the range
+ * 0..N; this is useful with UpdateDoc to test updating random documents; if
+ * this is unspecified or -1, then docid is sequentially assigned
+ * </ul>
  */
 public class LineDocMaker extends BasicDocMaker {
 
-  FileInputStream fileIS;
+  InputStream fileIS;
   BufferedReader fileIn;
   ThreadLocal docState = new ThreadLocal();
   private String fileName;
@@ -57,9 +63,12 @@
   private final DocState localDocState = new DocState();
 
   private boolean doReuseFields = true;
+  private boolean bzipCompressionEnabled = false;
   private Random r;
   private int numDocs;
   
+  private CompressorStreamFactory csFactory = new CompressorStreamFactory();
+  
   class DocState {
     Document doc;
     Field bodyField;
@@ -93,7 +102,7 @@
       doc.add(idField);
     }
 
-    final static String SEP = WriteLineDocTask.SEP;
+    final static char SEP = WriteLineDocTask.SEP;
 
     private int numDocsCreated;
     private synchronized int incrNumDocsCreated() {
@@ -101,27 +110,20 @@
     }
 
     public Document setFields(String line) {
+      // A line must be in the following format. If it's not, fail !
       // title <TAB> date <TAB> body <NEWLINE>
-      final String title, date, body;
-
       int spot = line.indexOf(SEP);
-      if (spot != -1) {
-        title = line.substring(0, spot);
-        int spot2 = line.indexOf(SEP, 1+spot);
-        if (spot2 != -1) {
-          date = line.substring(1+spot, spot2);
-          body = line.substring(1+spot2, line.length());
-        } else 
-          date = body = "";
-      } else
-        title = date = body = "";
-
-      final String docID;
-      if (r != null) {
-        docID = "doc" + r.nextInt(numDocs);
-      } else {
-        docID = "doc" + incrNumDocsCreated();
+      if (spot == -1) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format !");
       }
+      int spot2 = line.indexOf(SEP, 1 + spot);
+      if (spot2 == -1) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+      }
+      final String title = line.substring(0, spot);
+      final String date = line.substring(1+spot, spot2);
+      final String body = line.substring(1+spot2, line.length());
+      final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
 
       if (doReuseFields) {
         idField.setValue(docID);
@@ -130,7 +132,10 @@
         bodyField.setValue(body);
         return doc;
       } else {
-        Field localIDField = new Field(BasicDocMaker.ID_FIELD, docID, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+        Field localIDField = new Field(BasicDocMaker.ID_FIELD,
+                                       docID,
+                                       Field.Store.YES,
+                                       Field.Index.NOT_ANALYZED_NO_NORMS);
 
         Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
                                           title,
@@ -174,16 +179,14 @@
 
     String line;
     synchronized(this) {
-      while(true) {
-        line = fileIn.readLine();
-        if (line == null) {
-          // Reset the file
-          openFile();
-          if (!forever)
-            throw new NoMoreDataException();
-        } else {
-          break;
+      line = fileIn.readLine();
+      if (line == null) {
+        if (!forever) {
+          throw new NoMoreDataException();
         }
+        // Reset the file
+        openFile();
+        return makeDocument();
       }
     }
 
@@ -199,15 +202,24 @@
   
   public synchronized void resetInputs() {
     super.resetInputs();
-    fileName = config.get("docs.file", null);
-    if (fileName == null)
-      throw new RuntimeException("docs.file must be set");
     openFile();
   }
 
   public void setConfig(Config config) {
     super.setConfig(config);
+    fileName = config.get("docs.file", null);
+    if (fileName == null) {
+      throw new IllegalArgumentException("docs.file must be set");
+    }
     doReuseFields = config.get("doc.reuse.fields", true);
+    String doBZCompress = config.get("bzip.compression", null);
+    if (doBZCompress != null) {
+      // Property was set, use the value.
+      bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
+    } else {
+      // Property was not set, attempt to detect based on file's extension
+      bzipCompressionEnabled = fileName.endsWith("bz2");
+    }
     numDocs = config.get("doc.random.id.limit", -1);
     if (numDocs != -1) {
       r = new Random(179);
@@ -216,16 +228,35 @@
 
   synchronized void openFile() {
     try {
-      if (fileIn != null)
+      if (fileIn != null) {
         fileIn.close();
+      }
       fileIS = new FileInputStream(fileName);
-      fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
+      if (bzipCompressionEnabled) {
+        // According to BZip2CompressorInputStream's code, it reads the first 
+        // two file header chars ('B' and 'Z'). We only need to wrap the
+        // underlying stream with a BufferedInputStream, since the code uses
+        // the read() method exclusively.
+        fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
+        fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
+      }
+      // Wrap the stream with a BufferedReader for several reasons:
+      // 1. We need the readLine() method.
+      // 2. Even if bzip.compression is enabled, and is wrapped with
+      // BufferedInputStream, wrapping with a buffer can still improve
+      // performance, since the BIS buffer will be used to read from the
+      // compressed stream, while the BR buffer will be used to read from the
+      // uncompressed stream.
+      fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
     } catch (IOException e) {
       throw new RuntimeException(e);
+    } catch (CompressorException e) {
+      throw new RuntimeException(e);
     }
   }
 
   public int numUniqueTexts() {
     return -1;
   }
+
 }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Thu Apr 16 09:46:30 2009
@@ -17,18 +17,39 @@
  * limitations under the License.
  */
 
+import java.io.BufferedOutputStream;
 import java.io.BufferedWriter;
 import java.io.FileOutputStream;
+import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
-
+/**
+ * A task which writes documents, one line per document. Each line is in the
+ * following format: title &lt;TAB&gt; date &lt;TAB&gt; body. The output of this
+ * taske can be consumed by
+ * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
+ * to save the IO overhead of opening a file per doument to be indexed.<br>
+ * 
+ * Supports the following parameters:
+ * <ul>
+ * <li>line.file.out - the name of the file to write the output to. That
+ * parameter is mandatory. <b>NOTE:</b> the file is re-created.
+ * <li>bzip.compression - whether the output should be bzip-compressed. This is
+ * recommended when the output file is expected to be large. (optional, default:
+ * false).
+ * <li>doc.writeline.log.step - controls how many records to process before
+ * logging the status of the task. <b>NOTE:</b> to disable logging, set this
+ * value to 0 or negative. (optional, default:1000).
+ * </ul>
+ */
 public class WriteLineDocTask extends PerfTask {
 
   /**
@@ -36,33 +57,48 @@
    * an "added N docs" message should be logged.  
    */
   public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
-
-  public WriteLineDocTask(PerfRunData runData) {
-    super(runData);
-  }
+  public final static char SEP = '\t';
 
   private int logStep = -1;
   private int docSize = 0;
   int count = 0;
-  private BufferedWriter lineFileOut=null;
+  private BufferedWriter lineFileOut = null;
   private DocMaker docMaker;
   
-  public final static String SEP = "\t";
-  
-  /*
-   *  (non-Javadoc)
-   * @see PerfTask#setup()
-   */
-  public void setup() throws Exception {
-    super.setup();
-    if (lineFileOut==null) {
-      Config config = getRunData().getConfig();
-      String fileName = config.get("line.file.out", null);
-      if (fileName == null)
-        throw new Exception("line.file.out must be set");
-      lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
+  public WriteLineDocTask(PerfRunData runData) throws Exception {
+    super(runData);
+    Config config = runData.getConfig();
+    String fileName = config.get("line.file.out", null);
+    if (fileName == null) {
+      throw new IllegalArgumentException("line.file.out must be set");
+    }
+
+    OutputStream out = new FileOutputStream(fileName);
+    boolean doBzipCompression = false;
+    String doBZCompress = config.get("bzip.compression", null);
+    if (doBZCompress != null) {
+      // Property was set, use the value.
+      doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
+    } else {
+      // Property was not set, attempt to detect based on file's extension
+      doBzipCompression = fileName.endsWith("bz2");
+    }
+
+    if (doBzipCompression) {
+      // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int) 
+      // and does not use the write(byte[]) version. This proved to speed the 
+      // compression process by 70% !
+      out = new BufferedOutputStream(out, 1 << 16);
+      out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
+    }
+    lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
+    docMaker = runData.getDocMaker();
+    logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+    // To avoid the check 'if (logStep > 0)' in log(). This effectively turns
+    // logging off.
+    if (logStep <= 0) {
+      logStep = Integer.MAX_VALUE;
     }
-    docMaker = getRunData().getDocMaker();
   }
 
   public void tearDown() throws Exception {
@@ -71,61 +107,52 @@
   }
 
   public int doLogic() throws Exception {
-    Document doc;
-    if (docSize > 0) {
-      doc = docMaker.makeDocument(docSize);
-    } else {
-      doc = docMaker.makeDocument();
-    }
+    Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
 
     Field f = doc.getField(BasicDocMaker.BODY_FIELD);
-
-    String body, title, date;
-    if (f != null)
-      body = f.stringValue().replace('\t', ' ');
-    else
-      body = null;
+    String body = f != null ? f.stringValue().replace('\t', ' ') : null;
     
-    f = doc.getField(BasicDocMaker.TITLE_FIELD);
-    if (f != null)
-      title = f.stringValue().replace('\t', ' ');
-    else
-      title = "";
-
-    f = doc.getField(BasicDocMaker.DATE_FIELD);
-    if (f != null)
-      date = f.stringValue().replace('\t', ' ');
-    else
-      date = "";
-
     if (body != null) {
+      f = doc.getField(BasicDocMaker.TITLE_FIELD);
+      String title = f != null ? f.stringValue().replace('\t', ' ') : "";
+      
+      f = doc.getField(BasicDocMaker.DATE_FIELD);
+      String date = f != null ? f.stringValue().replace('\t', ' ') : "";
+      
       lineFileOut.write(title, 0, title.length());
       lineFileOut.write(SEP);
       lineFileOut.write(date, 0, date.length());
       lineFileOut.write(SEP);
       lineFileOut.write(body, 0, body.length());
       lineFileOut.newLine();
-      lineFileOut.flush();
     }
     return 1;
   }
 
-  private void log (int count) {
-    if (logStep<0) {
-      // init once per instance
-      logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
-    }
-    if (logStep>0 && (count%logStep)==0) {
-      System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
+  private void log(int count) {
+    // logStep is initialized in the ctor to a positive value. If the config
+    // file indicates no logging, or contains an invalid value, logStep is init
+    // to Integer.MAX_VALUE, so that logging will not occur (at least for the
+    // first Integer.MAX_VALUE records).
+    if (count % logStep == 0) {
+      System.out.println("--> " + Thread.currentThread().getName()
+          + " processed (write line) " + count + " docs");
     }
   }
 
+  public void close() throws Exception {
+    lineFileOut.close();
+    super.close();
+  }
+  
   /**
    * Set the params (docSize only)
    * @param params docSize, or 0 for no limit.
    */
   public void setParams(String params) {
-    super.setParams(params);
+    if (super.supportsParams()) {
+      super.setParams(params);
+    }
     docSize = (int) Float.parseFloat(params); 
   }
 
@@ -135,4 +162,5 @@
   public boolean supportsParams() {
     return true;
   }
+  
 }

Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java?rev=765543&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java Thu Apr 16 09:46:30 2009
@@ -0,0 +1,38 @@
+package org.apache.lucene.benchmark;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+
+import junit.framework.TestCase;
+
+/** Base class for all Benchmark unit tests. */
+public class BenchmarkTestCase extends TestCase {
+
+  private static final File workDir;
+
+  static {
+    workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile();
+    workDir.mkdirs();
+  }
+  
+  public File getWorkDir() {
+    return workDir;
+  }
+  
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java?rev=765543&r1=765542&r2=765543&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java Thu Apr 16 09:46:30 2009
@@ -17,188 +17,33 @@
 
 package org.apache.lucene.benchmark.byTask;
 
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+
 import junit.framework.TestCase;
+
 import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
 import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
 import org.apache.lucene.benchmark.byTask.utils.Algorithm;
 
-import java.io.File;
-import java.io.StringReader;
-import java.lang.reflect.Modifier;
-import java.util.ArrayList;
-import java.util.Iterator;
-
-/**
- * Test very simply that perf tasks are parses as expected.
- */
+/** Test very simply that perf tasks are parses as expected. */
 public class TestPerfTasksParse extends TestCase {
 
-  private static final boolean DEBUG = false;
   static final String NEW_LINE = System.getProperty("line.separator");
   static final String INDENT = "  ";
 
   // properties in effect in all tests here
   static final String propPart = 
-    INDENT+"directory=RAMDirectory" + NEW_LINE +
-    INDENT+"print.props=false" + NEW_LINE
+    INDENT + "directory=RAMDirectory" + NEW_LINE +
+    INDENT + "print.props=false" + NEW_LINE
   ;
 
-  /*
-   * All known tasks. 
-   * As new tasks are added, add them here.
-   * It would be nice to do that automatically, unfortunately
-   * Java does not provide a "get all classes in package" or
-   * "get all sub-classes" functionality.  
-   */
-  static String singleTaskAlgs [];
-  
-  /* (non-Javadoc)
-   * @see junit.framework.TestCase#setUp()
-   */
-  protected void setUp() throws Exception {
-    super.setUp();
-    if (singleTaskAlgs==null) {
-      singleTaskAlgs = findTasks();
-    }
-  }
-
-  // one time initialization
-  static String [] findTasks () throws Exception {
-    ArrayList tsks = new ArrayList();
-    // init with tasks we know about
-    tsks.add(  " AddDoc                   "  );
-    tsks.add(  " AddDoc(1000.0)           "  );
-    tsks.add(  " ClearStats               "  );
-    tsks.add(  " CloseIndex               "  );
-    tsks.add(  " CloseReader              "  );
-    tsks.add(  " CreateIndex              "  );
-    tsks.add(  " DeleteDoc                "  );
-    tsks.add(  " DeleteDoc(500.0)         "  );
-    tsks.add(  " NewRound                 "  );
-    tsks.add(  " OpenIndex                "  );
-    tsks.add(  " OpenReader               "  );
-    tsks.add(  " Optimize                 "  );
-    tsks.add(  " RepAll                   "  );
-    tsks.add(  " RepSelectByPref prefix   "  );
-    tsks.add(  " RepSumByNameRound        "  );
-    tsks.add(  " RepSumByName             "  );
-    tsks.add(  " RepSumByPrefRound prefix "  );
-    tsks.add(  " RepSumByPref   prefix    "  );
-    tsks.add(  " ResetInputs              "  );
-    tsks.add(  " ResetSystemErase         "  );
-    tsks.add(  " ResetSystemSoft          "  );
-    tsks.add(  " Search                   "  );
-    tsks.add(  " SearchTravRet            "  );
-    tsks.add(  " SearchTravRet(100.0)     "  );
-    tsks.add(  " SearchTrav               "  );
-    tsks.add(  " SearchTrav(50.0)         "  );
-    tsks.add(  " SetProp                  "  );
-    tsks.add(  " SetProp(name,value)      "  );
-    tsks.add(  " Warm                     "  );
-    tsks.add(  "SearchTravRetLoadFieldSelector");
-    tsks.add("SearchTravRetLoadFieldSelector(body,title)");
-    
-    // if tasks.dir property is defined, look for additional tasks.
-    // this somewhat covers tasks that would be added in the future, in case
-    // the list above is not updated to cover them.
-    // some tasks would be tested more than once this way, but that's ok.
-    String tasksDir = System.getProperty("tasks.dir");
-    if (tasksDir !=null) {
-      String pkgPrefix = PerfTask.class.getPackage().getName()+".";
-      String taskNames[] = new File(tasksDir).list();
-      for (int i = 0; i < taskNames.length; i++) {
-        String name = taskNames[i].trim();
-        if (!name.endsWith("Task.class"))
-          continue; // Task class file only
-        name = name.substring(0,name.length()-6);
-        Class cls = Class.forName(pkgPrefix+name);
-        if (Modifier.isAbstract(cls.getModifiers()) || Modifier.isInterface(cls.getModifiers())) 
-          continue; // skip sbstract classes
-        if (!PerfTask.class.isAssignableFrom(cls))
-          continue; // not a task
-        name = name.substring(0,name.length()-4);
-        if (name.startsWith("Rep") && name.indexOf("Pref")>=0)
-          name += " prefix";
-        tsks.add(" "+name+" ");
-      }
-    }
-    return (String[]) tsks.toArray(new String[0]);
-  }
-  
-  
-  /**
-   * @param name test name
-   */
   public TestPerfTasksParse(String name) {
     super(name);
   }
 
-  /**
-   * Test the parsing of very simple tasks, for all tasks
-   */
-  public void testAllTasksSimpleParse() {
-    doTestAllTasksSimpleParse(false,false);
-  }
-  
-  /**
-   * Test the parsing of simple sequential sequences, for all tasks
-   */
-  public void testAllTasksSimpleParseSequntial() {
-    doTestAllTasksSimpleParse(true,false);
-  }
-
-  /**
-   * Test the parsing of simple parallel sequences, for all tasks
-   */
-  public void testAllTasksSimpleParseParallel() {
-    doTestAllTasksSimpleParse(true,true);
-  }
-  
-  // utility for simple parsing testing of all tasks.
-  private void doTestAllTasksSimpleParse(boolean parOrSeq, boolean par) {
-    for (int i = 0; i < singleTaskAlgs.length; i++) {
-      String testedTask = singleTaskAlgs[i];
-      if (parOrSeq) {
-        if (par) {
-          testedTask = "[ " + testedTask + " ] : 2";
-        } else {
-          testedTask = "{ " + testedTask + " } : 3";
-        }
-      }
-      try {
-        String algText = propPart+INDENT+testedTask;
-        logTstParsing(algText);
-        Benchmark benchmark = new Benchmark(new StringReader(algText));
-        Algorithm alg = benchmark.getAlgorithm();
-        ArrayList algTasks = alg.extractTasks();
-        // must find a task with this name in the algorithm
-        boolean foundName = false;
-        boolean foundPar = false;
-        String theTask = singleTaskAlgs[i].replaceAll(" +"," ").trim();
-        for (Iterator iter = algTasks.iterator(); iter.hasNext();) {
-          PerfTask task = (PerfTask) iter.next();
-          foundName |= (task.toString().indexOf(theTask)>=0);
-          foundPar |= (task instanceof TaskSequence && ((TaskSequence)task).isParallel());
-        }
-        assertTrue("Task "+testedTask+" was not found in "+alg.toString(),foundName);
-        if (parOrSeq) {
-          if (par) {
-            assertTrue("Task "+testedTask+" was supposed to be parallel in "+alg.toString(),foundPar);
-          } else {
-            assertFalse("Task "+testedTask+" was not supposed to be parallel in "+alg.toString(),foundPar);
-          }
-        }
-      } catch (Exception e) {
-        System.out.flush();
-        e.printStackTrace();
-        fail(e.getMessage());
-      }
-    }
-  }
-
-  /**
-   * Test the repetiotion parsing for parallel tasks
-   */
+  /** Test the repetiotion parsing for parallel tasks */
   public void testParseParallelTaskSequenceRepetition() throws Exception {
     String taskStr = "AddDoc";
     String parsedTasks = "[ "+taskStr+" ] : 1000";
@@ -219,9 +64,7 @@
     }
   }
 
-  /**
-   * Test the repetiotion parsing for sequential  tasks
-   */
+  /** Test the repetiotion parsing for sequential  tasks */
   public void testParseTaskSequenceRepetition() throws Exception {
     String taskStr = "AddDoc";
     String parsedTasks = "{ "+taskStr+" } : 1000";
@@ -242,11 +85,4 @@
     }
   }
 
-  private void logTstParsing (String txt) {
-    if (!DEBUG) 
-      return;
-    System.out.println("Test parsing of");
-    System.out.println(txt);
-  }
-
 }

Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java?rev=765543&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java Thu Apr 16 09:46:30 2009
@@ -0,0 +1,169 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.Properties;
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.benchmark.BenchmarkTestCase;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
+import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
+import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
+import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+
+/** Tests the functionality of {@link LineDocMaker}. */
+public class LineDocMakerTest extends BenchmarkTestCase {
+
+  private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+  private void createBZ2LineFile(File file) throws Exception {
+    OutputStream out = new FileOutputStream(file);
+    out = csFactory.createCompressorOutputStream("bzip2", out);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    StringBuffer doc = new StringBuffer();
+    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+    writer.write(doc.toString());
+    writer.newLine();
+    writer.close();
+  }
+
+  private void createRegularLineFile(File file) throws Exception {
+    OutputStream out = new FileOutputStream(file);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    StringBuffer doc = new StringBuffer();
+    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+    writer.write(doc.toString());
+    writer.newLine();
+    writer.close();
+  }
+  
+  private void doIndexAndSearchTest(File file, boolean setBZCompress,
+      String bz2CompressVal) throws Exception {
+
+    Properties props = new Properties();
+    
+    // LineDocMaker specific settings.
+    props.setProperty("docs.file", file.getAbsolutePath());
+    if (setBZCompress) {
+      props.setProperty("bzip.compression", bz2CompressVal);
+    }
+    
+    // Indexing configuration.
+    props.setProperty("analyzer", SimpleAnalyzer.class.getName());
+    props.setProperty("doc.maker", LineDocMaker.class.getName());
+    props.setProperty("directory", "RAMDirectory");
+    
+    // Create PerfRunData
+    Config config = new Config(props);
+    PerfRunData runData = new PerfRunData(config);
+
+    TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
+    tasks.addTask(new CreateIndexTask(runData));
+    tasks.addTask(new AddDocTask(runData));
+    tasks.addTask(new CloseIndexTask(runData));
+    tasks.doLogic();
+    
+    IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
+    TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
+    assertEquals(1, td.totalHits);
+    assertNotNull(td.scoreDocs[0]);
+    searcher.close();
+  }
+  
+  /* Tests LineDocMaker with a bzip2 input stream. */
+  public void testBZip2() throws Exception {
+    File file = new File(getWorkDir(), "one-line.bz2");
+    createBZ2LineFile(file);
+    doIndexAndSearchTest(file, true, "true");
+  }
+  
+  public void testBZip2AutoDetect() throws Exception {
+    File file = new File(getWorkDir(), "one-line.bz2");
+    createBZ2LineFile(file);
+    doIndexAndSearchTest(file, false, null);
+  }
+  
+  public void testBZip2WithBzipCompressionDisabled() throws Exception {
+    File file = new File(getWorkDir(), "one-line.bz2");
+    createBZ2LineFile(file);
+    
+    try {
+      doIndexAndSearchTest(file, true, "false");
+      fail("Some exception should have been thrown !");
+    } catch (Exception e) {
+      // expected.
+    }
+  }
+  
+  public void testRegularFile() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file);
+    doIndexAndSearchTest(file, false, null);
+  }
+  
+  public void testRegularFileWithBZipCompressionEnabled() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file);
+    
+    try {
+      doIndexAndSearchTest(file, true, "true");
+      fail("Some exception should have been thrown !");
+    } catch (Exception e) {
+      // expected.
+    }
+  }
+
+  public void testInvalidFormat() throws Exception {
+    String[] testCases = new String[] {
+      "", // empty line
+      "title", // just title
+      "title" + WriteLineDocTask.SEP, // title + SEP
+      "title" + WriteLineDocTask.SEP + "body", // title + SEP + body
+      // note that title + SEP + body + SEP is a valid line, which results in an
+      // empty body
+    };
+    
+    for (int i = 0; i < testCases.length; i++) {
+      File file = new File(getWorkDir(), "one-line");
+      BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
+      writer.write(testCases[i]);
+      writer.newLine();
+      writer.close();
+      try {
+        doIndexAndSearchTest(file, false, null);
+        fail("Some exception should have been thrown for: [" + testCases[i] + "]");
+      } catch (Exception e) {
+        // expected.
+      }
+    }
+  }
+  
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java?rev=765543&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java Thu Apr 16 09:46:30 2009
@@ -0,0 +1,134 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Properties;
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.benchmark.BenchmarkTestCase;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+
+/** Tests the functionality of {@link WriteLineDocTask}. */
+public class WriteLineDocTaskTest extends BenchmarkTestCase {
+
+  // class has to be public so that Class.forName.newInstance() will work
+  public static final class WriteLineDocMaker extends BasicDocMaker {
+
+    protected DocData getNextDocData() throws NoMoreDataException, Exception {
+      throw new UnsupportedOperationException("not implemented");
+    }
+
+    public Document makeDocument() throws Exception {
+      Document doc = new Document();
+      doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      return doc;
+    }
+    
+    public int numUniqueTexts() {
+      return 0;
+    }
+    
+  }
+  
+  private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+  private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
+    Properties props = new Properties();
+    props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
+    props.setProperty("line.file.out", file.getAbsolutePath());
+    if (setBZCompress) {
+      props.setProperty("bzip.compression", bz2CompressVal);
+    }
+    props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
+    Config config = new Config(props);
+    return new PerfRunData(config);
+  }
+  
+  private void doReadTest(File file, boolean bz2File) throws Exception {
+    InputStream in = new FileInputStream(file);
+    if (bz2File) {
+      in = csFactory.createCompressorInputStream("bzip2", in);
+    }
+    BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
+    try {
+      String line = br.readLine();
+      assertNotNull(line);
+      String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
+      assertEquals(3, parts.length);
+      assertEquals("title", parts[0]);
+      assertEquals("date", parts[1]);
+      assertEquals("body", parts[2]);
+      assertNull(br.readLine());
+    } finally {
+      br.close();
+    }
+  }
+  
+  /* Tests WriteLineDocTask with a bzip2 format. */
+  public void testBZip2() throws Exception {
+    
+    // Create a document in bz2 format.
+    File file = new File(getWorkDir(), "one-line.bz2");
+    PerfRunData runData = createPerfRunData(file, true, "true");
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    doReadTest(file, true);
+  }
+  
+  public void testBZip2AutoDetect() throws Exception {
+    
+    // Create a document in bz2 format.
+    File file = new File(getWorkDir(), "one-line.bz2");
+    PerfRunData runData = createPerfRunData(file, false, null);
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    doReadTest(file, true);
+  }
+  
+  public void testRegularFile() throws Exception {
+    
+    // Create a document in regular format.
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, true, "false");
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    doReadTest(file, false);
+  }
+  
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message