lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dor...@apache.org
Subject svn commit: r1083816 - in /lucene/dev/trunk: ./ lucene/ modules/benchmark/ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ modules/benchmark/src/test/org/apache/l...
Date Mon, 21 Mar 2011 14:59:43 GMT
Author: doronc
Date: Mon Mar 21 14:59:42 2011
New Revision: 1083816

URL: http://svn.apache.org/viewvc?rev=1083816&view=rev
Log:
LUCENE-2958: WriteLineDocTask improvements - flexible line fields definition - port/merge
from 3x.

Modified:
    lucene/dev/trunk/   (props changed)
    lucene/dev/trunk/lucene/   (props changed)
    lucene/dev/trunk/modules/benchmark/CHANGES.txt
    lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
    lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
    lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
    lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
    lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java

Modified: lucene/dev/trunk/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/CHANGES.txt?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/benchmark/CHANGES.txt Mon Mar 21 14:59:42 2011
@@ -3,6 +3,13 @@ Lucene Benchmark Contrib Change Log
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
 03/21/2011
+  LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+  docs, and be flexible about which fields are added to the line file. For this, a header
+  line was added to the line file. That header is examined by LineDocSource. Old line
+  files which have no header line are handled as before, imposing the default header.
+  (Doron Cohen, Shai Erera, Mike McCandless)
+  
+03/21/2011
   LUCENE-2964: Allow benchmark tasks from alternative packages,
   specified through a new property "alt.tasks.packages".
   (Doron Cohen, Shai Erera)

Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
(original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
Mon Mar 21 14:59:42 2011
@@ -22,6 +22,9 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.lang.reflect.Constructor;
+import java.util.Arrays;
+import java.util.Properties;
 
 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -40,16 +43,136 @@ import org.apache.lucene.benchmark.byTas
  * <ul>
  * <li>docs.file=&lt;path to the file&gt;
  * <li>content.source.encoding - default to UTF-8.
+ * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which
differs 
+ *     from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
  * </ul>
  */
 public class LineDocSource extends ContentSource {
 
-  private final static char SEP = WriteLineDocTask.SEP;
+  /** Reader of a single input line into {@link DocData}. */
+  public static abstract class LineParser {
+    protected final String[] header;
+    /** Construct with the header 
+     * @param header header line found in the input file, or null if none
+     */
+    public LineParser(String[] header) {
+      this.header = header; 
+    }
+    /** parse an input line and fill doc data appropriately */
+    public abstract void parseLine(DocData docData, String line);
+  }
+  
+  /** 
+   * {@link LineParser} which ignores the header passed to its constructor
+   * and assumes simply that field names and their order are the same 
+   * as in {@link WriteLineDocTask#DEFAULT_FIELDS} 
+   */
+  public static class SimpleLineParser extends LineParser {
+    public SimpleLineParser(String[] header) {
+      super(header);
+    }
+    public void parseLine(DocData docData, String line) {
+      int k1 = 0;
+      int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2<0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing:
separator title::date)!");
+      }
+      docData.setTitle(line.substring(k1,k2));
+      k1 = k2+1;
+      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2<0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing:
separator date::body)!");
+      }
+      docData.setDate(line.substring(k1,k2));
+      k1 = k2+1;
+      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2>=0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (too many
separators)!");
+      }
+      // last one
+      docData.setBody(line.substring(k1));
+    }
+  }
+  
+  /** 
+   * {@link LineParser} which sets field names and order by 
+   * the header - any header - of the lines file.
+   * It is less efficient than {@link SimpleLineParser} but more powerful.
+   */
+  public static class HeaderLineParser extends LineParser {
+    private enum FieldName { NAME , TITLE , DATE , BODY, PROP } 
+    private final FieldName[] posToF;
+    public HeaderLineParser(String[] header) {
+      super(header);
+      posToF = new FieldName[header.length];
+      for (int i=0; i<header.length; i++) {
+        String f = header[i];
+        if (DocMaker.NAME_FIELD.equals(f)) {
+          posToF[i] = FieldName.NAME;
+        } else if (DocMaker.TITLE_FIELD.equals(f)) {
+          posToF[i] = FieldName.TITLE;
+        } else if (DocMaker.DATE_FIELD.equals(f)) {
+          posToF[i] = FieldName.DATE;
+        } else if (DocMaker.BODY_FIELD.equals(f)) {
+          posToF[i] = FieldName.BODY;
+        } else {
+          posToF[i] = FieldName.PROP;
+        }
+      }
+    }
+    
+    public void parseLine(DocData docData, String line) {
+      int n = 0;
+      int k1 = 0;
+      int k2;
+      while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
+        if (n>=header.length) {
+          throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead
of "+header.length+" :: [" + line + "]");
+        }
+        setDocDataField(docData, n, line.substring(k1,k2));
+        ++n;
+        k1 = k2 + 1;
+      }
+      if (n!=header.length-1) {
+        throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead
of "+header.length+" :: [" + line + "]");
+      }
+      // last one
+      setDocDataField(docData, n, line.substring(k1)); 
+    }
 
+    private void setDocDataField(DocData docData, int position, String text) {
+      switch(posToF[position]) {
+        case NAME: 
+          docData.setName(text);
+          break;
+        case TITLE: 
+          docData.setTitle(text);
+          break;
+        case DATE: 
+          docData.setDate(text);
+          break;
+        case BODY: 
+          docData.setBody(text);
+          break;
+        case PROP:
+          Properties p = docData.getProps();
+          if (p==null) {
+            p = new Properties();
+            docData.setProps(p);
+          }
+          p.setProperty(header[position], text);
+          break;
+      }
+    }
+  }
+  
   private File file;
   private BufferedReader reader;
   private int readCount;
 
+  private LineParser docDataLineReader = null;
+  private boolean skipHeaderLine = false;
+
   private synchronized void openFile() {
     try {
       if (reader != null) {
@@ -57,6 +180,9 @@ public class LineDocSource extends Conte
       }
       InputStream is = getInputStream(file);
       reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
+      if (skipHeaderLine) {
+        reader.readLine(); // skip one line - the header line - already handled that info
+      }
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -77,7 +203,6 @@ public class LineDocSource extends Conte
     
     synchronized(this) {
       line = reader.readLine();
-      myID = readCount++;
       if (line == null) {
         if (!forever) {
           throw new NoMoreDataException();
@@ -86,27 +211,54 @@ public class LineDocSource extends Conte
         openFile();
         return getNextDocData(docData);
       }
+      if (docDataLineReader == null) { // first line ever, one time initialization,
+        docDataLineReader = createDocDataLineReader(line);
+        if (skipHeaderLine) {
+          return getNextDocData(docData);
+        }
+      }
+      // increment IDS only once...
+      myID = readCount++; 
     }
     
-    // A line must be in the following format. If it's not, fail !
-    // title <TAB> date <TAB> body <NEWLINE>
-    int spot = line.indexOf(SEP);
-    if (spot == -1) {
-      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-    }
-    int spot2 = line.indexOf(SEP, 1 + spot);
-    if (spot2 == -1) {
-      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-    }
     // The date String was written in the format of DateTools.dateToString.
     docData.clear();
     docData.setID(myID);
-    docData.setBody(line.substring(1 + spot2, line.length()));
-    docData.setTitle(line.substring(0, spot));
-    docData.setDate(line.substring(1 + spot, spot2));
+    docDataLineReader.parseLine(docData, line);
     return docData;
   }
 
+  private LineParser createDocDataLineReader(String line) {
+    String[] header;
+    String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
+
+    if (line.startsWith(headIndicator)) {
+      header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
+      skipHeaderLine = true; // mark to skip the header line when input file is reopened
+    } else {
+      header = WriteLineDocTask.DEFAULT_FIELDS;
+    }
+    
+    // if a specific DocDataLineReader was configured, must respect it
+    String docDataLineReaderClassName = getConfig().get("line.parser", null);
+    if (docDataLineReaderClassName!=null) {
+      try {
+        final Class<? extends LineParser> clazz = 
+          Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
+        Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
+        return cnstr.newInstance((Object)header);
+      } catch (Exception e) {
+        throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
+      }
+    }
+
+    // if this the simple case,   
+    if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
+      return new SimpleLineParser(header);
+    }
+    return new HeaderLineParser(header);
+  }
+
   @Override
   public void resetInputs() throws IOException {
     super.resetInputs();

Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
(original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
Mon Mar 21 14:59:42 2011
@@ -23,6 +23,8 @@ import java.io.FileOutputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -41,11 +43,17 @@ import org.apache.lucene.document.Field;
  * to save the IO overhead of opening a file per document to be indexed.<br>
  * Supports the following parameters:
  * <ul>
- * <li>line.file.out - the name of the file to write the output to. That
+ * <li><b>line.file.out<b> - the name of the file to write the output to.
That
  * parameter is mandatory. <b>NOTE:</b> the file is re-created.
- * <li>bzip.compression - whether the output should be bzip-compressed. This is
- * recommended when the output file is expected to be large. (optional, default:
- * false).
+ * <li><b>bzip.compression<b> - whether the output should be bzip-compressed.
This is
+ * recommended when the output file is expected to be large. 
+ * <li><b>line.fields<b> - which fields should be written in each line.
+ * (optional, default: {@link #DEFAULT_FIELDS}).
+ * <li><b>sufficient.fields</b> - list of field names, separated by comma,
which, 
+ * if all of them are missing, the document will be skipped. For example, to require 
+ * that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
+ * that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>.
   
+ * (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
  * </ul>
  * <b>NOTE:</b> this class is not thread-safe and if used by multiple threads
the
  * output is unspecified (as all will write to the same output file in a
@@ -53,13 +61,32 @@ import org.apache.lucene.document.Field;
  */
 public class WriteLineDocTask extends PerfTask {
 
+  public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
+
   public final static char SEP = '\t';
   
+  /**
+   * Fields to be written by default
+   */
+  public static final String[] DEFAULT_FIELDS = new String[] {
+    DocMaker.TITLE_FIELD,
+    DocMaker.DATE_FIELD,
+    DocMaker.BODY_FIELD,
+  };
+  
+  /**
+   * Default fields which at least one of them is required to not skip the doc.
+   */
+  public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
+  
   private int docSize = 0;
   private PrintWriter lineFileOut = null;
   private DocMaker docMaker;
   private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
   private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
+  private final String[] fieldsToWrite;;
+  private final boolean[] sufficientFields;
+  private final boolean checkSufficientFields;
   
   public WriteLineDocTask(PerfRunData runData) throws Exception {
     super(runData);
@@ -89,6 +116,51 @@ public class WriteLineDocTask extends Pe
     }
     lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"),
1 << 16));
     docMaker = runData.getDocMaker();
+    
+    // init fields 
+    String f2r = config.get("line.fields",null);
+    if (f2r == null) {
+      fieldsToWrite = DEFAULT_FIELDS;
+    } else {
+      if (f2r.indexOf(SEP)>=0) {
+        throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator
char: "+SEP);
+      }
+      fieldsToWrite = f2r.split(","); 
+    }
+    
+    // init sufficient fields
+    sufficientFields = new boolean[fieldsToWrite.length];
+    String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
+    if (",".equals(suff)) {
+      checkSufficientFields = false;
+    } else {
+      checkSufficientFields = true;
+      HashSet<String> sf = new HashSet<String>(Arrays.asList(suff.split(",")));
+      for (int i=0; i<fieldsToWrite.length; i++) {
+        if (sf.contains(fieldsToWrite[i])) {
+          sufficientFields[i] = true;
+        }
+      }
+    }
+    
+    writeHeader();
+  }
+
+  /**
+   * Write a header to the lines file - indicating how to read the file later 
+   */
+  private void writeHeader() {
+    StringBuilder sb = threadBuffer.get();
+    if (sb == null) {
+      sb = new StringBuilder();
+      threadBuffer.set(sb);
+    }
+    sb.setLength(0);
+    sb.append(FIELDS_HEADER_INDICATOR);
+    for (String f : fieldsToWrite) {
+      sb.append(SEP).append(f);
+    }
+    lineFileOut.println(sb.toString());
   }
 
   @Override
@@ -106,27 +178,26 @@ public class WriteLineDocTask extends Pe
       threadNormalizer.set(matcher);
     }
     
-    Field f = doc.getField(DocMaker.BODY_FIELD);
-    String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-    
-    f = doc.getField(DocMaker.TITLE_FIELD);
-    String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-    
-    if (body.length() > 0 || title.length() > 0) {
-      
-      f = doc.getField(DocMaker.DATE_FIELD);
-      String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-      
-      StringBuilder sb = threadBuffer.get();
-      if (sb == null) {
-        sb = new StringBuilder();
-        threadBuffer.set(sb);
-      }
-      sb.setLength(0);
-      sb.append(title).append(SEP).append(date).append(SEP).append(body);
+    StringBuilder sb = threadBuffer.get();
+    if (sb == null) {
+      sb = new StringBuilder();
+      threadBuffer.set(sb);
+    }
+    sb.setLength(0);
+
+    boolean sufficient = !checkSufficientFields;
+    for (int i=0; i<fieldsToWrite.length; i++) {
+      Field f = doc.getField(fieldsToWrite[i]);
+      String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
+      sb.append(text).append(SEP);
+      sufficient |= text.length()>0 && sufficientFields[i];
+    }
+    if (sufficient) {
+      sb.setLength(sb.length()-1); // remove redundant last separator
       // lineFileOut is a PrintWriter, which synchronizes internally in println.
       lineFileOut.println(sb.toString());
     }
+
     return 1;
   }
 

Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
(original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Mon Mar 21 14:59:42 2011
@@ -36,6 +36,7 @@ import org.apache.lucene.benchmark.byTas
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.collation.CollationKeyAnalyzer;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldsEnum;
@@ -393,8 +394,13 @@ public class TestPerfTasksLogic extends 
 
     BufferedReader r = new BufferedReader(new FileReader(lineFile));
     int numLines = 0;
-    while(r.readLine() != null)
+    String line;
+    while((line = r.readLine()) != null) {
+      if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR))
{
+        continue; // do not count the header line as a doc 
+      }
       numLines++;
+    }
     r.close();
     assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but
was " + numLines, NUM_TRY_DOCS, numLines);
     

Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
(original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
Mon Mar 21 14:59:42 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTa
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
+import java.io.IOException;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.util.Properties;
@@ -28,6 +29,8 @@ import org.apache.commons.compress.compr
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
 import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
 import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
 import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
@@ -44,29 +47,71 @@ public class LineDocSourceTest extends B
 
   private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
 
-  private void createBZ2LineFile(File file) throws Exception {
+  private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
     OutputStream out = new FileOutputStream(file);
     out = csFactory.createCompressorOutputStream("bzip2", out);
     BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    writeDocsToFile(writer, addHeader, null);
+    writer.close();
+  }
+
+  private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields)
throws IOException {
+    if (addHeader) {
+      writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.TITLE_FIELD);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.DATE_FIELD);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.BODY_FIELD);
+      if (otherFields!=null) {
+        // additional field names in the header 
+        for (Object fn : otherFields.keySet()) {
+          writer.write(WriteLineDocTask.SEP);
+          writer.write(fn.toString());
+        }
+      }
+      writer.newLine();
+    }
     StringBuilder doc = new StringBuilder();
-    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
+    if (otherFields!=null) {
+      // additional field values in the doc line 
+      for (Object fv : otherFields.values()) {
+        doc.append(WriteLineDocTask.SEP).append(fv.toString());
+      }
+    }
     writer.write(doc.toString());
     writer.newLine();
+  }
+
+  private void createRegularLineFile(File file, boolean addHeader) throws Exception {
+    OutputStream out = new FileOutputStream(file);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    writeDocsToFile(writer, addHeader, null);
     writer.close();
   }
 
-  private void createRegularLineFile(File file) throws Exception {
+  private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws
Exception {
     OutputStream out = new FileOutputStream(file);
     BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
-    StringBuilder doc = new StringBuilder();
-    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
-    writer.write(doc.toString());
-    writer.newLine();
+    Properties p = new Properties();
+    for (String f : extraFields) {
+      p.setProperty(f, f);
+    }
+    writeDocsToFile(writer, true, p);
     writer.close();
   }
   
   private void doIndexAndSearchTest(File file, boolean setBZCompress,
-      String bz2CompressVal) throws Exception {
+      String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField)
throws Exception {
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass,
1, storedField); // no extra repetitions
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass,
2, storedField); // 1 extra repetition
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass,
4, storedField); // 3 extra repetitions
+  }
+  
+  private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
+      String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds,
String storedField) throws Exception {
 
     Properties props = new Properties();
     
@@ -75,11 +120,16 @@ public class LineDocSourceTest extends B
     if (setBZCompress) {
       props.setProperty("bzip.compression", bz2CompressVal);
     }
+    if (lineParserClass != null) {
+      props.setProperty("line.parser", lineParserClass.getName());
+    }
     
     // Indexing configuration.
     props.setProperty("analyzer", MockAnalyzer.class.getName());
     props.setProperty("content.source", LineDocSource.class.getName());
     props.setProperty("directory", "RAMDirectory");
+    props.setProperty("doc.stored", "true");
+    props.setProperty("doc.index.props", "true");
     
     // Create PerfRunData
     Config config = new Config(props);
@@ -87,34 +137,60 @@ public class LineDocSourceTest extends B
 
     TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
     tasks.addTask(new CreateIndexTask(runData));
-    tasks.addTask(new AddDocTask(runData));
+    for (int i=0; i<numAdds; i++) {
+      tasks.addTask(new AddDocTask(runData));
+    }
     tasks.addTask(new CloseIndexTask(runData));
     tasks.doLogic();
     
     IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
     TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
-    assertEquals(1, td.totalHits);
+    assertEquals(numAdds, td.totalHits);
     assertNotNull(td.scoreDocs[0]);
+    
+    if (storedField==null) {
+      storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name ==
value
+    }
+    assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
+
     searcher.close();
   }
   
   /* Tests LineDocSource with a bzip2 input stream. */
   public void testBZip2() throws Exception {
     File file = new File(getWorkDir(), "one-line.bz2");
-    createBZ2LineFile(file);
-    doIndexAndSearchTest(file, true, "true");
+    createBZ2LineFile(file,true);
+    doIndexAndSearchTest(file, true, "true", null, null);
+  }
+
+  public void testBZip2NoHeaderLine() throws Exception {
+    File file = new File(getWorkDir(), "one-line.bz2");
+    createBZ2LineFile(file,false);
+    doIndexAndSearchTest(file, true, "true", null, null);
   }
   
   public void testBZip2AutoDetect() throws Exception {
     File file = new File(getWorkDir(), "one-line.bz2");
-    createBZ2LineFile(file);
-    doIndexAndSearchTest(file, false, null);
+    createBZ2LineFile(file,false);
+    doIndexAndSearchTest(file, false, null, null, null);
   }
   
   public void testRegularFile() throws Exception {
     File file = new File(getWorkDir(), "one-line");
-    createRegularLineFile(file);
-    doIndexAndSearchTest(file, false, null);
+    createRegularLineFile(file,true);
+    doIndexAndSearchTest(file, false, null, null, null);
+  }
+
+  public void testRegularFileSpecialHeader() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file,true);
+    doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
+  }
+
+  public void testRegularFileNoHeaderLine() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file,false);
+    doIndexAndSearchTest(file, false, null, null, null);
   }
 
   public void testInvalidFormat() throws Exception {
@@ -134,7 +210,7 @@ public class LineDocSourceTest extends B
       writer.newLine();
       writer.close();
       try {
-        doIndexAndSearchTest(file, false, null);
+        doIndexAndSearchTest(file, false, null, null, null);
         fail("Some exception should have been thrown for: [" + testCases[i] + "]");
       } catch (Exception e) {
         // expected.
@@ -142,4 +218,19 @@ public class LineDocSourceTest extends B
     }
   }
   
+  /** Doc Name is not part of the default header */
+  public void testWithDocsName()  throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
+    doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
+  }
+
+  /** Use fields names that are not defined in Docmaker and so will go to Properties */
+  public void testWithProperties()  throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    String specialField = "mySpecialField";
+    createRegularLineFileWithMoreFields(file, specialField);
+    doIndexAndSearchTest(file, false, null, null, specialField);
+  }
+  
 }

Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
(original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
Mon Mar 21 14:59:42 2011
@@ -98,6 +98,25 @@ public class WriteLineDocTaskTest extend
       return doc;
     }
   }
+
+  // class has to be public so that Class.forName.newInstance() will work
+  // same as JustDate just that this one is treated as legal
+  public static final class LegalJustDateDocMaker extends DocMaker {
+    @Override
+    public Document makeDocument() throws Exception {
+      Document doc = new Document();
+      doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      return doc;
+    }
+  }
+
+  // class has to be public so that Class.forName.newInstance() will work
+  public static final class EmptyDocMaker extends DocMaker {
+    @Override
+    public Document makeDocument() throws Exception {
+      return new Document();
+    }
+  }
   
   // class has to be public so that Class.forName.newInstance() will work
   public static final class ThreadingDocMaker extends DocMaker {
@@ -117,6 +136,7 @@ public class WriteLineDocTaskTest extend
   private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
 
   private PerfRunData createPerfRunData(File file, boolean setBZCompress,
+                                        boolean allowEmptyDocs,
                                         String bz2CompressVal,
                                         String docMakerName) throws Exception {
     Properties props = new Properties();
@@ -126,6 +146,13 @@ public class WriteLineDocTaskTest extend
       props.setProperty("bzip.compression", bz2CompressVal);
     }
     props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
+    if (allowEmptyDocs) {
+      props.setProperty("sufficient.fields", ",");
+    }
+    if (docMakerName.equals(LegalJustDateDocMaker.class.getName())) {
+      props.setProperty("line.fields", DocMaker.DATE_FIELD);
+      props.setProperty("sufficient.fields", DocMaker.DATE_FIELD);
+    }
     Config config = new Config(props);
     return new PerfRunData(config);
   }
@@ -139,6 +166,8 @@ public class WriteLineDocTaskTest extend
     BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
     try {
       String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
       assertNotNull(line);
       String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
       int numExpParts = expBody == null ? 2 : 3;
@@ -153,13 +182,17 @@ public class WriteLineDocTaskTest extend
       br.close();
     }
   }
+
+  private void assertHeaderLine(String line) {
+    assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
+  }
   
   /* Tests WriteLineDocTask with a bzip2 format. */
   public void testBZip2() throws Exception {
     
     // Create a document in bz2 format.
     File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -171,7 +204,7 @@ public class WriteLineDocTaskTest extend
     
     // Create a document in bz2 format.
     File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -183,7 +216,7 @@ public class WriteLineDocTaskTest extend
     
     // Create a document in regular format.
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -196,7 +229,7 @@ public class WriteLineDocTaskTest extend
     // separator char. However, it didn't replace newline characters, which
     // resulted in errors in LineDocSource.
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -209,7 +242,7 @@ public class WriteLineDocTaskTest extend
     // had a TITLE element (LUCENE-1755). It should throw away documents if they
     // don't have BODY nor TITLE
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -219,7 +252,7 @@ public class WriteLineDocTaskTest extend
   
   public void testEmptyTitle() throws Exception {
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -227,9 +260,10 @@ public class WriteLineDocTaskTest extend
     doReadTest(file, false, "", "date", "body");
   }
   
+  /** Fail by default when there's only date */
   public void testJustDate() throws Exception {
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
@@ -237,15 +271,53 @@ public class WriteLineDocTaskTest extend
     BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),
"utf-8"));
     try {
       String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
       assertNull(line);
     } finally {
       br.close();
     }
   }
 
+  public void testLegalJustDate() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),
"utf-8"));
+    try {
+      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
+      assertNotNull(line);
+    } finally {
+      br.close();
+    }
+  }
+
+  public void testEmptyDoc() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),
"utf-8"));
+    try {
+      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
+      assertNotNull(line);
+    } finally {
+      br.close();
+    }
+  }
+
   public void testMultiThreaded() throws Exception {
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
     final WriteLineDocTask wldt = new WriteLineDocTask(runData);
     Thread[] threads = new Thread[10];
     for (int i = 0; i < threads.length; i++) {
@@ -269,8 +341,10 @@ public class WriteLineDocTaskTest extend
     Set<String> ids = new HashSet<String>();
     BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),
"utf-8"));
     try {
+      String line = br.readLine();
+      assertHeaderLine(line); // header line is written once, no matter how many threads
there are
       for (int i = 0; i < threads.length; i++) {
-        String line = br.readLine();
+        line = br.readLine();
         String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
         assertEquals(3, parts.length);
         // check that all thread names written are the same in the same line



Mime
View raw message