lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r559366 - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/tasks/ src/java/org/apache/lucene/benchm...
Date Wed, 25 Jul 2007 08:55:00 GMT
Author: mikemccand
Date: Wed Jul 25 01:54:58 2007
New Revision: 559366

URL: http://svn.apache.org/viewvc?view=rev&rev=559366
Log:
LUCENE-947: add creation of & indexing from 'one document per line' text files to minimize
IO overhead of creating documents when running tests

Added:
    lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg   (with props)
    lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg   (with props)
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
  (with props)
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
  (with props)
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Wed Jul 25 01:54:58 2007
@@ -4,6 +4,11 @@
 
 $Id:$
 
+7/24/07
+  LUCENE-947: Add support for creating and index "one document per
+  line" from a large text file, which reduces per-document overhead of
+  opening a single file for each document.
+
 6/30/07
   LUCENE-848: Added support for Wikipedia benchmarking.
 

Added: lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg?view=auto&rev=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg Wed Jul 25 01:54:58 2007
@@ -0,0 +1,43 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg will process the Reuters documents feed to produce a
+# single file that contains all documents, one per line.
+#
+# To use this, first cd to contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/createLineFile.alg
+#
+# Then, to index the documents in the line file, see
+# indexLineFile.alg.
+#
+
+# Where to get documents from:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+
+# Where to write the line file output:
+line.file.out=work/reuters.lines.txt
+
+# Stop after processing the document feed once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Process all documents, appending each one to the line file:
+{WriteLineDoc()}: * 

Propchange: lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg?view=auto&rev=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg Wed Jul 25 01:54:58 2007
@@ -0,0 +1,53 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This file indexes documents contained in a single text file, one per
+# line.  See createLineFile.alg for how to create this file.  The
+# benefit of this is it removes the IO cost of opening one file per
+# document to let you more accurately measure time spent analyzing and
+# indexing your documents vs time spent creating the documents.
+#
+# To use this, you must first run the createLineFile.alg, then cd to
+# contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/indexLineFile.alg
+#
+
+analyzer=org.apache.lucene.analysis.SimpleAnalyzer
+
+# Feed that knows how to process the line file format:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
+
+# File that contains one document per line:
+docs.file=work/reuters.lines.txt
+
+# Process documents only once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Reset the system, create a new index, index all docs from the line
+# file, close the index, produce a report.
+
+ResetSystemErase
+CreateIndex
+{AddDoc}: *
+CloseIndex
+
+RepSumByPref AddDoc 

Propchange: lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
Wed Jul 25 01:54:58 2007
@@ -70,6 +70,7 @@
   private IndexReader indexReader;
   private IndexWriter indexWriter;
   private Config config;
+  private long startTimeMillis;
   
   // constructor
   public PerfRunData (Config config) throws Exception {
@@ -136,6 +137,15 @@
     // release unused stuff
     System.runFinalization();
     System.gc();
+
+    startTimeMillis = System.currentTimeMillis();
+  }
+
+  /**
+   * @return Start time in milliseconds
+   */
+  public long getStartTimeMillis() {
+    return startTimeMillis;
   }
 
   /**

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
Wed Jul 25 01:54:58 2007
@@ -39,6 +39,8 @@
  * doc.stored=true|FALSE<br/>
  * doc.tokenized=TRUE|false<br/>
  * doc.term.vector=true|FALSE<br/>
+ * doc.term.vector.positions=true|FALSE<br/>
+ * doc.term.vector.offsets=true|FALSE<br/>
  * doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field<br/>
  */
 public abstract class BasicDocMaker implements DocMaker {
@@ -55,7 +57,13 @@
   // leftovers are thread local, because it is unsafe to share residues between threads
   private ThreadLocal leftovr = new ThreadLocal();
 
-  static final String BODY_FIELD = "body";
+  public static final String BODY_FIELD = "body";
+  public static final String TITLE_FIELD = "doctitle";
+  public static final String DATE_FIELD = "docdate";
+  public static final String ID_FIELD = "docid";
+  public static final String BYTES_FIELD = "bytes";
+  public static final String NAME_FIELD = "docname";
+
   private long numBytes = 0;
   private long numUniqueBytes = 0;
 
@@ -97,17 +105,17 @@
   private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException
{
     int docid = incrNumDocsCreated();
     Document doc = new Document();
-    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
+    doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal));
     if (docData.getName()!=null) {
       String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
-      doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
+      doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal));
     }
     if (docData.getDate()!=null) {
       String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
-      doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
+      doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
     }
     if (docData.getTitle()!=null) {
-      doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
+      doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
     }
     if (docData.getBody()!=null && docData.getBody().length()>0) {
       String bdy;
@@ -127,7 +135,7 @@
       }
       doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
       if (storeBytes == true) {
-        doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES));
+        doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
       }
     }
 
@@ -188,7 +196,18 @@
     boolean termVec = config.get("doc.term.vector",false);
     storeVal = (stored ? Field.Store.YES : Field.Store.NO);
     indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
-    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
+    boolean termVecPositions = config.get("doc.term.vector.positions",false);
+    boolean termVecOffsets = config.get("doc.term.vector.offsets",false);
+    if (termVecPositions && termVecOffsets)
+      termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
+    else if (termVecPositions)
+      termVecVal = Field.TermVector.WITH_POSITIONS;
+    else if (termVecOffsets)
+      termVecVal = Field.TermVector.WITH_OFFSETS;
+    else if (termVec)
+      termVecVal = Field.TermVector.YES;
+    else
+      termVecVal = Field.TermVector.NO;
     storeBytes = config.get("doc.store.body.bytes", false);
     forever = config.get("doc.maker.forever",true);
   }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
Wed Jul 25 01:54:58 2007
@@ -40,7 +40,7 @@
  */
 public class DirDocMaker extends BasicDocMaker {
 
-  private DateFormat dateFormat;
+  private ThreadLocal dateFormat = new ThreadLocal();
   private File dataDir = null;
   private int iteration=0;
   
@@ -148,11 +148,21 @@
     if (inputFiles==null) {
       throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
     }
-    // date format: 30-MAR-1987 14:22:36
-    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
-    dateFormat.setLenient(true);
   }
 
+  // get/initiate a thread-local simple date format (must do so 
+  // because SimpleDateFormat is not thread-safe).
+  protected DateFormat getDateFormat () {
+    DateFormat df = (DateFormat) dateFormat.get();
+    if (df == null) {
+      // date format: 30-MAR-1987 14:22:36.87
+      df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
+      df.setLenient(true);
+      dateFormat.set(df);
+    }
+    return df;
+  }
+  
   protected DocData getNextDocData() throws Exception {
     File f = null;
     String name = null;
@@ -184,7 +194,7 @@
     reader.close();
     addBytes(f.length());
     
-    Date date = dateFormat.parse(dateStr.trim()); 
+    Date date = getDateFormat().parse(dateStr.trim()); 
     return new DocData(name, bodyBuf.toString(), title, null, date);
   }
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
Wed Jul 25 01:54:58 2007
@@ -46,7 +46,7 @@
 
     Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer",
             "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
-    String defaultField = config.get("file.query.maker.default.field", "body");
+    String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD);
     QueryParser qp = new QueryParser(defaultField, anlzr);
 
     List qq = new ArrayList();

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java?view=auto&rev=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
Wed Jul 25 01:54:58 2007
@@ -0,0 +1,159 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+
+/**
+ * A DocMaker reading one line at a time as a Document from
+ * a single file.  This saves IO cost (over DirDocMaker) of
+ * recursing through a directory and opening a new file for
+ * every document.  It also re-uses its Document and Field
+ * instance to improve indexing speed.
+ *
+ * Config properties:
+ * docs.file=&lt;path to the file%gt;
+ */
+public class LineDocMaker extends BasicDocMaker {
+
+  private BufferedReader fileIn;
+  private ThreadLocal docState = new ThreadLocal();
+  private String fileName;
+
+  private static int READER_BUFFER_BYTES = 64*1024;
+
+  private class DocState {
+    Document doc;
+    Field bodyField;
+    Field titleField;
+    Field dateField;
+
+    public DocState() {
+
+      bodyField = new Field(BasicDocMaker.BODY_FIELD,
+                            "",
+                            storeVal,
+                            Field.Index.TOKENIZED,
+                            termVecVal);
+      titleField = new Field(BasicDocMaker.TITLE_FIELD,
+                             "",
+                             storeVal,
+                             Field.Index.TOKENIZED,
+                             termVecVal);
+      dateField = new Field(BasicDocMaker.TITLE_FIELD,
+                            "",
+                            storeVal,
+                            Field.Index.TOKENIZED,
+                            termVecVal);
+
+      doc = new Document();
+      doc.add(bodyField);
+      doc.add(titleField);
+      doc.add(dateField);
+    }
+
+    final static String SEP = WriteLineDocTask.SEP;
+
+    public Document setFields(String line) {
+      // title <TAB> date <TAB> body <NEWLINE>
+      int spot = line.indexOf(SEP);
+      titleField.setValue(line.substring(0, spot));
+      int spot2 = line.indexOf(SEP, 1+spot);
+      dateField.setValue(line.substring(1+spot, spot2));
+      bodyField.setValue(line.substring(1+spot2, line.length()));
+      return doc;
+    }
+  }
+
+  /* (non-Javadoc)
+   * @see SimpleDocMaker#setConfig(java.util.Properties)
+   */
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    resetInputs();
+  }
+
+  protected DocData getNextDocData() throws Exception {
+    throw new RuntimeException("not implemented");
+  }
+
+  private DocState getDocState() {
+    DocState ds = (DocState) docState.get();
+    if (ds == null) {
+      ds = new DocState();
+      docState.set(ds);
+    }
+    return ds;
+  }
+
+  public Document makeDocument() throws Exception {
+
+    String line;
+    synchronized(this) {
+      while(true) {
+        line = fileIn.readLine();
+        if (line == null) {
+          if (!forever)
+            throw new NoMoreDataException();
+          else {
+            // Reset the file
+            openFile();
+          }
+        } else {
+          break;
+        }
+      }
+    }
+
+    return getDocState().setFields(line);
+  }
+
+  public Document makeDocument(int size) throws Exception {
+    throw new RuntimeException("cannot change document size with LineDocMaker; please use
DirDocMaker instead");
+  }
+  
+  public synchronized void resetInputs() {
+    super.resetInputs();
+    fileName = config.get("docs.file", null);
+    if (fileName == null)
+      throw new RuntimeException("docs.file must be set");
+    openFile();
+  }
+
+  private void openFile() {
+    try {
+      if (fileIn != null)
+        fileIn.close();
+      fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public int numUniqueTexts() {
+    return -1;
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
Wed Jul 25 01:54:58 2007
@@ -71,7 +71,7 @@
    * @return array of Lucene queries
    */
   private static Query[] createQueries(List qs, Analyzer a) {
-    QueryParser qp = new QueryParser("body", a);
+    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a);
     List queries = new ArrayList();
     for (int i = 0; i < qs.size(); i++)  {
       try {
@@ -107,7 +107,7 @@
     
     List queryList = new ArrayList(20);
     queryList.addAll(Arrays.asList(STANDARD_QUERIES));
-    queryList.addAll(Arrays.asList(getPrebuiltQueries("body")));
+    queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD)));
     return createQueries(queryList, anlzr);
   }
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
Wed Jul 25 01:54:58 2007
@@ -45,11 +45,11 @@
     Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer",
         "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); 
     
-    QueryParser qp = new QueryParser("body",anlzr);
+    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr);
     ArrayList qq = new ArrayList();
-    Query q1 = new TermQuery(new Term("docid","doc2"));
+    Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2"));
     qq.add(q1);
-    Query q2 = new TermQuery(new Term("body","simple"));
+    Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple"));
     qq.add(q2);
     BooleanQuery bq = new BooleanQuery();
     bq.add(q1,Occur.MUST);

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
Wed Jul 25 01:54:58 2007
@@ -519,6 +519,8 @@
     </li><li>doc.stored
     </li><li>doc.tokenized
     </li><li>doc.term.vector
+    </li><li>doc.term.vector.positions
+    </li><li>doc.term.vector.offsets
     </li><li>doc.store.body.bytes
     </li><li>docs.dir
     </li><li>query.maker
@@ -540,6 +542,8 @@
     </li><li>merge.factor
     </li><li>max.buffered
     </li><li>directory
+    </li><li>ram.flush.mb
+    </li><li>autocommit
     </li></ul>
   </li>
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
Wed Jul 25 01:54:58 2007
@@ -20,6 +20,7 @@
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.document.Document;
+import java.text.NumberFormat;
 
 
 /**
@@ -81,7 +82,10 @@
       logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
     }
     if (logStep>0 && (count%logStep)==0) {
-      System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+"
docs");
+      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
+      NumberFormat nf = NumberFormat.getInstance();
+      nf.setMaximumFractionDigits(2);
+      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+"
processed (add) "+count+" docs");
     }
   }
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
Wed Jul 25 01:54:58 2007
@@ -30,7 +30,8 @@
  * Create an index.
  * <br>Other side effects: index writer object in perfRunData is set.
  * <br>Relevant properties: <code>merge.factor, max.buffered,
- *  max.field.length</code>.
+ *  max.field.length, ram.flush.mb [default 0], autocommit
+ *  [default true]</code>.
  */
 public class CreateIndexTask extends PerfTask {
 
@@ -42,19 +43,23 @@
     Directory dir = getRunData().getDirectory();
     Analyzer analyzer = getRunData().getAnalyzer();
     
-    IndexWriter iw = new IndexWriter(dir, analyzer, true);
-    
     Config config = getRunData().getConfig();
     
     boolean cmpnd = config.get("compound",true);
     int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
     int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
     int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
+    double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+    boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
 
+    IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
+    
     iw.setUseCompoundFile(cmpnd);
     iw.setMergeFactor(mrgf);
     iw.setMaxBufferedDocs(mxbf);
     iw.setMaxFieldLength(mxfl);
+    if (flushAtRAMUsage > 0)
+      iw.setRAMBufferSizeMB(flushAtRAMUsage);
 
     getRunData().setIndexWriter(iw);
     return 1;

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
Wed Jul 25 01:54:58 2007
@@ -30,14 +30,16 @@
  * Open an index writer.
  * <br>Other side effects: index writer object in perfRunData is set.
  * <br>Relevant properties: <code>merge.factor, max.buffered,
- * max.field.length</code>.
-</code>.
+ * max.field.length, ram.flush.mb [default 0], autocommit
+ * [default true]</code>.
  */
 public class OpenIndexTask extends PerfTask {
 
   public static final int DEFAULT_MAX_BUFFERED = 10;
   public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
   public static final int DEFAULT_MERGE_PFACTOR = 10;
+  public static final int DEFAULT_RAM_FLUSH_MB = 0;
+  public static final boolean DEFAULT_AUTO_COMMIT = true;
 
   public OpenIndexTask(PerfRunData runData) {
     super(runData);
@@ -46,7 +48,6 @@
   public int doLogic() throws IOException {
     Directory dir = getRunData().getDirectory();
     Analyzer analyzer = getRunData().getAnalyzer();
-    IndexWriter writer = new IndexWriter(dir, analyzer, false);
     
     Config config = getRunData().getConfig();
     
@@ -54,12 +55,17 @@
     int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
     int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
     int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
+    double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+    boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
+    IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
 
     // must update params for newly opened writer
     writer.setMaxBufferedDocs(mxbf);
     writer.setMaxFieldLength(mxfl);
     writer.setMergeFactor(mrgf);
     writer.setUseCompoundFile(cmpnd); // this one redundant?
+    if (flushAtRAMUsage > 0)
+      writer.setRAMBufferSizeMB(flushAtRAMUsage);
     
     getRunData().setIndexWriter(writer);
     return 1;

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?view=auto&rev=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
Wed Jul 25 01:54:58 2007
@@ -0,0 +1,137 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+
+public class WriteLineDocTask extends PerfTask {
+
+  /**
+   * Default value for property <code>doc.add.log.step<code> - indicating how
often 
+   * an "added N docs" message should be logged.  
+   */
+  public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
+
+  public WriteLineDocTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  private int logStep = -1;
+  private int docSize = 0;
+  int count = 0;
+  private BufferedWriter lineFileOut=null;
+  private DocMaker docMaker;
+  
+  public final static String SEP = "\t";
+  
+  /*
+   *  (non-Javadoc)
+   * @see PerfTask#setup()
+   */
+  public void setup() throws Exception {
+    super.setup();
+    if (lineFileOut==null) {
+      Config config = getRunData().getConfig();
+      String fileName = config.get("line.file.out", null);
+      if (fileName == null)
+        throw new Exception("line.file.out must be set");
+      lineFileOut = new BufferedWriter(new FileWriter(fileName));
+    }
+    docMaker = getRunData().getDocMaker();
+  }
+
+  public void tearDown() throws Exception {
+    log(++count);
+    super.tearDown();
+  }
+
+  public int doLogic() throws Exception {
+    Document doc;
+    if (docSize > 0) {
+      doc = docMaker.makeDocument(docSize);
+    } else {
+      doc = docMaker.makeDocument();
+    }
+
+    Field f = doc.getField(BasicDocMaker.BODY_FIELD);
+
+    String body, title, date;
+    if (f != null)
+      body = f.stringValue().replace('\t', ' ');
+    else
+      body = null;
+    
+    f = doc.getField(BasicDocMaker.TITLE_FIELD);
+    if (f != null)
+      title = f.stringValue().replace('\t', ' ');
+    else
+      title = "";
+
+    f = doc.getField(BasicDocMaker.DATE_FIELD);
+    if (f != null)
+      date = f.stringValue().replace('\t', ' ');
+    else
+      date = "";
+
+    if (body != null) {
+      lineFileOut.write(title, 0, title.length());
+      lineFileOut.write(SEP);
+      lineFileOut.write(date, 0, date.length());
+      lineFileOut.write(SEP);
+      lineFileOut.write(body, 0, body.length());
+      lineFileOut.newLine();
+      lineFileOut.flush();
+    }
+    return 1;
+  }
+
+  private void log (int count) {
+    if (logStep<0) {
+      // init once per instance
+      logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+    }
+    if (logStep>0 && (count%logStep)==0) {
+      System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+"
docs");
+    }
+  }
+
+  /**
+   * Set the params (docSize only)
+   * @param params docSize, or 0 for no limit.
+   */
+  public void setParams(String params) {
+    super.setParams(params);
+    docSize = (int) Float.parseFloat(params); 
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
+   */
+  public boolean supportsParams() {
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
Wed Jul 25 01:54:58 2007
@@ -22,6 +22,8 @@
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
+import java.util.List;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Properties;
@@ -110,7 +112,9 @@
 
   private void printProps() {
     System.out.println("------------> config properties:");
-    for (Iterator it = props.keySet().iterator(); it.hasNext();) {
+    List propKeys = new ArrayList(props.keySet());
+    Collections.sort(propKeys);
+    for (Iterator it = propKeys.iterator(); it.hasNext();) {
       String propName = (String) it.next();
       System.out.println(propName + " = " + props.getProperty(propName));
     }

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?view=diff&rev=559366&r1=559365&r2=559366
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Wed Jul 25 01:54:58 2007
@@ -18,6 +18,9 @@
 package org.apache.lucene.benchmark.byTask;
 
 import java.io.StringReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.BufferedReader;
 
 import org.apache.lucene.benchmark.byTask.Benchmark;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
@@ -79,6 +82,7 @@
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
+    ir.close();
   }
 
   /**
@@ -121,6 +125,7 @@
     iw.close();
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
     assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
+    ir.close();
   }
 
   /**
@@ -150,6 +155,69 @@
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
     int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
+    ir.close();
+  }
+
+  /**
+   * Test WriteLineDoc and LineDocMaker.
+   */
+  public void testLineDocFile() throws Exception {
+    File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt");
+
+    // We will call WriteLineDocs this many times
+    final int NUM_TRY_DOCS = 500;
+
+    // Creates a line file with first 500 docs from reuters
+    String algLines1[] = {
+      "# ----- properties ",
+      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+      "doc.maker.forever=false",
+      "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
+      "# ----- alg ",
+      "{WriteLineDoc()}:" + NUM_TRY_DOCS,
+    };
+
+    // Run algo
+    Benchmark benchmark = execBenchmark(algLines1);
+
+    // Verify we got somewhere between 1-500 lines (some
+    // Reuters docs have no body, which WriteLineDoc task
+    // skips).
+    BufferedReader r = new BufferedReader(new FileReader(lineFile));
+    int numLines = 0;
+    while(r.readLine() != null)
+      numLines++;
+    r.close();
+    assertTrue("did not see the right number of docs; should be > 0 and <= " + NUM_TRY_DOCS
+ " but was " + numLines, numLines > 0 && numLines <= NUM_TRY_DOCS);
+    
+    // Index the line docs
+    String algLines2[] = {
+      "# ----- properties ",
+      "analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
+      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
+      "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
+      "doc.maker.forever=false",
+      "autocommit=false",
+      "ram.flush.mb=4",
+      "# ----- alg ",
+      "ResetSystemErase",
+      "CreateIndex",
+      "{AddDoc}: *",
+      "CloseIndex",
+    };
+    
+    // Run algo
+    benchmark = execBenchmark(algLines2);
+
+    // now we should be able to open the index for write. 
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+    iw.close();
+
+    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+    assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in
the index", numLines, ir.numDocs());
+    ir.close();
+
+    lineFile.delete();
   }
   
   // create the benchmark and execute it. 



Mime
View raw message