lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r561908 - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/benchmark/byTask/tasks/ src/test/org/apache/lucene/benchmark/byTask/
Date Wed, 01 Aug 2007 18:54:46 GMT
Author: mikemccand
Date: Wed Aug  1 11:54:43 2007
New Revision: 561908

URL: http://svn.apache.org/viewvc?view=rev&rev=561908
Log:
LUCENE-967: add ReadTokensTask to allow for benchmarking just tokenization

Added:
    lucene/java/trunk/contrib/benchmark/conf/tokenize.alg   (with props)
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
  (with props)
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Wed Aug  1 11:54:43 2007
@@ -4,6 +4,9 @@
 
 $Id:$
 
+8/1/07
+  LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.
+
 7/27/07
   LUCENE-836: Add support for search quality benchmarking, running 
   a set of queries against a searcher, and, optionally produce a submission

Added: lucene/java/trunk/contrib/benchmark/conf/tokenize.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/tokenize.alg?view=auto&rev=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/tokenize.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/tokenize.alg Wed Aug  1 11:54:43 2007
@@ -0,0 +1,36 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg reads all tokens out of a document but does not index them.
+# This is useful for benchmarking tokenizers.
+#
+# To use this, cd to contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/tokenize.alg
+#
+
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+doc.maker.forever=false
+
+
+#
+-------------------------------------------------------------------------------------
+
+{ReadTokens > : *
+RepSumByName

Propchange: lucene/java/trunk/contrib/benchmark/conf/tokenize.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
Wed Aug  1 11:54:43 2007
@@ -69,6 +69,7 @@
       throw new IllegalStateException("Benchmark was already executed");
     }
     executed = true;
+    runData.setStartTimeMillis();
     algorithm.execute();
   }
   

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
Wed Aug  1 11:54:43 2007
@@ -137,8 +137,11 @@
     // release unused stuff
     System.runFinalization();
     System.gc();
-
+  }
+  
+  public long setStartTimeMillis() {
     startTimeMillis = System.currentTimeMillis();
+    return startTimeMillis;
   }
 
   /**

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
Wed Aug  1 11:54:43 2007
@@ -534,6 +534,7 @@
     </li><li>doc.delete.log.step
     </li><li>log.queries
     </li><li>task.max.depth.log
+    </li><li>doc.tokenize.log.step
     </li></ul>
   </li>
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
Wed Aug  1 11:54:43 2007
@@ -78,7 +78,7 @@
     if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart())
{
       System.out.println("------------> starting task: " + getName());
     }
-    if (shouldNotRecordStats() || !reportStats) {
+    if (!reportStats || shouldNotRecordStats()) {
       setup();
       int count = doLogic();
       tearDown();

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?view=auto&rev=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
Wed Aug  1 11:54:43 2007
@@ -0,0 +1,168 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import java.text.NumberFormat;
+import java.io.Reader;
+import java.util.List;
+
+
+/**
+ * Simple task to test performance of tokenizers.  It just
+ * creates a token stream for each field of the document and
+ * read all tokens out of that stream.
+ * <br>Relevant properties: <code>doc.tokenize.log.step</code>.
+ */
+public class ReadTokensTask extends PerfTask {
+
+  /**
+   * Default value for property <code>doc.tokenize.log.step<code> - indicating
how often 
+   * an "added N docs / M tokens" message should be logged.  
+   */
+  public static final int DEFAULT_DOC_LOG_STEP = 500;
+
+  public ReadTokensTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  private int logStep = -1;
+  int count = 0;
+  int totalTokenCount = 0;
+  
+  // volatile data passed between setup(), doLogic(), tearDown().
+  private Document doc = null;
+  
+  /*
+   *  (non-Javadoc)
+   * @see PerfTask#setup()
+   */
+  public void setup() throws Exception {
+    super.setup();
+    DocMaker docMaker = getRunData().getDocMaker();
+    doc = docMaker.makeDocument();
+  }
+
+  /* (non-Javadoc)
+   * @see PerfTask#tearDown()
+   */
+  public void tearDown() throws Exception {
+    log(++count);
+    doc = null;
+    super.tearDown();
+  }
+
+  Token token = new Token("", 0, 0);
+
+  public int doLogic() throws Exception {
+    List fields = doc.getFields();
+    final int numField = fields.size();
+    Analyzer analyzer = getRunData().getAnalyzer();
+    int tokenCount = 0;
+    for(int i=0;i<numField;i++) {
+      final Field field = (Field) fields.get(i);
+      final TokenStream stream;
+      final TokenStream streamValue = field.tokenStreamValue();
+
+      if (streamValue != null) 
+        stream = streamValue;
+      else {
+        // the field does not have a TokenStream,
+        // so we have to obtain one from the analyzer
+        final Reader reader;			  // find or make Reader
+        final Reader readerValue = field.readerValue();
+
+        if (readerValue != null)
+          reader = readerValue;
+        else {
+          String stringValue = field.stringValue();
+          if (stringValue == null)
+            throw new IllegalArgumentException("field must have either TokenStream, String
or Reader value");
+          stringReader.init(stringValue);
+          reader = stringReader;
+        }
+        
+        // Tokenize field
+        stream = analyzer.tokenStream(field.name(), reader);
+      }
+
+      // reset the TokenStream to the first token
+      stream.reset();
+
+      while(stream.next() != null)
+        tokenCount++;
+    }
+    totalTokenCount += tokenCount;
+    return tokenCount;
+  }
+
+  private void log(int count) {
+    if (logStep<0) {
+      // init once per instance
+      logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
+    }
+    if (logStep>0 && (count%logStep)==0) {
+      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
+      NumberFormat nf = NumberFormat.getInstance();
+      nf.setMaximumFractionDigits(2);
+      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+"
processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
+    }
+  }
+
+  /* Simple StringReader that can be reset to a new string;
+   * we use this when tokenizing the string value from a
+   * Field. */
+  ReusableStringReader stringReader = new ReusableStringReader();
+
+  private final static class ReusableStringReader extends Reader {
+    int upto;
+    int left;
+    String s;
+    void init(String s) {
+      this.s = s;
+      left = s.length();
+      this.upto = 0;
+    }
+    public int read(char[] c) {
+      return read(c, 0, c.length);
+    }
+    public int read(char[] c, int off, int len) {
+      if (left > len) {
+        s.getChars(upto, upto+len, c, off);
+        upto += len;
+        left -= len;
+        return len;
+      } else if (0 == left) {
+        return -1;
+      } else {
+        s.getChars(upto, upto+left, c, off);
+        int r = left;
+        left = 0;
+        upto = s.length();
+        return r;
+      }
+    }
+    public void close() {};
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
Wed Aug  1 11:54:43 2007
@@ -100,15 +100,19 @@
     
     int count = 0;
     boolean exhausted = false;
+    
+    final int numTasks = tasks.size();
+    final PerfTask[] tasksArray = new PerfTask[numTasks];
+    for(int k=0;k<numTasks;k++)
+      tasksArray[k] = (PerfTask) tasks.get(k);
+
     for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions;
k++) {
-      for (Iterator it = tasks.iterator(); it.hasNext();) {
-        PerfTask task = (PerfTask) it.next();
+      for(int l=0;l<numTasks;l++)
         try {
-          count += task.runAndMaybeStats(letChildReport);
+          count += tasksArray[l].runAndMaybeStats(letChildReport);
         } catch (NoMoreDataException e) {
           exhausted = true;
         }
-      }
     }
     return count;
   }

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?view=diff&rev=561908&r1=561907&r2=561908
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Wed Aug  1 11:54:43 2007
@@ -21,14 +21,19 @@
 import java.io.File;
 import java.io.FileReader;
 import java.io.BufferedReader;
+import java.util.List;
+import java.util.Iterator;
 
 import org.apache.lucene.benchmark.byTask.Benchmark;
 import org.apache.lucene.benchmark.byTask.feeds.DocData;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
+import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermDocs;
 
 import junit.framework.TestCase;
 
@@ -221,6 +226,60 @@
     ir.close();
 
     lineFile.delete();
+  }
+  
+  /**
+   * Test ReadTokensTask
+   */
+  public void testReadTokens() throws Exception {
+
+    // We will call ReadTokens on this many docs
+    final int NUM_DOCS = 100;
+
+    // Read tokens from first NUM_DOCS docs from Reuters and
+    // then build index from the same docs
+    String algLines1[] = {
+      "# ----- properties ",
+      "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
+      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+      "# ----- alg ",
+      "{ReadTokens}: " + NUM_DOCS,
+      "ResetSystemErase",
+      "CreateIndex",
+      "{AddDoc}: " + NUM_DOCS,
+      "CloseIndex",
+    };
+
+    // Run algo
+    Benchmark benchmark = execBenchmark(algLines1);
+
+    List stats = benchmark.getRunData().getPoints().taskStats();
+
+    // Count how many tokens all ReadTokens saw
+    int totalTokenCount1 = 0;
+    for (Iterator it = stats.iterator(); it.hasNext();) {
+      TaskStats stat = (TaskStats) it.next();
+      if (stat.getTask().getName().equals("ReadTokens")) {
+        totalTokenCount1 += stat.getCount();
+      }
+    }
+
+    // Separately count how many tokens are actually in the index:
+    IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory());
+    assertEquals(NUM_DOCS, reader.numDocs());
+
+    TermEnum terms = reader.terms();
+    TermDocs termDocs = reader.termDocs();
+    int totalTokenCount2 = 0;
+    while(terms.next()) {
+      termDocs.seek(terms.term());
+      while(termDocs.next())
+        totalTokenCount2 += termDocs.freq();
+    }
+    reader.close();
+
+    // Make sure they are the same
+    assertEquals(totalTokenCount1, totalTokenCount2);
   }
   
   // create the benchmark and execute it. 



Mime
View raw message