lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r786233 [3/3] - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/programmatic/ src/java/org/apache/...
Date Thu, 18 Jun 2009 19:59:01 GMT
Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java
Thu Jun 18 19:58:59 2009
@@ -0,0 +1,173 @@
+package org.apache.lucene.benchmark.byTask.utils;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Implements a {@link Reader} over a {@link StringBuffer} instance. Although
+ * one can use {@link java.io.StringReader} by passing it
+ * {@link StringBuffer#toString()}, it is better to use this class, as it
+ * doesn't mark the passed-in {@link StringBuffer} as shared (which will cause
+ * inner char[] allocations at the next append() attempt).<br>
+ * Notes:
+ * <ul>
+ * <li>This implementation assumes the underlying {@link StringBuffer} is not
+ * changed during the use of this {@link Reader} implementation.
+ * <li>This implementation is thread-safe.
+ * <li>The implementation looks very much like {@link java.io.StringReader} (for
+ * the right reasons).
+ * <li>If one wants to reuse that instance, then the following needs to be done:
+ * <pre>
+ * StringBuffer sb = new StringBuffer("some text");
+ * Reader reader = new StringBufferReader(sb);
+ * ... read from reader - dont close it ! ...
+ * sb.setLength(0);
+ * sb.append("some new text");
+ * reader.reset();
+ * ... read the new string from the reader ...
+ * </pre>
+ * </ul>
+ */
+public class StringBufferReader extends Reader {
+  
+  // TODO (3.0): change to StringBuffer (including the name of the class)
+  
+  // The StringBuffer to read from.
+  private StringBuffer sb;
+
+  // The length of 'sb'.
+  private int length;
+
+  // The next position to read from the StringBuffer.
+  private int next = 0;
+
+  // The mark position. The default value 0 means the start of the text.
+  private int mark = 0;
+
+  public StringBufferReader(StringBuffer sb) {
+    set(sb);
+  }
+
+  /** Check to make sure that the stream has not been closed. */
+  private void ensureOpen() throws IOException {
+    if (sb == null) {
+      throw new IOException("Stream has already been closed");
+    }
+  }
+
+  public void close() {
+    synchronized (lock) {
+      sb = null;
+    }
+  }
+
+  /**
+   * Mark the present position in the stream. Subsequent calls to reset() will
+   * reposition the stream to this point.
+   * 
+   * @param readAheadLimit Limit on the number of characters that may be read
+   *        while still preserving the mark. Because the stream's input comes
+   *        from a StringBuffer, there is no actual limit, so this argument 
+   *        must not be negative, but is otherwise ignored.
+   * @exception IllegalArgumentException If readAheadLimit is < 0
+   * @exception IOException If an I/O error occurs
+   */
+  public void mark(int readAheadLimit) throws IOException {
+    if (readAheadLimit < 0){
+      throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit);
+    }
+    synchronized (lock) {
+      ensureOpen();
+      mark = next;
+    }
+  }
+
+  public boolean markSupported() {
+    return true;
+  }
+
+  public int read() throws IOException {
+    synchronized (lock) {
+      ensureOpen();
+      return next >= length ? -1 : sb.charAt(next++);
+    }
+  }
+
+  public int read(char cbuf[], int off, int len) throws IOException {
+    synchronized (lock) {
+      ensureOpen();
+
+      // Validate parameters
+      if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length)
{
+        throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length="
+ cbuf.length);
+      }
+
+      if (len == 0) {
+        return 0;
+      }
+
+      if (next >= length) {
+        return -1;
+      }
+
+      int n = Math.min(length - next, len);
+      sb.getChars(next, next + n, cbuf, off);
+      next += n;
+      return n;
+    }
+  }
+
+  public boolean ready() throws IOException {
+    synchronized (lock) {
+      ensureOpen();
+      return true;
+    }
+  }
+
+  public void reset() throws IOException {
+    synchronized (lock) {
+      ensureOpen();
+      next = mark;
+      length = sb.length();
+    }
+  }
+
+  public void set(StringBuffer sb) {
+    synchronized (lock) {
+      this.sb = sb;
+      length = sb.length();
+    }
+  }
+  public long skip(long ns) throws IOException {
+    synchronized (lock) {
+      ensureOpen();
+      if (next >= length) {
+        return 0;
+      }
+
+      // Bound skip by beginning and end of the source
+      long n = Math.min(length - next, ns);
+      n = Math.max(-next, n);
+      next += n;
+      return n;
+    }
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
Thu Jun 18 19:58:59 2009
@@ -17,18 +17,17 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Properties;
+
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Properties;
-
 /**
  * Extract the downloaded Wikipedia dump into separate files for indexing.
  */
@@ -51,7 +50,6 @@
     }
   }
 
-
   public File directory(int count, File directory) {
     if (directory == null) {
       directory = outputDir;
@@ -99,7 +97,8 @@
     long start = System.currentTimeMillis();
     try {
       while ((doc = docMaker.makeDocument()) != null) {
-        create(doc.get(BasicDocMaker.ID_FIELD), doc.get(BasicDocMaker.TITLE_FIELD), doc.get(BasicDocMaker.DATE_FIELD),
doc.get(BasicDocMaker.BODY_FIELD));
+        create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
+            .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
       }
     } catch (NoMoreDataException e) {
       //continue
@@ -130,7 +129,7 @@
     Properties properties = new Properties();
 
     properties.setProperty("docs.file", wikipedia.getAbsolutePath());
-    properties.setProperty("doc.maker.forever", "false");
+    properties.setProperty("content.source.forever", "false");
     properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
     docMaker.setConfig(new Config(properties));
     docMaker.resetInputs();

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Thu Jun 18 19:58:59 2009
@@ -17,6 +17,7 @@
 
 package org.apache.lucene.benchmark.byTask;
 
+import java.io.IOException;
 import java.io.StringReader;
 import java.io.File;
 import java.io.FileReader;
@@ -26,7 +27,7 @@
 
 import org.apache.lucene.benchmark.byTask.feeds.DocData;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
-import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
 import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
@@ -114,7 +115,7 @@
     };
 
     CountingSearchTestTask.numSearches = 0;
-    Benchmark benchmark = execBenchmark(algLines);
+    execBenchmark(algLines);
     assertTrue(CountingSearchTestTask.numSearches > 0);
     long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
     assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
@@ -124,7 +125,7 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "doc.stored=true",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "content.source="+Reuters20ContentSource.class.getName(),
         "query.maker=" + ReutersQueryMaker.class.getName(),
         "ResetSystemErase",
         "CreateIndex",
@@ -162,7 +163,7 @@
     String algLines[] = {
         "doc.stored=true",//doc storage is required in order to have text to highlight
         "doc.term.vector.offsets=true",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "content.source="+Reuters20ContentSource.class.getName(),
         "query.maker=" + ReutersQueryMaker.class.getName(),
         "ResetSystemErase",
         "CreateIndex",
@@ -199,7 +200,7 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "doc.stored=false",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "content.source="+Reuters20ContentSource.class.getName(),
         "query.maker=" + ReutersQueryMaker.class.getName(),
         "ResetSystemErase",
         "CreateIndex",
@@ -227,14 +228,14 @@
   /**
    * Test Exhasting Doc Maker logic
    */
-  public void testExhaustDocMaker() throws Exception {
+  public void testExhaustContentSource() throws Exception {
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
-        "doc.add.log.step=1",
+        "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
+        "content.source.log.step=1",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "doc.tokenized=false",
@@ -274,10 +275,10 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=FSDirectory",
         "doc.stored=false",
         "doc.tokenized=false",
@@ -292,7 +293,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -309,8 +310,8 @@
     // Creates a line file with first 500 docs from reuters
     String algLines1[] = {
       "# ----- properties ",
-      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
-      "doc.maker.forever=false",
+      "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
+      "content.source.forever=false",
       "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
       "# ----- alg ",
       "{WriteLineDoc()}:" + NUM_TRY_DOCS,
@@ -335,7 +336,7 @@
       "analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
       "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
       "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
-      "doc.maker.forever=false",
+      "content.source.forever=false",
       "doc.reuse.fields=false",
       "autocommit=false",
       "ram.flush.mb=4",
@@ -373,7 +374,7 @@
     String algLines1[] = {
       "# ----- properties ",
       "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
-      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+      "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
       "# ----- alg ",
       "{ReadTokens}: " + NUM_DOCS,
       "ResetSystemErase",
@@ -421,10 +422,10 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "doc.tokenized=false",
@@ -442,7 +443,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 2 * 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 2 * 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -477,16 +478,19 @@
   }
 
   /** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */
-  public static class Reuters20DocMaker extends ReutersDocMaker {
-    private int nDocs=0;
-    protected synchronized DocData getNextDocData() throws Exception {
-      if (nDocs>=20 && !forever) {
+  public static class Reuters20ContentSource extends ReutersContentSource {
+    private int nDocs = 0;
+
+    public synchronized DocData getNextDocData(DocData docData)
+        throws NoMoreDataException, IOException {
+      if (nDocs >= 20 && !forever) {
         throw new NoMoreDataException();
       }
       nDocs++;
-      return super.getNextDocData();
+      return super.getNextDocData(docData);
     }
-    public synchronized void resetInputs() {
+
+    public synchronized void resetInputs() throws IOException {
       super.resetInputs();
       nDocs = 0;
     }
@@ -499,10 +503,10 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "doc.tokenized=false",
@@ -521,7 +525,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -533,12 +537,12 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
+        "content.source="+Reuters20ContentSource.class.getName(),
         "ram.flush.mb=-1",
         "max.buffered=2",
-        "doc.add.log.step=3",
+        "content.source.log.step=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "doc.tokenized=false",
@@ -557,7 +561,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -577,10 +581,10 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "merge.scheduler=" + MyMergeScheduler.class.getName(),
         "doc.stored=false",
@@ -601,7 +605,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -620,12 +624,12 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "ram.flush.mb=-1",
         "max.buffered=2",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "merge.policy=" + MyMergePolicy.class.getName(),
         "doc.stored=false",
@@ -646,7 +650,7 @@
     
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
   }
@@ -658,13 +662,13 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "ram.flush.mb=-1",
         "max.buffered=2",
         "compound=cmpnd:true:false",
         "doc.term.vector=vector:false:true",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "merge.factor=3",
@@ -702,12 +706,12 @@
     // 1. alg definition (required in every "logic" test)
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=3",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=3",
         "ram.flush.mb=-1",
         "max.buffered=3",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
         "doc.stored=false",
@@ -728,7 +732,7 @@
 
     // 3. test number of docs in the index
     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
-    int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
+    int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
     ir.close();
 
@@ -780,10 +784,10 @@
     String dis = disable ? "-" : "";
     return new String[] {
         "# ----- properties ",
-        "doc.maker="+Reuters20DocMaker.class.getName(),
-        "doc.add.log.step=30",
+        "content.source="+Reuters20ContentSource.class.getName(),
+        "content.source.log.step=30",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=RAMDirectory",
         "doc.stored=false",
         "doc.tokenized=false",

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
Thu Jun 18 19:58:59 2009
@@ -111,35 +111,11 @@
     doIndexAndSearchTest(file, false, null);
   }
   
-  public void testBZip2WithBzipCompressionDisabled() throws Exception {
-    File file = new File(getWorkDir(), "one-line.bz2");
-    createBZ2LineFile(file);
-    
-    try {
-      doIndexAndSearchTest(file, true, "false");
-      fail("Some exception should have been thrown !");
-    } catch (Exception e) {
-      // expected.
-    }
-  }
-  
   public void testRegularFile() throws Exception {
     File file = new File(getWorkDir(), "one-line");
     createRegularLineFile(file);
     doIndexAndSearchTest(file, false, null);
   }
-  
-  public void testRegularFileWithBZipCompressionEnabled() throws Exception {
-    File file = new File(getWorkDir(), "one-line");
-    createRegularLineFile(file);
-    
-    try {
-      doIndexAndSearchTest(file, true, "true");
-      fail("Some exception should have been thrown !");
-    } catch (Exception e) {
-      // expected.
-    }
-  }
 
   public void testInvalidFormat() throws Exception {
     String[] testCases = new String[] {

Added: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
Thu Jun 18 19:58:59 2009
@@ -0,0 +1,332 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.Date;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.DateTools;
+
+public class TrecContentSourceTest extends TestCase {
+
+  /** A TrecDocMaker which works on a String and not files. */
+  private static class StringableTrecSource extends TrecContentSource {
+  
+    private String docs = null;
+    
+    public StringableTrecSource(String docs, boolean forever) {
+      this.docs = docs;
+      this.forever = forever;
+    }
+    
+    protected void openNextFile() throws NoMoreDataException, IOException {
+      if (reader != null) {
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        ++iteration;
+      }
+      
+      reader = new BufferedReader(new StringReader(docs));
+    }
+    
+    public void setConfig(Config config) {
+      htmlParser = new DemoHTMLParser();
+    }
+  }
+  
+  private void assertDocData(DocData dd, String expName, String expTitle,
+                             String expBody, Date expDate)
+      throws ParseException {
+    assertNotNull(dd);
+    assertEquals(expName, dd.getName());
+    assertEquals(expTitle, dd.getTitle());
+    assertTrue(dd.getBody().indexOf(expBody) != -1);
+    Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null;
+    assertEquals(expDate, date);
+  }
+  
+  private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception {
+    boolean thrown = false;
+    try {
+      stdm.getNextDocData(null);
+    } catch (NoMoreDataException e) {
+      thrown = true;
+    }
+    assertTrue("Expecting NoMoreDataException", thrown);
+  }
+  
+  public void testOneDocument() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+  
+  public void testTwoDocuments() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+
+  // If a Date: attribute is missing, make sure the document is not skipped, but
+  // rather that null Data is assigned.
+  public void testMissingDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+
+  // When a 'bad date' is input (unparsable date), make sure the DocData date is
+  // assigned null.
+  public void testBadDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Bad Date\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    assertNoMoreDataException(source);
+  }
+
+  public void testForever() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, true);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    // same document, but the second iteration changes the name.
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+    // Don't test that NoMoreDataException is thrown, since the forever flag is
+    // turned on.
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
Thu Jun 18 19:58:59 2009
@@ -27,8 +27,8 @@
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.DocData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
@@ -40,7 +40,7 @@
 public class WriteLineDocTaskTest extends BenchmarkTestCase {
 
   // class has to be public so that Class.forName.newInstance() will work
-  public static final class WriteLineDocMaker extends BasicDocMaker {
+  public static final class WriteLineDocMaker extends DocMaker {
 
     protected DocData getNextDocData() throws NoMoreDataException, Exception {
       throw new UnsupportedOperationException("not implemented");

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java
Thu Jun 18 19:58:59 2009
@@ -23,7 +23,7 @@
 import java.io.PrintWriter;
 
 import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
-import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
 import org.apache.lucene.benchmark.quality.Judge;
 import org.apache.lucene.benchmark.quality.QualityQuery;
 import org.apache.lucene.benchmark.quality.QualityQueryParser;
@@ -155,10 +155,10 @@
     // 1. alg definition
     String algLines[] = {
         "# ----- properties ",
-        "doc.maker="+ReutersDocMaker.class.getName(),
-        "doc.add.log.step=2500",
+        "content.source="+ReutersContentSource.class.getName(),
+        "content.source.log.step=2500",
         "doc.term.vector=false",
-        "doc.maker.forever=false",
+        "content.source.forever=false",
         "directory=FSDirectory",
         "doc.stored=true",
         "doc.tokenized=true",



Mime
View raw message