lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From markrmil...@apache.org
Subject svn commit: r786233 [1/3] - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/programmatic/ src/java/org/apache/...
Date Thu, 18 Jun 2009 19:59:01 GMT
Author: markrmiller
Date: Thu Jun 18 19:58:59 2009
New Revision: 786233

URL: http://svn.apache.org/viewvc?rev=786233&view=rev
Log:
LUCENE-1595: Separate DocMaker into DocMaker and ContentSource.

Added:
    lucene/java/trunk/contrib/benchmark/conf/readContentSource.alg
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
Removed:
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSimpleDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/conf/analyzer.alg
    lucene/java/trunk/contrib/benchmark/conf/autoCommit.alg
    lucene/java/trunk/contrib/benchmark/conf/compound-penalty.alg
    lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg
    lucene/java/trunk/contrib/benchmark/conf/deletepercent.alg
    lucene/java/trunk/contrib/benchmark/conf/deletes.alg
    lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg
    lucene/java/trunk/contrib/benchmark/conf/highlight-profile.alg
    lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg
    lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg
    lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM.alg
    lucene/java/trunk/contrib/benchmark/conf/indexing-multithreaded.alg
    lucene/java/trunk/contrib/benchmark/conf/indexing.alg
    lucene/java/trunk/contrib/benchmark/conf/micro-standard-flush-by-ram.alg
    lucene/java/trunk/contrib/benchmark/conf/micro-standard.alg
    lucene/java/trunk/contrib/benchmark/conf/sample.alg
    lucene/java/trunk/contrib/benchmark/conf/sloppy-phrase.alg
    lucene/java/trunk/contrib/benchmark/conf/sort-standard.alg
    lucene/java/trunk/contrib/benchmark/conf/standard-flush-by-RAM.alg
    lucene/java/trunk/contrib/benchmark/conf/standard-highlights-notv.alg
    lucene/java/trunk/contrib/benchmark/conf/standard-highlights-tv.alg
    lucene/java/trunk/contrib/benchmark/conf/standard.alg
    lucene/java/trunk/contrib/benchmark/conf/tokenize.alg
    lucene/java/trunk/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg
    lucene/java/trunk/contrib/benchmark/conf/wikipedia.alg
    lucene/java/trunk/contrib/benchmark/conf/wikipediaOneRound.alg
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Thu Jun 18 19:58:59 2009
@@ -3,6 +3,34 @@
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
 $Id:$
+6/17/09 
+  LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been 
+  replaced with a concrete class which accepts a ContentSource for iterating over 
+  a content source's documents. Most of the old DocMakers were changed to a 
+  ContentSource implementation, and DocMaker is now a default document creation impl
+  that provides an easy way for reusing fields. When [doc.maker] is not defined in 
+  an algorithm, the new DocMaker is the default. If you have .alg files which 
+  specify a DocMaker (like ReutersDocMaker), you should change the [doc.maker] line to: 
+  [content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource]
+  
+  i.e.
+  doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+  becomes
+  content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+  
+  doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
+  becomes
+  content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+ 	
+  Also, PerfTask now logs a message in tearDown() rather than each Task doing its
+  own logging. A new setting called [log.step] is consulted to determine how often 
+  to log. [doc.add.log.step] is no longer a valid setting. For easy migration of 
+  current .alg files, rename [doc.add.log.step] to [log.step] and [doc.delete.log.step] 
+  to [delete.log.step]. 
+  
+  Additionally, [doc.maker.forever] should be changed to [content.source.forever].
+  (Shai Erera via Mark Miller)
+
 6/12/09 
   LUCENE-1539: Added DeleteByPercentTask which enables deleting a
   percentage of documents and searching on them.  Changed CommitIndex

Modified: lucene/java/trunk/contrib/benchmark/conf/analyzer.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/analyzer.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/analyzer.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/analyzer.alg Thu Jun 18 19:58:59 2009
@@ -30,13 +30,12 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/autoCommit.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/autoCommit.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/autoCommit.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/autoCommit.alg Thu Jun 18 19:58:59 2009
@@ -38,7 +38,7 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=5000
+log.step=5000
 
 docs.file=temp/enwiki-20070527-pages-articles.xml
 

Modified: lucene/java/trunk/contrib/benchmark/conf/compound-penalty.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/compound-penalty.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/compound-penalty.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/compound-penalty.alg Thu Jun 18 19:58:59 2009
@@ -34,14 +34,13 @@
 doc.stored=stored:true:true:false:false
 doc.tokenized=true
 doc.term.vector=vector:true:true:false:false
-doc.add.log.step=500
-doc.delete.log.step=100
+log.step=500
+delete.log.step=100
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/createLineFile.alg Thu Jun 18 19:58:59 2009
@@ -29,13 +29,13 @@
 #
 
 # Where to get documents from:
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 # Where to write the line file output:
 line.file.out=work/reuters.lines.txt
 
 # Stop after processing the document feed once:
-doc.maker.forever=false
+content.source.forever=false
 
 # -------------------------------------------------------------------------------------
 

Modified: lucene/java/trunk/contrib/benchmark/conf/deletepercent.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/deletepercent.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/deletepercent.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/deletepercent.alg Thu Jun 18 19:58:59 2009
@@ -25,13 +25,14 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
 #doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/deletes.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/deletes.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/deletes.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/deletes.alg Thu Jun 18 19:58:59 2009
@@ -32,14 +32,14 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=10000
-doc.delete.log.step=100
+log.step=10000
+delete.log.step=100
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg Thu Jun 18 19:58:59 2009
@@ -36,7 +36,7 @@
 line.file.out=work/enwiki.txt
 
 # Stop after processing the document feed once:
-doc.maker.forever=false
+content.source.forever=false
 
 # -------------------------------------------------------------------------------------
 

Modified: lucene/java/trunk/contrib/benchmark/conf/highlight-profile.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/highlight-profile.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/highlight-profile.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/highlight-profile.alg Thu Jun 18 19:58:59 2009
@@ -28,11 +28,11 @@
 doc.term.vector=true
 doc.term.vector.offsets=true
 doc.term.vector.positions=true
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 

Modified: lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/indexLineFile.alg Thu Jun 18 19:58:59 2009
@@ -38,7 +38,7 @@
 docs.file=work/reuters.lines.txt
 
 # Process documents only once:
-doc.maker.forever=false
+content.source.forever=false
 
 # -------------------------------------------------------------------------------------
 

Modified: lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg Thu Jun 18 19:58:59 2009
@@ -30,13 +30,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/indexing-flush-by-RAM.alg Thu Jun 18 19:58:59 2009
@@ -30,13 +30,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/indexing-multithreaded.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexing-multithreaded.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexing-multithreaded.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/indexing-multithreaded.alg Thu Jun 18 19:58:59 2009
@@ -30,13 +30,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/indexing.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/indexing.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/indexing.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/indexing.alg Thu Jun 18 19:58:59 2009
@@ -30,13 +30,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/micro-standard-flush-by-ram.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/micro-standard-flush-by-ram.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/micro-standard-flush-by-ram.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/micro-standard-flush-by-ram.alg Thu Jun 18 19:58:59 2009
@@ -29,13 +29,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/micro-standard.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/micro-standard.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/micro-standard.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/micro-standard.alg Thu Jun 18 19:58:59 2009
@@ -28,13 +28,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Added: lucene/java/trunk/contrib/benchmark/conf/readContentSource.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/readContentSource.alg?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/readContentSource.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/readContentSource.alg Thu Jun 18 19:58:59 2009
@@ -0,0 +1,45 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg reads the information from a ContentSoruce. It is useful for 
+# measuring the performance of a particular ContentSource implementation, or 
+# gather baselines for operations like indexing (if reading from the content 
+# source takes 'X' time, we cannot index faster).
+#
+# To use this, first cd to contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/readContentSource.alg
+#
+
+# Where to get documents from:
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
+
+# Stop after processing the document feed once:
+content.source.forever=false
+
+# Log messages every:
+log.step=100000
+
+# -------------------------------------------------------------------------------------
+
+# Process all documents, appending each one to the line file:
+{ ConsumeContentSource } : *
+
+RepSumByPref ConsumeContentSource

Modified: lucene/java/trunk/contrib/benchmark/conf/sample.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/sample.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/sample.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/sample.alg Thu Jun 18 19:58:59 2009
@@ -40,13 +40,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/sloppy-phrase.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/sloppy-phrase.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/sloppy-phrase.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/sloppy-phrase.alg Thu Jun 18 19:58:59 2009
@@ -28,13 +28,13 @@
 doc.stored=false
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=500
+log.step=500
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleSloppyPhraseQueryMaker
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/sort-standard.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/sort-standard.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/sort-standard.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/sort-standard.alg Thu Jun 18 19:58:59 2009
@@ -29,11 +29,11 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=100000
+log.step=100000
 
 docs.dir=reuters-out
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.SortableSimpleDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 

Modified: lucene/java/trunk/contrib/benchmark/conf/standard-flush-by-RAM.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/standard-flush-by-RAM.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/standard-flush-by-RAM.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/standard-flush-by-RAM.alg Thu Jun 18 19:58:59 2009
@@ -29,13 +29,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/standard-highlights-notv.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/standard-highlights-notv.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/standard-highlights-notv.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/standard-highlights-notv.alg Thu Jun 18 19:58:59 2009
@@ -28,11 +28,11 @@
 doc.term.vector=false
 doc.term.vector.offsets=false
 doc.term.vector.positions=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 

Modified: lucene/java/trunk/contrib/benchmark/conf/standard-highlights-tv.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/standard-highlights-tv.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/standard-highlights-tv.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/standard-highlights-tv.alg Thu Jun 18 19:58:59 2009
@@ -28,11 +28,11 @@
 doc.term.vector=true
 doc.term.vector.offsets=true
 doc.term.vector.positions=true
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 

Modified: lucene/java/trunk/contrib/benchmark/conf/standard.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/standard.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/standard.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/standard.alg Thu Jun 18 19:58:59 2009
@@ -28,13 +28,13 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=2000
+log.step=2000
 
 docs.dir=reuters-out
 #docs.dir=reuters-111
 
-#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 
 #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
 query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker

Modified: lucene/java/trunk/contrib/benchmark/conf/tokenize.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/tokenize.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/tokenize.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/tokenize.alg Thu Jun 18 19:58:59 2009
@@ -25,8 +25,8 @@
 #   ant run-task -Dtask.alg=conf/tokenize.alg
 #
 
-doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
-doc.maker.forever=false
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source.forever=false
 
 
 #

Modified: lucene/java/trunk/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg Thu Jun 18 19:58:59 2009
@@ -37,7 +37,7 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=5000
+log.step=5000
 
 docs.file=temp/enwiki-20070527-pages-articles.xml
 

Modified: lucene/java/trunk/contrib/benchmark/conf/wikipedia.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/wikipedia.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/wikipedia.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/wikipedia.alg Thu Jun 18 19:58:59 2009
@@ -33,7 +33,7 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=5000
+log.step=5000
 
 docs.file=temp/enwiki-20070527-pages-articles.xml
 

Modified: lucene/java/trunk/contrib/benchmark/conf/wikipediaOneRound.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/wikipediaOneRound.alg?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/wikipediaOneRound.alg (original)
+++ lucene/java/trunk/contrib/benchmark/conf/wikipediaOneRound.alg Thu Jun 18 19:58:59 2009
@@ -33,7 +33,7 @@
 doc.stored=true
 doc.tokenized=true
 doc.term.vector=false
-doc.add.log.step=5000
+log.step=5000
 
 docs.file=temp/enwiki-20070527-pages-articles.xml
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java Thu Jun 18 19:58:59 2009
@@ -17,9 +17,13 @@
  * limitations under the License.
  */
 
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.HTMLParser;
 import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.benchmark.byTask.stats.Points;
 import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
@@ -33,11 +37,6 @@
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.RAMDirectory;
 
-import java.io.File;
-import java.util.HashMap;
-import java.util.Iterator;
-
-
 /**
  * Data maintained by a performance test run.
  * <p>
@@ -62,7 +61,6 @@
   private Directory directory;
   private Analyzer analyzer;
   private DocMaker docMaker;
-  private HTMLParser htmlParser;
   
   // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
   private HashMap readTaskQueryMaker;
@@ -82,14 +80,11 @@
         "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
     // doc maker
     docMaker = (DocMaker) Class.forName(config.get("doc.maker",
-        "org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker")).newInstance();
+        "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).newInstance();
     docMaker.setConfig(config);
     // query makers
     readTaskQueryMaker = new HashMap();
     qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
-    // html parser, used for some doc makers
-    htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance();
-    docMaker.setHTMLParser(htmlParser);
 
     // index stuff
     reinit(false);
@@ -229,9 +224,7 @@
     this.analyzer = analyzer;
   }
 
-  /**
-   * @return Returns the docMaker.
-   */
+  /** Returns the docMaker. */
   public DocMaker getDocMaker() {
     return docMaker;
   }
@@ -243,7 +236,7 @@
     return config;
   }
 
-  public void resetInputs() {
+  public void resetInputs() throws IOException {
     docMaker.resetInputs();
     Iterator it = readTaskQueryMaker.values().iterator();
     while (it.hasNext()) {
@@ -271,11 +264,4 @@
     return qm;
   }
 
-  /**
-   * @return Returns the htmlParser.
-   */
-  public HTMLParser getHtmlParser() {
-    return htmlParser;
-  }
-
 }

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,201 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+/**
+ * Represents content from a specified source, such as TREC, Reuters etc. A
+ * {@link ContentSource} is responsible for creating {@link DocData} objects for
+ * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of
+ * various statistics, such as how many documents were generated, size in bytes
+ * etc.
+ * <p>
+ * Supports the following configuration parameters:
+ * <ul>
+ * <li><b>content.source.forever</b> - specifies whether to generate documents
+ * forever (<b>default=true</b>).
+ * <li><b>content.source.verbose</b> - specifies whether messages should be
+ * output by the content source (<b>default=false</b>).
+ * <li><b>content.source.log.step</b> - specifies for how many documents a
+ * message should be logged. If set to 0 it means no logging should occur.
+ * <b>NOTE:</b> if verbose is set to false, logging should not occur even if
+ * logStep is not 0 (<b>default=0</b>).
+ * </ul>
+ */
+public abstract class ContentSource {
+  
+  private static final int BZIP = 0;
+  private static final int OTHER = 1;
+  private static final Map extensionToType = new HashMap();
+  static {
+    extensionToType.put(".bz2", Integer.valueOf(BZIP));
+    extensionToType.put(".bzip", Integer.valueOf(BZIP));
+  }
+  
+  protected static final int BUFFER_SIZE = 1 << 16; // 64K
+
+  private long bytesCount;
+  private long totalBytesCount;
+  private int docsCount;
+  private int totalDocsCount;
+  private Config config;
+
+  protected boolean forever;
+  protected int logStep;
+  protected boolean verbose;
+  
+  private CompressorStreamFactory csFactory = new CompressorStreamFactory();
+
+  protected final synchronized void addBytes(long numBytes) {
+    bytesCount += numBytes;
+    totalBytesCount += numBytes;
+  }
+  
+  protected final synchronized void addDoc() {
+    ++docsCount;
+    ++totalDocsCount;
+  }
+
+  /**
+   * A convenience method for collecting all the files of a content source from
+   * a given directory. The collected {@link File} instances are stored in the
+   * given <code>files</code>.
+   */
+  protected final void collectFiles(File dir, ArrayList files) {
+    if (!dir.canRead()) {
+      return;
+    }
+    
+    File[] dirFiles = dir.listFiles();
+    Arrays.sort(dirFiles);
+    for (int i = 0; i < dirFiles.length; i++) {
+      File file = dirFiles[i];
+      if (file.isDirectory()) {
+        collectFiles(file, files);
+      } else if (file.canRead()) {
+        files.add(file);
+      }
+    }
+  }
+
+  /**
+   * Returns an {@link InputStream} over the requested file. This method
+   * attempts to identify the appropriate {@link InputStream} instance to return
+   * based on the file name (e.g., if it ends with .bz2 or .bzip, return a
+   * 'bzip' {@link InputStream}).
+   */
+  protected InputStream getInputStream(File file) throws IOException {
+    // First, create a FileInputStream, as this will be required by all types.
+    // Wrap with BufferedInputStream for better performance
+    InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
+    
+    String fileName = file.getName();
+    int idx = fileName.lastIndexOf('.');
+    int type = OTHER;
+    if (idx != -1) {
+      Integer typeInt = (Integer) extensionToType.get(fileName.substring(idx));
+      if (typeInt != null) {
+        type = typeInt.intValue();
+      }
+    }
+    switch (type) {
+      case BZIP:
+        try {
+          // According to BZip2CompressorInputStream's code, it reads the first 
+          // two file header chars ('B' and 'Z'). It is important to wrap the
+          // underlying input stream with a buffered one since
+          // Bzip2CompressorInputStream uses the read() method exclusively.
+          is = csFactory.createCompressorInputStream("bzip2", is);
+        } catch (CompressorException e) {
+          IOException ioe = new IOException(e.getMessage());
+          ioe.initCause(e);
+          throw ioe;
+        }
+        break;
+      default: // Do nothing, stay with FileInputStream
+    }
+    
+    return is;
+  }
+  
+  /**
+   * Returns true whether it's time to log a message (depending on verbose and
+   * the number of documents generated).
+   */
+  protected final boolean shouldLog() {
+    return verbose && logStep > 0 && docsCount % logStep == 0;
+  }
+
+  /** Called when reading from this content source is no longer required. */
+  public abstract void close() throws IOException;
+  
+  /** Returns the number of bytes generated since last reset. */
+  public final long getBytesCount() { return bytesCount; }
+
+  /** Returns the number of generated documents since last reset. */
+  public final int getDocsCount() { return docsCount; }
+  
+  public final Config getConfig() { return config; }
+
+  /** Returns the next {@link DocData} from the content source. */
+  public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;
+
+  /** Returns the total number of bytes that were generated by this source. */ 
+  public final long getTotalBytesCount() { return totalBytesCount; }
+
+  /** Returns the total number of generated documents. */
+  public final int getTotalDocsCount() { return totalDocsCount; }
+
+  /**
+   * Resets the input for this content source, so that the test would behave as
+   * if it was just started, input-wise.
+   * <p>
+   * <b>NOTE:</b> the default implementation resets the number of bytes and
+   * documents generated since the last reset, so it's important to call
+   * super.resetInputs in case you override this method.
+   */
+  public void resetInputs() throws IOException {
+    bytesCount = 0;
+    docsCount = 0;
+  }
+
+  /**
+   * Sets the {@link Config} for this content source. If you override this
+   * method, you must call super.setConfig.
+   */
+  public void setConfig(Config config) {
+    this.config = config;
+    forever = config.get("content.source.forever", true);
+    logStep = config.get("content.source.log.step", 0);
+    verbose = config.get("content.source.verbose", false);
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Thu Jun 18 19:58:59 2009
@@ -30,14 +30,7 @@
  */
 public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
 
-  public DemoHTMLParser () {
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
-   */
-  public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
+  public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
     org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
     
     // title
@@ -64,16 +57,22 @@
         date = new Date(); // now 
       }
     }
-      
-    return new DocData(name, bodyBuf.toString(), title, props, date);
+    
+    docData.clear();
+    docData.setName(name);
+    docData.setBody(bodyBuf.toString());
+    docData.setTitle(title);
+    docData.setProps(props);
+    docData.setDate(date);
+    return docData;
   }
 
   /*
    *  (non-Javadoc)
    * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.lang.StringBuffer, java.text.DateFormat)
    */
-  public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
-    return parse(name, date, new StringReader(inputText.toString()), dateFormat);
+  public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
+    return parse(docData, name, date, new StringReader(inputText.toString()), dateFormat);
   }
 
 }

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,246 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileReader;
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.ParsePosition;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Stack;
+
+/**
+ * A {@link ContentSource} using the Dir collection for its input. Supports
+ * the following configuration parameters (on top of {@link ContentSource}):
+ * <ul>
+ * <li><b>work.dir</b> - specifies the working directory. Required if "docs.dir"
+ * denotes a relative path (<b>default=work</b>).
+ * <li><b>docs.dir</b> - specifies the directory the Dir collection. Can be set
+ * to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).
+ * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
+ * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
+ * </ul>
+ */
+public class DirContentSource extends ContentSource {
+
+  private static final class DateFormatInfo {
+    DateFormat df;
+    ParsePosition pos;
+  }
+  
+  public static class Iterator implements java.util.Iterator {
+
+    static class Comparator implements java.util.Comparator {
+      public int compare(Object _a, Object _b) {
+        String a = _a.toString();
+        String b = _b.toString();
+
+        int diff = a.length() - b.length();
+
+        if (diff > 0) {
+          while (diff-- > 0) {
+            b = "0" + b;
+          }
+        } else if (diff < 0) {
+          diff = -diff;
+          while (diff-- > 0) {
+            a = "0" + a;
+          }
+        }
+
+        /* note it's reversed because we're going to push,
+           which reverses again */
+        return b.compareTo(a);
+      }
+    }
+
+    int count = 0;
+
+    Stack stack = new Stack();
+
+    /* this seems silly ... there must be a better way ...
+       not that this is good, but can it matter? */
+
+    Comparator c = new Comparator();
+
+    public Iterator(File f) {
+      push(f);
+    }
+
+    void find() {
+      if (stack.empty()) {
+        return;
+      }
+      if (!((File)stack.peek()).isDirectory()) {
+        return;
+      }
+      File f = (File)stack.pop();
+      push(f);
+    }
+
+    void push(File f) {
+      push(f.listFiles(new FileFilter() {
+
+        public boolean accept(File file) {
+          return file.isDirectory();
+        }
+      }));
+      push(f.listFiles(new FileFilter() {
+
+        public boolean accept(File file) {
+          return file.getName().endsWith(".txt");
+        }
+      }));
+      find();
+    }
+
+    void push(File[] files) {
+      Arrays.sort(files, c);
+      for(int i = 0; i < files.length; i++) {
+        // System.err.println("push " + files[i]);
+        stack.push(files[i]);
+      }
+    }
+
+    public int getCount(){
+      return count;
+    }
+
+    public boolean hasNext() {
+      return stack.size() > 0;
+    }
+    
+    public Object next() {
+      assert hasNext();
+      count++;
+      Object object = stack.pop();
+      // System.err.println("pop " + object);
+      find();
+      return object;
+    }
+
+    public void remove() {
+      throw new RuntimeException("cannot");
+    }
+
+  }
+  
+  private ThreadLocal dateFormat = new ThreadLocal();
+  private File dataDir = null;
+  private int iteration = 0;
+  private Iterator inputFiles = null;
+
+  // get/initiate a thread-local simple date format (must do so 
+  // because SimpleDateFormat is not thread-safe).
+  private DateFormatInfo getDateFormatInfo() {
+    DateFormatInfo dfi = (DateFormatInfo) dateFormat.get();
+    if (dfi == null) {
+      dfi = new DateFormatInfo();
+      dfi.pos = new ParsePosition(0);
+      // date format: 30-MAR-1987 14:22:36.87
+      dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
+      dfi.df.setLenient(true);
+      dateFormat.set(dfi);
+    }
+    return dfi;
+  }
+  
+  private Date parseDate(String dateStr) {
+    DateFormatInfo dfi = getDateFormatInfo();
+    dfi.pos.setIndex(0);
+    dfi.pos.setErrorIndex(-1);
+    return dfi.df.parse(dateStr.trim(), dfi.pos);
+  }
+
+  public void close() throws IOException {
+    inputFiles = null;
+  }
+  
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+    File f = null;
+    String name = null;
+    synchronized (this) {
+      if (!inputFiles.hasNext()) { 
+        // exhausted files, start a new round, unless forever set to false.
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        inputFiles = new Iterator(dataDir);
+        iteration++;
+      }
+      f = (File) inputFiles.next();
+      // System.err.println(f);
+      name = f.getCanonicalPath()+"_"+iteration;
+    }
+    
+    BufferedReader reader = new BufferedReader(new FileReader(f));
+    String line = null;
+    //First line is the date, 3rd is the title, rest is body
+    String dateStr = reader.readLine();
+    reader.readLine();//skip an empty line
+    String title = reader.readLine();
+    reader.readLine();//skip an empty line
+    StringBuffer bodyBuf = new StringBuffer(1024);
+    while ((line = reader.readLine()) != null) {
+      bodyBuf.append(line).append(' ');
+    }
+    reader.close();
+    addBytes(f.length());
+    
+    Date date = parseDate(dateStr);
+    
+    docData.clear();
+    docData.setName(name);
+    docData.setBody(bodyBuf.toString());
+    docData.setTitle(title);
+    docData.setDate(date);
+    return docData;
+  }
+  
+  public synchronized void resetInputs() throws IOException {
+    super.resetInputs();
+    inputFiles = new Iterator(dataDir);
+    iteration = 0;
+  }
+
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    
+    File workDir = new File(config.get("work.dir", "work"));
+    String d = config.get("docs.dir", "dir-out");
+    dataDir = new File(d);
+    if (!dataDir.isAbsolute()) {
+      dataDir = new File(workDir, d);
+    }
+
+    inputFiles = new Iterator(dataDir);
+
+    if (inputFiles == null) {
+      throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath());
+    }
+  }
+
+}

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java Thu Jun 18 19:58:59 2009
@@ -20,94 +20,77 @@
 import java.util.Date;
 import java.util.Properties;
 
-/**
- * Output of parsing (e.g. HTML parsing) of an input document.
- */
+import org.apache.lucene.document.DateTools;
 
+/** Output of parsing (e.g. HTML parsing) of an input document. */
 public class DocData {
   
   private String name;
   private String body;
   private String title;
-  private Date date;
+  private String date;
   private Properties props;
   
-  public DocData(String name, String body, String title, Properties props, Date date) {
-    this.name = name;
-    this.body = body;
-    this.title = title;
-    this.date = date;
-    this.props = props;
+  public void clear() {
+    name = null;
+    body = null;
+    title = null;
+    date = null;
+    props = null;
+  }
+  
+  public String getBody() {
+    return body;
   }
 
   /**
-   * @return Returns the name.
+   * @return the date. If the ctor with Date was called, then the String
+   *         returned is the output of
+   *         {@link DateTools#dateToString(Date, org.apache.lucene.document.DateTools.Resolution)}
+   *         . Otherwise it's the String passed to the other ctor.
    */
-  public String getName() {
-    return name;
+  public String getDate() {
+    return date;
   }
 
-  /**
-   * @param name The name to set.
-   */
-  public void setName(String name) {
-    this.name = name;
+  public String getName() {
+    return name;
   }
 
-  /**
-   * @return Returns the props.
-   */
   public Properties getProps() {
     return props;
   }
 
-  /**
-   * @param props The props to set.
-   */
-  public void setProps(Properties props) {
-    this.props = props;
-  }
-
-  /**
-   * @return Returns the body.
-   */
-  public String getBody() {
-    return body;
+  public String getTitle() {
+    return title;
   }
 
-  /**
-   * @param body The body to set.
-   */
   public void setBody(String body) {
     this.body = body;
   }
 
-  /**
-   * @return Returns the title.
-   */
-  public String getTitle() {
-    return title;
+  public void setDate(Date date) {
+    if (date != null) {
+      setDate(DateTools.dateToString(date, DateTools.Resolution.SECOND));
+    } else {
+      this.date = null;
+    }
   }
 
-  /**
-   * @param title The title to set.
-   */
-  public void setTitle(String title) {
-    this.title = title;
+  public void setDate(String date) {
+    this.date = date;
   }
 
-  /**
-   * @return Returns the date.
-   */
-  public Date getDate() {
-    return date;
+  public void setName(String name) {
+    this.name = name;
   }
 
-  /**
-   * @param date The date to set.
-   */
-  public void setDate(Date date) {
-    this.date = date;
+  public void setProps(Properties props) {
+    this.props = props;
+  }
+
+  public void setTitle(String title) {
+    this.title = title;
   }
 
 }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java?rev=786233&r1=786232&r2=786233&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java Thu Jun 18 19:58:59 2009
@@ -17,55 +17,373 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.document.Document;
-import org.apache.lucene.benchmark.byTask.utils.Config;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Map.Entry;
 
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.Format;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
 
 /**
- * Create documents for the test.
- * <br>Each call to makeDocument would create the next document.
- * When input is exhausted, the DocMaker iterates over the input again,
- * providing a source for unlimited number of documents,
- * though not all of them are unique. 
+ * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
+ * {@link DocData} objects. Supports the following parameters:
+ * <ul>
+ * <li><b>content.source</b> - specifies the {@link ContentSource} class to use
+ * (default <b>SingleDocSource</b>).
+ * <li><b>doc.stored</b> - specifies whether fields should be stored (default
+ * <b>false</b>).
+ * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
+ * (default <b>true</b>).
+ * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
+ * for fields (default <b>false</b>).
+ * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
+ * be stored with positions (default <b>false</b>).
+ * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
+ * stored with offsets (default <b>false</b>).
+ * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
+ * the document's content in the document (default <b>false</b>).
+ * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
+ * should be reused (default <b>true</b>).
+ * </ul>
  */
-public interface DocMaker {
+public class DocMaker {
 
-  /** 
-   * Create the next document, of the given size by input bytes.
-   * If the implementation does not support control over size, an exception is thrown.
-   * @param size size of document, or 0 if there is no size requirement.
-   * @exception if cannot make the document, or if size>0 was specified but this feature is not supported.
-   */ 
-  public Document makeDocument (int size) throws Exception;
-
-  /** Create the next document. */
-  public Document makeDocument () throws Exception;
-
-  /** Set the properties */
-  public void setConfig (Config config);
+  private static class LeftOver {
+    private DocData docdata;
+    private int cnt;
+  }
+
+  static class DocState {
+    
+    private Map fields;
+    private boolean reuseFields;
+    Document doc;
+    DocData docData = new DocData();
+    
+    public DocState(boolean reuseFields, Store store, Index index, TermVector termVector) {
+
+      this.reuseFields = reuseFields;
+      
+      if (reuseFields) {
+        fields =  new HashMap();
+        
+        // Initialize the map with the default fields.
+        fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, index, termVector));
+        fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
+        fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
+        fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
+        fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
+        
+        doc = new Document();
+      }
+    }
+
+    /**
+     * Returns a field corresponding to the field name. If
+     * <code>reuseFields</code> was set to true, then it attempts to reuse a
+     * Field instance. If such a field does not exist, it creates a new one.
+     */
+    Field getField(String name, Store store, Index index, TermVector termVector) {
+      if (!reuseFields) {
+        return new Field(name, "", store, index, termVector);
+      }
+      
+      Field f = (Field) fields.get(name);
+      if (f == null) {
+        f = new Field(name, "", store, index, termVector);
+        fields.put(name, f);
+      }
+      return f;
+    }
+  }
   
-  /** Reset inputs so that the test run would behave, input wise, as if it just started. */
-  public void resetInputs();
+  private int numDocsCreated = 0;
+  private boolean storeBytes = false;
+
+  // leftovers are thread local, because it is unsafe to share residues between threads
+  private ThreadLocal leftovr = new ThreadLocal();
+  private ThreadLocal docState = new ThreadLocal();
+
+  public static final String BODY_FIELD = "body";
+  public static final String TITLE_FIELD = "doctitle";
+  public static final String DATE_FIELD = "docdate";
+  public static final String ID_FIELD = "docid";
+  public static final String BYTES_FIELD = "bytes";
+  public static final String NAME_FIELD = "docname";
+
+  protected Config config;
+
+  protected Store storeVal = Store.NO;
+  protected Index indexVal = Index.ANALYZED;
+  protected TermVector termVecVal = TermVector.NO;
   
-  /** Return how many real unique texts are available, 0 if not applicable. */ 
-  public int numUniqueTexts();
+  protected ContentSource source;
+  protected boolean reuseFields;
+  protected DocState localDocState;
   
-  /** Return total bytes of all available unique texts, 0 if not applicable */ 
-  public long numUniqueBytes();
-
-  /** Return number of docs made since last reset. */
-  public int getCount();
+  private int lastPrintedNumUniqueTexts = 0;
 
-  /** Return total byte size of docs made since last reset. */
-  public long getByteCount();
+  private long lastPrintedNumUniqueBytes = 0;
 
-  /** Print some statistics on docs available/added/etc. */ 
-  public void printDocStatistics();
+  private int printNum = 0;
 
-  /** Set the html parser to use, when appropriate */
-  public void setHTMLParser(HTMLParser htmlParser);
+  // create a doc
+  // use only part of the body, modify it to keep the rest (or use all if size==0).
+  // reset the docdata properties so they are not added more than once.
+  private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
+    int docid = incrNumDocsCreated();
+    DocState ds = reuseFields ? getDocState() : localDocState;
+    Document doc = reuseFields ? ds.doc : new Document();
+    doc.getFields().clear();
+    
+    // Set ID_FIELD
+    Field idField = ds.getField(ID_FIELD, storeVal, indexVal, termVecVal);
+    idField.setValue("doc" + docid);
+    doc.add(idField);
+    
+    // Set NAME_FIELD
+    String name = docData.getName();
+    if (name == null) name = "";
+    name = cnt < 0 ? name : name + "_" + cnt;
+    Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
+    nameField.setValue(name);
+    doc.add(nameField);
+    
+    // Set DATE_FIELD
+    String date = docData.getDate();
+    if (date == null) {
+      date = "";
+    }
+    Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
+    dateField.setValue(date);
+    doc.add(dateField);
+    
+    // Set TITLE_FIELD
+    String title = docData.getTitle();
+    Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
+    titleField.setValue(title == null ? "" : title);
+    doc.add(titleField);
+    
+    String body = docData.getBody();
+    if (body != null && body.length() > 0) {
+      String bdy;
+      if (size <= 0 || size >= body.length()) {
+        bdy = body; // use all
+        docData.setBody(""); // nothing left
+      } else {
+        // attempt not to break words - if whitespace found within next 20 chars...
+        for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
+          if (Character.isWhitespace(body.charAt(n))) {
+            size = n;
+            break;
+          }
+        }
+        bdy = body.substring(0, size); // use part
+        docData.setBody(body.substring(size)); // some left
+      }
+      Field bodyField = ds.getField(BODY_FIELD, storeVal, indexVal, termVecVal);
+      bodyField.setValue(bdy);
+      doc.add(bodyField);
+      
+      if (storeBytes) {
+        Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
+        bytesField.setValue(bdy.getBytes("UTF-8"));
+        doc.add(bytesField);
+      }
+    }
+
+    Properties props = docData.getProps();
+    if (props != null) {
+      for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) {
+        Entry entry = (Entry) iterator.next();
+        Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
+        f.setValue((String) entry.getValue());
+        doc.add(f);
+      }
+      docData.setProps(null);
+    }
+    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
+    return doc;
+  }
+
+  private void resetLeftovers() {
+    leftovr.set(null);
+  }
+
+  protected DocState getDocState() {
+    DocState ds = (DocState) docState.get();
+    if (ds == null) {
+      ds = new DocState(true, storeVal, indexVal, termVecVal);
+      docState.set(ds);
+    }
+    return ds;
+  }
+
+  protected synchronized int incrNumDocsCreated() {
+    return numDocsCreated++;
+  }
+
+  /**
+   * Closes the {@link DocMaker}. The base implementation closes the
+   * {@link ContentSource}, and it can be overridden to do more work (but make
+   * sure to call super.close()).
+   */
+  public void close() throws IOException {
+    source.close();
+  }
+  
+  /**
+   * Returns the number of bytes generated by the content source since last
+   * reset.
+   */
+  public synchronized long getBytesCount() {
+    return source.getBytesCount();
+  }
+
+  /**
+   * Returns the total number of bytes that were generated by the content source
+   * defined to that doc maker.
+   */ 
+  public long getTotalBytesCount() {
+    return source.getTotalBytesCount();
+  }
+
+  /**
+   * Creates a {@link Document} object ready for indexing. This method uses the
+   * {@link ContentSource} to get the next document from the source, and creates
+   * a {@link Document} object from the returned fields. If
+   * <code>reuseFields</code> was set to true, it will reuse {@link Document}
+   * and {@link Field} instances.
+   */
+  public Document makeDocument() throws Exception {
+    resetLeftovers();
+    DocData docData = source.getNextDocData(reuseFields ? getDocState().docData : localDocState.docData);
+    Document doc = createDocument(docData, 0, -1);
+    return doc;
+  }
+
+  /**
+   * Same as {@link #makeDocument()}, only this method creates a document of the
+   * given size input by <code>size</code>.
+   */
+  public Document makeDocument(int size) throws Exception {
+    LeftOver lvr = (LeftOver) leftovr.get();
+    if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
+        || lvr.docdata.getBody().length() == 0) {
+      resetLeftovers();
+    }
+    DocData docData = reuseFields ? getDocState().docData : localDocState.docData;
+    DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
+    int cnt = (lvr == null ? 0 : lvr.cnt);
+    while (dd.getBody() == null || dd.getBody().length() < size) {
+      DocData dd2 = dd;
+      dd = source.getNextDocData(new DocData());
+      cnt = 0;
+      dd.setBody(dd2.getBody() + dd.getBody());
+    }
+    Document doc = createDocument(dd, size, cnt);
+    if (dd.getBody() == null || dd.getBody().length() == 0) {
+      resetLeftovers();
+    } else {
+      if (lvr == null) {
+        lvr = new LeftOver();
+        leftovr.set(lvr);
+      }
+      lvr.docdata = dd;
+      lvr.cnt = ++cnt;
+    }
+    return doc;
+  }
+  
+  public void printDocStatistics() {
+    boolean print = false;
+    String col = "                  ";
+    StringBuffer sb = new StringBuffer();
+    String newline = System.getProperty("line.separator");
+    sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
+    int nut = source.getTotalDocsCount();
+    if (nut > lastPrintedNumUniqueTexts) {
+      print = true;
+      sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
+      lastPrintedNumUniqueTexts = nut;
+    }
+    long nub = getTotalBytesCount();
+    if (nub > lastPrintedNumUniqueBytes) {
+      print = true;
+      sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
+      lastPrintedNumUniqueBytes = nub;
+    }
+    if (source.getDocsCount() > 0) {
+      print = true;
+      sb.append("num docs added since last inputs reset:   ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
+      sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
+    }
+    if (print) {
+      System.out.println(sb.append(newline).toString());
+      printNum++;
+    }
+  }
+  
+  /** Reset inputs so that the test run would behave, input wise, as if it just started. */
+  public synchronized void resetInputs() throws IOException {
+    printDocStatistics();
+    // re-initiate since properties by round may have changed.
+    setConfig(config);
+    source.resetInputs();
+    numDocsCreated = 0;
+    resetLeftovers();
+  }
   
-  /** Returns the htmlParser. */
-  public HTMLParser getHtmlParser();
+  /** Set the configuration parameters of this doc maker. */
+  public void setConfig(Config config) {
+    this.config = config;
+    try {
+      String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
+      source = (ContentSource) Class.forName(sourceClass).newInstance();
+      source.setConfig(config);
+    } catch (Exception e) {
+      // Should not get here. Throw runtime exception.
+      throw new RuntimeException(e);
+    }
+
+    boolean stored = config.get("doc.stored", false);
+    boolean tokenized = config.get("doc.tokenized", true);
+    boolean termVec = config.get("doc.term.vector", false);
+    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
+    indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED);
+    boolean termVecPositions = config.get("doc.term.vector.positions", false);
+    boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
+    if (termVecPositions && termVecOffsets) {
+      termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
+    } else if (termVecPositions) {
+      termVecVal = TermVector.WITH_POSITIONS;
+    } else if (termVecOffsets) {
+      termVecVal = TermVector.WITH_OFFSETS;
+    } else if (termVec) {
+      termVecVal = TermVector.YES;
+    } else {
+      termVecVal = TermVector.NO;
+    }
+    storeBytes = config.get("doc.store.body.bytes", false);
+    
+    reuseFields = config.get("doc.reuse.fields", true);
+    if (!reuseFields) {
+      localDocState = new DocState(false, storeVal, indexVal, termVecVal);
+    } else {
+      // In a multi-rounds run, it is important to reset DocState since settings
+      // of fields may change between rounds, and this is the only way to reset
+      // the cache of all threads.
+      docState = new ThreadLocal();
+    }
+  }
 
-}
\ No newline at end of file
+}

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=786233&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Jun 18 19:58:59 2009
@@ -0,0 +1,294 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+/**
+ * A {@link ContentSource} which reads the English Wikipedia dump. You can read
+ * the .bz2 file directly (it will be decompressed on the fly). Config
+ * properties:
+ * <ul>
+ * <li>keep.image.only.docs=false|true (default <b>true</b>).
+ * <li>docs.file=&lt;path to the file&gt;
+ * </ul>
+ */
+public class EnwikiContentSource extends ContentSource {
+
+  private class Parser extends DefaultHandler implements Runnable {
+    private Thread t;
+    private boolean threadDone;
+    private String[] tuple;
+    private NoMoreDataException nmde;
+    private StringBuffer contents = new StringBuffer();
+    private String title;
+    private String body;
+    private String time;
+    private String id;
+    
+    String[] next() throws NoMoreDataException {
+      if (t == null) {
+        threadDone = false;
+        t = new Thread(this);
+        t.setDaemon(true);
+        t.start();
+      }
+      String[] result;
+      synchronized(this){
+        while(tuple == null && nmde == null && !threadDone) {
+          try {
+            wait();
+          } catch (InterruptedException ie) {
+          }
+        }
+        if (nmde != null) {
+          // Set to null so we will re-start thread in case
+          // we are re-used:
+          t = null;
+          throw nmde;
+        }
+        if (t != null && threadDone) {
+          // The thread has exited yet did not hit end of
+          // data, so this means it hit an exception.  We
+          // throw NoMorDataException here to force
+          // benchmark to stop the current alg:
+          throw new NoMoreDataException();
+        }
+        result = tuple;
+        tuple = null;
+        notify();
+      }
+      return result;
+    }
+    
+    String time(String original) {
+      StringBuffer buffer = new StringBuffer();
+
+      buffer.append(original.substring(8, 10));
+      buffer.append('-');
+      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
+      buffer.append('-');
+      buffer.append(original.substring(0, 4));
+      buffer.append(' ');
+      buffer.append(original.substring(11, 19));
+      buffer.append(".000");
+
+      return buffer.toString();
+    }
+    
+    public void characters(char[] ch, int start, int length) {
+      contents.append(ch, start, length);
+    }
+
+    public void endElement(String namespace, String simple, String qualified)
+      throws SAXException {
+      int elemType = getElementType(qualified);
+      switch (elemType) {
+        case PAGE:
+          // the body must be null and we either are keeping image docs or the
+          // title does not start with Image:
+          if (body != null && (keepImages || !title.startsWith("Image:"))) {
+            String[] tmpTuple = new String[LENGTH];
+            tmpTuple[TITLE] = title.replace('\t', ' ');
+            tmpTuple[DATE] = time.replace('\t', ' ');
+            tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
+            tmpTuple[ID] = id;
+            synchronized(this) {
+              while (tuple != null) {
+                try {
+                  wait();
+                } catch (InterruptedException ie) {
+                }
+              }
+              tuple = tmpTuple;
+              notify();
+            }
+          }
+          break;
+        case BODY:
+          body = contents.toString();
+          //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
+          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+          if (startsWith.startsWith("#redirect")) {
+            body = null;
+          }
+          break;
+        case DATE:
+          time = time(contents.toString());
+          break;
+        case TITLE:
+          title = contents.toString();
+          break;
+        case ID:
+          id = contents.toString();
+          break;
+        default:
+          // this element should be discarded.
+      }
+    }
+
+    public void run() {
+
+      try {
+        XMLReader reader = XMLReaderFactory.createXMLReader();
+        reader.setContentHandler(this);
+        reader.setErrorHandler(this);
+        while(true){
+          final InputStream localFileIS = is;
+          try {
+            reader.parse(new InputSource(localFileIS));
+          } catch (IOException ioe) {
+            synchronized(EnwikiContentSource.this) {
+              if (localFileIS != is) {
+                // fileIS was closed on us, so, just fall
+                // through
+              } else
+                // Exception is real
+                throw ioe;
+            }
+          }
+          synchronized(this) {
+            if (!forever) {
+              nmde = new NoMoreDataException();
+              notify();
+              return;
+            } else if (localFileIS == is) {
+              // If file is not already re-opened then re-open it now
+              is = getInputStream(file);
+            }
+          }
+        }
+      } catch (SAXException sae) {
+        throw new RuntimeException(sae);
+      } catch (IOException ioe) {
+        throw new RuntimeException(ioe);
+      } finally {
+        synchronized(this) {
+          threadDone = true;
+          notify();
+        }
+      }
+    }
+
+    public void startElement(String namespace, String simple, String qualified,
+                             Attributes attributes) {
+      int elemType = getElementType(qualified);
+      switch (elemType) {
+        case PAGE:
+          title = null;
+          body = null;
+          time = null;
+          id = null;
+          break;
+        // intentional fall-through.
+        case BODY:
+        case DATE:
+        case TITLE:
+        case ID:
+          contents.setLength(0);
+          break;
+        default:
+          // this element should be discarded.
+      }
+    }
+  }
+
+  private static final Map ELEMENTS = new HashMap();
+  private static final int TITLE = 0;
+  private static final int DATE = TITLE + 1;
+  private static final int BODY = DATE + 1;
+  private static final int ID = BODY + 1;
+  private static final int LENGTH = ID + 1;
+  // LENGTH is used as the size of the tuple, so whatever constants we need that
+  // should not be part of the tuple, we should define them after LENGTH.
+  private static final int PAGE = LENGTH + 1;
+
+  private static final String[] months = {"JAN", "FEB", "MAR", "APR",
+                                  "MAY", "JUN", "JUL", "AUG",
+                                  "SEP", "OCT", "NOV", "DEC"};
+
+  static {
+    ELEMENTS.put("page", Integer.valueOf(PAGE));
+    ELEMENTS.put("text", Integer.valueOf(BODY));
+    ELEMENTS.put("timestamp", Integer.valueOf(DATE));
+    ELEMENTS.put("title", Integer.valueOf(TITLE));
+    ELEMENTS.put("id", Integer.valueOf(ID));
+  }
+  
+  /**
+   * Returns the type of the element if defined, otherwise returns -1. This
+   * method is useful in startElement and endElement, by not needing to compare
+   * the element qualified name over and over.
+   */
+  private final static int getElementType(String elem) {
+    Integer val = (Integer) ELEMENTS.get(elem);
+    return val == null ? -1 : val.intValue();
+  }
+  
+  private File file;
+  private boolean keepImages = true;
+  private InputStream is;
+  private Parser parser = new Parser();
+  
+  public void close() throws IOException {
+    synchronized (EnwikiContentSource.this) {
+      if (is != null) {
+        is.close();
+        is = null;
+      }
+    }
+  }
+  
+  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+    String[] tuple = parser.next();
+    docData.clear();
+    docData.setName(tuple[ID]);
+    docData.setBody(tuple[BODY]);
+    docData.setDate(tuple[DATE]);
+    docData.setTitle(tuple[TITLE]);
+    return docData;
+  }
+
+  public void resetInputs() throws IOException {
+    super.resetInputs();
+    is = getInputStream(file);
+  }
+  
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    keepImages = config.get("keep.image.only.docs", true);
+    String fileName = config.get("docs.file", null);
+    if (fileName == null) {
+      throw new IllegalArgumentException("docs.file must be set");
+    }
+    file = new File(fileName).getAbsoluteFile();
+  }
+  
+}



Mime
View raw message