lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Otis Gospodnetic <otis_gospodne...@yahoo.com>
Subject Re: [Performance] Streaming main memory indexing of single strings
Date Thu, 14 Apr 2005 05:09:51 GMT
It looks like you are calling that IndexWriter code in some loops,
opening it and closing it in every iteration of the loop and also
calling optimize.  All of those things could be improved.
Keep your IndexWriter open, don't close it, and optimize the index only
once you are done adding documents to it.

See the highlights and the snipets in the first hit:
  http://www.lucenebook.com/search?query=when+to+optimize

Otis


--- Wolfgang Hoschek <whoschek@lbl.gov> wrote:

> Hi,
> 
> I'm wondering if anyone could let me know how to improve Lucene 
> performance for "streaming main memory indexing of single strings". 
> This would help to effectively integrate Lucene with the Nux XQuery 
> engine.
> 
> Below is a small microbenchmark simulating STREAMING XQuery fulltext 
> search as typical for XML network routers, message queuing system,
> P2P 
> networks, etc. In this on-the-fly main memory indexing scenario, each
> 
> individual string is immediately matched as soon as it becomes 
> available without any persistance involved. This usage scenario and 
> corresponding performance profile is quite different in comparison to
> 
> fulltext search over persistent (read-mostly) indexes.
> 
> The benchmark runs at some 3000 lucene queries/sec (lucene-1.4.3)
> which 
> is unfortunate news considering the XQuery engine can easily walk 
> hundreds of thousands of XML nodes per second. Ideally I'd like to
> run 
> at some 100000 queries/sec. Runnning this through the JDK 1.5
> profiler 
> it seems that most time is spent in and below the following calls:
> 
> writer = new IndexWriter(dir, analyzer, true);
> writer.addDocument(...);
> writer.close();
> 
> I tried quite a few variants of the benchmark with various options, 
> unfortunately with little or no effect.
> Lucene just does not seem to designed to do this sort of "transient 
> single string index" thing. All code paths related to opening,
> closing, 
> reading, writing, querying and object creation seem to be designed
> for 
> large persistent indexes.
> 
> Any advice on what I'm missing or what could be done about it would
> be 
> greatly appreciated.
> 
> Wolfgang.
> 
> P.S. the benchmark code is attached as a file below:
> 
> > package nux.xom.pool;
> 
> import java.io.IOException;
> //import java.io.Reader;
> 
> import org.apache.lucene.analysis.Analyzer;
> //import org.apache.lucene.analysis.LowerCaseTokenizer;
> //import org.apache.lucene.analysis.PorterStemFilter;
> //import org.apache.lucene.analysis.SimpleAnalyzer;
> //import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> //import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.queryParser.ParseException;
> import org.apache.lucene.queryParser.QueryParser;
> import org.apache.lucene.search.Hits;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.Searcher;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> 
> public final class LuceneMatcher { // TODO: make non-public
> 
> 	private final Analyzer analyzer;
> //	private final Directory dir = new RAMDirectory();
> 
> 	public LuceneMatcher() {
> 		this(new StandardAnalyzer());
> //		this(new SimpleAnalyzer());
> //		this(new StopAnalyzer());
> //		this(new Analyzer() {
> //			public final TokenStream tokenStream(String fieldName, Reader
> reader) {
> //				return new PorterStemFilter(new LowerCaseTokenizer(reader));
> //			}
> //		});
> 	}
> 
> 	public LuceneMatcher(Analyzer analyzer) {
> 		if (analyzer == null) 
> 			throw new IllegalArgumentException("analyzer must not be null");
> 		this.analyzer = analyzer;
> 	}
> 
> 	public Query parseQuery(String expression) throws ParseException {
> 		QueryParser parser = new QueryParser("content", analyzer);
> //		parser.setPhraseSlop(0);
> 		return parser.parse(expression);
> 	}
> 
> 	/**
> 	 * Returns the relevance score by matching the given index against
> the given
> 	 * Lucene query expression. The index must not contain more than one
> Lucene
> 	 * "document" (aka string to be searched).
> 	 */
> 	public float match(Directory index, Query query) {
> 		Searcher searcher = null;
> 		try {
> 			searcher = new IndexSearcher(index);
> 			Hits hits = searcher.search(query);
> 			float score = hits.length() > 0 ? hits.score(0) : 0.0f;						
> 			return score;
> 		} catch (IOException e) { // should never happen (RAMDirectory)
> 			throw new RuntimeException(e);
> 		} finally {
> 			try {
> 				if (searcher != null) searcher.close();
> 			} catch (IOException e) { // should never happen (RAMDirectory)
> 				throw new RuntimeException(e);
> 			}
> 		}
> 	}
> 	
> //	public float match(String text, Query query) {
> //		return match(createIndex(text), query);
> //	}
> 	
> 	public Directory createIndex(String text) {
> 		Directory dir = new RAMDirectory();
> 		IndexWriter writer = null;
> 		try {
> 			writer = new IndexWriter(dir, analyzer, true);
> //			writer.setUseCompoundFile(false);
> //			writer.mergeFactor = 2;
> //			writer.minMergeDocs = 1;
> //			writer.maxMergeDocs = 1;
> 	
> 			writer.addDocument(createDocument(text));
> //			writer.optimize();
> 			return dir;
> 		} catch (IOException e) { // should never happen (RAMDirectory)
> 			throw new RuntimeException(e);
> 		} finally {
> 			try {
> 				if (writer != null) writer.close();
> 			} catch (IOException e) { // should never happen (RAMDirectory)
> 				throw new RuntimeException(e);
> 			}
> 		}
> 	}
> 
> 	private Document createDocument(String content) {
> 		Document doc = new Document();
> 		doc.add(Field.UnStored("content", content));
> //		doc.add(Field.Text("x", content));
> 		return doc;
> 	}
> 
> 	/**
> 	 * Lucene microbenchmark simulating STREAMING XQuery fulltext search
> as
> 	 * typical for XML network routers, message queuing system, P2P
> networks,
> 	 * etc. In this on-the-fly main memory indexing scenario, each
> individual
> 	 * string is immediately matched as soon as it becomes available
> without any
> 	 * persistance involved. This usage scenario and corresponding
> performance
> 	 * profile is quite different in comparison to fulltext search over
> 	 * persistent (read-mostly) indexes.
> 	 * 
> 	 * Example XPath: count(/table/row[lucene:match(string(./firstname),
> 	 * "James") > 0.0])
> 	 */
> 	public static void main(String[] args) throws Exception {
> 		int k = -1;		
> 		int runs = 5;
> 		if (args.length > ++k) runs = Integer.parseInt(args[k]); 
> 		
> 		int nodes = 10000;
> 		if (args.length > ++k) nodes = Integer.parseInt(args[k]);
> 		
> 		String content = "James is out in the woods";
> 		if (args.length > ++k) content = args[k];
> 		
> 		String expression = "James";
> 		if (args.length > ++k) expression = args[k];
> 		
> 		LuceneMatcher matcher = new LuceneMatcher();
> 		Query query = matcher.parseQuery(expression); // to be reused N
> times
> 		
> 		for (int r = 0; r < runs; r++) {
> 			long start = System.currentTimeMillis();
> 			int matches = 0;
> 			
> 			for (int i = 0; i < nodes; i++) {
> //				if (LuceneUtil.match(content + i, expression) > 0.0f) {
> 				if (matcher.match(matcher.createIndex(content + i), query) >
> 0.0f) {
> 					matches++;
> 				}
> 			}
> 			
> 			long end = System.currentTimeMillis();
> 			System.out.println("matches=" + matches);
> 			System.out.println("secs=" + ((end-start) / 1000.0f));
> 			System.out.println("queries/sec=" + (nodes / ((end-start) /
> 1000.0f)));
> 			System.out.println();
> 		}
> 	}
> }
> > 
> 
> >
---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message