Return-Path: Delivered-To: apmail-jakarta-lucene-user-archive@apache.org Received: (qmail 30106 invoked from network); 6 Dec 2002 17:01:49 -0000 Received: from unknown (HELO nagoya.betaversion.org) (192.18.49.131) by daedalus.apache.org with SMTP; 6 Dec 2002 17:01:49 -0000 Received: (qmail 16931 invoked by uid 97); 6 Dec 2002 17:02:51 -0000 Delivered-To: qmlist-jakarta-archive-lucene-user@jakarta.apache.org Received: (qmail 16904 invoked by uid 97); 6 Dec 2002 17:02:50 -0000 Mailing-List: contact lucene-user-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Lucene Users List" Reply-To: "Lucene Users List" Delivered-To: mailing list lucene-user@jakarta.apache.org Received: (qmail 16883 invoked by uid 98); 6 Dec 2002 17:02:50 -0000 X-Antivirus: nagoya (v4218 created Aug 14 2002) Message-Id: <818623B5FD23D51193200002B32C076106FE4938@excsrv44.mayo.edu> From: "Armbrust, Daniel C." To: 'Lucene Users List' Subject: RE: Lucene Speed under diff JVMs Date: Fri, 6 Dec 2002 11:01:04 -0600 MIME-Version: 1.0 X-Mailer: Internet Mail Service (5.5.2653.19) Content-Type: multipart/mixed; boundary="----_=_NextPart_000_01C29D49.0FFD57F0" X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N ------_=_NextPart_000_01C29D49.0FFD57F0 Content-Type: text/plain Class that was used (attached) And correction, the UnStored field had 1000 words, not 500. -----Original Message----- From: Otis Gospodnetic [mailto:otis_gospodnetic@yahoo.com] Sent: Friday, December 06, 2002 10:57 AM To: Lucene Users List Subject: RE: Lucene Speed under diff JVMs Otis doesn't mind. ------------- One more bit of info that I should have included: The randomly generated documents consisted of 2 fields, one Text with 3 words, and one UnStored with 500 words. Average word length was 7 characters. If Otis (he wrote it, I just made a tweak or two) doesn't mind, I'll post the source code. Dan ------_=_NextPart_000_01C29D49.0FFD57F0 Content-Type: application/octet-stream; name="Words2Index.java" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="Words2Index.java" import java.io.*; import java.util.Set; import java.util.HashSet; import java.util.Random; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // TODO: use RAMDirectory or BatchIndexWriter // try with multiple threads and multiple indices public class Words2Index { // avg. word length =3D 8, so 8 * 3 bytes =3D> 24 bytes private static final short WORDS_PER_TITLE =3D 3; // avg. word length =3D 8, so 8 * 2000 bytes =3D> 16000 bytes private static final short WORDS_PER_BODY =3D 1000; private static String _dict; private static String _index; private static int _indexSize; private static String[] _words; private static Random _random; private Words2Index(String dict, String index, int indexSize) { _dict =3D dict; _index =3D index; _indexSize =3D indexSize; _random =3D new Random(); } public static void main(String[] args) throws Exception { Words2Index wi =3D new Words2Index(args[0], args[1], = Integer.parseInt(args[2])); System.out.println("DICT: " + _dict); System.out.println("INDEX: " + _index); System.out.println("INDEX SIZE: " + _indexSize + " document"); System.out.println("Loading words from " + _dict); Set wordSet =3D wi.loadWords(); _words =3D (String[])wordSet.toArray(new = String[wordSet.size()]); int maxRandNumber =3D _words.length; IndexWriter writer =3D new IndexWriter(_index, new = StandardAnalyzer(), true); writer.mergeFactor =3D 500; //BatchIndexWriter batchWriter =3D new BatchIndexWriter(writer, 2000); long startTime =3D System.currentTimeMillis(); long batchStartTime =3D System.currentTimeMillis(); long optimizeTime =3D 0; System.out.println("Adding documents to index " + _index); for (int i =3D 0; i < _indexSize; i++) { //batchWriter.addDocument(wi.makeDocument(maxRandNumber)); writer.addDocument(wi.makeDocument(maxRandNumber)); if (i % 100 =3D=3D 0) { System.out.println(i + " " + = String.valueOf(System.currentTimeMillis() - batchStartTime) + " ms"); batchStartTime =3D System.currentTimeMillis(); } if (i % 50000 =3D=3D 0) { System.out.print("Optimizing..."); long startOptimizeTime =3D System.currentTimeMillis(); //batchWriter.optimize(); writer.optimize(); long endOptimizeTime =3D System.currentTimeMillis(); long thisOptimizeTime =3D endOptimizeTime - = startOptimizeTime; System.out.println("done " + = String.valueOf(thisOptimizeTime)); optimizeTime +=3D thisOptimizeTime; batchStartTime =3D System.currentTimeMillis(); } } System.out.println("Optimizing and closing index " + _index); long startOptimizeTime =3D System.currentTimeMillis(); //batchWriter.optimize(); writer.optimize(); long endOptimizeTime =3D System.currentTimeMillis(); long thisOptimizeTime =3D endOptimizeTime - startOptimizeTime; System.out.println("done " + String.valueOf(thisOptimizeTime)); optimizeTime +=3D thisOptimizeTime; writer.close(); long endTime =3D System.currentTimeMillis(); long totalTime =3D System.currentTimeMillis() - startTime; =20 System.out.println("Done"); System.out.println("Time spent optimizing: " + = String.valueOf(optimizeTime) + " ms"); System.out.println("Total time: " + String.valueOf(totalTime) + = " ms"); System.out.println("Avg time per document (not counting = optimizing) " + String.valueOf((totalTime - optimizeTime)/ = wi._indexSize) + " ms"); System.out.println("Avg time per document (counting optimizing) = " + String.valueOf(totalTime/ wi._indexSize) + " ms"); System.out.println("Avg time per 1000 documents (not counting = optimizing) " + String.valueOf((totalTime - optimizeTime)/ = (wi._indexSize/1000)) + " ms"); System.out.println("Avg time per 1000 documents (counting = optimizing) " + String.valueOf(totalTime/ (wi._indexSize/1000)) + " = ms"); } private Set loadWords() throws IOException { int wordLength =3D 0; String word; Set wordSet =3D new HashSet(99905); File file =3D new File(_dict); BufferedReader br =3D new BufferedReader(new FileReader(file)); while ((word =3D br.readLine()) !=3D null) { //System.out.println("WORD: " + word); wordSet.add(word); wordLength +=3D word.length(); } System.out.println("WORD COUNT: " + wordSet.size()); System.out.println("TOTAL WORD LENGTH: " + wordLength); System.out.println("AVG WORD LENGTH: " + = wordLength/wordSet.size()); return wordSet; } private Document makeDocument(int maxRandNumber) { Document doc =3D new Document(); StringBuffer fieldValue =3D new StringBuffer(WORDS_PER_TITLE * = 8); for (int i =3D 0; i < WORDS_PER_TITLE; i++) { int rand =3D _random.nextInt(maxRandNumber); fieldValue.append(" ").append(_words[rand]); } //System.out.println("Title: " + fieldValue); doc.add(Field.Text("title", fieldValue.toString())); fieldValue.setLength(0); fieldValue.setLength(WORDS_PER_BODY * 8); for (int i =3D 0; i < WORDS_PER_BODY; i++) { int rand =3D _random.nextInt(maxRandNumber); fieldValue.append(" ").append(_words[rand]); } //System.out.println("Body: " + fieldValue); doc.add(Field.UnStored("body", fieldValue.toString())); return doc; } } ------_=_NextPart_000_01C29D49.0FFD57F0 Content-Type: text/plain; charset=us-ascii -- To unsubscribe, e-mail: For additional commands, e-mail: ------_=_NextPart_000_01C29D49.0FFD57F0--