lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Grant Ingersoll <gsing...@apache.org>
Subject Fwd: Search Test file
Date Sun, 04 Jan 2009 02:23:42 GMT


Begin forwarded message:

> From: Grant Ingersoll <gsingers@apache.org>
> Date: January 3, 2009 8:19:14 PM EST
> To: java-dev@lucene.apache.org
> Subject: Fwd: Search Test file
> Reply-To: java-dev@lucene.apache.org
>
> Hi Amin,
>
> I see a couple of issues with your program below, and one that is  
> the cause of the problem of not finding "amin" as a query term.
>
> When you construct your IndexWriter, you are doing:
>> IndexWriter indexWriter = new  
>> IndexWriter(getDirectory(),getAnalyzer(),new  
>> IndexWriter.MaxFieldLength(2));
>
> The MaxFieldLength parameter specifies the maximum number of tokens  
> allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)

>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>
> Also,
>  TopDocs topDocs = multiSearcher.search(query,  
> BooleanQuery.getMaxClauseCount());
>
> strikes me as really odd.  Why are you passing in the max clause  
> count as the number of results you want returned?
>
> Cheers,
> Grant
>
>
>
> Begin forwarded message:
>
>> From: "aminmc@gmail.com" <aminmc@gmail.com>
>> Date: January 3, 2009 3:24:52 PM EST
>> To: gsingers@apache.org
>> Subject: Search Test file
>>
>> I've shared a document with you called "Search Test file":
>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>
>> It's not an attachment -- it's stored online at Google Docs. To  
>> open this document, just click the link above.
>> ---
>>
>> Hi
>>
>> I have uploaded the test file at google docs. It is currently a txt  
>> file but if you change the extension to .java it should work.
>>
>> package com.amin.app.lucene.search.impl;
>>
>> import static org.junit.Assert.assertEquals;
>> import static org.junit.Assert.assertNotNull;
>> import static org.junit.Assert.assertNotSame;
>> import static org.junit.Assert.assertTrue;
>>
>> import java.io.File;
>> import java.io.FileInputStream;
>> import java.io.FileOutputStream;
>> import java.io.IOException;
>> import java.io.InputStream;
>> import java.io.OutputStream;
>>
>> import javax.swing.text.BadLocationException;
>> import javax.swing.text.DefaultStyledDocument;
>> import javax.swing.text.rtf.RTFEditorKit;
>>
>> import org.apache.commons.lang.StringUtils;
>> import org.apache.lucene.analysis.Analyzer;
>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>> import org.apache.lucene.ant.DocumentHandler;
>> import org.apache.lucene.ant.DocumentHandlerException;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.document.Field;
>> import org.apache.lucene.index.CorruptIndexException;
>> import org.apache.lucene.index.IndexReader;
>> import org.apache.lucene.index.IndexWriter;
>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>> import org.apache.lucene.queryParser.QueryParser;
>> import org.apache.lucene.search.BooleanQuery;
>> import org.apache.lucene.search.IndexSearcher;
>> import org.apache.lucene.search.MultiSearcher;
>> import org.apache.lucene.search.Query;
>> import org.apache.lucene.search.ScoreDoc;
>> import org.apache.lucene.search.Searchable;
>> import org.apache.lucene.search.TopDocs;
>> import org.apache.lucene.store.Directory;
>> import org.apache.lucene.store.FSDirectory;
>> import org.junit.After;
>> import org.junit.Before;
>> import org.junit.Test;
>>
>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>
>> public class SearchTest {
>>
>> private File rtfFile = null;
>> private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";
>>
>> @Before
>> public void setUp() throws Exception {
>> InputStream inputStream =  
>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>> rtfFile = new File(RTF_FILE_NAME);
>> convertInputStreamToFile(inputStream, rtfFile);
>> }
>>
>>
>>
>> @Test
>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>> Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> assertNotNull(document);
>> String value = document.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(value);
>> assertNotSame("", value);
>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>> assertTrue(value.contains("This is a test rtf document that will be  
>> indexed."));
>> String path = document.get(FieldNameEnum.PATH.getDescription());
>> assertNotNull(path);
>> assertTrue(path.contains(".rtf"));
>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>> assertNotNull(fileName);
>> assertEquals(RTF_FILE_NAME, fileName);
>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>> document.get(FieldNameEnum.TYPE.getDescription()));
>>
>> }
>>
>>
>>
>> @Test
>> public void testCanSearchRtfDocument() throws Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> IndexWriter indexWriter = new  
>> IndexWriter(getDirectory(),getAnalyzer(),new  
>> IndexWriter.MaxFieldLength(2));
>> try {
>> indexWriter.addDocument(document);
>> commitAndCloseWriter(indexWriter);
>> } catch (CorruptIndexException e) {
>> throw new IllegalStateException(e);
>> } catch (IOException e) {
>> throw new IllegalStateException(e);
>> }
>>
>> //I plan to use other searchers later
>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>> {indexSearcher});
>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>> Query query = queryParser.parse("amin");
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>> assertNotNull(topDocs);
>> assertEquals(1, topDocs.totalHits);
>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>> for (ScoreDoc scoreDoc : scoreDocs) {
>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>> assertNotNull(documentFromSearch);
>> String bodyText =  
>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(bodyText);
>> assertNotSame("", bodyText);
>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>> assertTrue(bodyText.contains("This is a test rtf document that will  
>> be indexed."));
>>
>> }
>> multiSearcher.close();
>>
>> }
>>
>> @After
>> public void tearDown() throws Exception {
>> rtfFile.delete();
>> if (getDirectory().list() != null && getDirectory().list().length >  
>> 0) {
>> IndexReader reader = IndexReader.open(getDirectory());
>> for(int i = 0; i < reader.maxDoc();i++) {
>> reader.deleteDocument(i);
>> }
>> reader.close();
>> }
>> }
>>
>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>> CorruptIndexException,IOException {
>> indexWriter.commit();
>> indexWriter.close();
>> }
>>
>>
>> public Directory getDirectory() throws IOException {
>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>> }
>>
>> public Analyzer getAnalyzer() {
>> return new StandardAnalyzer();
>> }
>> private static void convertInputStreamToFile(InputStream  
>> inputStream, File file) {
>> try
>>     {
>>     OutputStream out=new FileOutputStream(file);
>>     byte buf[]=new byte[1024];
>>     int len;
>>     while((len=inputStream.read(buf))>0)
>>     out.write(buf,0,len);
>>     out.close();
>>     inputStream.close();
>>
>>     }catch (IOException e){
>>     throw new IllegalStateException(e);
>>     }
>> }
>> private static class JavaBuiltInRTFHandler implements  
>> DocumentHandler{
>>
>> public Document getDocument(File file) throws  
>> DocumentHandlerException {
>> String bodyText = null;
>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>> try {
>> InputStream inputStream = new FileInputStream(file);
>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>> } catch (IOException ioex) {
>> throw new IllegalStateException(ioex);
>> } catch (BadLocationException e) {
>> throw new IllegalArgumentException(e);
>> }
>> //create Document object using body
>> if (bodyText != null) {
>> Document document = new Document();
>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>> Field field = new  
>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>> Field.Store.YES, Field.Index.ANALYZED);
>> document.add(field);
>>
>> String pathToFile = file.getPath();
>> Field pathToFileField = new  
>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(pathToFileField);
>>
>> String fileName = file.getName();
>> Field fileNameField = new  
>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(fileNameField);
>>
>> Field typeField = new  
>> Field 
>> (FieldNameEnum 
>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(typeField);
>>
>> String summary = bodyText.substring(0, 10);
>>
>> Field summaryField = new  
>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(summaryField);
>>
>> return document;
>> }
>> return null;
>> }
>> }
>>
>> private static class WorkItem {
>>
>> public enum WorkItemEvent {
>> ADD,
>> UPDATE,
>> DELETE;
>> }
>>
>> public enum IndexerType {
>> RTF_INDEXER,
>> PDF_INDEXER,
>> XML_INDEXER,
>> PLAIN_TEXT_INDEXER,
>> MS_WORD_INDEXER,
>> MS_EXCEL_INDEXER,
>> MS_POWERPOINT_INDEXER;
>> }
>>
>>
>> private final Document workLoad;
>>
>> private final WorkItemEvent workItemEvent;
>>
>> private final IndexerType indexerType;
>>
>>
>> public WorkItem(final Document workLoad, final WorkItemEvent  
>> workItemEvent) {
>> this.workLoad = workLoad;
>> this.workItemEvent = workItemEvent;
>> String type = this.workLoad.get("type");
>> this.indexerType = IndexerType.valueOf(type);
>> }
>>
>> public IndexerType getIndexerType() {
>> return indexerType;
>> }
>>
>> public Document getWorkLoad() {
>> return workLoad;
>> }
>>
>> public WorkItemEvent getWorkItemEvent() {
>> return workItemEvent;
>> }
>> }
>>
>> private enum FieldNameEnum {
>>
>> AUTHOR("author"),
>> BODY("body"),
>> TITLE("title"),
>> SUBJECT("subject"),
>> KEYWORDS("keywords"),
>> PATH("path"), NAME ("name"),
>> TYPE("type"),
>> ID ("id"),
>> SUMMARY ("summary");
>>
>> private final String description;
>>
>> private FieldNameEnum(final String description) {
>> this.description = description;
>> }
>>
>> public String getDescription() {
>> return this.description;
>> }
>> }
>> }
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>

--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ











Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message