lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Amin Mohammed-Coleman <ami...@gmail.com>
Subject Re: Search Test file
Date Sun, 04 Jan 2009 07:55:57 GMT
Hi,

Please ignore my last email.  Just woke up and wrote the email.  After  
looking at the luke further it looks like the token is being stored at  
index.amin, that is why "amin" wasn't working.  Making those changes  
that you recommended worked.

I will investigate further why "amin" token is being stored as  
"indexed.amin".


Thanks again for all the help.

Cheers

Amin


On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:

>
>
> Begin forwarded message:
>
>> From: Grant Ingersoll <gsingers@apache.org>
>> Date: January 3, 2009 8:19:14 PM EST
>> To: java-dev@lucene.apache.org
>> Subject: Fwd: Search Test file
>> Reply-To: java-dev@lucene.apache.org
>>
>> Hi Amin,
>>
>> I see a couple of issues with your program below, and one that is  
>> the cause of the problem of not finding "amin" as a query term.
>>
>> When you construct your IndexWriter, you are doing:
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>
>> The MaxFieldLength parameter specifies the maximum number of tokens  
>> allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)

>>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd.  Why are you passing in the max clause  
>> count as the number of results you want returned?
>>
>> Cheers,
>> Grant
>>
>>
>>
>> Begin forwarded message:
>>
>>> From: "aminmc@gmail.com" <aminmc@gmail.com>
>>> Date: January 3, 2009 3:24:52 PM EST
>>> To: gsingers@apache.org
>>> Subject: Search Test file
>>>
>>> I've shared a document with you called "Search Test file":
>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>
>>> It's not an attachment -- it's stored online at Google Docs. To  
>>> open this document, just click the link above.
>>> ---
>>>
>>> Hi
>>>
>>> I have uploaded the test file at google docs. It is currently a  
>>> txt file but if you change the extension to .java it should work.
>>>
>>> package com.amin.app.lucene.search.impl;
>>>
>>> import static org.junit.Assert.assertEquals;
>>> import static org.junit.Assert.assertNotNull;
>>> import static org.junit.Assert.assertNotSame;
>>> import static org.junit.Assert.assertTrue;
>>>
>>> import java.io.File;
>>> import java.io.FileInputStream;
>>> import java.io.FileOutputStream;
>>> import java.io.IOException;
>>> import java.io.InputStream;
>>> import java.io.OutputStream;
>>>
>>> import javax.swing.text.BadLocationException;
>>> import javax.swing.text.DefaultStyledDocument;
>>> import javax.swing.text.rtf.RTFEditorKit;
>>>
>>> import org.apache.commons.lang.StringUtils;
>>> import org.apache.lucene.analysis.Analyzer;
>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>> import org.apache.lucene.ant.DocumentHandler;
>>> import org.apache.lucene.ant.DocumentHandlerException;
>>> import org.apache.lucene.document.Document;
>>> import org.apache.lucene.document.Field;
>>> import org.apache.lucene.index.CorruptIndexException;
>>> import org.apache.lucene.index.IndexReader;
>>> import org.apache.lucene.index.IndexWriter;
>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>> import org.apache.lucene.queryParser.QueryParser;
>>> import org.apache.lucene.search.BooleanQuery;
>>> import org.apache.lucene.search.IndexSearcher;
>>> import org.apache.lucene.search.MultiSearcher;
>>> import org.apache.lucene.search.Query;
>>> import org.apache.lucene.search.ScoreDoc;
>>> import org.apache.lucene.search.Searchable;
>>> import org.apache.lucene.search.TopDocs;
>>> import org.apache.lucene.store.Directory;
>>> import org.apache.lucene.store.FSDirectory;
>>> import org.junit.After;
>>> import org.junit.Before;
>>> import org.junit.Test;
>>>
>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>
>>> public class SearchTest {
>>>
>>> private File rtfFile = null;
>>> private static final String RTF_FILE_NAME =  
>>> "rtfDocumentToIndex.rtf";
>>>
>>> @Before
>>> public void setUp() throws Exception {
>>> InputStream inputStream =  
>>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>> rtfFile = new File(RTF_FILE_NAME);
>>> convertInputStreamToFile(inputStream, rtfFile);
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>>> Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> assertNotNull(document);
>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(value);
>>> assertNotSame("", value);
>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>> assertTrue(value.contains("This is a test rtf document that will  
>>> be indexed."));
>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>> assertNotNull(path);
>>> assertTrue(path.contains(".rtf"));
>>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>>> assertNotNull(fileName);
>>> assertEquals(RTF_FILE_NAME, fileName);
>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanSearchRtfDocument() throws Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>> try {
>>> indexWriter.addDocument(document);
>>> commitAndCloseWriter(indexWriter);
>>> } catch (CorruptIndexException e) {
>>> throw new IllegalStateException(e);
>>> } catch (IOException e) {
>>> throw new IllegalStateException(e);
>>> }
>>>
>>> //I plan to use other searchers later
>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>>> {indexSearcher});
>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>> Query query = queryParser.parse("amin");
>>> TopDocs topDocs = multiSearcher.search(query,  
>>> BooleanQuery.getMaxClauseCount());
>>> assertNotNull(topDocs);
>>> assertEquals(1, topDocs.totalHits);
>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>> assertNotNull(documentFromSearch);
>>> String bodyText =  
>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(bodyText);
>>> assertNotSame("", bodyText);
>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>> assertTrue(bodyText.contains("This is a test rtf document that  
>>> will be indexed."));
>>>
>>> }
>>> multiSearcher.close();
>>>
>>> }
>>>
>>> @After
>>> public void tearDown() throws Exception {
>>> rtfFile.delete();
>>> if (getDirectory().list() != null && getDirectory().list().length  
>>> > 0) {
>>> IndexReader reader = IndexReader.open(getDirectory());
>>> for(int i = 0; i < reader.maxDoc();i++) {
>>> reader.deleteDocument(i);
>>> }
>>> reader.close();
>>> }
>>> }
>>>
>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>>> CorruptIndexException,IOException {
>>> indexWriter.commit();
>>> indexWriter.close();
>>> }
>>>
>>>
>>> public Directory getDirectory() throws IOException {
>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>> }
>>>
>>> public Analyzer getAnalyzer() {
>>> return new StandardAnalyzer();
>>> }
>>> private static void convertInputStreamToFile(InputStream  
>>> inputStream, File file) {
>>> try
>>>    {
>>>    OutputStream out=new FileOutputStream(file);
>>>    byte buf[]=new byte[1024];
>>>    int len;
>>>    while((len=inputStream.read(buf))>0)
>>>    out.write(buf,0,len);
>>>    out.close();
>>>    inputStream.close();
>>>
>>>    }catch (IOException e){
>>>    throw new IllegalStateException(e);
>>>    }
>>> }
>>> private static class JavaBuiltInRTFHandler implements  
>>> DocumentHandler{
>>>
>>> public Document getDocument(File file) throws  
>>> DocumentHandlerException {
>>> String bodyText = null;
>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>> try {
>>> InputStream inputStream = new FileInputStream(file);
>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>> } catch (IOException ioex) {
>>> throw new IllegalStateException(ioex);
>>> } catch (BadLocationException e) {
>>> throw new IllegalArgumentException(e);
>>> }
>>> //create Document object using body
>>> if (bodyText != null) {
>>> Document document = new Document();
>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>> Field field = new  
>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>>> Field.Store.YES, Field.Index.ANALYZED);
>>> document.add(field);
>>>
>>> String pathToFile = file.getPath();
>>> Field pathToFileField = new  
>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(pathToFileField);
>>>
>>> String fileName = file.getName();
>>> Field fileNameField = new  
>>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(fileNameField);
>>>
>>> Field typeField = new  
>>> Field 
>>> (FieldNameEnum 
>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(typeField);
>>>
>>> String summary = bodyText.substring(0, 10);
>>>
>>> Field summaryField = new  
>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(summaryField);
>>>
>>> return document;
>>> }
>>> return null;
>>> }
>>> }
>>>
>>> private static class WorkItem {
>>>
>>> public enum WorkItemEvent {
>>> ADD,
>>> UPDATE,
>>> DELETE;
>>> }
>>>
>>> public enum IndexerType {
>>> RTF_INDEXER,
>>> PDF_INDEXER,
>>> XML_INDEXER,
>>> PLAIN_TEXT_INDEXER,
>>> MS_WORD_INDEXER,
>>> MS_EXCEL_INDEXER,
>>> MS_POWERPOINT_INDEXER;
>>> }
>>>
>>>
>>> private final Document workLoad;
>>>
>>> private final WorkItemEvent workItemEvent;
>>>
>>> private final IndexerType indexerType;
>>>
>>>
>>> public WorkItem(final Document workLoad, final WorkItemEvent  
>>> workItemEvent) {
>>> this.workLoad = workLoad;
>>> this.workItemEvent = workItemEvent;
>>> String type = this.workLoad.get("type");
>>> this.indexerType = IndexerType.valueOf(type);
>>> }
>>>
>>> public IndexerType getIndexerType() {
>>> return indexerType;
>>> }
>>>
>>> public Document getWorkLoad() {
>>> return workLoad;
>>> }
>>>
>>> public WorkItemEvent getWorkItemEvent() {
>>> return workItemEvent;
>>> }
>>> }
>>>
>>> private enum FieldNameEnum {
>>>
>>> AUTHOR("author"),
>>> BODY("body"),
>>> TITLE("title"),
>>> SUBJECT("subject"),
>>> KEYWORDS("keywords"),
>>> PATH("path"), NAME ("name"),
>>> TYPE("type"),
>>> ID ("id"),
>>> SUMMARY ("summary");
>>>
>>> private final String description;
>>>
>>> private FieldNameEnum(final String description) {
>>> this.description = description;
>>> }
>>>
>>> public String getDescription() {
>>> return this.description;
>>> }
>>> }
>>> }
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message