lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Liaqat Ali <liaqatalim...@gmail.com>
Subject Re: StopWords problem
Date Thu, 27 Dec 2007 08:22:28 GMT
Doron Cohen wrote:
> Hi Liagat,
>
> This part of the code seems correct and should work, so problem
> must be elsewhere.
>
> Can you post a short program that demonstrates the problem?
>
> You can start with something like this:
>       Document doc = new Document();
>       doc.add(new Field("text",URDU_STOP_WORDS[0] +
>                   " regular text",Store.YES, Index.TOKENIZED));
>       indexWriter.addDocument(doc);
>
> Now URDU_STOP_WORDS[0] should not appear within the index terms.
> You can easily verify this by iterating IndexReader.terms();
>
> Regards, Doron
>
> On Dec 27, 2007 9:36 AM, Liaqat Ali <liaqatalimian@gmail.com> wrote:
>
>   
>> Hi, Grant
>>
>> I think i did not make my self clear. I am trying to pass a list of Urdu
>> Stop words as a argument to the Standard Analyzer. But it does work well
>> for me..
>>
>> public static final String[] URDU_STOP_WORDS = { "کی" ,"کا" ,"کو" ,"ہے"
>> ,"کے" ,"نے" ,"پر" ,"اور" ,"سے","میں" ,"بھی"
>> ,"ان" ,"ایک" ,"تھا" ,"تھی" ,"کیا" ,"ہیں" ,"کر" ,"وہ" ,"جس"
,"نہں" ,"تک" };
>> Analyzer analyzer = new StandardAnalyzer(URDU_STOP_WORDS);
>>
>>
>> Kindly give some guidelines.
>>
>> Regards,
>> Liaqat
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>
>>
>>     

The whole program is given below. But it does not eliminate stop words 
from the index.


import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.*;


public class urduIndexer1  {

   

    Reader file;
    BufferedReader buff;
    String line;
    IndexWriter writer;
    String indexDir;
    Directory dir;

    public static final String[] URDU_STOP_WORDS = { "کی" ,"کا" ,"کو" 
,"ہے" ,"کے" ,"نے" ,"پر" ,"اور" ,"سے","میں" ,"بھی"
,"ان" ,"ایک" ,"تھا" ,"تھی" ,"کیا" ,"ہیں" ,"کر" ,"وہ" ,"جس" ,"نہں"
,"تک"  };


    public void index() throws IOException,
     UnsupportedEncodingException {

       
        indexDir = "D:\\UIR\\index";
        Analyzer analyzer = new StandardAnalyzer(URDU_STOP_WORDS);
            boolean createFlag = true;

        dir = FSDirectory.getDirectory(indexDir);
        writer = new IndexWriter(dir, analyzer, createFlag);

        for (int i=1;i<=201;i++)  {



            file = new InputStreamReader(new 
FileInputStream("corpus\\doc" + i + ".txt"), "UTF-8");

            StringBuffer sb = new StringBuffer();

            buff = new BufferedReader(file);

            //line = buff.readLine();


            while( (line = buff.readLine()) != null) {
                        sb.append(line);
                }



            boolean eof = false;

                Document document  = new Document();
            document.add(new Field("contents",sb.toString(), 
Field.Store.NO, Field.Index.TOKENIZED));
           

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message