lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Bianca Pereira <aivykar...@gmail.com>
Subject Re: EnglishAnalyzer vs WhiteSpaceAnalyzer in getting Term Frequency
Date Thu, 07 Aug 2014 12:46:45 GMT
Hi Jack,

  Thank you very much. I just changed for the StandardAnalyzer and it is
working as I would like. But there is something I still cannot understand.
If I use the same analyzer for indexing and for searching, the same term
should be parsed in the same way in both moments, shouldn't it? It is why I
still don't understand why the EnglishAnalyzer was not working. Any idea on
that?

  Best Regards,
  Bianca


2014-08-07 12:40 GMT+01:00 Jack Krupansky <jack@basetechnology.com>:

> Generally, the standard analyzer will be a better choice, unless you have
> some special need.
>
> A language-specific analyzer will include stemming. The English analyzer
> includes the Porter stemmer.
>
> Generally, you need to apply a compatible analyzer to query terms to match
> the index, or you need to manually filter your query terms. Sounds like
> maybe a term got stemmed.
>
> -- Jack Krupansky
>
> -----Original Message----- From: Bianca Pereira
> Sent: Thursday, August 7, 2014 7:28 AM
> To: java-user@lucene.apache.org
> Subject: EnglishAnalyzer vs WhiteSpaceAnalyzer in getting Term Frequency
>
>
> Hi,
>
>  I am new in the list and I have been working on a problem for some time
> already. I would like to know if someone has any idea of how I can solve
> it.
>
> Given a term, I want to get the term frequency in a lucene document. When
> I use the WhiteSpaceAnalyzer my code works properly but when I use the
> EnglishAnalyzer it returns 0 as frequency for any term.
>
>  In order to get the term appearing both as "term" or "term," in the text
> the EnglishAnalyzer is the best one to be used (I suppose).
>
>  Any help is more than welcome.
>
>  Best Regards,
>  Bianca
>
> ----------------------------
>  Here is my code:
>
> TO INDEX
>
> public class LuceneDescriptionIndexer implements Closeable {
>
> private IndexWriter descWriter;
>
>
> public LuceneDescriptionIndexer(Directory luceneDirectory, Analyzer
> analyzer)
>
> throws IOException {
>
>  openIndex(luceneDirectory, analyzer);
>
> }
>
> private void openIndex(Directory directory, Analyzer analyzer) throws
> IOException {
>
>  IndexWriterConfig descIwc = new IndexWriterConfig(LuceneConfig.
> INDEX_VERSION, analyzer);
>
>  descWriter = new IndexWriter(directory, descIwc);
>
> }
>
> public void indexDocument(String id, String text) throws IOException {
>
>    IndexableField idField = new StringField("id",id,Field.Store.YES);
>
>     FieldType fieldType = new FieldType();
>
>    fieldType.setStoreTermVectors(true);
>
>    fieldType.setStoreTermVectorPositions(true);
>
>    fieldType.setIndexed(true);
>
>    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
>
>    fieldType.setStored(true);
>
>
>
>    Document doc = new Document();
>
>    doc.add(idField);
>
>    doc.add(new Field("description", text, fieldType));
>
>
>
>    descWriter.addDocument(doc);
>
> }
>
> @Override
>
> public void close() throws IOException {
>
>  descWriter.commit();
>
>  descWriter.close();
>
> }
>
> }
>
>
> TO QUERY
>
> public class LuceneTermStatistics implements TermKBStatistics {
>
>
> private IndexReader luceneIndexReader;
>
> private Analyzer analyzer;
>
> private IndexSearcher searcher;
>
>
> public LuceneTermStatistics(IndexReader reader, Analyzer analyzer) {
>
>  this.luceneIndexReader = reader;
>
>  this.analyzer = analyzer;
>
>  this.searcher = new IndexSearcher(reader);
>
> }
>
> /**
>
> * Create an instance of LuceneTermStatistics from the Config options.
>
> */
>
> public static LuceneTermStatistics configureInstance(String indexPath,
> Analyzer analyzer)
>
>  throws IOException {
>
>  FSDirectory index = FSDirectory.open(new File(indexPath));
>
>  DirectoryReader indexReader = DirectoryReader.open(index);
>
>  return new LuceneTermStatistics(indexReader, analyzer);
>
> }
>
> @Override
>
> public int getTermFrequency(String term, String id)
>
> throws Exception {
>
>   int docId = getDocId(id);
>
>   // Get the vector with the frequency for the term in all documents
>
>  DocsEnum de = MultiFields.getTermDocsEnum(
>
>       luceneIndexReader, MultiFields.getLiveDocs(luceneIndexReader),
> "description",
>
>       new BytesRef(term));
>
>   // Get the frequency for the document of interest
>
>  if (de != null) {
>
>      int docNo;
>
>      while((docNo = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
>
>         if(docNo == docId)
>
>           return de.freq();
>
>       }
>
>  }
>
>  return 0;
>
> }
>
>
> private int getDocId(String id) throws IOException {
>
>  BooleanQuery idQuery = new BooleanQuery();
>
>  idQuery.add(new TermQuery(new Term("id", id)), Occur.MUST);
>
>
>  TopScoreDocCollector collector = TopScoreDocCollector.create(1, false);
>
>  searcher.search(idQuery, collector);
>
>   TopDocs topDocs = collector.topDocs();
>
>  if (topDocs.totalHits == 0)
>
>    return -1;
>
>   return topDocs.scoreDocs[0].doc;
>
> }
>
> }
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message