lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Jack Krupansky" <j...@basetechnology.com>
Subject Re: EnglishAnalyzer vs WhiteSpaceAnalyzer in getting Term Frequency
Date Thu, 07 Aug 2014 11:40:48 GMT
Generally, the standard analyzer will be a better choice, unless you have 
some special need.

A language-specific analyzer will include stemming. The English analyzer 
includes the Porter stemmer.

Generally, you need to apply a compatible analyzer to query terms to match 
the index, or you need to manually filter your query terms. Sounds like 
maybe a term got stemmed.

-- Jack Krupansky

-----Original Message----- 
From: Bianca Pereira
Sent: Thursday, August 7, 2014 7:28 AM
To: java-user@lucene.apache.org
Subject: EnglishAnalyzer vs WhiteSpaceAnalyzer in getting Term Frequency

Hi,

  I am new in the list and I have been working on a problem for some time
already. I would like to know if someone has any idea of how I can solve it.

Given a term, I want to get the term frequency in a lucene document. When
I use the WhiteSpaceAnalyzer my code works properly but when I use the
EnglishAnalyzer it returns 0 as frequency for any term.

  In order to get the term appearing both as "term" or "term," in the text
the EnglishAnalyzer is the best one to be used (I suppose).

  Any help is more than welcome.

  Best Regards,
  Bianca

----------------------------
  Here is my code:

TO INDEX

public class LuceneDescriptionIndexer implements Closeable {

private IndexWriter descWriter;


public LuceneDescriptionIndexer(Directory luceneDirectory, Analyzer
analyzer)

throws IOException {

  openIndex(luceneDirectory, analyzer);

}

private void openIndex(Directory directory, Analyzer analyzer) throws
IOException {

  IndexWriterConfig descIwc = new IndexWriterConfig(LuceneConfig.
INDEX_VERSION, analyzer);

  descWriter = new IndexWriter(directory, descIwc);

}

public void indexDocument(String id, String text) throws IOException {

    IndexableField idField = new StringField("id",id,Field.Store.YES);

     FieldType fieldType = new FieldType();

    fieldType.setStoreTermVectors(true);

    fieldType.setStoreTermVectorPositions(true);

    fieldType.setIndexed(true);

    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    fieldType.setStored(true);



    Document doc = new Document();

    doc.add(idField);

    doc.add(new Field("description", text, fieldType));



    descWriter.addDocument(doc);

}

@Override

public void close() throws IOException {

  descWriter.commit();

  descWriter.close();

}

}


TO QUERY

public class LuceneTermStatistics implements TermKBStatistics {


private IndexReader luceneIndexReader;

private Analyzer analyzer;

private IndexSearcher searcher;


public LuceneTermStatistics(IndexReader reader, Analyzer analyzer) {

  this.luceneIndexReader = reader;

  this.analyzer = analyzer;

  this.searcher = new IndexSearcher(reader);

}

/**

* Create an instance of LuceneTermStatistics from the Config options.

*/

public static LuceneTermStatistics configureInstance(String indexPath,
Analyzer analyzer)

  throws IOException {

  FSDirectory index = FSDirectory.open(new File(indexPath));

  DirectoryReader indexReader = DirectoryReader.open(index);

  return new LuceneTermStatistics(indexReader, analyzer);

}

@Override

public int getTermFrequency(String term, String id)

throws Exception {

   int docId = getDocId(id);

   // Get the vector with the frequency for the term in all documents

  DocsEnum de = MultiFields.getTermDocsEnum(

       luceneIndexReader, MultiFields.getLiveDocs(luceneIndexReader),
"description",

       new BytesRef(term));

   // Get the frequency for the document of interest

  if (de != null) {

      int docNo;

      while((docNo = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) {

         if(docNo == docId)

           return de.freq();

       }

  }

  return 0;

}


private int getDocId(String id) throws IOException {

  BooleanQuery idQuery = new BooleanQuery();

  idQuery.add(new TermQuery(new Term("id", id)), Occur.MUST);


  TopScoreDocCollector collector = TopScoreDocCollector.create(1, false);

  searcher.search(idQuery, collector);

   TopDocs topDocs = collector.topDocs();

  if (topDocs.totalHits == 0)

    return -1;

   return topDocs.scoreDocs[0].doc;

}

} 


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message