lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "beatriz ramos" <beatriz.ramos.mor...@gmail.com>
Subject wrong BM25 implementation in Lucene
Date Wed, 25 Oct 2006 15:00:31 GMT
Hello, this is BM25 algorithm I implement in Lucene.

it doen't work because I have compaired my results with the results of MG4J
(with the same documents set)

I don't know if I have a wrong formule or there are another mistake

Could you help me ?

--------------------------------------------------------------------------------------------------------------------------------

public class BM25Scorer extends Scorer {

    private final static double EPSILON_SCORE = 1.000000082240371E-9;
    private final static double DEFAULT_K1 = 0.75d;
    private final static double DEFAULT_B = 0.95d;
    private double b = DEFAULT_B;
    private double k1 = DEFAULT_K1;

    private IndexReader reader;
    private Term term;
    private Hits hits;
    private int position;   // document position in hits
    private IndexSearcher searcher;

    private int cooc = 0;    // How many times a term appears in the
document
    private float idf;


    public float score() throws IOException {
        TermFreqVector tfv = reader.getTermFreqVector( hits.id(position),
term.field() );

        String[] terms = tfv.getTerms();
        int[] freqs = tfv.getTermFrequencies();
        for (int i = 0 ; i < terms.length ; i++) {
            if( terms[i].equalsIgnoreCase(term.text()) ){
                cooc = freqs[i];
            }
        }

        idf = searcher.getSimilarity().idf(term, searcher);

        Document document = (Document)hits.doc(position);
        String[] values = document.getValues("DOCUMENT_LENGTH");  //
document length is a field of my index

        long docLength = Long.valueOf(values[0]).longValue();  // document
lenght (number of words)
        long averageLength = 200;

        double loga =  Math.max( EPSILON_SCORE, new Float(idf
).doubleValue());
        double score = ( loga * (k1 + 1) * cooc ) / (cooc + k1*( (1-b) +
(b*docLength/averageLength) ) );

        return new Float(score).floatValue();
    }

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message