cutting 2002/11/07 09:31:27 Modified: . CHANGES.txt build.xml default.properties src/java/org/apache/lucene/document Field.java src/java/org/apache/lucene/index DocumentWriter.java IndexWriter.java src/java/org/apache/lucene/search BooleanQuery.java BooleanScorer.java ExactPhraseScorer.java MultiTermQuery.java PhrasePrefixQuery.java PhraseQuery.java PhraseScorer.java PrefixQuery.java Query.java RangeQuery.java Scorer.java Searcher.java Similarity.java SloppyPhraseScorer.java TermQuery.java TermScorer.java src/test/org/apache/lucene/index DocTest.java src/test/org/apache/lucene/search TestDocBoost.java Added: src/java/org/apache/lucene/analysis/ru package.html src/java/org/apache/lucene/search DefaultSimilarity.java src/test/org/apache/lucene/search TestSimilarity.java Log: Added a public, extensible scoring API. Revision Changes Path 1.34 +4 -1 jakarta-lucene/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v retrieving revision 1.33 retrieving revision 1.34 diff -u -r1.33 -r1.34 --- CHANGES.txt 16 Sep 2002 04:11:36 -0000 1.33 +++ CHANGES.txt 7 Nov 2002 17:31:25 -0000 1.34 @@ -93,6 +93,9 @@ 17. Added Russian Analyzer. (Boris Okner via otis) + 18. Added a public, extensible scoring API. For details, see the + javadoc for org.apache.lucene.search.Similarity. + 1.2 RC6 1.35 +15 -17 jakarta-lucene/build.xml Index: build.xml =================================================================== RCS file: /home/cvs/jakarta-lucene/build.xml,v retrieving revision 1.34 retrieving revision 1.35 diff -u -r1.34 -r1.35 --- build.xml 19 Oct 2002 16:08:59 -0000 1.34 +++ build.xml 7 Nov 2002 17:31:25 -0000 1.35 @@ -12,14 +12,21 @@ - - - + + + + + + + + + + @@ -245,7 +252,7 @@ includes="**/*.java" destdir="${build.demo.classes}" debug="${debug}"> - + @@ -255,23 +262,14 @@ - - - - - - - - - - + @@ -295,7 +293,7 @@ includes="**/*.java" destdir="${junit.classes}" debug="${debug}"> - + @@ -565,7 +563,7 @@ - + 1.12 +0 -1 jakarta-lucene/default.properties Index: default.properties =================================================================== RCS file: /home/cvs/jakarta-lucene/default.properties,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- default.properties 19 Oct 2002 16:05:12 -0000 1.11 +++ default.properties 7 Nov 2002 17:31:25 -0000 1.12 @@ -50,7 +50,6 @@ build.docweb.war.name = lucenedocweb build.test = ${build.dir}/test -build.test.src = ${build.test}/src build.test.classes = ${build.test}/classes junit.src = ${basedir}/src/test 1.1 jakarta-lucene/src/java/org/apache/lucene/analysis/ru/package.html Index: package.html =================================================================== Support for indexing and searching Russian text. 1.8 +2 -2 jakarta-lucene/src/java/org/apache/lucene/document/Field.java Index: Field.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- Field.java 29 Jul 2002 19:11:14 -0000 1.7 +++ Field.java 7 Nov 2002 17:31:26 -0000 1.8 @@ -85,13 +85,13 @@ *

The boost is multiplied by {@link Document#getBoost()} of the document * containing this field. If a document has multiple fields with the same * name, all such values are multiplied together. This product is then - * multipled by the value {@link Similarity#normalizeLength(int)}, and + * multipled by the value {@link Similarity#lengthNorm(String,int)}, and * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * * @see Document#setBoost(float) - * @see Similarity#normalizeLength(int) + * @see Similarity#lengthNorm(String, int) * @see Similarity#encodeNorm(float) */ public void setBoost(float boost) { 1.5 +10 -7 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java Index: DocumentWriter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- DocumentWriter.java 7 Nov 2002 05:55:39 -0000 1.4 +++ DocumentWriter.java 7 Nov 2002 17:31:26 -0000 1.5 @@ -73,13 +73,16 @@ final class DocumentWriter { private Analyzer analyzer; private Directory directory; + private Similarity similarity; private FieldInfos fieldInfos; private int maxFieldLength; - - DocumentWriter(Directory d, Analyzer a, int mfl) { - directory = d; - analyzer = a; - maxFieldLength = mfl; + + DocumentWriter(Directory directory, Analyzer analyzer, + Similarity similarity, int maxFieldLength) { + this.directory = directory; + this.analyzer = analyzer; + this.similarity = similarity; + this.maxFieldLength = maxFieldLength; } final void addDocument(String segment, Document doc) @@ -320,10 +323,10 @@ if (field.isIndexed()) { int n = fieldInfos.fieldNumber(field.name()); float norm = - fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]); + fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]); OutputStream norms = directory.createFile(segment + ".f" + n); try { - norms.writeByte(Similarity.encodeNorm(norm)); + norms.writeByte(similarity.encodeNorm(norm)); } finally { norms.close(); } 1.10 +20 -11 jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java Index: IndexWriter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- IndexWriter.java 7 Nov 2002 05:55:39 -0000 1.9 +++ IndexWriter.java 7 Nov 2002 17:31:26 -0000 1.10 @@ -68,6 +68,8 @@ import org.apache.lucene.search.Similarity; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.Similarity; + /** An IndexWriter creates and maintains an index. @@ -89,12 +91,28 @@ private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text + private Similarity similarity = Similarity.getDefault(); // how to normalize + private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private final Directory ramDirectory = new RAMDirectory(); // for temp segs private Lock writeLock; - private Similarity similarity; + /** Expert: Set the Similarity implementation used by this IndexWriter. + * + * @see Similarity#setDefault(Similarity) + */ + public void setSimilarity(Similarity similarity) { + this.similarity = similarity; + } + + /** Expert: Return the Similarity implementation used by this IndexWriter. + * + *

This defaults to the current value of {@link Similarity#getDefault()}. + */ + public Similarity getSimilarity() { + return this.similarity; + } /** Constructs an IndexWriter for the index in path. Text will be analyzed with a. If create is true, then a @@ -186,7 +204,7 @@ /** Adds a document to this index.*/ public void addDocument(Document doc) throws IOException { DocumentWriter dw = - new DocumentWriter(ramDirectory, analyzer, maxFieldLength); + new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength); String segmentName = newSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { @@ -406,14 +424,5 @@ output.close(); } directory.renameFile("deleteable.new", "deletable"); - } - - /** - * Sets the Similarity implementation to use. - * - * @param sim an instance of a class that implements Similarityterm. */ public MultiTermQuery(Term term) { this.term = term; - this.query = query; } /** Set the TermEnum to be used */ @@ -105,8 +104,9 @@ } } - final Scorer scorer(IndexReader reader) throws IOException { - return getQuery().scorer(reader); + final Scorer scorer(IndexReader reader, Similarity similarity) + throws IOException { + return getQuery().scorer(reader, similarity); } private final BooleanQuery getQuery() throws IOException { 1.3 +7 -5 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java Index: PhrasePrefixQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- PhrasePrefixQuery.java 29 Jul 2002 19:11:15 -0000 1.2 +++ PhrasePrefixQuery.java 7 Nov 2002 17:31:26 -0000 1.3 @@ -147,7 +147,7 @@ _termArrays.add(terms); } - Scorer scorer(IndexReader reader) + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { if (_termArrays.size() == 0) // optimize zero-term case @@ -161,7 +161,7 @@ for (int i=0; i 0.0) { - float score = Similarity.tf(freq)*weight; // compute score + float score = similarity.tf(freq)*weight; // compute score score *= Similarity.decodeNorm(norms[first.doc]); // normalize results.collect(first.doc, score); // add to results } 1.4 +2 -2 jakarta-lucene/src/java/org/apache/lucene/search/PrefixQuery.java Index: PrefixQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PrefixQuery.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- PrefixQuery.java 17 Jul 2002 17:38:04 -0000 1.3 +++ PrefixQuery.java 7 Nov 2002 17:31:26 -0000 1.4 @@ -90,8 +90,8 @@ } } - Scorer scorer(IndexReader reader) throws IOException { - return getQuery().scorer(reader); + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { + return getQuery().scorer(reader, similarity); } private BooleanQuery getQuery() throws IOException { 1.8 +9 -8 jakarta-lucene/src/java/org/apache/lucene/search/Query.java Index: Query.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Query.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- Query.java 7 Nov 2002 05:55:40 -0000 1.7 +++ Query.java 7 Nov 2002 17:31:26 -0000 1.8 @@ -86,18 +86,19 @@ abstract void normalize(float norm); // query evaluation - abstract Scorer scorer(IndexReader reader) throws IOException; + abstract Scorer scorer(IndexReader reader, Similarity similarity) + throws IOException; void prepare(IndexReader reader) {} static Scorer scorer(Query query, Searcher searcher, IndexReader reader) - throws IOException - { - query.prepare(reader); - float sum = query.sumOfSquaredWeights(searcher); - float norm = 1.0f / (float)Math.sqrt(sum); - query.normalize(norm); - return query.scorer(reader); + throws IOException { + Similarity similarity = searcher.getSimilarity(); + query.prepare(reader); + float sum = query.sumOfSquaredWeights(searcher); + float norm = similarity.queryNorm(sum); + query.normalize(norm); + return query.scorer(reader, similarity); } /** 1.5 +2 -2 jakarta-lucene/src/java/org/apache/lucene/search/RangeQuery.java Index: RangeQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/RangeQuery.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- RangeQuery.java 17 Jul 2002 17:38:04 -0000 1.4 +++ RangeQuery.java 7 Nov 2002 17:31:26 -0000 1.5 @@ -113,9 +113,9 @@ } } - Scorer scorer(IndexReader reader) throws IOException + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { - return getQuery().scorer(reader); + return getQuery().scorer(reader, similarity); } private BooleanQuery getQuery() throws IOException 1.2 +10 -0 jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java Index: Scorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- Scorer.java 18 Sep 2001 16:29:58 -0000 1.1 +++ Scorer.java 7 Nov 2002 17:31:26 -0000 1.2 @@ -57,5 +57,15 @@ import java.io.IOException; abstract class Scorer { + private Similarity similarity; + + protected Scorer(Similarity similarity) { + this.similarity = similarity; + } + + public Similarity getSimilarity() { + return this.similarity; + } + abstract void score(HitCollector hc, int maxDoc) throws IOException; } 1.7 +15 -8 jakarta-lucene/src/java/org/apache/lucene/search/Searcher.java Index: Searcher.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Searcher.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- Searcher.java 7 Nov 2002 05:55:40 -0000 1.6 +++ Searcher.java 7 Nov 2002 17:31:26 -0000 1.7 @@ -63,9 +63,6 @@ * Implements some common utility methods. */ public abstract class Searcher implements Searchable { - - protected Similarity similarity; - /** Returns the documents matching query. */ public final Hits search(Query query) throws IOException { return search(query, (Filter)null); @@ -91,12 +88,22 @@ search(query, (Filter)null, results); } - /** - * Sets the Similarity implementation to use. + /** The Similarity implementation used by this searcher. */ + private Similarity similarity = Similarity.getDefault(); + + /** Expert: Set the Similarity implementation used by this Searcher. + * + * @see Similarity#setDefault(Similarity) + */ + public void setSimilarity(Similarity similarity) { + this.similarity = similarity; + } + + /** Expert: Return the Similarity implementation used by this Searcher. * - * @param sim an instance of a class that implements SimilarityThis defaults to the current value of {@link Similarity#getDefault()}. */ - public void setSimilarity(Similarity sim) { - similarity = sim; + public Similarity getSimilarity() { + return this.similarity; } } 1.4 +201 -39 jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java Index: Similarity.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- Similarity.java 7 Nov 2002 05:55:40 -0000 1.3 +++ Similarity.java 7 Nov 2002 17:31:26 -0000 1.4 @@ -55,14 +55,73 @@ */ import java.io.IOException; +import java.util.Vector; import org.apache.lucene.index.Term; +import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; -/** Internal class used for scoring. - *

Public only so that the indexing code can compute and store the - * normalization byte for each document. */ +/** Expert: Scoring API. + *

Subclasses implement search scoring. + * + *

The score of query q for document d is defined + * in terms of these methods as follows: + * + * + * + * + * + * + * + * + * + * + * + *
score(q,d) =
+ * Σ + * {@link #tf(int) tf}(t in d) * + * {@link #idf(Term,Searcher) idf}(t) * + * {@link Field#getBoost getBoost}(t.field in d) * + * {@link #lengthNorm(String,int) lengthNorm}(t.field in d) + *  * + * {@link #coord(int,int) coord}(q,d) * + * {@link #queryNorm(float) queryNorm}(q) + *
+ * t in q + *
+ * + * @see #setDefault(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + * @see Searcher#setSimilarity(Similarity) + */ public abstract class Similarity { + /** The Similarity implementation used by default. */ + private static Similarity defaultImpl = new DefaultSimilarity(); + /** Set the default Similarity implementation used by indexing and search + * code. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + public static void setDefault(Similarity similarity) { + Similarity.defaultImpl = similarity; + } + + /** Return the default Similarity implementation used by indexing and search + * code. + * + *

This is initially an instance of {@link DefaultSimilarity}. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + public static Similarity getDefault() { + return Similarity.defaultImpl; + } + + /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; static { @@ -70,37 +129,47 @@ NORM_TABLE[i] = byteToFloat((byte)i); } - private static Similarity similarity; - - private Similarity() {} // no public constructor - - /** - * Sets the Similarity implementation to use. - * - * @param sim an instance of a class that implements SimilarityMatches in longer fields are less precise, so implemenations of this + * method usually return smaller values when numTokens is large, + * and larger values when numTokens is small. + * + *

That these values are computed under {@link + * IndexWriter#addDocument(Document)} and stored then using + * {#encodeNorm(float)}. Thus they have limited precision, and documents + * must be re-indexed if this method is altered. * - *

The formula used is: 1.0f / Math.sqrt(numTerms) + * @param fieldName the name of the field + * @param numTokens the total number of tokens contained in fields named + * fieldName of doc. + * @return a normalization factor for hits on this field of this document * * @see Field#setBoost(float) */ - public static float normalizeLength(int numTerms) { - return (float)(1.0 / Math.sqrt(numTerms)); - } - - /** Decodes a normalization factor stored in an index. - * @see #encodeNorm(float) + public abstract float lengthNorm(String fieldName, int numTokens); + + /** Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + *

This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * @param sumOfSquaredWeights the sum of the squares of query term weights + * @return a normalization factor for query weights */ - public static float decodeNorm(byte b) { - return NORM_TABLE[b & 0xFF]; - } + public abstract float queryNorm(float sumOfSquaredWeights); /** Encodes a normalization factor for storage in an index. * @@ -151,25 +220,118 @@ return (byte)((exponent << 3) | mantissa); // pack into a byte } - static final float tf(int freq) { - return (float)Math.sqrt(freq); - } - static final float tf(float freq) { - return (float)Math.sqrt(freq); + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implemenations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + *

The default implementation calls {@link #tf(float)}. + * + * @param tf the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public float tf(int freq) { + return tf((float)freq); } + + /** Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to {@link #tf(float)}. + * + *

A phrase match with a small edit distance to a document passage more + * closely matches the document, so implemenations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * @see PhraseQuery#setSlop(int) + * @param distance the edit distance of this sloppy phrase match + * @return the frequency increment for this match + */ + public abstract float sloppyFreq(int distance); + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implemenations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + * @param tf the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public abstract float tf(float freq); - static final float idf(Term term, Searcher searcher) throws IOException { - // Use maxDoc() instead of numDocs() because its proportional to docFreq(), - // i.e., when one is inaccurate, so is the other, and in the same way. + /** Computes a score factor for a simple term. + * + *

The default implementation is:

  +   *   return idf(searcher.docFreq(term), searcher.maxDoc());
  +   * 
+ * + * Note that {@link Searcher#maxDoc()} is used instead of {@link + * IndexReader#numDocs()} because it is proportional to {@link + * Searcher#docFreq(Term)} , i.e., when one is inaccurate, so is the other, + * and in the same direction. + * + * @param term the term in question + * @param searcher the document collection being searched + * @return a score factor for the term + */ + public float idf(Term term, Searcher searcher) throws IOException { return idf(searcher.docFreq(term), searcher.maxDoc()); } - static final float idf(int docFreq, int numDocs) { - return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); + /** Computes a score factor for a phrase. + * + *

The default implementation sums the {@link #idf(Term,Searcher)} factor + * for each term in the phrase. + * + * @param terms the vector of terms in the phrase + * @param searcher the document collection being searched + * @return a score factor for the phrase + */ + public float idf(Vector terms, Searcher searcher) throws IOException { + float idf = 0.0f; + for (int i = 0; i < terms.size(); i++) { + idf += idf((Term)terms.elementAt(i), searcher); + } + return idf; } + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + *

Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + protected abstract float idf(int docFreq, int numDocs); - static final float coord(int overlap, int maxOverlap) { - return overlap / (float)maxOverlap; - } + /** Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + *

The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * @param overlap the number of query terms matched in the document + * @param maxOverlap the total number of terms in the query + * @return a score factor based on term overlap with the query + */ + public abstract float coord(int overlap, int maxOverlap); } 1.2 +5 -5 jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Index: SloppyPhraseScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- SloppyPhraseScorer.java 18 Sep 2001 16:29:58 -0000 1.1 +++ SloppyPhraseScorer.java 7 Nov 2002 17:31:26 -0000 1.2 @@ -62,10 +62,10 @@ final class SloppyPhraseScorer extends PhraseScorer { private int slop; - SloppyPhraseScorer(TermPositions[] tps, int s, byte[] n, float w) - throws IOException { - super(tps, n, w); - slop = s; + SloppyPhraseScorer(TermPositions[] tps, Similarity similarity, + int slop, byte[] norms, float weight) throws IOException { + super(tps, similarity, norms, weight); + this.slop = slop; } protected final float phraseFreq() throws IOException { @@ -94,7 +94,7 @@ int matchLength = end - start; if (matchLength <= slop) - freq += 1.0 / (matchLength + 1); // penalize longer matches + freq += getSimilarity().sloppyFreq(matchLength); // score match if (pp.position > end) end = pp.position; 1.4 +4 -3 jakarta-lucene/src/java/org/apache/lucene/search/TermQuery.java Index: TermQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermQuery.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- TermQuery.java 17 Jul 2002 17:38:04 -0000 1.3 +++ TermQuery.java 7 Nov 2002 17:31:26 -0000 1.4 @@ -73,7 +73,7 @@ } final float sumOfSquaredWeights(Searcher searcher) throws IOException { - idf = Similarity.idf(term, searcher); + idf = searcher.getSimilarity().idf(term, searcher); weight = idf * boost; return weight * weight; // square term weights } @@ -83,14 +83,15 @@ weight *= idf; // factor from document } - Scorer scorer(IndexReader reader) + Scorer scorer(IndexReader reader, Similarity similarity) throws IOException { TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; - return new TermScorer(termDocs, reader.norms(term.field()), weight); + return new TermScorer(termDocs, similarity, + reader.norms(term.field()), weight); } /** Prints a user-readable version of this query. */ 1.3 +11 -8 jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java Index: TermScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- TermScorer.java 29 Jul 2002 19:11:15 -0000 1.2 +++ TermScorer.java 7 Nov 2002 17:31:26 -0000 1.3 @@ -63,21 +63,23 @@ private float weight; private int doc; - private final int[] docs = new int[128]; // buffered doc numbers - private final int[] freqs = new int[128]; // buffered term freqs + private final int[] docs = new int[32]; // buffered doc numbers + private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; - TermScorer(TermDocs td, byte[] n, float w) throws IOException { - termDocs = td; - norms = n; - weight = w; + TermScorer(TermDocs td, Similarity similarity, byte[] norms, float weight) + throws IOException { + super(similarity); + this.termDocs = td; + this.norms = norms; + this.weight = weight; for (int i = 0; i < SCORE_CACHE_SIZE; i++) - scoreCache[i] = Similarity.tf(i) * weight; + scoreCache[i] = getSimilarity().tf(i) * weight; pointerMax = termDocs.read(docs, freqs); // fill buffers @@ -91,12 +93,13 @@ final void score(HitCollector c, final int end) throws IOException { int d = doc; // cache doc in local + Similarity similarity = getSimilarity(); // cache sim in local while (d < end) { // for docs in window final int f = freqs[pointer]; float score = // compute tf(f)*weight f < SCORE_CACHE_SIZE // check cache ? scoreCache[f] // cache hit - : Similarity.tf(f)*weight; // cache miss + : similarity.tf(f)*weight; // cache miss score *= Similarity.decodeNorm(norms[d]); // normalize for field 1.1 jakarta-lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java Index: DefaultSimilarity.java =================================================================== package org.apache.lucene.search; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ import org.apache.lucene.document.Document; /** Expert: Default scoring implementation. */ public class DefaultSimilarity extends Similarity { /** Implemented as 1/sqrt(numTerms). */ public float lengthNorm(String fieldName, int numTerms) { return (float)(1.0 / Math.sqrt(numTerms)); } /** Implemented as 1/sqrt(sumOfSquaredWeights). */ public float queryNorm(float sumOfSquaredWeights) { return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } /** Implemented as sqrt(freq). */ public float tf(float freq) { return (float)Math.sqrt(freq); } /** Implemented as 1 / (distance + 1). */ public float sloppyFreq(int distance) { return 1.0f / (distance + 1); } /** Implemented as log(numDocs/(docFreq+1)) + 1. */ public float idf(int docFreq, int numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } /** Implemented as overlap / maxOverlap. */ public float coord(int overlap, int maxOverlap) { return overlap / (float)maxOverlap; } } 1.4 +3 -1 jakarta-lucene/src/test/org/apache/lucene/index/DocTest.java Index: DocTest.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/index/DocTest.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- DocTest.java 26 Jan 2002 15:01:32 -0000 1.3 +++ DocTest.java 7 Nov 2002 17:31:27 -0000 1.4 @@ -59,6 +59,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.document.Document; +import org.apache.lucene.search.Similarity; import org.apache.lucene.demo.FileDocument; import java.io.File; @@ -95,7 +96,8 @@ throws Exception { Directory directory = FSDirectory.getDirectory("test", false); Analyzer analyzer = new SimpleAnalyzer(); - DocumentWriter writer = new DocumentWriter(directory, analyzer, 1000); + DocumentWriter writer = + new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000); File file = new File(fileName); Document doc = FileDocument.Document(file); 1.2 +2 -2 jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java Index: TestDocBoost.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- TestDocBoost.java 29 Jul 2002 19:11:15 -0000 1.1 +++ TestDocBoost.java 7 Nov 2002 17:31:27 -0000 1.2 @@ -76,7 +76,7 @@ super(name); } - public static void test() throws Exception { + public void testDocBoost() throws Exception { RAMDirectory store = new RAMDirectory(); IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); 1.1 jakarta-lucene/src/test/org/apache/lucene/search/TestSimilarity.java Index: TestSimilarity.java =================================================================== package org.apache.lucene.search; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import junit.framework.TestCase; import java.util.Vector; /** Similarity unit test. * * @author Doug Cutting * @version $Revision: 1.1 $ */ public class TestSimilarity extends TestCase { public TestSimilarity(String name) { super(name); } public static class SimpleSimilarity extends Similarity { public float lengthNorm(String field, int numTerms) { return 1.0f; } public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float tf(float freq) { return freq; } public float sloppyFreq(int distance) { return 2.0f; } public float idf(Vector terms, Searcher searcher) { return 1.0f; } public float idf(int docFreq, int numDocs) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; } } public void testSimilarity() throws Exception { RAMDirectory store = new RAMDirectory(); IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); writer.setSimilarity(new SimpleSimilarity()); Document d1 = new Document(); d1.add(Field.Text("field", "a c")); Document d2 = new Document(); d2.add(Field.Text("field", "a b c")); writer.addDocument(d1); writer.addDocument(d2); writer.optimize(); writer.close(); final float[] scores = new float[4]; Searcher searcher = new IndexSearcher(store); searcher.setSimilarity(new SimpleSimilarity()); Term a = new Term("field", "a"); Term b = new Term("field", "b"); Term c = new Term("field", "c"); searcher.search (new TermQuery(b), new HitCollector() { public final void collect(int doc, float score) { assertTrue(score == 1.0f); } }); BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(a), false, false); bq.add(new TermQuery(b), false, false); //System.out.println(bq.toString("field")); searcher.search (bq, new HitCollector() { public final void collect(int doc, float score) { //System.out.println("Doc=" + doc + " score=" + score); assertTrue(score == (float)doc+1); } }); PhraseQuery pq = new PhraseQuery(); pq.add(a); pq.add(c); //System.out.println(pq.toString("field")); searcher.search (pq, new HitCollector() { public final void collect(int doc, float score) { //System.out.println("Doc=" + doc + " score=" + score); assertTrue(score == 1.0f); } }); pq.setSlop(2); //System.out.println(pq.toString("field")); searcher.search (pq, new HitCollector() { public final void collect(int doc, float score) { //System.out.println("Doc=" + doc + " score=" + score); assertTrue(score == 2.0f); } }); } } -- To unsubscribe, e-mail: For additional commands, e-mail: