lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestSimilarity.java TestDocBoost.java
Date Thu, 07 Nov 2002 17:31:27 GMT
cutting     2002/11/07 09:31:27

  Modified:    .        CHANGES.txt build.xml default.properties
               src/java/org/apache/lucene/document Field.java
               src/java/org/apache/lucene/index DocumentWriter.java
                        IndexWriter.java
               src/java/org/apache/lucene/search BooleanQuery.java
                        BooleanScorer.java ExactPhraseScorer.java
                        MultiTermQuery.java PhrasePrefixQuery.java
                        PhraseQuery.java PhraseScorer.java PrefixQuery.java
                        Query.java RangeQuery.java Scorer.java
                        Searcher.java Similarity.java
                        SloppyPhraseScorer.java TermQuery.java
                        TermScorer.java
               src/test/org/apache/lucene/index DocTest.java
               src/test/org/apache/lucene/search TestDocBoost.java
  Added:       src/java/org/apache/lucene/analysis/ru package.html
               src/java/org/apache/lucene/search DefaultSimilarity.java
               src/test/org/apache/lucene/search TestSimilarity.java
  Log:
  Added a public, extensible scoring API.
  
  Revision  Changes    Path
  1.34      +4 -1      jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.33
  retrieving revision 1.34
  diff -u -r1.33 -r1.34
  --- CHANGES.txt	16 Sep 2002 04:11:36 -0000	1.33
  +++ CHANGES.txt	7 Nov 2002 17:31:25 -0000	1.34
  @@ -93,6 +93,9 @@
    17. Added Russian Analyzer.
        (Boris Okner via otis)
   
  + 18. Added a public, extensible scoring API.  For details, see the
  +     javadoc for org.apache.lucene.search.Similarity.
  +
   
   1.2 RC6
   
  
  
  
  1.35      +15 -17    jakarta-lucene/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/build.xml,v
  retrieving revision 1.34
  retrieving revision 1.35
  diff -u -r1.34 -r1.35
  --- build.xml	19 Oct 2002 16:08:59 -0000	1.34
  +++ build.xml	7 Nov 2002 17:31:25 -0000	1.35
  @@ -12,14 +12,21 @@
     <!-- Build classpath -->
     <path id="classpath">
       <pathelement location="${build.classes}"/>
  -    <pathelement location="${build.demo.classes}"/>
  -    <pathelement location="${build.test.classes}"/>
  -    <pathelement location="."/>
       <fileset dir="lib">
         <include name="*.jar" />
       </fileset>
     </path>
   
  +  <path id="demo.classpath">
  +    <path refid="classpath"/>
  +    <pathelement location="${build.demo.classes}"/>
  +  </path>
  +
  +  <path id="test.classpath">
  +    <path refid="demo.classpath"/>
  +    <pathelement location="${build.test.classes}"/>
  +  </path>
  +
     <path id="junit.classpath">
       <pathelement location="${junit.classes}" />
       <pathelement location="${build.classes}"/>
  @@ -245,7 +252,7 @@
         includes="**/*.java"
         destdir="${build.demo.classes}"
         debug="${debug}">
  -      <classpath refid="classpath"/>
  +      <classpath refid="demo.classpath"/>
       </javac>
     </target>
   
  @@ -255,23 +262,14 @@
     <!--                                                                    -->
     <!-- ================================================================== -->
     <target name="test" depends="compile,demo">
  -    <mkdir dir="${build.test}"/>
  -
  -    <copy todir="${build.test.src}">
  -      <fileset dir="${test.src}">
  -        <include name="**/*.java"/>
  -      </fileset>
  -    </copy>
  -
       <mkdir dir="${build.test.classes}"/>
  -
       <javac
         encoding="${build.encoding}"
  -      srcdir="${build.test.src}"
  +      srcdir="${test.src}"
         includes="**/*.java"
         destdir="${build.test.classes}"
         debug="${debug}">
  -      <classpath refid="classpath"/>
  +      <classpath refid="test.classpath"/>
       </javac>
     </target>
   
  @@ -295,7 +293,7 @@
         includes="**/*.java"
         destdir="${junit.classes}"
         debug="${debug}">
  -      <classpath refid="classpath"/>
  +      <classpath refid="test.classpath"/>
       </javac>
   
       <junit printsummary="yes" haltonfailure="no" >
  @@ -565,7 +563,7 @@
     <!-- ================================================================== -->
     <!--                                                                    -->
     <!-- ================================================================== -->
  -  <target name="clean" depends="init">
  +  <target name="clean">
       <delete dir="${build.dir}"/>
       <delete dir="${dist.dir}"/>
       <delete file="${basedir}/${final.name}.tar"/>
  
  
  
  1.12      +0 -1      jakarta-lucene/default.properties
  
  Index: default.properties
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/default.properties,v
  retrieving revision 1.11
  retrieving revision 1.12
  diff -u -r1.11 -r1.12
  --- default.properties	19 Oct 2002 16:05:12 -0000	1.11
  +++ default.properties	7 Nov 2002 17:31:25 -0000	1.12
  @@ -50,7 +50,6 @@
   build.docweb.war.name = lucenedocweb
   
   build.test = ${build.dir}/test
  -build.test.src = ${build.test}/src
   build.test.classes = ${build.test}/classes
   
   junit.src = ${basedir}/src/test
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/ru/package.html
  
  Index: package.html
  ===================================================================
  <html>
  <body>
  Support for indexing and searching Russian text.
  </body>
  </html>
  
  
  
  1.8       +2 -2      jakarta-lucene/src/java/org/apache/lucene/document/Field.java
  
  Index: Field.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Field.java	29 Jul 2002 19:11:14 -0000	1.7
  +++ Field.java	7 Nov 2002 17:31:26 -0000	1.8
  @@ -85,13 +85,13 @@
      * <p>The boost is multiplied by {@link Document#getBoost()} of the document
      * containing this field.  If a document has multiple fields with the same
      * name, all such values are multiplied together.  This product is then
  -   * multipled by the value {@link Similarity#normalizeLength(int)}, and
  +   * multipled by the value {@link Similarity#lengthNorm(String,int)}, and
      * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
      * index.  One should attempt to ensure that this product does not overflow
      * the range of that encoding.
      *
      * @see Document#setBoost(float)
  -   * @see Similarity#normalizeLength(int)
  +   * @see Similarity#lengthNorm(String, int)
      * @see Similarity#encodeNorm(float)
      */
     public void setBoost(float boost) {
  
  
  
  1.5       +10 -7     jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java
  
  Index: DocumentWriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- DocumentWriter.java	7 Nov 2002 05:55:39 -0000	1.4
  +++ DocumentWriter.java	7 Nov 2002 17:31:26 -0000	1.5
  @@ -73,13 +73,16 @@
   final class DocumentWriter {
     private Analyzer analyzer;
     private Directory directory;
  +  private Similarity similarity;
     private FieldInfos fieldInfos;
     private int maxFieldLength;
  -
  -  DocumentWriter(Directory d, Analyzer a, int mfl) {
  -    directory = d;
  -    analyzer = a;
  -    maxFieldLength = mfl;
  +  
  +  DocumentWriter(Directory directory, Analyzer analyzer,
  +                 Similarity similarity, int maxFieldLength) {
  +    this.directory = directory;
  +    this.analyzer = analyzer;
  +    this.similarity = similarity;
  +    this.maxFieldLength = maxFieldLength;
     }
   
     final void addDocument(String segment, Document doc)
  @@ -320,10 +323,10 @@
         if (field.isIndexed()) {
   	int n = fieldInfos.fieldNumber(field.name());
           float norm =
  -          fieldBoosts[n] * Similarity.normalizeLength(fieldLengths[n]);
  +          fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
   	OutputStream norms = directory.createFile(segment + ".f" + n);
   	try {
  -	  norms.writeByte(Similarity.encodeNorm(norm));
  +	  norms.writeByte(similarity.encodeNorm(norm));
   	} finally {
   	  norms.close();
   	}
  
  
  
  1.10      +20 -11    jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java
  
  Index: IndexWriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- IndexWriter.java	7 Nov 2002 05:55:39 -0000	1.9
  +++ IndexWriter.java	7 Nov 2002 17:31:26 -0000	1.10
  @@ -68,6 +68,8 @@
   import org.apache.lucene.search.Similarity;
   import org.apache.lucene.document.Document;
   import org.apache.lucene.analysis.Analyzer;
  +import org.apache.lucene.search.Similarity;
  +
   
   /**
     An IndexWriter creates and maintains an index.
  @@ -89,12 +91,28 @@
     private Directory directory;			  // where this index resides
     private Analyzer analyzer;			  // how to analyze text
   
  +  private Similarity similarity = Similarity.getDefault(); // how to normalize
  +
     private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
     private final Directory ramDirectory = new RAMDirectory(); // for temp segs
   
     private Lock writeLock;
   
  -  private Similarity similarity;
  +  /** Expert: Set the Similarity implementation used by this IndexWriter.
  +   *
  +   * @see Similarity#setDefault(Similarity)
  +   */
  +  public void setSimilarity(Similarity similarity) {
  +    this.similarity = similarity;
  +  }
  +
  +  /** Expert: Return the Similarity implementation used by this IndexWriter.
  +   *
  +   * <p>This defaults to the current value of {@link Similarity#getDefault()}.
  +   */
  +  public Similarity getSimilarity() {
  +    return this.similarity;
  +  }
   
     /** Constructs an IndexWriter for the index in <code>path</code>.  Text will
       be analyzed with <code>a</code>.  If <code>create</code> is true, then a
  @@ -186,7 +204,7 @@
     /** Adds a document to this index.*/
     public void addDocument(Document doc) throws IOException {
       DocumentWriter dw =
  -      new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
  +      new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
       String segmentName = newSegmentName();
       dw.addDocument(segmentName, doc);
       synchronized (this) {
  @@ -406,14 +424,5 @@
         output.close();
       }
       directory.renameFile("deleteable.new", "deletable");
  -  }
  -
  -  /**
  -   * Sets the <code>Similarity</code> implementation to use.
  -   *
  -   * @param sim an instance of a class that implements  <code>Similarity</code
  -   */
  -  public void setSimilarity(Similarity sim) {
  -    similarity = sim;
     }
   }
  
  
  
  1.5       +4 -4      jakarta-lucene/src/java/org/apache/lucene/search/BooleanQuery.java
  
  Index: BooleanQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/BooleanQuery.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- BooleanQuery.java	17 Jul 2002 17:38:04 -0000	1.4
  +++ BooleanQuery.java	7 Nov 2002 17:31:26 -0000	1.5
  @@ -116,20 +116,20 @@
       }
     }
   
  -  Scorer scorer(IndexReader reader)
  +  Scorer scorer(IndexReader reader, Similarity similarity)
          throws IOException {
   
       if (clauses.size() == 1) {			  // optimize 1-term queries
         BooleanClause c = (BooleanClause)clauses.elementAt(0);
         if (!c.prohibited)			  // just return term scorer
  -	return c.query.scorer(reader);
  +	return c.query.scorer(reader, similarity);
       }
   
  -    BooleanScorer result = new BooleanScorer();
  +    BooleanScorer result = new BooleanScorer(similarity);
   
       for (int i = 0 ; i < clauses.size(); i++) {
         BooleanClause c = (BooleanClause)clauses.elementAt(i);
  -      Scorer subScorer = c.query.scorer(reader);
  +      Scorer subScorer = c.query.scorer(reader, similarity);
         if (subScorer != null)
   	result.add(subScorer, c.required, c.prohibited);
         else if (c.required)
  
  
  
  1.2       +5 -1      jakarta-lucene/src/java/org/apache/lucene/search/BooleanScorer.java
  
  Index: BooleanScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/BooleanScorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- BooleanScorer.java	18 Sep 2001 16:29:56 -0000	1.1
  +++ BooleanScorer.java	7 Nov 2002 17:31:26 -0000	1.2
  @@ -70,6 +70,10 @@
     private int prohibitedMask = 0;
     private int nextMask = 1;
   
  +  BooleanScorer(Similarity similarity) {
  +    super(similarity);
  +  }
  +
     static final class SubScorer {
       public Scorer scorer;
       public boolean required = false;
  @@ -113,7 +117,7 @@
     private final void computeCoordFactors() throws IOException {
       coordFactors = new float[maxCoord];
       for (int i = 0; i < maxCoord; i++)
  -      coordFactors[i] = Similarity.coord(i, maxCoord);
  +      coordFactors[i] = getSimilarity().coord(i, maxCoord);
     }
   
     final void score(HitCollector results, int maxDoc) throws IOException {
  
  
  
  1.2       +3 -3      jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
  
  Index: ExactPhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ExactPhraseScorer.java	18 Sep 2001 16:29:56 -0000	1.1
  +++ ExactPhraseScorer.java	7 Nov 2002 17:31:26 -0000	1.2
  @@ -61,9 +61,9 @@
   
   final class ExactPhraseScorer extends PhraseScorer {
   
  -  ExactPhraseScorer(TermPositions[] tps, byte[] n, float w)
  -       throws IOException {
  -    super(tps, n, w);
  +  ExactPhraseScorer(TermPositions[] tps, Similarity similarity,
  +                    byte[] norms, float weight) throws IOException {
  +    super(tps, similarity, norms, weight);
     }
   
     protected final float phraseFreq() throws IOException {
  
  
  
  1.6       +3 -3      jakarta-lucene/src/java/org/apache/lucene/search/MultiTermQuery.java
  
  Index: MultiTermQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/MultiTermQuery.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- MultiTermQuery.java	7 Nov 2002 05:55:40 -0000	1.5
  +++ MultiTermQuery.java	7 Nov 2002 17:31:26 -0000	1.6
  @@ -85,7 +85,6 @@
       /** Constructs a query for terms matching <code>term</code>. */
       public MultiTermQuery(Term term) {
           this.term = term;
  -        this.query = query;
       }
       
       /** Set the TermEnum to be used */
  @@ -105,8 +104,9 @@
           }
       }
       
  -    final Scorer scorer(IndexReader reader) throws IOException {
  -        return getQuery().scorer(reader);
  +    final Scorer scorer(IndexReader reader, Similarity similarity)
  +      throws IOException {
  +      return getQuery().scorer(reader, similarity);
       }
       
       private final BooleanQuery getQuery() throws IOException {
  
  
  
  1.3       +7 -5      jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
  
  Index: PhrasePrefixQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- PhrasePrefixQuery.java	29 Jul 2002 19:11:15 -0000	1.2
  +++ PhrasePrefixQuery.java	7 Nov 2002 17:31:26 -0000	1.3
  @@ -147,7 +147,7 @@
   	_termArrays.add(terms);
       }
   
  -    Scorer scorer(IndexReader reader)
  +    Scorer scorer(IndexReader reader, Similarity similarity)
   	throws IOException
       {
       	if (_termArrays.size() == 0)  // optimize zero-term case
  @@ -161,7 +161,7 @@
   	    for (int i=0; i<terms.length; i++)
   		boq.add(new TermQuery(terms[i]), false, false);
   
  -	    return boq.scorer(reader);
  +	    return boq.scorer(reader, similarity);
       	}
   
       	TermPositions[] tps = new TermPositions[_termArrays.size()];
  @@ -182,9 +182,11 @@
   	}
   
   	if (_slop == 0)
  -	    return new ExactPhraseScorer(tps, reader.norms(_field), _weight);
  +	    return new ExactPhraseScorer(tps, similarity,
  +                                         reader.norms(_field), _weight);
   	else
  -	    return new SloppyPhraseScorer(tps, _slop, reader.norms(_field), _weight);
  +	    return new SloppyPhraseScorer(tps, similarity, _slop,
  +                                          reader.norms(_field), _weight);
       }
   
       float sumOfSquaredWeights(Searcher searcher)
  @@ -195,7 +197,7 @@
   	{
   	    Term[] terms = (Term[])i.next();
   	    for (int j=0; j<terms.length; j++)
  -		_idf += Similarity.idf(terms[j], searcher);
  +		_idf += searcher.getSimilarity().idf(terms[j], searcher);
   	}
   
   	_weight = _idf * boost;
  
  
  
  1.6       +9 -8      jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java
  
  Index: PhraseQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- PhraseQuery.java	3 Sep 2002 21:13:32 -0000	1.5
  +++ PhraseQuery.java	7 Nov 2002 17:31:26 -0000	1.6
  @@ -108,10 +108,7 @@
     }
   
     final float sumOfSquaredWeights(Searcher searcher) throws IOException {
  -    idf = 0.0f;
  -    for (int i = 0; i < terms.size(); i++)	  // sum term IDFs
  -      idf += Similarity.idf((Term)terms.elementAt(i), searcher);
  -
  +    idf = searcher.getSimilarity().idf(terms, searcher);
       weight = idf * boost;
       return weight * weight;			  // square term weights
     }
  @@ -121,7 +118,8 @@
       weight *= idf;				  // factor from document
     }
   
  -  final Scorer scorer(IndexReader reader) throws IOException {
  +  final Scorer scorer(IndexReader reader, Similarity similarity)
  +    throws IOException {
       if (terms.size() == 0)			  // optimize zero-term case
         return null;
       if (terms.size() == 1) {			  // optimize one-term case
  @@ -129,7 +127,8 @@
         TermDocs docs = reader.termDocs(term);
         if (docs == null)
   	return null;
  -      return new TermScorer(docs, reader.norms(term.field()), weight);
  +      return new TermScorer(docs, similarity,
  +                            reader.norms(term.field()), weight);
       }
   
       TermPositions[] tps = new TermPositions[terms.size()];
  @@ -141,10 +140,12 @@
       }
   
       if (slop == 0)				  // optimize exact case
  -      return new ExactPhraseScorer(tps, reader.norms(field), weight);
  +      return new ExactPhraseScorer(tps, similarity,
  +                                   reader.norms(field), weight);
       else
         return
  -	new SloppyPhraseScorer(tps, slop, reader.norms(field), weight);
  +	new SloppyPhraseScorer(tps, similarity, slop,
  +                               reader.norms(field), weight);
   
     }
   
  
  
  
  1.4       +7 -4      jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java
  
  Index: PhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- PhraseScorer.java	7 Nov 2002 05:55:40 -0000	1.3
  +++ PhraseScorer.java	7 Nov 2002 17:31:26 -0000	1.4
  @@ -66,9 +66,11 @@
     protected PhraseQueue pq;
     protected PhrasePositions first, last;
   
  -  PhraseScorer(TermPositions[] tps, byte[] n, float w) throws IOException {
  -    norms = n;
  -    weight = w;
  +  PhraseScorer(TermPositions[] tps, Similarity similarity,
  +               byte[] norms, float weight) throws IOException {
  +    super(similarity);
  +    this.norms = norms;
  +    this.weight = weight;
   
       // use PQ to build a sorted list of PhrasePositions
       pq = new PhraseQueue(tps.length);
  @@ -78,6 +80,7 @@
     }
   
     final void score(HitCollector results, int end) throws IOException {
  +    Similarity similarity = getSimilarity();
       while (last.doc < end) {			  // find doc w/ all the terms
         while (first.doc < last.doc) {		  // scan forward in first
   	do {
  @@ -92,7 +95,7 @@
         float freq = phraseFreq();		  // check for phrase
   
         if (freq > 0.0) {
  -	float score = Similarity.tf(freq)*weight; // compute score
  +	float score = similarity.tf(freq)*weight; // compute score
   	score *= Similarity.decodeNorm(norms[first.doc]); // normalize
   	results.collect(first.doc, score);	  // add to results
         }
  
  
  
  1.4       +2 -2      jakarta-lucene/src/java/org/apache/lucene/search/PrefixQuery.java
  
  Index: PrefixQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PrefixQuery.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- PrefixQuery.java	17 Jul 2002 17:38:04 -0000	1.3
  +++ PrefixQuery.java	7 Nov 2002 17:31:26 -0000	1.4
  @@ -90,8 +90,8 @@
       }
     }
   
  -  Scorer scorer(IndexReader reader) throws IOException {
  -    return getQuery().scorer(reader);
  +  Scorer scorer(IndexReader reader, Similarity similarity) throws IOException {
  +    return getQuery().scorer(reader, similarity);
     }
   
     private BooleanQuery getQuery() throws IOException {
  
  
  
  1.8       +9 -8      jakarta-lucene/src/java/org/apache/lucene/search/Query.java
  
  Index: Query.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Query.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Query.java	7 Nov 2002 05:55:40 -0000	1.7
  +++ Query.java	7 Nov 2002 17:31:26 -0000	1.8
  @@ -86,18 +86,19 @@
       abstract void normalize(float norm);
   
       // query evaluation
  -    abstract Scorer scorer(IndexReader reader) throws IOException;
  +    abstract Scorer scorer(IndexReader reader, Similarity similarity)
  +      throws IOException;
   
       void prepare(IndexReader reader) {}
   
       static Scorer scorer(Query query, Searcher searcher, IndexReader reader)
  -	throws IOException
  -    {
  -	query.prepare(reader);
  -	float sum = query.sumOfSquaredWeights(searcher);
  -	float norm = 1.0f / (float)Math.sqrt(sum);
  -	query.normalize(norm);
  -	return query.scorer(reader);
  +      throws IOException {
  +      Similarity similarity = searcher.getSimilarity();
  +      query.prepare(reader);
  +      float sum = query.sumOfSquaredWeights(searcher);
  +      float norm = similarity.queryNorm(sum);
  +      query.normalize(norm);
  +      return query.scorer(reader, similarity);
       }
   
       /**
  
  
  
  1.5       +2 -2      jakarta-lucene/src/java/org/apache/lucene/search/RangeQuery.java
  
  Index: RangeQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/RangeQuery.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- RangeQuery.java	17 Jul 2002 17:38:04 -0000	1.4
  +++ RangeQuery.java	7 Nov 2002 17:31:26 -0000	1.5
  @@ -113,9 +113,9 @@
           }
       }
       
  -    Scorer scorer(IndexReader reader) throws IOException
  +    Scorer scorer(IndexReader reader, Similarity similarity) throws IOException
       {
  -        return getQuery().scorer(reader);
  +        return getQuery().scorer(reader, similarity);
       }
       
       private BooleanQuery getQuery() throws IOException
  
  
  
  1.2       +10 -0     jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java
  
  Index: Scorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Scorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- Scorer.java	18 Sep 2001 16:29:58 -0000	1.1
  +++ Scorer.java	7 Nov 2002 17:31:26 -0000	1.2
  @@ -57,5 +57,15 @@
   import java.io.IOException;
   
   abstract class Scorer {
  +  private Similarity similarity;
  +
  +  protected Scorer(Similarity similarity) {
  +    this.similarity = similarity;
  +  }
  +
  +  public Similarity getSimilarity() {
  +    return this.similarity;
  +  }
  +
     abstract void score(HitCollector hc, int maxDoc) throws IOException;
   }
  
  
  
  1.7       +15 -8     jakarta-lucene/src/java/org/apache/lucene/search/Searcher.java
  
  Index: Searcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Searcher.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- Searcher.java	7 Nov 2002 05:55:40 -0000	1.6
  +++ Searcher.java	7 Nov 2002 17:31:26 -0000	1.7
  @@ -63,9 +63,6 @@
    * Implements some common utility methods.
    */
   public abstract class Searcher implements Searchable {
  -
  -  protected Similarity similarity;
  -
     /** Returns the documents matching <code>query</code>. */
     public final Hits search(Query query) throws IOException {
       return search(query, (Filter)null);
  @@ -91,12 +88,22 @@
       search(query, (Filter)null, results);
     }    
   
  -  /**
  -   * Sets the <code>Similarity</code> implementation to use.
  +  /** The Similarity implementation used by this searcher. */
  +  private Similarity similarity = Similarity.getDefault();
  +
  +  /** Expert: Set the Similarity implementation used by this Searcher.
  +   *
  +   * @see Similarity#setDefault(Similarity)
  +   */
  +  public void setSimilarity(Similarity similarity) {
  +    this.similarity = similarity;
  +  }
  +
  +  /** Expert: Return the Similarity implementation used by this Searcher.
      *
  -   * @param sim an instance of a class that implements  <code>Similarity</code
  +   * <p>This defaults to the current value of {@link Similarity#getDefault()}.
      */
  -  public void setSimilarity(Similarity sim) {
  -    similarity = sim;
  +  public Similarity getSimilarity() {
  +    return this.similarity;
     }
   }
  
  
  
  1.4       +201 -39   jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java
  
  Index: Similarity.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/Similarity.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Similarity.java	7 Nov 2002 05:55:40 -0000	1.3
  +++ Similarity.java	7 Nov 2002 17:31:26 -0000	1.4
  @@ -55,14 +55,73 @@
    */
   
   import java.io.IOException;
  +import java.util.Vector;
   import org.apache.lucene.index.Term;
  +import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
  +import org.apache.lucene.index.IndexReader;
  +import org.apache.lucene.index.IndexWriter;
   
  -/** Internal class used for scoring.
  - * <p>Public only so that the indexing code can compute and store the
  - * normalization byte for each document. */
  +/** Expert: Scoring API.
  + * <p>Subclasses implement search scoring.
  + *
  + * <p>The score of query <code>q</code> for document <code>d</code> is defined
  + * in terms of these methods as follows:
  + *
  + * <table cellpadding="0" cellspacing="0" border="0">
  + *  <tr>
  + *    <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
  + *    <td valign="middle" align="center">
  + *    <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
  + *    <td valign="middle"><small>
  + *    {@link #tf(int) tf}(t in d) *
  + *    {@link #idf(Term,Searcher) idf}(t) *
  + *    {@link Field#getBoost getBoost}(t.field in d) *
  + *    {@link #lengthNorm(String,int) lengthNorm}(t.field in d)
  + *    </small></td>
  + *    <td valign="middle" rowspan="2">&nbsp;*
  + *    {@link #coord(int,int) coord}(q,d) *
  + *    {@link #queryNorm(float) queryNorm}(q)
  + *    </td>
  + *  </tr>
  + *  <tr> 
  + *   <td valign="top" align="right">
  + *    <small>t in q</small>
  + *    </td>
  + *  </tr>
  + * </table>
  + *
  + * @see #setDefault(Similarity)
  + * @see IndexWriter#setSimilarity(Similarity)
  + * @see Searcher#setSimilarity(Similarity)
  + */
   public abstract class Similarity {
  +  /** The Similarity implementation used by default. */
  +  private static Similarity defaultImpl = new DefaultSimilarity();
   
  +  /** Set the default Similarity implementation used by indexing and search
  +   * code.
  +   *
  +   * @see Searcher#setSimilarity(Similarity)
  +   * @see IndexWriter#setSimilarity(Similarity)
  +   */
  +  public static void setDefault(Similarity similarity) {
  +    Similarity.defaultImpl = similarity;
  +  }
  +
  +  /** Return the default Similarity implementation used by indexing and search
  +   * code.
  +   *
  +   * <p>This is initially an instance of {@link DefaultSimilarity}.
  +   *
  +   * @see Searcher#setSimilarity(Similarity)
  +   * @see IndexWriter#setSimilarity(Similarity)
  +   */
  +  public static Similarity getDefault() {
  +    return Similarity.defaultImpl;
  +  }
  +
  +  /** Cache of decoded bytes. */
     private static final float[] NORM_TABLE = new float[256];
   
     static {
  @@ -70,37 +129,47 @@
         NORM_TABLE[i] = byteToFloat((byte)i);
     }
   
  -  private static Similarity similarity;
  -
  -  private Similarity() {}			  // no public constructor
  -
  -  /**
  -   * Sets the <code>Similarity</code> implementation to use.
  -   *
  -   * @param sim an instance of a class that implements  <code>Similarity</code
  +  /** Decodes a normalization factor stored in an index.
  +   * @see #encodeNorm(float)
      */
  -  public static void setDefaultSimilarity(Similarity sim) {
  -    similarity = sim;
  +  public static float decodeNorm(byte b) {
  +    return NORM_TABLE[b & 0xFF];
     }
   
  -  /** Computes the normalization value for a document given the total number of
  -   * terms contained in a field.  These values are stored in an index and used
  -   * by the search code.
  +  /** Computes the normalization value for a field given the total number of
  +   * terms contained in a field.  These values, together with field boosts, are
  +   * stored in an index and multipled into scores for hits on each field by the
  +   * search code.
  +   *
  +   * <p>Matches in longer fields are less precise, so implemenations of this
  +   * method usually return smaller values when <code>numTokens</code> is large,
  +   * and larger values when <code>numTokens</code> is small.
  +   *
  +   * <p>That these values are computed under {@link
  +   * IndexWriter#addDocument(Document)} and stored then using
  +   * {#encodeNorm(float)}.  Thus they have limited precision, and documents
  +   * must be re-indexed if this method is altered.
      *
  -   * <p>The formula used is: <code>1.0f / Math.sqrt(numTerms)</code>
  +   * @param fieldName the name of the field
  +   * @param numTokens the total number of tokens contained in fields named
  +   * <i>fieldName</i> of <i>doc</i>.
  +   * @return a normalization factor for hits on this field of this document
      *
      * @see Field#setBoost(float)
      */
  -  public static float normalizeLength(int numTerms) {
  -    return (float)(1.0 / Math.sqrt(numTerms));
  -  }
  -  
  -  /** Decodes a normalization factor stored in an index.
  -   * @see #encodeNorm(float)
  +  public abstract float lengthNorm(String fieldName, int numTokens);
  +
  +  /** Computes the normalization value for a query given the sum of the squared
  +   * weights of each of the query terms.  This value is then multipled into the
  +   * weight of each query term.
  +   *
  +   * <p>This does not affect ranking, but rather just attempts to make scores
  +   * from different queries comparable.
  +   *
  +   * @param sumOfSquaredWeights the sum of the squares of query term weights
  +   * @return a normalization factor for query weights
      */
  -  public static float decodeNorm(byte b) {
  -    return NORM_TABLE[b & 0xFF];
  -  }
  +  public abstract float queryNorm(float sumOfSquaredWeights);
   
     /** Encodes a normalization factor for storage in an index.  
      *
  @@ -151,25 +220,118 @@
       return (byte)((exponent << 3) | mantissa);    // pack into a byte
      }
   
  -  static final float tf(int freq) {
  -    return (float)Math.sqrt(freq);
  -  }
   
  -  static final float tf(float freq) {
  -    return (float)Math.sqrt(freq);
  +  /** Computes a score factor based on a term or phrase's frequency in a
  +   * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
  +   * factor for each term in the query and these products are then summed to
  +   * form the initial score for a document.
  +   *
  +   * <p>Terms and phrases repeated in a document indicate the topic of the
  +   * document, so implemenations of this method usually return larger values
  +   * when <code>freq</code> is large, and smaller values when <code>freq</code>
  +   * is small.
  +   *
  +   * <p>The default implementation calls {@link #tf(float)}.
  +   *
  +   * @param tf the frequency of a term within a document
  +   * @return a score factor based on a term's within-document frequency
  +   */
  +  public float tf(int freq) {
  +    return tf((float)freq);
     }
  +
  +  /** Computes the amount of a sloppy phrase match, based on an edit distance.
  +   * This value is summed for each sloppy phrase match in a document to form
  +   * the frequency that is passed to {@link #tf(float)}.
  +   *
  +   * <p>A phrase match with a small edit distance to a document passage more
  +   * closely matches the document, so implemenations of this method usually
  +   * return larger values when the edit distance is small and smaller values
  +   * when it is large.
  +   *
  +   * @see PhraseQuery#setSlop(int)
  +   * @param distance the edit distance of this sloppy phrase match
  +   * @return the frequency increment for this match
  +   */
  +  public abstract float sloppyFreq(int distance);
  +
  +  /** Computes a score factor based on a term or phrase's frequency in a
  +   * document.  This value is multiplied by the {@link #idf(Term, Searcher)}
  +   * factor for each term in the query and these products are then summed to
  +   * form the initial score for a document.
  +   *
  +   * <p>Terms and phrases repeated in a document indicate the topic of the
  +   * document, so implemenations of this method usually return larger values
  +   * when <code>freq</code> is large, and smaller values when <code>freq</code>
  +   * is small.
  +   *
  +   * @param tf the frequency of a term within a document
  +   * @return a score factor based on a term's within-document frequency
  +   */
  +  public abstract float tf(float freq);
       
  -  static final float idf(Term term, Searcher searcher) throws IOException {
  -    // Use maxDoc() instead of numDocs() because its proportional to docFreq(),
  -    // i.e., when one is inaccurate, so is the other, and in the same way.
  +  /** Computes a score factor for a simple term.
  +   *
  +   * <p>The default implementation is:<pre>
  +   *   return idf(searcher.docFreq(term), searcher.maxDoc());
  +   * </pre>
  +   *
  +   * Note that {@link Searcher#maxDoc()} is used instead of {@link
  +   * IndexReader#numDocs()} because it is proportional to {@link
  +   * Searcher#docFreq(Term)} , i.e., when one is inaccurate, so is the other,
  +   * and in the same direction.
  +   *
  +   * @param term the term in question
  +   * @param searcher the document collection being searched
  +   * @return a score factor for the term
  +   */
  +  public float idf(Term term, Searcher searcher) throws IOException {
       return idf(searcher.docFreq(term), searcher.maxDoc());
     }
   
  -  static final float idf(int docFreq, int numDocs) {
  -    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
  +  /** Computes a score factor for a phrase.
  +   *
  +   * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
  +   * for each term in the phrase.
  +   *
  +   * @param terms the vector of terms in the phrase
  +   * @param searcher the document collection being searched
  +   * @return a score factor for the phrase
  +   */
  +  public float idf(Vector terms, Searcher searcher) throws IOException {
  +    float idf = 0.0f;
  +    for (int i = 0; i < terms.size(); i++) {
  +      idf += idf((Term)terms.elementAt(i), searcher);
  +    }
  +    return idf;
     }
  +
  +  /** Computes a score factor based on a term's document frequency (the number
  +   * of documents which contain the term).  This value is multiplied by the
  +   * {@link #tf(int)} factor for each term in the query and these products are
  +   * then summed to form the initial score for a document.
  +   *
  +   * <p>Terms that occur in fewer documents are better indicators of topic, so
  +   * implemenations of this method usually return larger values for rare terms,
  +   * and smaller values for common terms.
  +   *
  +   * @param docFreq the number of documents which contain the term
  +   * @param numDocs the total number of documents in the collection
  +   * @return a score factor based on the term's document frequency
  +   */
  +  protected abstract float idf(int docFreq, int numDocs);
       
  -  static final float coord(int overlap, int maxOverlap) {
  -    return overlap / (float)maxOverlap;
  -  }
  +  /** Computes a score factor based on the fraction of all query terms that a
  +   * document contains.  This value is multiplied into scores.
  +   *
  +   * <p>The presence of a large portion of the query terms indicates a better
  +   * match with the query, so implemenations of this method usually return
  +   * larger values when the ratio between these parameters is large and smaller
  +   * values when the ratio between them is small.
  +   *
  +   * @param overlap the number of query terms matched in the document
  +   * @param maxOverlap the total number of terms in the query
  +   * @return a score factor based on term overlap with the query
  +   */
  +  public abstract float coord(int overlap, int maxOverlap);
   }
  
  
  
  1.2       +5 -5      jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
  
  Index: SloppyPhraseScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- SloppyPhraseScorer.java	18 Sep 2001 16:29:58 -0000	1.1
  +++ SloppyPhraseScorer.java	7 Nov 2002 17:31:26 -0000	1.2
  @@ -62,10 +62,10 @@
   final class SloppyPhraseScorer extends PhraseScorer {
     private int slop;
   
  -  SloppyPhraseScorer(TermPositions[] tps, int s, byte[] n, float w)
  -       throws IOException {
  -    super(tps, n, w);
  -    slop = s;
  +  SloppyPhraseScorer(TermPositions[] tps, Similarity similarity,
  +                     int slop, byte[] norms, float weight) throws IOException {
  +    super(tps, similarity, norms, weight);
  +    this.slop = slop;
     }
   
     protected final float phraseFreq() throws IOException {
  @@ -94,7 +94,7 @@
   
         int matchLength = end - start;
         if (matchLength <= slop)
  -	freq += 1.0 / (matchLength + 1);	  // penalize longer matches
  +	freq += getSimilarity().sloppyFreq(matchLength); // score match
   
         if (pp.position > end)
   	end = pp.position;
  
  
  
  1.4       +4 -3      jakarta-lucene/src/java/org/apache/lucene/search/TermQuery.java
  
  Index: TermQuery.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermQuery.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- TermQuery.java	17 Jul 2002 17:38:04 -0000	1.3
  +++ TermQuery.java	7 Nov 2002 17:31:26 -0000	1.4
  @@ -73,7 +73,7 @@
     }
   
     final float sumOfSquaredWeights(Searcher searcher) throws IOException {
  -    idf = Similarity.idf(term, searcher);
  +    idf = searcher.getSimilarity().idf(term, searcher);
       weight = idf * boost;
       return weight * weight;			  // square term weights
     }
  @@ -83,14 +83,15 @@
       weight *= idf;				  // factor from document
     }
   
  -  Scorer scorer(IndexReader reader)
  +  Scorer scorer(IndexReader reader, Similarity similarity)
          throws IOException {
       TermDocs termDocs = reader.termDocs(term);
   
       if (termDocs == null)
         return null;
       
  -    return new TermScorer(termDocs, reader.norms(term.field()), weight);
  +    return new TermScorer(termDocs, similarity,
  +                          reader.norms(term.field()), weight);
     }
   
     /** Prints a user-readable version of this query. */
  
  
  
  1.3       +11 -8     jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java
  
  Index: TermScorer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- TermScorer.java	29 Jul 2002 19:11:15 -0000	1.2
  +++ TermScorer.java	7 Nov 2002 17:31:26 -0000	1.3
  @@ -63,21 +63,23 @@
     private float weight;
     private int doc;
   
  -  private final int[] docs = new int[128];	  // buffered doc numbers
  -  private final int[] freqs = new int[128];	  // buffered term freqs
  +  private final int[] docs = new int[32];	  // buffered doc numbers
  +  private final int[] freqs = new int[32];	  // buffered term freqs
     private int pointer;
     private int pointerMax;
   
     private static final int SCORE_CACHE_SIZE = 32;
     private float[] scoreCache = new float[SCORE_CACHE_SIZE];
   
  -  TermScorer(TermDocs td, byte[] n, float w) throws IOException {
  -    termDocs = td;
  -    norms = n;
  -    weight = w;
  +  TermScorer(TermDocs td, Similarity similarity, byte[] norms, float weight)
  +    throws IOException {
  +    super(similarity);
  +    this.termDocs = td;
  +    this.norms = norms;
  +    this.weight = weight;
   
       for (int i = 0; i < SCORE_CACHE_SIZE; i++)
  -      scoreCache[i] = Similarity.tf(i) * weight;
  +      scoreCache[i] = getSimilarity().tf(i) * weight;
   
       pointerMax = termDocs.read(docs, freqs);	  // fill buffers
   
  @@ -91,12 +93,13 @@
   
     final void score(HitCollector c, final int end) throws IOException {
       int d = doc;				  // cache doc in local
  +    Similarity similarity = getSimilarity();      // cache sim in local
       while (d < end) {				  // for docs in window
         final int f = freqs[pointer];
         float score =				  // compute tf(f)*weight
   	f < SCORE_CACHE_SIZE			  // check cache
   	 ? scoreCache[f]			  // cache hit
  -	 : Similarity.tf(f)*weight;		  // cache miss
  +	 : similarity.tf(f)*weight;		  // cache miss
   
         score *= Similarity.decodeNorm(norms[d]);	  // normalize for field
   
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java
  
  Index: DefaultSimilarity.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.document.Document;
  
  /** Expert: Default scoring implementation. */
  public class DefaultSimilarity extends Similarity {
    /** Implemented as <code>1/sqrt(numTerms)</code>. */
    public float lengthNorm(String fieldName, int numTerms) {
      return (float)(1.0 / Math.sqrt(numTerms));
    }
    
    /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
    public float queryNorm(float sumOfSquaredWeights) {
      return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
    }
  
    /** Implemented as <code>sqrt(freq)</code>. */
    public float tf(float freq) {
      return (float)Math.sqrt(freq);
    }
      
    /** Implemented as <code>1 / (distance + 1)</code>. */
    public float sloppyFreq(int distance) {
      return 1.0f / (distance + 1);
    }
      
    /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
    public float idf(int docFreq, int numDocs) {
      return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
    }
      
    /** Implemented as <code>overlap / maxOverlap</code>. */
    public float coord(int overlap, int maxOverlap) {
      return overlap / (float)maxOverlap;
    }
  }
  
  
  
  1.4       +3 -1      jakarta-lucene/src/test/org/apache/lucene/index/DocTest.java
  
  Index: DocTest.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/index/DocTest.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- DocTest.java	26 Jan 2002 15:01:32 -0000	1.3
  +++ DocTest.java	7 Nov 2002 17:31:27 -0000	1.4
  @@ -59,6 +59,7 @@
   import org.apache.lucene.store.FSDirectory;
   import org.apache.lucene.store.Directory;
   import org.apache.lucene.document.Document;
  +import org.apache.lucene.search.Similarity;
   import org.apache.lucene.demo.FileDocument;
   
   import java.io.File;
  @@ -95,7 +96,8 @@
          throws Exception {
       Directory directory = FSDirectory.getDirectory("test", false);
       Analyzer analyzer = new SimpleAnalyzer();
  -    DocumentWriter writer = new DocumentWriter(directory, analyzer, 1000);
  +    DocumentWriter writer =
  +      new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000);
   
       File file = new File(fileName);
       Document doc = FileDocument.Document(file);
  
  
  
  1.2       +2 -2      jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java
  
  Index: TestDocBoost.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/search/TestDocBoost.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TestDocBoost.java	29 Jul 2002 19:11:15 -0000	1.1
  +++ TestDocBoost.java	7 Nov 2002 17:31:27 -0000	1.2
  @@ -76,7 +76,7 @@
       super(name);
     }
     
  -  public static void test() throws Exception {
  +  public void testDocBoost() throws Exception {
       RAMDirectory store = new RAMDirectory();
       IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
       
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/search/TestSimilarity.java
  
  Index: TestSimilarity.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.search.Query;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.analysis.SimpleAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import junit.framework.TestCase;
  
  import java.util.Vector;
  
   /** Similarity unit test.
    *
    * @author Doug Cutting
    * @version $Revision: 1.1 $
    */
  public class TestSimilarity extends TestCase {
    public TestSimilarity(String name) {
      super(name);
    }
    
    public static class SimpleSimilarity extends Similarity {
      public float lengthNorm(String field, int numTerms) { return 1.0f; }
      public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
      public float tf(float freq) { return freq; }
      public float sloppyFreq(int distance) { return 2.0f; }
      public float idf(Vector terms, Searcher searcher) { return 1.0f; }
      public float idf(int docFreq, int numDocs) { return 1.0f; }
      public float coord(int overlap, int maxOverlap) { return 1.0f; }
    }
  
    public void testSimilarity() throws Exception {
      RAMDirectory store = new RAMDirectory();
      IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
      writer.setSimilarity(new SimpleSimilarity());
      
      Document d1 = new Document();
      d1.add(Field.Text("field", "a c"));
  
      Document d2 = new Document();
      d2.add(Field.Text("field", "a b c"));
      
      writer.addDocument(d1);
      writer.addDocument(d2);
      writer.optimize();
      writer.close();
  
      final float[] scores = new float[4];
  
      Searcher searcher = new IndexSearcher(store);
      searcher.setSimilarity(new SimpleSimilarity());
  
      Term a = new Term("field", "a");
      Term b = new Term("field", "b");
      Term c = new Term("field", "c");
  
      searcher.search
        (new TermQuery(b),
         new HitCollector() {
           public final void collect(int doc, float score) {
             assertTrue(score == 1.0f);
           }
         });
  
      BooleanQuery bq = new BooleanQuery();
      bq.add(new TermQuery(a), false, false);
      bq.add(new TermQuery(b), false, false);
      //System.out.println(bq.toString("field"));
      searcher.search
        (bq,
         new HitCollector() {
           public final void collect(int doc, float score) {
             //System.out.println("Doc=" + doc + " score=" + score);
             assertTrue(score == (float)doc+1);
           }
         });
  
      PhraseQuery pq = new PhraseQuery();
      pq.add(a);
      pq.add(c);
      //System.out.println(pq.toString("field"));
      searcher.search
        (pq,
         new HitCollector() {
           public final void collect(int doc, float score) {
             //System.out.println("Doc=" + doc + " score=" + score);
             assertTrue(score == 1.0f);
           }
         });
  
      pq.setSlop(2);
      //System.out.println(pq.toString("field"));
      searcher.search
        (pq,
         new HitCollector() {
           public final void collect(int doc, float score) {
             //System.out.println("Doc=" + doc + " score=" + score);
             assertTrue(score == 2.0f);
           }
         });
    }
  }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message