lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r827042 - in /lucene/java/trunk/contrib: CHANGES.txt queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
Date Tue, 20 Oct 2009 11:59:54 GMT
Author: mikemccand
Date: Tue Oct 20 11:59:53 2009
New Revision: 827042

URL: http://svn.apache.org/viewvc?rev=827042&view=rev
Log:
LUCENE-1993: add maxDocFreq to MoreLikeThis

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=827042&r1=827041&r2=827042&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Tue Oct 20 11:59:53 2009
@@ -53,6 +53,10 @@
    any number of output parts, at the cost of doing multiple passes over
    the input index. (Andrzej Bialecki)
 
+ * LUCENE-1993: Add maxDocFreq setting to MoreLikeThis, to exclude
+   from consideration terms that match more than the specified number
+   of documents.  (Christian Steinert via Mike McCandless)
+
 Optimizations
 
  * LUCENE-1965, LUCENE-1962: Arabic-, Persian- and SmartChineseAnalyzer

Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java?rev=827042&r1=827041&r2=827042&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
(original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
Tue Oct 20 11:59:53 2009
@@ -128,6 +128,8 @@
  * <ul>
  * <li> {@link #setMinTermFreq setMinTermFreq(...)}
  * <li> {@link #setMinDocFreq setMinDocFreq(...)}
+ * <li> {@link #setMaxDocFreq setMaxDocFreq(...)}
+ * <li> {@link #setMaxDocFreqPct setMaxDocFreqPct(...)}
  * <li> {@link #setMinWordLen setMinWordLen(...)}
  * <li> {@link #setMaxWordLen setMaxWordLen(...)}
  * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
@@ -177,6 +179,14 @@
     public static final int DEFAULT_MIN_DOC_FREQ = 5;
 
     /**
+     * Ignore words which occur in more than this many docs.
+	 * @see #getMaxDocFreq
+	 * @see #setMaxDocFreq	 
+	 * @see #setMaxDocFreqPct	 
+     */
+    public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
+    
+    /**
      * Boost terms in query based on score.
 	 * @see #isBoost
 	 * @see #setBoost 
@@ -241,6 +251,11 @@
      */
     private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
 
+	/**
+     * Ignore words which occur in more than this many docs.
+	 */
+	private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
+    
     /**
      * Should we apply a boost to the Query based on the scores?
      */
@@ -388,6 +403,43 @@
     }
 
     /**
+     * Returns the maximum frequency in which words may still appear. 
+     * Words that appear in more than this many docs will be ignored. The default frequency
is 
+     * {@link #DEFAULT_MAX_DOC_FREQ}.
+     *
+     * @return get the maximum frequency at which words are still allowed,  
+     * words which occur in more docs than this are ignored.
+     */
+    public int getMaxDocFreq() {
+        return maxDocFreq;
+    }
+
+	/**
+     * Set the maximum frequency in which words may still appear. Words that appear
+     * in more than this many docs will be ignored.
+	 * 
+	 * @param maxFreq
+	 *            the maximum count of documents that a term may appear 
+	 *            in to be still considered relevant
+	 */
+	public void setMaxDocFreq(int maxFreq) {
+		this.maxDocFreq = maxFreq;
+	}
+
+	/**
+     * Set the maximum percentage in which words may still appear. Words that appear
+     * in more than this many percent of all docs will be ignored.
+	 * 
+	 * @param maxPercentage
+	 *            the maximum percentage of documents (0-100) that a term may appear 
+	 *            in to be still considered relevant
+	 */
+	public void setMaxDocFreqPct(int maxPercentage) {
+		this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
+	}
+
+	
+    /**
      * Returns whether to boost terms in query based on "score" or not. The default is
      * {@link #DEFAULT_BOOST}.
      *
@@ -660,6 +712,10 @@
                 continue; // filter out words that don't occur in enough docs
             }
 
+            if (docFreq > maxDocFreq) {
+                continue; // filter out words that occur in too many docs            	
+            }
+
             if (docFreq == 0) {
                 continue; // index update problem?
             }



Mime
View raw message