lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dspen...@apache.org
Subject svn commit: r169512 - /lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
Date Tue, 10 May 2005 19:29:58 GMT
Author: dspencer
Date: Tue May 10 12:29:56 2005
New Revision: 169512

URL: http://svn.apache.org/viewcvs?rev=169512&view=rev
Log:
Logic ignored stop words were in a early version of this code but it was taken out in the
belief that there
was no point in explicitly looking for them as the scoring algorithm would effictively ignore
them.

I did a test and indexed 700 pages on a corporate web site and then ran the MoreLikeThis code
on them
and 1/2 of the docs had stop words identified as interesting.

So - I added code in to ignore stop words, but make it backward compatible so that by default
this code
is not used.



Modified:
    lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java

Modified: lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
URL: http://svn.apache.org/viewcvs/lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java?rev=169512&r1=169511&r2=169512&view=diff
==============================================================================
--- lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
(original)
+++ lucene/java/trunk/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
Tue May 10 12:29:56 2005
@@ -32,6 +32,7 @@
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 
+import java.util.Set;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Collection;
@@ -128,6 +129,7 @@
  * <li> {@link #setMaxWordLen setMaxWordLen(...)}
  * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
  * <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
+ * <li> {@link #setStopWords setStopWord(...)} 
  * </ul> 
  *
  * <hr>
@@ -201,6 +203,20 @@
      */
     public static final int DEFAULT_MAX_WORD_LENGTH = 0;
 
+	/**
+	 * Default set of stopwords.
+	 * If null means to allow stop words.
+	 *
+	 * @see #setStopWords
+	 * @see #getStopWords
+	 */
+	public static final Set DEFAULT_STOP_WORDS = null;
+
+	/**
+	 * Current set of stop words.
+	 */
+	private Set stopWords = DEFAULT_STOP_WORDS;
+
     /**
      * Return a Query with no more than this many terms.
      *
@@ -417,6 +433,30 @@
         this.maxWordLen = maxWordLen;
     }
 
+	/**
+	 * Set the set of stopwords.
+	 * Any word in this set is considered "uninteresting" and ignored.
+	 * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code
to ignore them, as
+	 * for the purposes of document similarity it seems reasonable to assume that "a stop word
is never interesting".
+	 * 
+	 * @param stopWords set of stopwords, if null it means to allow stop words
+	 *
+	 * @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
+	 * @see #getStopWords	 
+	 */
+	public void setStopWords(Set stopWords) {
+		this.stopWords = stopWords;
+	}
+
+	/**
+	 * Get the current stop words being used.
+	 * @see #setStopWords
+	 */
+	public Set getStopWords() {
+		return stopWords;
+	}
+		
+
     /**
      * Returns the maximum number of query terms that will be included in any generated query.
      * The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
@@ -791,6 +831,9 @@
 			return true;
 		}
 		if (maxWordLen > 0 && len > maxWordLen) {
+			return true;
+		}
+		if (stopWords != null && stopWords.contains( term)) {
 			return true;
 		}
 		return false;



Mime
View raw message