ehatcher 2003/11/27 18:03:14
Modified: src/java/org/apache/lucene/analysis StopFilter.java
. CHANGES.txt
Added: src/test/org/apache/lucene/analysis TestStopAnalyzer.java
Log:
Use position increments to account for removed stop words
Revision Changes Path
1.4 +22 -10 jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java
Index: StopFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- StopFilter.java 9 Dec 2002 19:02:20 -0000 1.3
+++ StopFilter.java 28 Nov 2003 02:03:14 -0000 1.4
@@ -57,29 +57,33 @@
import java.io.IOException;
import java.util.Hashtable;
-/** Removes stop words from a token stream. */
-
+/**
+ * Removes stop words from a token stream. Position increments
+ * on tokens emitted are adjusted to account for words
+ * removed. Exact phrase queries will not match across holes left
+ * by stop word removal, but sloppy phrase queries may match.
+ */
public final class StopFilter extends TokenFilter {
private Hashtable table;
/** Constructs a filter which removes words from the input
- TokenStream that are named in the array of words. */
+ TokenStream that are named in the array of words. */
public StopFilter(TokenStream in, String[] stopWords) {
super(in);
table = makeStopTable(stopWords);
}
/** Constructs a filter which removes words from the input
- TokenStream that are named in the Hashtable. */
+ TokenStream that are named in the Hashtable. */
public StopFilter(TokenStream in, Hashtable stopTable) {
super(in);
table = stopTable;
}
-
+
/** Builds a Hashtable from an array of stop words, appropriate for passing
- into the StopFilter constructor. This permits this table construction to
- be cached once when an Analyzer is constructed. */
+ into the StopFilter constructor. This permits this table construction to
+ be cached once when an Analyzer is constructed. */
public static final Hashtable makeStopTable(String[] stopWords) {
Hashtable stopTable = new Hashtable(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
@@ -89,10 +93,18 @@
/** Returns the next input Token whose termText() is not a stop word. */
public final Token next() throws IOException {
+ int position = 1;
+
// return the first non-stop word found
- for (Token token = input.next(); token != null; token = input.next())
- if (table.get(token.termText) == null)
- return token;
+ for (Token token = input.next(); token != null; token = input.next()) {
+ if (table.get(token.termText) == null) {
+ token.setPositionIncrement(position);
+ position = 1;
+ return token;
+ }
+
+ position++;
+ }
// reached EOS -- return null
return null;
}
1.1 jakarta-lucene/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
Index: TestStopAnalyzer.java
===================================================================
package org.apache.lucene.analysis;
import junit.framework.TestCase;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Hits;
public class TestStopAnalyzer extends TestCase {
private StopAnalyzer stopAnalyzer = new StopAnalyzer();
public Token[] tokensFromAnalyzer(Analyzer analyzer, String text)
throws Exception {
TokenStream stream =
analyzer.tokenStream("contents", new StringReader(text));
ArrayList tokenList = new ArrayList();
while (true) {
Token token = stream.next();
if (token == null) break;
tokenList.add(token);
}
return (Token[]) tokenList.toArray(new Token[0]);
}
public void testNoHoles() throws Exception {
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
"non-stop words");
assertEquals(3, tokens.length);
// ensure all words are in successive positions
assertEquals("non", 1, tokens[0].getPositionIncrement());
assertEquals("stop", 1, tokens[1].getPositionIncrement());
assertEquals("words", 1, tokens[2].getPositionIncrement());
}
public void testHoles() throws Exception {
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
"the stop words are here");
assertEquals(3, tokens.length);
// check for the holes noted by position gaps
assertEquals("stop", 2, tokens[0].getPositionIncrement());
assertEquals("words", 1, tokens[1].getPositionIncrement());
assertEquals("here", 2, tokens[2].getPositionIncrement());
}
public void testPhraseQuery() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true);
Document doc = new Document();
doc.add(Field.Text("field", "the stop words are here"));
writer.addDocument(doc);
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
// valid exact phrase query
PhraseQuery query = new PhraseQuery();
query.add(new Term("field","stop"));
query.add(new Term("field","words"));
Hits hits = searcher.search(query);
assertEquals(1, hits.length());
// incorrect attempt at exact phrase query over stop word hole
query = new PhraseQuery();
query.add(new Term("field", "words"));
query.add(new Term("field", "here"));
hits = searcher.search(query);
assertEquals(0, hits.length());
// add some slop, and match over the hole
query.setSlop(1);
hits = searcher.search(query);
assertEquals(1, hits.length());
searcher.close();
}
}
1.60 +7 -1 jakarta-lucene/CHANGES.txt
Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.59
retrieving revision 1.60
diff -u -r1.59 -r1.60
--- CHANGES.txt 26 Nov 2003 11:10:54 -0000 1.59
+++ CHANGES.txt 28 Nov 2003 02:03:14 -0000 1.60
@@ -7,6 +7,12 @@
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
throw ParseException instead. (Erik Hatcher)
+ 2. Modified StopFilter to increment positions to account for
+ stop words removed. This prevents exact phrase queries from
+ matching erroneously (use slop factor to account for missing
+ stop words). StopFilter is used by StopAnalyzer, StandardAnalyzer
+ and some others. (Erik Hatcher)
+
1.3 RC3
1. Added minMergeDocs in IndexWriter. This can be raised to speed
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org
|