Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 8094 invoked from network); 12 Nov 2005 09:04:02 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur.apache.org with SMTP; 12 Nov 2005 09:04:02 -0000 Received: (qmail 2185 invoked by uid 500); 12 Nov 2005 09:04:01 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 2132 invoked by uid 500); 12 Nov 2005 09:04:01 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 2121 invoked by uid 99); 12 Nov 2005 09:04:01 -0000 Received: from asf.osuosl.org (HELO asf.osuosl.org) (140.211.166.49) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 12 Nov 2005 01:04:01 -0800 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received: from [209.237.227.194] (HELO minotaur.apache.org) (209.237.227.194) by apache.org (qpsmtpd/0.29) with SMTP; Sat, 12 Nov 2005 01:03:52 -0800 Received: (qmail 7957 invoked by uid 65534); 12 Nov 2005 09:03:39 -0000 Message-ID: <20051112090339.7956.qmail@minotaur.apache.org> Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r332747 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/search/regex/ src/test/org/apache/lucene/search/regex/ Date: Sat, 12 Nov 2005 09:03:36 -0000 To: java-commits@lucene.apache.org From: ehatcher@apache.org X-Mailer: svnmailer-1.0.5 X-Virus-Checked: Checked by ClamAV on apache.org X-Spam-Rating: minotaur.apache.org 1.6.2 0/1000/N Author: ehatcher Date: Sat Nov 12 01:03:26 2005 New Revision: 332747 URL: http://svn.apache.org/viewcvs?rev=332747&view=rev Log: Add RegexQuery and SpanRegexQuery Added: lucene/java/trunk/src/java/org/apache/lucene/search/regex/ lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexQuery.java lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexTermEnum.java lucene/java/trunk/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java lucene/java/trunk/src/test/org/apache/lucene/search/regex/ lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestRegexQuery.java lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java Modified: lucene/java/trunk/CHANGES.txt Modified: lucene/java/trunk/CHANGES.txt URL: http://svn.apache.org/viewcvs/lucene/java/trunk/CHANGES.txt?rev=332747&r1=332746&r2=332747&view=diff ============================================================================== --- lucene/java/trunk/CHANGES.txt (original) +++ lucene/java/trunk/CHANGES.txt Sat Nov 12 01:03:26 2005 @@ -163,6 +163,11 @@ highlighting entire documents or fields. (Erik Hatcher) +23. Added regular expression queries, RegexQuery and SpanRegexQuery. + Note the same term enumeration caveats apply with these queries as + apply to WildcardQuery and other term expanding queries. + (Erik Hatcher) + API Changes 1. Several methods and fields have been deprecated. The API documentation Added: lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexQuery.java URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexQuery.java?rev=332747&view=auto ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexQuery.java (added) +++ lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexQuery.java Sat Nov 12 01:03:26 2005 @@ -0,0 +1,26 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; + +public class RegexQuery extends MultiTermQuery { + public RegexQuery(Term term) { + super(term); + } + + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + Term term = new Term(getTerm().field(), getTerm().text()); + return new RegexTermEnum(reader, term); + } + + public boolean equals(Object o) { + if (o instanceof RegexQuery) + return super.equals(o); + + return false; + } +} Added: lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexTermEnum.java URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexTermEnum.java?rev=332747&view=auto ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexTermEnum.java (added) +++ lucene/java/trunk/src/java/org/apache/lucene/search/regex/RegexTermEnum.java Sat Nov 12 01:03:26 2005 @@ -0,0 +1,65 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import java.util.regex.Pattern; +import java.io.IOException; + +public class RegexTermEnum extends FilteredTermEnum { + private String field = ""; + private String pre = ""; + boolean endEnum = false; + private Pattern pattern; + + public RegexTermEnum(IndexReader reader, Term term) throws IOException { + super(); + field = term.field(); + String text = term.text(); + + pattern = Pattern.compile(text); + + // Find the first regex character position, to find the + // maximum prefix to use for term enumeration + int index = 0; + while (index < text.length()) { + char c = text.charAt(index); + + // TODO: improve the logic here. There are other types of patterns + // that could break this, such as "\d*" and "\*abc" + if (c == '*' || c == '[' || c == '?' || c == '.') break; + + index++; + } + + pre = text.substring(0, index); + + setEnum(reader.terms(new Term(term.field(), pre))); + } + + protected final boolean termCompare(Term term) { + if (field == term.field()) { + String searchText = term.text(); + if (searchText.startsWith(pre)) { + return pattern.matcher(searchText).matches(); + } + } + endEnum = true; + return false; + } + + public final float difference() { +// TODO: adjust difference based on distance of searchTerm.text() and term().text() + return 1.0f; + } + + public final boolean endEnum() { + return endEnum; + } + + public void close() throws IOException { + super.close(); + field = null; + } +} Added: lucene/java/trunk/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java?rev=332747&view=auto ============================================================================== --- lucene/java/trunk/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java (added) +++ lucene/java/trunk/src/java/org/apache/lucene/search/regex/SpanRegexQuery.java Sat Nov 12 01:03:26 2005 @@ -0,0 +1,85 @@ +package org.apache.lucene.search.regex; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.util.ToStringUtils; + +import java.io.IOException; +import java.util.Collection; +import java.util.ArrayList; + +public class SpanRegexQuery extends SpanQuery { + private Term term; + + public SpanRegexQuery(Term term) { + this.term = term; + } + + public Query rewrite(IndexReader reader) throws IOException { + Query orig = new RegexQuery(term).rewrite(reader); + + // RegexQuery (via MultiTermQuery).rewrite always returns a BooleanQuery + BooleanQuery bq = (BooleanQuery) orig; + + BooleanClause[] clauses = bq.getClauses(); + SpanQuery[] sqs = new SpanQuery[clauses.length]; + for (int i = 0; i < clauses.length; i++) { + BooleanClause clause = clauses[i]; + + // Clauses from RegexQuery.rewrite are always TermQuery's + TermQuery tq = (TermQuery) clause.getQuery(); + + sqs[i] = new SpanTermQuery(tq.getTerm()); + sqs[i].setBoost(tq.getBoost()); + } + + SpanOrQuery query = new SpanOrQuery(sqs); + query.setBoost(orig.getBoost()); + + return query; + } + + public Spans getSpans(IndexReader reader) throws IOException { + throw new UnsupportedOperationException("Query should have been rewritten"); + } + + public String getField() { + return term.field(); + } + + public Collection getTerms() { + Collection terms = new ArrayList(); + terms.add(term); + return terms; + } + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final SpanRegexQuery that = (SpanRegexQuery) o; + + return term.equals(that.term) && getBoost() == that.getBoost(); + } + + public int hashCode() { + return term.hashCode(); + } + + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("spanRegexQuery("); + buffer.append(term); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } +} Added: lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestRegexQuery.java URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestRegexQuery.java?rev=332747&view=auto ============================================================================== --- lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestRegexQuery.java (added) +++ lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestRegexQuery.java Sat Nov 12 01:03:26 2005 @@ -0,0 +1,30 @@ +package org.apache.lucene.search.regex; + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; + +public class TestRegexQuery extends TestCase { + public void testRegex() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); + Document doc = new Document(); + doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + Query query = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*")); + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + } +} + Added: lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java URL: http://svn.apache.org/viewcvs/lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java?rev=332747&view=auto ============================================================================== --- lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java (added) +++ lucene/java/trunk/src/test/org/apache/lucene/search/regex/TestSpanRegexQuery.java Sat Nov 12 01:03:26 2005 @@ -0,0 +1,33 @@ +package org.apache.lucene.search.regex; + +import junit.framework.TestCase; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; + +public class TestSpanRegexQuery extends TestCase { + public void testSpanRegex() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); + Document doc = new Document(); + doc.add(new Field("field", "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + SpanRegexQuery srq = new SpanRegexQuery(new Term("field", "q.[aeiou]c.*")); + SpanTermQuery stq = new SpanTermQuery(new Term("field","dog")); + SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {srq, stq}, 6, true); + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + } +}