Return-Path: Delivered-To: apmail-incubator-lucene-net-commits-archive@locus.apache.org Received: (qmail 28027 invoked from network); 4 Jun 2006 02:43:37 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (209.237.227.199) by minotaur.apache.org with SMTP; 4 Jun 2006 02:43:37 -0000 Received: (qmail 88898 invoked by uid 500); 4 Jun 2006 02:43:37 -0000 Delivered-To: apmail-incubator-lucene-net-commits-archive@incubator.apache.org Received: (qmail 88848 invoked by uid 500); 4 Jun 2006 02:43:36 -0000 Mailing-List: contact lucene-net-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@incubator.apache.org Delivered-To: mailing list lucene-net-commits@incubator.apache.org Received: (qmail 88812 invoked by uid 99); 4 Jun 2006 02:43:36 -0000 Received: from asf.osuosl.org (HELO asf.osuosl.org) (140.211.166.49) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 03 Jun 2006 19:43:36 -0700 X-ASF-Spam-Status: No, hits=-9.4 required=10.0 tests=ALL_TRUSTED,NO_REAL_NAME X-Spam-Check-By: apache.org Received-SPF: pass (asf.osuosl.org: local policy) Received: from [140.211.166.113] (HELO eris.apache.org) (140.211.166.113) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 03 Jun 2006 19:43:32 -0700 Received: by eris.apache.org (Postfix, from userid 65534) id 0DE8C1A9859; Sat, 3 Jun 2006 19:42:26 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r411501 [19/30] - in /incubator/lucene.net/trunk/C#/src: ./ Demo/DeleteFiles/ Demo/DemoLib/ Demo/DemoLib/HTML/ Demo/IndexFiles/ Demo/IndexHtml/ Demo/SearchFiles/ Lucene.Net/ Lucene.Net/Analysis/ Lucene.Net/Analysis/Standard/ Lucene.Net/Docu... Date: Sun, 04 Jun 2006 02:41:25 -0000 To: lucene-net-commits@incubator.apache.org From: aroush@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20060604024226.0DE8C1A9859@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org X-Spam-Rating: minotaur.apache.org 1.6.2 0/1000/N Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/FuzzyTermEnum.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/FuzzyTermEnum.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/FuzzyTermEnum.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/FuzzyTermEnum.cs Sat Jun 3 19:41:13 2006 @@ -13,123 +13,147 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; + namespace Lucene.Net.Search { - /// Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term. + /// Subclass of FilteredTermEnum for enumerating all terms that are similiar + /// to the specified filter term. + /// ///

Term enumerations are always ordered by Term.compareTo(). Each term in - /// the enumeration is greater than all that precede it. + /// the enumeration is greater than all that precede it. ///

- public sealed class FuzzyTermEnum:FilteredTermEnum + public sealed class FuzzyTermEnum : FilteredTermEnum { - private void InitBlock() + + /* This should be somewhere around the average long word. + * If it is longer, we waste time and space. If it is shorter, we waste a + * little bit of time growing the array as we encounter longer words. + */ + private const int TYPICAL_LONGEST_WORD_IN_INDEX = 19; + + /* Allows us save time required to create a new array + * everytime similarity is called. + */ + private int[][] d; + + private float similarity; + private bool endEnum = false; + + private Term searchTerm = null; + private System.String field; + private System.String text; + private System.String prefix; + + private float minimumSimilarity; + private float scale_factor; + private int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; + + /// Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f. + ///

+ /// After calling the constructor the enumeration is already pointing to the first + /// valid term if such a term exists. + /// + ///

+ /// + /// + /// + /// + /// IOException + /// + /// + public FuzzyTermEnum(IndexReader reader, Term term) : this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength) + { + } + + /// Creates a FuzzyTermEnum with an empty prefix. + ///

+ /// After calling the constructor the enumeration is already pointing to the first + /// valid term if such a term exists. + /// + ///

+ /// + /// + /// + /// + /// + /// + /// IOException + /// + /// + public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) : this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength) + { + } + + /// Constructor for enumeration of all terms from specified reader which share a prefix of + /// length prefixLength with term and which have a fuzzy similarity > + /// minSimilarity. + ///

+ /// After calling the constructor the enumeration is already pointing to the first + /// valid term if such a term exists. + /// + ///

+ /// Delivers terms. + /// + /// Pattern term. + /// + /// Minimum required similarity for terms from the reader. Default value is 0.5f. + /// + /// Length of required common prefix. Default value is 0. + /// + /// IOException + public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) : base() + { + + if (minSimilarity >= 1.0f) + throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1"); + else if (minSimilarity < 0.0f) + throw new System.ArgumentException("minimumSimilarity cannot be less than 0"); + if (prefixLength < 0) + throw new System.ArgumentException("prefixLength cannot be less than 0"); + + this.minimumSimilarity = minSimilarity; + this.scale_factor = 1.0f / (1.0f - minimumSimilarity); + this.searchTerm = term; + this.field = searchTerm.Field(); + + //The prefix could be longer than the word. + //It's kind of silly though. It means we must match the entire word. + int fullSearchTermLength = searchTerm.Text().Length; + int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength; + + this.text = searchTerm.Text().Substring(realPrefixLength); + this.prefix = searchTerm.Text().Substring(0, (realPrefixLength) - (0)); + + InitializeMaxDistances(); + this.d = InitDistanceArray(); + + SetEnum(reader.Terms(new Term(searchTerm.Field(), prefix))); + } + + /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to + /// calculate the distance between the given term and the comparing term. + /// + protected internal override bool TermCompare(Term term) { - for (int i = 0; i < 1; i++) + if (field == term.Field() && term.Text().StartsWith(prefix)) { - e[i] = new int[1]; + System.String target = term.Text().Substring(prefix.Length); + this.similarity = Similarity(target); + return (similarity > minimumSimilarity); } + endEnum = true; + return false; } - internal double distance; - internal bool endEnum = false; - internal Term searchTerm = null; - internal System.String field = ""; - internal System.String text = ""; - internal int textlen; - internal System.String prefix = ""; - internal int prefixLength = 0; - internal float minimumSimilarity; - internal double scale_factor; - - - /// Empty prefix and minSimilarity of 0.5f are used. - /// - /// - /// reader - /// - /// term - /// - /// IOException - /// - /// - public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, 0) - { - } - - /// This is the standard FuzzyTermEnum with an empty prefix. - /// - /// - /// reader - /// - /// term - /// - /// minSimilarity - /// - /// IOException - /// - /// - public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, 0) - { - } - - /// Constructor for enumeration of all terms from specified reader which share a prefix of - /// length prefixLength with term and which have a fuzzy similarity > - /// minSimilarity. - /// - /// - /// Delivers terms. - /// - /// Pattern term. - /// - /// Minimum required similarity for terms from the reader. Default value is 0.5f. - /// - /// Length of required common prefix. Default value is 0. - /// - /// IOException - public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base() - { - InitBlock(); - minimumSimilarity = minSimilarity; - scale_factor = 1.0f / (1.0f - minimumSimilarity); - searchTerm = term; - field = searchTerm.Field(); - text = searchTerm.Text(); - textlen = text.Length; - if (prefixLength > 0 && prefixLength < textlen) - { - this.prefixLength = prefixLength; - prefix = text.Substring(0, (prefixLength) - (0)); - text = text.Substring(prefixLength); - textlen = text.Length; - } - SetEnum(reader.Terms(new Term(searchTerm.Field(), prefix))); - } - - /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to - /// calculate the distance between the given term and the comparing term. - /// - protected internal override bool TermCompare(Term term) - { - System.String termText = term.Text(); - if (field == term.Field() && termText.StartsWith(prefix)) - { - System.String target = termText.Substring(prefixLength); - int targetlen = target.Length; - int dist = EditDistance(text, target, textlen, targetlen); - distance = 1 - ((double) dist / (double) System.Math.Min(textlen, targetlen)); - return (distance > minimumSimilarity); - } - endEnum = true; - return false; - } - - public override float Difference() - { - return (float) ((distance - minimumSimilarity) * scale_factor); - } + public override float Difference() + { + return (float) ((similarity - minimumSimilarity) * scale_factor); + } public override bool EndEnum() { @@ -141,76 +165,184 @@ /// **************************** ///
- /// Finds and returns the smallest of three integers - private static int Min(int a, int b, int c) + /// Finds and returns the smallest of three integers + private static int min(int a, int b, int c) { int t = (a < b) ? a : b; return (t < c) ? t : c; } - /// This static array saves us from the time required to create a new array - /// everytime editDistance is called. - /// - private int[][] e = new int[1][]; + private int[][] InitDistanceArray() + { + int[][] tmpArray = new int[this.text.Length + 1][]; + for (int i = 0; i < this.text.Length + 1; i++) + { + tmpArray[i] = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; + } + return tmpArray; + } - /// Levenshtein distance also known as edit distance is a measure of similiarity - /// between two strings where the distance is measured as the number of character - /// deletions, insertions or substitutions required to transform one string to - /// the other string. - ///

This method takes in four parameters; two strings and their respective - /// lengths to compute the Levenshtein distance between the two strings. - /// The result is returned as an integer. + ///

Similarity returns a number that is 1.0f or less (including negative numbers) + /// based on how similar the Term is compared to a target term. It returns + /// exactly 0.0f when + ///

+		/// editDistance < maximumEditDistance
+ /// Otherwise it returns: + ///
+		/// 1 - (editDistance / length)
+ /// where length is the length of the shortest term (text or target) including a + /// prefix that are identical and editDistance is the Levenshtein distance for + /// the two words.

+ /// + ///

Embedded within this algorithm is a fail-fast Levenshtein distance + /// algorithm. The fail-fast algorithm differs from the standard Levenshtein + /// distance algorithm in that it is aborted if it is discovered that the + /// mimimum distance between the words is greater than some threshold. + /// + ///

To calculate the maximum distance threshold we use the following formula: + ///

+		/// (1 - minimumSimilarity) * length
+ /// where length is the shortest term including any prefix that is not part of the + /// similarity comparision. This formula was derived by solving for what maximum value + /// of distance returns false for the following statements: + ///
+		/// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+		/// return (similarity > minimumSimilarity);
+ /// where distance is the Levenshtein distance for the two words. + ///

+ ///

Levenshtein distance (also known as edit distance) is a measure of similiarity + /// between two strings where the distance is measured as the number of character + /// deletions, insertions or substitutions required to transform one string to + /// the other string. ///

- private int EditDistance(System.String s, System.String t, int n, int m) + /// the target word or phrase + /// + /// the similarity, 0.0 or less indicates that it matches less than the required + /// threshold and 1.0 indicates that the text and target are identical + /// + private float Similarity(System.String target) { - if (e.Length <= n || e[0].Length <= m) + lock (this) { - int[][] tmpArray = new int[System.Math.Max(e.Length, n + 1)][]; - for (int i = 0; i < System.Math.Max(e.Length, n + 1); i++) + int m = target.Length; + int n = text.Length; + if (n == 0) { - tmpArray[i] = new int[System.Math.Max(e[0].Length, m + 1)]; + //we don't have anything to compare. That means if we just add + //the letters for m we get the new word + return prefix.Length == 0 ? 0.0f : 1.0f - ((float) m / prefix.Length); } - e = tmpArray; - } - int[][] d = e; // matrix - int i2; // iterates through s - int j; // iterates through t - char s_i; // ith character of s - - if (n == 0) - return m; - if (m == 0) - return n; - - // init matrix d - for (i2 = 0; i2 <= n; i2++) - d[i2][0] = i2; - for (j = 0; j <= m; j++) - d[0][j] = j; - - // start computing edit distance - for (i2 = 1; i2 <= n; i2++) - { - s_i = s[i2 - 1]; - for (j = 1; j <= m; j++) + if (m == 0) + { + return prefix.Length == 0 ? 0.0f : 1.0f - ((float) n / prefix.Length); + } + + int maxDistance = GetMaxDistance(m); + + if (maxDistance < System.Math.Abs(m - n)) + { + //just adding the characters of m to n or vice-versa results in + //too many edits + //for example "pre" length is 3 and "prefixes" length is 8. We can see that + //given this optimal circumstance, the edit distance cannot be less than 5. + //which is 8-3 or more precisesly Math.abs(3-8). + //if our maximum edit distance is 4, then we can discard this word + //without looking at it. + return 0.0f; + } + + //let's make sure we have enough room in our array to do the distance calculations. + if (d[0].Length <= m) + { + GrowDistanceArray(m); + } + + // init matrix d + for (int i = 0; i <= n; i++) + d[i][0] = i; + for (int j = 0; j <= m; j++) + d[0][j] = j; + + // start computing edit distance + for (int i = 1; i <= n; i++) { - if (s_i != t[j - 1]) - d[i2][j] = Min(d[i2 - 1][j], d[i2][j - 1], d[i2 - 1][j - 1]) + 1; - else - d[i2][j] = Min(d[i2 - 1][j] + 1, d[i2][j - 1] + 1, d[i2 - 1][j - 1]); + int bestPossibleEditDistance = m; + char s_i = text[i - 1]; + for (int j = 1; j <= m; j++) + { + if (s_i != target[j - 1]) + { + d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1; + } + else + { + d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]); + } + bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i][j]); + } + + //After calculating row i, the best possible edit distance + //can be found by found by finding the smallest value in a given column. + //If the bestPossibleEditDistance is greater than the max distance, abort. + + if (i > maxDistance && bestPossibleEditDistance > maxDistance) + { + //equal is okay, but not greater + //the closest the target can be to the text is just too far away. + //this target is leaving the party early. + return 0.0f; + } } + + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + return 1.0f - ((float) d[n][m] / (float) (prefix.Length + System.Math.Min(n, m))); } - - // we got the result! - return d[n][m]; + } + + /// Grow the second dimension of the array, so that we can calculate the + /// Levenshtein difference. + /// + private void GrowDistanceArray(int m) + { + for (int i = 0; i < d.Length; i++) + { + d[i] = new int[m + 1]; + } + } + + /// The max Distance is the maximum Levenshtein distance for the text + /// compared to some other value that results in score that is + /// better than the minimum similarity. + /// + /// the length of the "other value" + /// + /// the maximum levenshtein distance that we care about + /// + private int GetMaxDistance(int m) + { + return (m < maxDistances.Length)?maxDistances[m]:CalculateMaxDistance(m); + } + + private void InitializeMaxDistances() + { + for (int i = 0; i < maxDistances.Length; i++) + { + maxDistances[i] = CalculateMaxDistance(i); + } + } + + private int CalculateMaxDistance(int m) + { + return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length)); } public override void Close() { - base.Close(); - searchTerm = null; - field = null; - text = null; + base.Close(); //call super.close() and let the garbage collector do its work. } } } Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hit.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/Hit.cs?rev=411501&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hit.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hit.cs Sat Jun 3 19:41:13 2006 @@ -0,0 +1,135 @@ +/* + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using Document = Lucene.Net.Documents.Document; + +namespace Lucene.Net.Search +{ + + /// Wrapper used by {@link HitIterator} to provide a lazily loaded hit + /// from {@link Hits}. + /// + /// + /// Jeremy Rayner + /// + [Serializable] + public class Hit + { + + private Document doc = null; + + private bool resolved = false; + + private Hits hits = null; + private int hitNumber; + + /// Constructed from {@link HitIterator} + /// Hits returned from a search + /// + /// Hit index in Hits + /// + internal Hit(Hits hits, int hitNumber) + { + this.hits = hits; + this.hitNumber = hitNumber; + } + + /// Returns document for this hit. + /// + /// + /// + /// + public virtual Document GetDocument() + { + if (!resolved) + FetchTheHit(); + return doc; + } + + /// Returns score for this hit. + /// + /// + /// + /// + public virtual float GetScore() + { + return hits.Score(hitNumber); + } + + /// Returns id for this hit. + /// + /// + /// + /// + public virtual int GetId() + { + return hits.Id(hitNumber); + } + + private void FetchTheHit() + { + doc = hits.Doc(hitNumber); + resolved = true; + } + + // provide some of the Document style interface (the simple stuff) + + /// Returns the boost factor for this hit on any field of the underlying document. + /// + /// + /// + /// + public virtual float GetBoost() + { + return GetDocument().GetBoost(); + } + + /// Returns the string value of the field with the given name if any exist in + /// this document, or null. If multiple fields exist with this name, this + /// method returns the first value added. If only binary fields with this name + /// exist, returns null. + /// + /// + /// + /// + public virtual System.String Get(System.String name) + { + return GetDocument().Get(name); + } + + /// Prints the parameters to be used to discover the promised result. + public override System.String ToString() + { + System.Text.StringBuilder buffer = new System.Text.StringBuilder(); + buffer.Append("Hit<"); + buffer.Append(hits.ToString()); + buffer.Append(" ["); + buffer.Append(hitNumber); + buffer.Append("] "); + if (resolved) + { + buffer.Append("resolved"); + } + else + { + buffer.Append("unresolved"); + } + buffer.Append(">"); + return buffer.ToString(); + } + } +} \ No newline at end of file Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitCollector.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/HitCollector.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitCollector.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitCollector.cs Sat Jun 3 19:41:13 2006 @@ -13,13 +13,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; + namespace Lucene.Net.Search { - /// Lower-level search API. - /// + + /// Lower-level search API. + ///
HitCollectors are primarily meant to be used to implement queries, + /// sorting and filtering. + ///
+ /// /// - /// $Id: HitCollector.java,v 1.6 2004/03/29 22:48:03 cutting Exp $ + /// $Id: HitCollector.java 155607 2005-02-27 01:29:53Z otis $ /// public abstract class HitCollector { @@ -40,7 +46,7 @@ ///

Note: This is called in an inner search loop. For good search /// performance, implementations of this method should not call /// {@link Searcher#Doc(int)} or - /// {@link Lucene.Net.Index.IndexReader#Document(int)} on every + /// {@link Lucene.Net.index.IndexReader#Document(int)} on every /// document number encountered. Doing so can slow searches by an order /// of magnitude or more. ///

Note: The score passed to this method is a raw score. Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitIterator.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/HitIterator.cs?rev=411501&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitIterator.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitIterator.cs Sat Jun 3 19:41:13 2006 @@ -0,0 +1,84 @@ +/* + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; + +namespace Lucene.Net.Search +{ + + ///

An iterator over {@link Hits} that provides lazy fetching of each document. + /// {@link Hits#Iterator()} returns an instance of this class. Calls to {@link #next()} + /// return a {@link Hit} instance. + /// + /// + /// Jeremy Rayner + /// + public class HitIterator : System.Collections.IEnumerator + { + /// Returns a {@link Hit} instance representing the next hit in {@link Hits}. + /// + /// + /// Next {@link Hit}. + /// + public virtual System.Object Current + { + get + { + if (hitNumber == hits.Length()) + throw new System.ArgumentOutOfRangeException(); + + System.Object next = new Hit(hits, hitNumber); + hitNumber++; + return next; + } + + } + private Hits hits; + private int hitNumber = 0; + + /// Constructed from {@link Hits#Iterator()}. + internal HitIterator(Hits hits) + { + this.hits = hits; + } + + /// true if current hit is less than the total number of {@link Hits}. + /// + public virtual bool MoveNext() + { + return hitNumber < hits.Length(); + } + + /// Unsupported operation. + /// + /// + /// UnsupportedOperationException + public virtual void Remove() + { + throw new System.NotSupportedException(); + } + + /// Returns the total number of hits. + public virtual int Length() + { + return hits.Length(); + } + //UPGRADE_TODO: The following method was automatically generated and it must be implemented in order to preserve the class logic. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1232'" + virtual public void Reset() + { + } + } +} \ No newline at end of file Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitQueue.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/HitQueue.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitQueue.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/HitQueue.cs Sat Jun 3 19:41:13 2006 @@ -13,8 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using PriorityQueue = Lucene.Net.Util.PriorityQueue; + namespace Lucene.Net.Search { Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hits.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/Hits.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hits.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/Hits.cs Sat Jun 3 19:41:13 2006 @@ -13,15 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using Document = Lucene.Net.Documents.Document; + namespace Lucene.Net.Search { /// A ranked list of documents, used to hold search results. public sealed class Hits { - private Query query; + private Weight weight; private Searcher searcher; private Filter filter = null; private Sort sort = null; @@ -36,7 +38,7 @@ internal Hits(Searcher s, Query q, Filter f) { - query = q; + weight = q.Weight(s); searcher = s; filter = f; GetMoreDocs(50); // retrieve 100 initially @@ -44,7 +46,7 @@ internal Hits(Searcher s, Query q, Filter f, Sort o) { - query = q; + weight = q.Weight(s); searcher = s; filter = f; sort = o; @@ -62,14 +64,15 @@ } int n = min * 2; // double # retrieved - TopDocs topDocs = (sort == null) ? searcher.Search(query, filter, n) : searcher.Search(query, filter, n, sort); + TopDocs topDocs = (sort == null) ? searcher.Search(weight, filter, n) : searcher.Search(weight, filter, n, sort); length = topDocs.totalHits; ScoreDoc[] scoreDocs = topDocs.scoreDocs; float scoreNorm = 1.0f; - if (length > 0 && scoreDocs[0].score > 1.0f) + + if (length > 0 && topDocs.GetMaxScore() > 1.0f) { - scoreNorm = 1.0f / scoreDocs[0].score; + scoreNorm = 1.0f / topDocs.GetMaxScore(); } int end = scoreDocs.Length < length?scoreDocs.Length:length; @@ -124,6 +127,18 @@ return HitDoc(n).id; } + /// Returns a {@link HitIterator} to navigate the Hits. Each item returned + /// from {@link Iterator#next()} is a {@link Hit}. + ///

+ /// Caution: Iterate only over the hits needed. Iterating over all + /// hits is generally not desirable and may be the source of + /// performance issues. + ///

+ ///
+ public System.Collections.IEnumerator Iterator() + { + return new HitIterator(this); + } private HitDoc HitDoc(int n) { Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/IndexSearcher.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/IndexSearcher.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/IndexSearcher.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/IndexSearcher.cs Sat Jun 3 19:41:13 2006 @@ -13,18 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using Document = Lucene.Net.Documents.Document; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; using Directory = Lucene.Net.Store.Directory; + namespace Lucene.Net.Search { /// Implements search over a single IndexReader. /// ///

Applications usually need only call the inherited {@link #Search(Query)} - /// or {@link #Search(Query,Filter)} methods. + /// or {@link #Search(Query,Filter)} methods. For performance reasons it is + /// recommended to open only one IndexSearcher and use it for all of your searches. + /// + ///

Note that you can only access Hits from an IndexSearcher as long as it is + /// not yet closed, otherwise an IOException will be thrown. ///

public class IndexSearcher : Searcher { @@ -39,13 +45,13 @@ this.bits = bits; this.totalHits = totalHits; this.hq = hq; - this.nDocs = nDocs; - this.enclosingInstance = enclosingInstance; + this.nDocs = nDocs; + this.enclosingInstance = enclosingInstance; } private System.Collections.BitArray bits; private int[] totalHits; private Lucene.Net.Search.HitQueue hq; - private int nDocs; + private int nDocs; private IndexSearcher enclosingInstance; public IndexSearcher Enclosing_Instance { @@ -55,19 +61,19 @@ } } - private float minScore = 0.0f; + private float minScore = 0.0f; public override void Collect(int doc, float score) { if (score > 0.0f && (bits == null || bits.Get(doc))) { // skip docs not in bits totalHits[0]++; - if (hq.Size() < nDocs || score >= minScore) - { - hq.Insert(new ScoreDoc(doc, score)); - minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore - } - } + if (hq.Size() < nDocs || score >= minScore) + { + hq.Insert(new ScoreDoc(doc, score)); + minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore + } + } } } private class AnonymousClassHitCollector1 : HitCollector @@ -137,9 +143,14 @@ } } } - public /*internal*/ IndexReader reader; + internal IndexReader reader; private bool closeReader; + public IndexReader Reader + { + get { return reader; } + } + /// Creates a searcher searching the index in the named directory. public IndexSearcher(System.String path) : this(IndexReader.Open(path), true) { @@ -161,12 +172,18 @@ this.closeReader = closeReader; } + /// Return the {@link IndexReader} this searches. + public virtual IndexReader GetIndexReader() + { + return reader; + } + /// Note that the underlying IndexReader is not closed, if /// IndexSearcher was constructed with IndexSearcher(IndexReader r). /// If the IndexReader was supplied implicitly by specifying a directory, then /// the IndexReader gets closed. /// - public override void Close() + public override void Close() { if (closeReader) reader.Close(); @@ -191,31 +208,38 @@ } // inherit javadoc - public override TopDocs Search(Query query, Filter filter, int nDocs) + public override TopDocs Search(Weight weight, Filter filter, int nDocs) { - Scorer scorer = query.Weight(this).Scorer(reader); + + if (nDocs <= 0) + // null might be returned from hq.top() below. + throw new System.ArgumentException("nDocs must be > 0"); + + Scorer scorer = weight.Scorer(reader); if (scorer == null) - return new TopDocs(0, new ScoreDoc[0]); + return new TopDocs(0, new ScoreDoc[0], System.Single.NegativeInfinity); - System.Collections.BitArray bits = filter != null ? filter.Bits(reader) : null; + System.Collections.BitArray bits = filter != null?filter.Bits(reader):null; HitQueue hq = new HitQueue(nDocs); int[] totalHits = new int[1]; scorer.Score(new AnonymousClassHitCollector(bits, totalHits, hq, nDocs, this)); ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()]; for (int i = hq.Size() - 1; i >= 0; i--) - // put docs in array + // put docs in array scoreDocs[i] = (ScoreDoc) hq.Pop(); - return new TopDocs(totalHits[0], scoreDocs); + float maxScore = (totalHits[0] == 0) ? System.Single.NegativeInfinity : scoreDocs[0].score; + + return new TopDocs(totalHits[0], scoreDocs, maxScore); } // inherit javadoc - public override TopFieldDocs Search(Query query, Filter filter, int nDocs, Sort sort) + public override TopFieldDocs Search(Weight weight, Filter filter, int nDocs, Sort sort) { - Scorer scorer = query.Weight(this).Scorer(reader); + Scorer scorer = weight.Scorer(reader); if (scorer == null) - return new TopFieldDocs(0, new ScoreDoc[0], sort.fields); + return new TopFieldDocs(0, new ScoreDoc[0], sort.fields, System.Single.NegativeInfinity); System.Collections.BitArray bits = filter != null ? filter.Bits(reader) : null; FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, sort.fields, nDocs); @@ -227,12 +251,11 @@ // put docs in array scoreDocs[i] = hq.FillFields((FieldDoc) hq.Pop()); - return new TopFieldDocs(totalHits[0], scoreDocs, hq.GetFields()); + return new TopFieldDocs(totalHits[0], scoreDocs, hq.GetFields(), hq.GetMaxScore()); } - // inherit javadoc - public override void Search(Query query, Filter filter, HitCollector results) + public override void Search(Weight weight, Filter filter, HitCollector results) { HitCollector collector = results; if (filter != null) @@ -241,7 +264,7 @@ collector = new AnonymousClassHitCollector2(bits, results, this); } - Scorer scorer = query.Weight(this).Scorer(reader); + Scorer scorer = weight.Scorer(reader); if (scorer == null) return ; scorer.Score(collector); @@ -257,9 +280,9 @@ return query; } - public override Explanation Explain(Query query, int doc) + public override Explanation Explain(Weight weight, int doc) { - return query.Weight(this).Explain(reader, doc); + return weight.Explain(reader, doc); } } } Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MatchAllDocsQuery.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/MatchAllDocsQuery.cs?rev=411501&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MatchAllDocsQuery.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MatchAllDocsQuery.cs Sat Jun 3 19:41:13 2006 @@ -0,0 +1,203 @@ +/* + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using IndexReader = Lucene.Net.Index.IndexReader; +using ToStringUtils = Lucene.Net.Util.ToStringUtils; + +namespace Lucene.Net.Search +{ + + /// A query that matches all documents. + /// + /// + /// John Wang + /// + [Serializable] + public class MatchAllDocsQuery : Query + { + + public MatchAllDocsQuery() + { + } + + private class MatchAllScorer:Scorer + { + private void InitBlock(MatchAllDocsQuery enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private MatchAllDocsQuery enclosingInstance; + public MatchAllDocsQuery Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + + internal IndexReader reader; + internal int count; + internal int maxDoc; + + internal MatchAllScorer(MatchAllDocsQuery enclosingInstance, IndexReader reader, Similarity similarity) : base(similarity) + { + InitBlock(enclosingInstance); + this.reader = reader; + count = - 1; + maxDoc = reader.MaxDoc(); + } + + public override int Doc() + { + return count; + } + + public override Explanation Explain(int doc) + { + Explanation explanation = new Explanation(); + explanation.SetValue(1.0f); + explanation.SetDescription("MatchAllDocsQuery"); + return explanation; + } + + public override bool Next() + { + while (count < (maxDoc - 1)) + { + count++; + if (!reader.IsDeleted(count)) + { + return true; + } + } + return false; + } + + public override float Score() + { + return 1.0f; + } + + public override bool SkipTo(int target) + { + count = target - 1; + return Next(); + } + } + + [Serializable] + private class MatchAllDocsWeight : Weight + { + private void InitBlock(MatchAllDocsQuery enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private MatchAllDocsQuery enclosingInstance; + public MatchAllDocsQuery Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + private Searcher searcher; + + public MatchAllDocsWeight(MatchAllDocsQuery enclosingInstance, Searcher searcher) + { + InitBlock(enclosingInstance); + this.searcher = searcher; + } + + public override System.String ToString() + { + return "weight(" + Enclosing_Instance + ")"; + } + + public virtual Query GetQuery() + { + return Enclosing_Instance; + } + + public virtual float GetValue() + { + return 1.0f; + } + + public virtual float SumOfSquaredWeights() + { + return 1.0f; + } + + public virtual void Normalize(float queryNorm) + { + } + + public virtual Scorer Scorer(IndexReader reader) + { + return new MatchAllScorer(enclosingInstance, reader, Enclosing_Instance.GetSimilarity(searcher)); + } + + public virtual Explanation Explain(IndexReader reader, int doc) + { + // explain query weight + Explanation queryExpl = new Explanation(); + queryExpl.SetDescription("MatchAllDocsQuery:"); + + Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); + if (Enclosing_Instance.GetBoost() != 1.0f) + queryExpl.AddDetail(boostExpl); + queryExpl.SetValue(boostExpl.GetValue()); + + return queryExpl; + } + } + + protected internal override Weight CreateWeight(Searcher searcher) + { + return new MatchAllDocsWeight(this, searcher); + } + + public override System.String ToString(System.String field) + { + System.Text.StringBuilder buffer = new System.Text.StringBuilder(); + buffer.Append("MatchAllDocsQuery"); + buffer.Append(ToStringUtils.Boost(GetBoost())); + return buffer.ToString(); + } + + public override bool Equals(System.Object o) + { + if (!(o is MatchAllDocsQuery)) + return false; + MatchAllDocsQuery other = (MatchAllDocsQuery) o; + return this.GetBoost() == other.GetBoost(); + } + + public override int GetHashCode() + { + return BitConverter.ToInt32(BitConverter.GetBytes(GetBoost()), 0); + } + + // {{Aroush-1.9}} Do we need this?! + override public System.Object Clone() + { + return null; + } + } +} \ No newline at end of file Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiPhraseQuery.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/MultiPhraseQuery.cs?rev=411501&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiPhraseQuery.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiPhraseQuery.cs Sat Jun 3 19:41:13 2006 @@ -0,0 +1,342 @@ +/* + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using IndexReader = Lucene.Net.Index.IndexReader; +using MultipleTermPositions = Lucene.Net.Index.MultipleTermPositions; +using Term = Lucene.Net.Index.Term; +using TermPositions = Lucene.Net.Index.TermPositions; +using ToStringUtils = Lucene.Net.Util.ToStringUtils; + +namespace Lucene.Net.Search +{ + + /// MultiPhraseQuery is a generalized version of PhraseQuery, with an added + /// method {@link #Add(Term[])}. + /// To use this class, to search for the phrase "Microsoft app*" first use + /// add(Term) on the term "Microsoft", then find all terms that have "app" as + /// prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[] + /// terms) to add them to the query. + /// + /// + /// Anders Nielsen + /// + /// 1.0 + /// + [Serializable] + public class MultiPhraseQuery : Query + { + private System.String field; + private System.Collections.ArrayList termArrays = new System.Collections.ArrayList(); + private System.Collections.ArrayList positions = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); + + private int slop = 0; + + /// Sets the phrase slop for this query. + /// + /// + public virtual void SetSlop(int s) + { + slop = s; + } + + /// Sets the phrase slop for this query. + /// + /// + public virtual int GetSlop() + { + return slop; + } + + /// Add a single term at the next position in the phrase. + /// + /// + public virtual void Add(Term term) + { + Add(new Term[]{term}); + } + + /// Add multiple terms at the next position in the phrase. Any of the terms + /// may match. + /// + /// + /// + /// + public virtual void Add(Term[] terms) + { + int position = 0; + if (positions.Count > 0) + position = ((System.Int32) positions[positions.Count - 1]) + 1; + + Add(terms, position); + } + + /// Allows to specify the relative position of terms within the phrase. + /// + /// + /// + /// + /// + /// + /// + /// + public virtual void Add(Term[] terms, int position) + { + if (termArrays.Count == 0) + field = terms[0].Field(); + + for (int i = 0; i < terms.Length; i++) + { + if ((System.Object) terms[i].Field() != (System.Object) field) + { + throw new System.ArgumentException("All phrase terms must be in the same field (" + field + "): " + terms[i]); + } + } + + termArrays.Add(terms); + positions.Add((System.Int32) position); + } + + /// Returns the relative positions of terms in this phrase. + public virtual int[] GetPositions() + { + int[] result = new int[positions.Count]; + for (int i = 0; i < positions.Count; i++) + result[i] = ((System.Int32) positions[i]); + return result; + } + + [Serializable] + private class MultiPhraseWeight : Weight + { + private void InitBlock(MultiPhraseQuery enclosingInstance) + { + this.enclosingInstance = enclosingInstance; + } + private MultiPhraseQuery enclosingInstance; + public MultiPhraseQuery Enclosing_Instance + { + get + { + return enclosingInstance; + } + + } + private Similarity similarity; + private float value_Renamed; + private float idf; + private float queryNorm; + private float queryWeight; + + public MultiPhraseWeight(MultiPhraseQuery enclosingInstance, Searcher searcher) + { + InitBlock(enclosingInstance); + this.similarity = Enclosing_Instance.GetSimilarity(searcher); + + // compute idf + System.Collections.IEnumerator i = Enclosing_Instance.termArrays.GetEnumerator(); + while (i.MoveNext()) + { + Term[] terms = (Term[]) i.Current; + for (int j = 0; j < terms.Length; j++) + { + idf += Enclosing_Instance.GetSimilarity(searcher).Idf(terms[j], searcher); + } + } + } + + public virtual Query GetQuery() + { + return Enclosing_Instance; + } + public virtual float GetValue() + { + return value_Renamed; + } + + public virtual float SumOfSquaredWeights() + { + queryWeight = idf * Enclosing_Instance.GetBoost(); // compute query weight + return queryWeight * queryWeight; // square it + } + + public virtual void Normalize(float queryNorm) + { + this.queryNorm = queryNorm; + queryWeight *= queryNorm; // normalize query weight + value_Renamed = queryWeight * idf; // idf for document + } + + public virtual Scorer Scorer(IndexReader reader) + { + if (Enclosing_Instance.termArrays.Count == 0) + // optimize zero-term case + return null; + + TermPositions[] tps = new TermPositions[Enclosing_Instance.termArrays.Count]; + for (int i = 0; i < tps.Length; i++) + { + Term[] terms = (Term[]) Enclosing_Instance.termArrays[i]; + + TermPositions p; + if (terms.Length > 1) + p = new MultipleTermPositions(reader, terms); + else + p = reader.TermPositions(terms[0]); + + if (p == null) + return null; + + tps[i] = p; + } + + if (Enclosing_Instance.slop == 0) + return new ExactPhraseScorer(this, tps, Enclosing_Instance.GetPositions(), similarity, reader.Norms(Enclosing_Instance.field)); + else + return new SloppyPhraseScorer(this, tps, Enclosing_Instance.GetPositions(), similarity, Enclosing_Instance.slop, reader.Norms(Enclosing_Instance.field)); + } + + public virtual Explanation Explain(IndexReader reader, int doc) + { + Explanation result = new Explanation(); + result.SetDescription("weight(" + GetQuery() + " in " + doc + "), product of:"); + + Explanation idfExpl = new Explanation(idf, "idf(" + GetQuery() + ")"); + + // explain query weight + Explanation queryExpl = new Explanation(); + queryExpl.SetDescription("queryWeight(" + GetQuery() + "), product of:"); + + Explanation boostExpl = new Explanation(Enclosing_Instance.GetBoost(), "boost"); + if (Enclosing_Instance.GetBoost() != 1.0f) + queryExpl.AddDetail(boostExpl); + + queryExpl.AddDetail(idfExpl); + + Explanation queryNormExpl = new Explanation(queryNorm, "queryNorm"); + queryExpl.AddDetail(queryNormExpl); + + queryExpl.SetValue(boostExpl.GetValue() * idfExpl.GetValue() * queryNormExpl.GetValue()); + + result.AddDetail(queryExpl); + + // explain field weight + Explanation fieldExpl = new Explanation(); + fieldExpl.SetDescription("fieldWeight(" + GetQuery() + " in " + doc + "), product of:"); + + Explanation tfExpl = Scorer(reader).Explain(doc); + fieldExpl.AddDetail(tfExpl); + fieldExpl.AddDetail(idfExpl); + + Explanation fieldNormExpl = new Explanation(); + byte[] fieldNorms = reader.Norms(Enclosing_Instance.field); + float fieldNorm = fieldNorms != null ? Similarity.DecodeNorm(fieldNorms[doc]) : 0.0f; + fieldNormExpl.SetValue(fieldNorm); + fieldNormExpl.SetDescription("fieldNorm(field=" + Enclosing_Instance.field + ", doc=" + doc + ")"); + fieldExpl.AddDetail(fieldNormExpl); + + fieldExpl.SetValue(tfExpl.GetValue() * idfExpl.GetValue() * fieldNormExpl.GetValue()); + + result.AddDetail(fieldExpl); + + // combine them + result.SetValue(queryExpl.GetValue() * fieldExpl.GetValue()); + + if (queryExpl.GetValue() == 1.0f) + return fieldExpl; + + return result; + } + } + + public override Query Rewrite(IndexReader reader) + { + if (termArrays.Count == 1) + { + // optimize one-term case + Term[] terms = (Term[]) termArrays[0]; + BooleanQuery boq = new BooleanQuery(true); + for (int i = 0; i < terms.Length; i++) + { + boq.Add(new TermQuery(terms[i]), BooleanClause.Occur.SHOULD); + } + boq.SetBoost(GetBoost()); + return boq; + } + else + { + return this; + } + } + + protected internal override Weight CreateWeight(Searcher searcher) + { + return new MultiPhraseWeight(this, searcher); + } + + /// Prints a user-readable version of this query. + public override System.String ToString(System.String f) + { + System.Text.StringBuilder buffer = new System.Text.StringBuilder(); + if (!field.Equals(f)) + { + buffer.Append(field); + buffer.Append(":"); + } + + buffer.Append("\""); + System.Collections.IEnumerator i = termArrays.GetEnumerator(); + while (i.MoveNext()) + { + Term[] terms = (Term[]) i.Current; + if (terms.Length > 1) + { + buffer.Append("("); + for (int j = 0; j < terms.Length; j++) + { + buffer.Append(terms[j].Text()); + if (j < terms.Length - 1) + buffer.Append(" "); + } + buffer.Append(")"); + } + else + { + buffer.Append(terms[0].Text()); + } + if (i.MoveNext()) + buffer.Append(" "); + } + buffer.Append("\""); + + if (slop != 0) + { + buffer.Append("~"); + buffer.Append(slop); + } + + buffer.Append(ToStringUtils.Boost(GetBoost())); + + return buffer.ToString(); + } + + // {{Aroush-1.9}} Do we need this?! + override public System.Object Clone() + { + return null; + } + } +} \ No newline at end of file Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiSearcher.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/MultiSearcher.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiSearcher.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiSearcher.cs Sat Jun 3 19:41:13 2006 @@ -13,9 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using Document = Lucene.Net.Documents.Document; using Term = Lucene.Net.Index.Term; + namespace Lucene.Net.Search { @@ -54,6 +56,92 @@ results.Collect(doc + start, score); } } + /// Document Frequency cache acting as a Dummy-Searcher. + /// This class is no full-fledged Searcher, but only supports + /// the methods necessary to initialize Weights. + /// + private class CachedDfSource:Searcher + { + private System.Collections.IDictionary dfMap; // Map from Terms to corresponding doc freqs + private int maxDoc; // document count + + public CachedDfSource(System.Collections.IDictionary dfMap, int maxDoc) + { + this.dfMap = dfMap; + this.maxDoc = maxDoc; + } + + public override int DocFreq(Term term) + { + int df; + try + { + df = ((System.Int32) dfMap[term]); + } + catch (System.NullReferenceException) + { + throw new System.ArgumentException("df for term " + term.Text() + " not available"); + } + return df; + } + + public override int[] DocFreqs(Term[] terms) + { + int[] result = new int[terms.Length]; + for (int i = 0; i < terms.Length; i++) + { + result[i] = DocFreq(terms[i]); + } + return result; + } + + public override int MaxDoc() + { + return maxDoc; + } + + public override Query Rewrite(Query query) + { + // this is a bit of a hack. We know that a query which + // creates a Weight based on this Dummy-Searcher is + // always already rewritten (see preparedWeight()). + // Therefore we just return the unmodified query here + return query; + } + + public override void Close() + { + throw new System.NotSupportedException(); + } + + public override Document Doc(int i) + { + throw new System.NotSupportedException(); + } + + public override Explanation Explain(Weight weight, int doc) + { + throw new System.NotSupportedException(); + } + + public override void Search(Weight weight, Filter filter, HitCollector results) + { + throw new System.NotSupportedException(); + } + + public override TopDocs Search(Weight weight, Filter filter, int n) + { + throw new System.NotSupportedException(); + } + + public override TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort) + { + throw new System.NotSupportedException(); + } + } + + + private Lucene.Net.Search.Searchable[] searchables; private int[] starts; private int maxDoc = 0; @@ -72,6 +160,12 @@ starts[searchables.Length] = maxDoc; } + /// Return the array of {@link Searchable}s this searches. + public virtual Lucene.Net.Search.Searchable[] GetSearchables() + { + return searchables; + } + protected internal virtual int[] GetStarts() { return starts; @@ -151,15 +245,16 @@ return maxDoc; } - public override TopDocs Search(Query query, Filter filter, int nDocs) + public override TopDocs Search(Weight weight, Filter filter, int nDocs) { + HitQueue hq = new HitQueue(nDocs); int totalHits = 0; for (int i = 0; i < searchables.Length; i++) { // search each searcher - TopDocs docs = searchables[i].Search(query, filter, nDocs); + TopDocs docs = searchables[i].Search(weight, filter, nDocs); totalHits += docs.totalHits; // update totalHits ScoreDoc[] scoreDocs = docs.scoreDocs; for (int j = 0; j < scoreDocs.Length; j++) @@ -174,25 +269,30 @@ ScoreDoc[] scoreDocs2 = new ScoreDoc[hq.Size()]; for (int i = hq.Size() - 1; i >= 0; i--) - // put docs in array + // put docs in array scoreDocs2[i] = (ScoreDoc) hq.Pop(); - return new TopDocs(totalHits, scoreDocs2); + float maxScore = (totalHits == 0) ? System.Single.NegativeInfinity : scoreDocs2[0].score; + + return new TopDocs(totalHits, scoreDocs2, maxScore); } - - public override TopFieldDocs Search(Query query, Filter filter, int n, Sort sort) + public override TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort) { FieldDocSortedHitQueue hq = null; int totalHits = 0; + float maxScore = System.Single.NegativeInfinity; + for (int i = 0; i < searchables.Length; i++) { // search each searcher - TopFieldDocs docs = searchables[i].Search(query, filter, n, sort); + TopFieldDocs docs = searchables[i].Search(weight, filter, n, sort); + if (hq == null) hq = new FieldDocSortedHitQueue(docs.fields, n); totalHits += docs.totalHits; // update totalHits + maxScore = System.Math.Max(maxScore, docs.GetMaxScore()); ScoreDoc[] scoreDocs = docs.scoreDocs; for (int j = 0; j < scoreDocs.Length; j++) { @@ -206,22 +306,22 @@ ScoreDoc[] scoreDocs2 = new ScoreDoc[hq.Size()]; for (int i = hq.Size() - 1; i >= 0; i--) - // put docs in array + // put docs in array scoreDocs2[i] = (ScoreDoc) hq.Pop(); - return new TopFieldDocs(totalHits, scoreDocs2, hq.GetFields()); + return new TopFieldDocs(totalHits, scoreDocs2, hq.GetFields(), maxScore); } // inherit javadoc - public override void Search(Query query, Filter filter, HitCollector results) + public override void Search(Weight weight, Filter filter, HitCollector results) { for (int i = 0; i < searchables.Length; i++) { int start = starts[i]; - searchables[i].Search(query, filter, new AnonymousClassHitCollector(results, start, this)); + searchables[i].Search(weight, filter, new AnonymousClassHitCollector(results, start, this)); } } @@ -232,13 +332,66 @@ { queries[i] = searchables[i].Rewrite(original); } - return original.Combine(queries); + return queries[0].Combine(queries); } - public override Explanation Explain(Query query, int doc) + public override Explanation Explain(Weight weight, int doc) { int i = SubSearcher(doc); // find searcher index - return searchables[i].Explain(query, doc - starts[i]); // dispatch to searcher + return searchables[i].Explain(weight, doc - starts[i]); // dispatch to searcher + } + + /// Create weight in multiple index scenario. + /// + /// Distributed query processing is done in the following steps: + /// 1. rewrite query + /// 2. extract necessary terms + /// 3. collect dfs for these terms from the Searchables + /// 4. create query weight using aggregate dfs. + /// 5. distribute that weight to Searchables + /// 6. merge results + /// + /// Steps 1-4 are done here, 5+6 in the search() methods + /// + /// + /// rewritten queries + /// + protected internal override Weight CreateWeight(Query original) + { + // step 1 + Query rewrittenQuery = Rewrite(original); + + // step 2 + System.Collections.Hashtable terms = new System.Collections.Hashtable(); + rewrittenQuery.ExtractTerms(terms); + + // step3 + Term[] allTermsArray = new Term[terms.Count]; + int index = 0; + System.Collections.IEnumerator e = terms.GetEnumerator(); + while (e.MoveNext()) + allTermsArray[index++] = e.Current as Term; + int[] aggregatedDfs = new int[terms.Count]; + for (int i = 0; i < searchables.Length; i++) + { + int[] dfs = searchables[i].DocFreqs(allTermsArray); + for (int j = 0; j < aggregatedDfs.Length; j++) + { + aggregatedDfs[j] += dfs[j]; + } + } + + System.Collections.Hashtable dfMap = new System.Collections.Hashtable(); + for (int i = 0; i < allTermsArray.Length; i++) + { + dfMap[allTermsArray[i]] = (System.Int32) aggregatedDfs[i]; + } + + // step4 + int numDocs = MaxDoc(); + CachedDfSource cacheSim = new CachedDfSource(dfMap, numDocs); + + return rewrittenQuery.Weight(cacheSim); } } } Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiTermQuery.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/MultiTermQuery.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiTermQuery.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/MultiTermQuery.cs Sat Jun 3 19:41:13 2006 @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; +using ToStringUtils = Lucene.Net.Util.ToStringUtils; + namespace Lucene.Net.Search { @@ -32,7 +35,7 @@ /// {@link FuzzyTermEnum}, respectively. ///
[Serializable] - public abstract class MultiTermQuery:Query + public abstract class MultiTermQuery : Query { private Term term; @@ -54,7 +57,7 @@ public override Query Rewrite(IndexReader reader) { FilteredTermEnum enumerator = GetEnum(reader); - BooleanQuery query = new BooleanQuery(); + BooleanQuery query = new BooleanQuery(true); try { do @@ -64,7 +67,7 @@ { TermQuery tq = new TermQuery(t); // found a match tq.SetBoost(GetBoost() * enumerator.Difference()); // set the boost - query.Add(tq, false, false); // add to query + query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } } while (enumerator.Next()); @@ -76,12 +79,6 @@ return query; } - public override Query Combine(Query[] queries) - { - return Query.MergeBooleanQueries(queries); - } - - /// Prints a user-readable version of this query. public override System.String ToString(System.String field) { @@ -92,15 +89,28 @@ buffer.Append(":"); } buffer.Append(term.Text()); - if (GetBoost() != 1.0f) - { - System.Globalization.NumberFormatInfo nfi = new System.Globalization.CultureInfo("en-US", false).NumberFormat; - nfi.NumberDecimalDigits = 1; - - buffer.Append("^"); - buffer.Append(GetBoost().ToString("N", nfi)); - } + buffer.Append(ToStringUtils.Boost(GetBoost())); return buffer.ToString(); + } + + public override bool Equals(System.Object o) + { + if (this == o) + return true; + if (!(o is MultiTermQuery)) + return false; + + MultiTermQuery multiTermQuery = (MultiTermQuery) o; + + if (!term.Equals(multiTermQuery.term)) + return false; + + return GetBoost() == multiTermQuery.GetBoost(); + } + + public override int GetHashCode() + { + return term.GetHashCode(); } } } Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/NonMatchingScorer.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/NonMatchingScorer.cs?rev=411501&view=auto ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/NonMatchingScorer.cs (added) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/NonMatchingScorer.cs Sat Jun 3 19:41:13 2006 @@ -0,0 +1,56 @@ +/* + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; + +namespace Lucene.Net.Search +{ + + /// A scorer that matches no document at all. + class NonMatchingScorer : Scorer + { + public NonMatchingScorer() : base(null) + { + } // no similarity used + + public override int Doc() + { + throw new System.NotSupportedException(); + } + + public override bool Next() + { + return false; + } + + public override float Score() + { + throw new System.NotSupportedException(); + } + + public override bool SkipTo(int target) + { + return false; + } + + public override Explanation Explain(int doc) + { + Explanation e = new Explanation(); + e.SetDescription("No document matches."); + return e; + } + } +} \ No newline at end of file Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/ParallelMultiSearcher.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/ParallelMultiSearcher.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/ParallelMultiSearcher.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/ParallelMultiSearcher.cs Sat Jun 3 19:41:13 2006 @@ -13,9 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using Term = Lucene.Net.Index.Term; using PriorityQueue = Lucene.Net.Util.PriorityQueue; + namespace Lucene.Net.Search { @@ -24,9 +26,9 @@ ///

Applications usually need only call the inherited {@link #Search(Query)} /// or {@link #Search(Query,Filter)} methods. /// - public class ParallelMultiSearcher:MultiSearcher + public class ParallelMultiSearcher : MultiSearcher { - private class AnonymousClassHitCollector1:HitCollector + private class AnonymousClassHitCollector1 : HitCollector { public AnonymousClassHitCollector1(Lucene.Net.Search.HitCollector results, int start, ParallelMultiSearcher enclosingInstance) { @@ -59,7 +61,7 @@ private int[] starts; ///

Creates a searcher which searches searchables. - public ParallelMultiSearcher(Lucene.Net.Search.Searchable[] searchables):base(searchables) + public ParallelMultiSearcher(Lucene.Net.Search.Searchable[] searchables) : base(searchables) { this.searchables = searchables; this.starts = GetStarts(); @@ -68,17 +70,14 @@ /// TODO: parallelize this one too public override int DocFreq(Term term) { - int docFreq = 0; - for (int i = 0; i < searchables.Length; i++) - docFreq += searchables[i].DocFreq(term); - return docFreq; + return base.DocFreq(term); } /// A search implementation which spans a new thread for each /// Searchable, waits for each search to complete and merge /// the results back together. /// - public override TopDocs Search(Query query, Filter filter, int nDocs) + public override TopDocs Search(Weight weight, Filter filter, int nDocs) { HitQueue hq = new HitQueue(nDocs); int totalHits = 0; @@ -87,7 +86,7 @@ { // search each searcher // Assume not too many searchables and cost of creating a thread is by far inferior to a search - msta[i] = new MultiSearcherThread(searchables[i], query, filter, nDocs, hq, i, starts, "MultiSearcher thread #" + (i + 1)); + msta[i] = new MultiSearcherThread(searchables[i], weight, filter, nDocs, hq, i, starts, "MultiSearcher thread #" + (i + 1)); msta[i].Start(); } @@ -97,7 +96,7 @@ { msta[i].Join(); } - catch (System.Threading.ThreadInterruptedException ie) + catch (System.Threading.ThreadInterruptedException) { ; // TODO: what should we do with this??? } @@ -118,14 +117,16 @@ // put docs in array scoreDocs[i] = (ScoreDoc) hq.Pop(); - return new TopDocs(totalHits, scoreDocs); + float maxScore = (totalHits == 0) ? System.Single.NegativeInfinity : scoreDocs[0].score; + + return new TopDocs(totalHits, scoreDocs, maxScore); } /// A search implementation allowing sorting which spans a new thread for each /// Searchable, waits for each search to complete and merges /// the results back together. /// - public override TopFieldDocs Search(Query query, Filter filter, int nDocs, Sort sort) + public override TopFieldDocs Search(Weight weight, Filter filter, int nDocs, Sort sort) { // don't specify the fields - we'll wait to do this until we get results FieldDocSortedHitQueue hq = new FieldDocSortedHitQueue(null, nDocs); @@ -135,17 +136,19 @@ { // search each searcher // Assume not too many searchables and cost of creating a thread is by far inferior to a search - msta[i] = new MultiSearcherThread(searchables[i], query, filter, nDocs, hq, sort, i, starts, "MultiSearcher thread #" + (i + 1)); + msta[i] = new MultiSearcherThread(searchables[i], weight, filter, nDocs, hq, sort, i, starts, "MultiSearcher thread #" + (i + 1)); msta[i].Start(); } + float maxScore = System.Single.NegativeInfinity; + for (int i = 0; i < searchables.Length; i++) { try { msta[i].Join(); } - catch (System.Threading.ThreadInterruptedException ie) + catch (System.Threading.ThreadInterruptedException) { ; // TODO: what should we do with this??? } @@ -153,6 +156,7 @@ if (ioe == null) { totalHits += msta[i].Hits(); + maxScore = System.Math.Max(maxScore, msta[i].GetMaxScore()); } else { @@ -166,7 +170,7 @@ // put docs in array scoreDocs[i] = (ScoreDoc) hq.Pop(); - return new TopFieldDocs(totalHits, scoreDocs, hq.GetFields()); + return new TopFieldDocs(totalHits, scoreDocs, hq.GetFields(), maxScore); } /// Lower-level search API. @@ -180,37 +184,32 @@ /// non-high-scoring hits. /// /// - /// to match documents + /// to match documents /// /// if non-null, a bitset used to eliminate some documents /// /// to receive hits /// - /// TODO: parallelize this one too /// - public override void Search(Query query, Filter filter, HitCollector results) + /// parallelize this one too + public override void Search(Weight weight, Filter filter, HitCollector results) { for (int i = 0; i < searchables.Length; i++) { int start = starts[i]; - searchables[i].Search(query, filter, new AnonymousClassHitCollector1(results, start, this)); + searchables[i].Search(weight, filter, new AnonymousClassHitCollector1(results, start, this)); } } /* * TODO: this one could be parallelized too - * @see Lucene.Net.Search.Searchable#rewrite(Lucene.Net.Search.Query) + * @see Lucene.Net.search.Searchable#rewrite(Lucene.Net.search.Query) */ public override Query Rewrite(Query original) { - Query[] queries = new Query[searchables.Length]; - for (int i = 0; i < searchables.Length; i++) - { - queries[i] = searchables[i].Rewrite(original); - } - return original.Combine(queries); + return base.Rewrite(original); } } @@ -219,7 +218,7 @@ { private Lucene.Net.Search.Searchable searchable; - private Query query; + private Weight weight; private Filter filter; private int nDocs; private TopDocs docs; @@ -229,10 +228,10 @@ private System.IO.IOException ioe; private Sort sort; - public MultiSearcherThread(Lucene.Net.Search.Searchable searchable, Query query, Filter filter, int nDocs, HitQueue hq, int i, int[] starts, System.String name):base(name) + public MultiSearcherThread(Lucene.Net.Search.Searchable searchable, Weight weight, Filter filter, int nDocs, HitQueue hq, int i, int[] starts, System.String name):base(name) { this.searchable = searchable; - this.query = query; + this.weight = weight; this.filter = filter; this.nDocs = nDocs; this.hq = hq; @@ -240,10 +239,10 @@ this.starts = starts; } - public MultiSearcherThread(Lucene.Net.Search.Searchable searchable, Query query, Filter filter, int nDocs, FieldDocSortedHitQueue hq, Sort sort, int i, int[] starts, System.String name):base(name) + public MultiSearcherThread(Lucene.Net.Search.Searchable searchable, Weight weight, Filter filter, int nDocs, FieldDocSortedHitQueue hq, Sort sort, int i, int[] starts, System.String name):base(name) { this.searchable = searchable; - this.query = query; + this.weight = weight; this.filter = filter; this.nDocs = nDocs; this.hq = hq; @@ -256,7 +255,7 @@ { try { - docs = (sort == null)?searchable.Search(query, filter, nDocs):searchable.Search(query, filter, nDocs, sort); + docs = (sort == null)?searchable.Search(weight, filter, nDocs):searchable.Search(weight, filter, nDocs, sort); } // Store the IOException for later use by the caller of this thread catch (System.IO.IOException ioe) @@ -265,7 +264,7 @@ } if (this.ioe == null) { - // if we are sorting by fields, we need to tell the Field sorted hit queue + // if we are sorting by fields, we need to tell the field sorted hit queue // the actual type of fields, in case the original list contained AUTO. // if the searchable returns null for fields, we'll have problems. if (sort != null) @@ -291,6 +290,11 @@ public virtual int Hits() { return docs.totalHits; + } + + public virtual float GetMaxScore() + { + return docs.GetMaxScore(); } public virtual System.IO.IOException GetIOException() Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/PhrasePositions.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Search/PhrasePositions.cs?rev=411501&r1=411500&r2=411501&view=diff ============================================================================== --- incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/PhrasePositions.cs (original) +++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Search/PhrasePositions.cs Sat Jun 3 19:41:13 2006 @@ -13,8 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + using System; using Lucene.Net.Index; + namespace Lucene.Net.Search {