Return-Path: Delivered-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Received: (qmail 47372 invoked from network); 30 May 2010 14:20:59 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 30 May 2010 14:20:59 -0000 Received: (qmail 40632 invoked by uid 500); 30 May 2010 14:20:59 -0000 Delivered-To: apmail-lucene-lucene-net-commits-archive@lucene.apache.org Received: (qmail 40591 invoked by uid 500); 30 May 2010 14:20:59 -0000 Mailing-List: contact lucene-net-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucene.apache.org Delivered-To: mailing list lucene-net-commits@lucene.apache.org Received: (qmail 40584 invoked by uid 99); 30 May 2010 14:20:59 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 30 May 2010 14:20:59 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 30 May 2010 14:20:51 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 2CF1D23888FD; Sun, 30 May 2010 14:20:29 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r949519 [1/2] - in /lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net: SpellChecker.Net/ SpellChecker.Net/Spell/ Test/ Test/Test/ Date: Sun, 30 May 2010 14:20:28 -0000 To: lucene-net-commits@lucene.apache.org From: digy@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100530142029.2CF1D23888FD@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: digy Date: Sun May 30 14:20:28 2010 New Revision: 949519 URL: http://svn.apache.org/viewvc?rev=949519&view=rev Log: LUCENENET-366 Spellchecker issues (SpellChecker 2.9.2) Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/JaroWinklerDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LevenshteinDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/NGramDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/StringDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.csproj lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.csproj lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestJaroWinklerDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestLevenshteinDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestLuceneDictionary.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestNGramDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestPlainTextDictionary.cs Removed: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net-2.0.0.csproj lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test-2.0.0.csproj Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/AssemblyInfo.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/Dictionary.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LuceneDictionary.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/SpellChecker.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/TRStringDistance.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.sln lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/AssemblyInfo.cs lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.sln lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestSpellChecker.cs Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/AssemblyInfo.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/AssemblyInfo.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/AssemblyInfo.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/AssemblyInfo.cs Sun May 30 14:20:28 2010 @@ -14,7 +14,7 @@ using System.Runtime.CompilerServices; [assembly: AssemblyDefaultAlias("Lucene.Net.SpellChecker")] [assembly: AssemblyCulture("")] -[assembly: AssemblyInformationalVersionAttribute("2.0")] +[assembly: AssemblyInformationalVersionAttribute("2.9")] // Version information for an assembly consists of the following four values: // @@ -26,7 +26,7 @@ using System.Runtime.CompilerServices; // You can specify all the values or you can default the Revision and Build Numbers // by using the '*' as shown below: -[assembly: AssemblyVersion("2.0.0.2")] +[assembly: AssemblyVersion("2.9.2.1")] // // In order to sign your assembly you must specify a key to use. Refer to the @@ -58,3 +58,5 @@ using System.Runtime.CompilerServices; [assembly: AssemblyKeyName("")] + +//[assembly: System.Runtime.CompilerServices.InternalsVisibleTo("SpellcheckTests")] Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/Dictionary.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/Dictionary.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/Dictionary.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/Dictionary.cs Sun May 30 14:20:28 2010 @@ -25,13 +25,9 @@ namespace SpellChecker.Net.Search.Spell { /// A simple interface representing a Dictionary - /// Nicolas Maisonneuve - /// - /// 1.0 - /// public interface Dictionary { - /// return all the words present in the dictionnary + /// return all the words present in the dictionary /// Iterator /// System.Collections.IEnumerator GetWordsIterator(); Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/JaroWinklerDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/JaroWinklerDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/JaroWinklerDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/JaroWinklerDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Text; + +namespace SpellChecker.Net.Search.Spell +{ + public class JaroWinklerDistance : StringDistance + { + private float threshold = 0.7f; + + private int[] Matches(String s1, String s2) + { + String Max, Min; + if (s1.Length > s2.Length) + { + Max = s1; + Min = s2; + } + else + { + Max = s2; + Min = s1; + } + int range = Math.Max(Max.Length / 2 - 1, 0); + int[] matchIndexes = new int[Min.Length]; + for (int i = 0; i < matchIndexes.Length; i++) + matchIndexes[i] = -1; + bool[] matchFlags = new bool[Max.Length]; + int matches = 0; + for (int mi = 0; mi < Min.Length; mi++) + { + char c1 = Min[mi]; + for (int xi = Math.Max(mi - range, 0), xn = Math.Min(mi + range + 1, Max + .Length); xi < xn; xi++) + { + if (!matchFlags[xi] && c1 == Max[xi]) + { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } + } + char[] ms1 = new char[matches]; + char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < Min.Length; i++) + { + if (matchIndexes[i] != -1) + { + ms1[si] = Min[i]; + si++; + } + } + for (int i = 0, si = 0; i < Max.Length; i++) + { + if (matchFlags[i]) + { + ms2[si] = Max[i]; + si++; + } + } + int transpositions = 0; + for (int mi = 0; mi < ms1.Length; mi++) + { + if (ms1[mi] != ms2[mi]) + { + transpositions++; + } + } + int prefix = 0; + for (int mi = 0; mi < Min.Length; mi++) + { + if (s1[mi] == s2[mi]) + { + prefix++; + } + else + { + break; + } + } + return new int[] { matches, transpositions / 2, prefix, Max.Length }; + } + + public float GetDistance(String s1, String s2) + { + int[] mtp = Matches(s1, s2); + float m = (float)mtp[0]; + if (m == 0) + { + return 0f; + } + float j = ((m / s1.Length + m / s2.Length + (m - mtp[1]) / m)) / 3; + float jw = j < GetThreshold() ? j : j + Math.Min(0.1f, 1f / mtp[3]) * mtp[2] + * (1 - j); + return jw; + } + + /// + ///Sets the threshold used to deterMine when Winkler bonus should be used. + /// Set to a negative value to get the Jaro distance. + /// + /// the new value of the threshold + public void SetThreshold(float threshold) + { + this.threshold = threshold; + } + + /// + /// Returns the current value of the threshold used for adding the Winkler bonus. + /// The default value is 0.7. + /// + /// the current value of the threshold + public float GetThreshold() + { + return threshold; + } + + } +} Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LevenshteinDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LevenshteinDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LevenshteinDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LevenshteinDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Text; +using SpellChecker.Net.Search.Spell; + +namespace SpellChecker.Net.Search.Spell +{ + /// + /// Levenshtein edit distance + /// + public class LevenshteinDistance : StringDistance + { + /// + /// Returns a float between 0 and 1 based on how similar the specified strings are to one another. + /// Returning a value of 1 means the specified strings are identical and 0 means the + /// string are maximally different. + /// + /// The first string. + /// The second string. + /// a float between 0 and 1 based on how similar the specified strings are to one another. + public float GetDistance(String target, String other) + { + char[] sa; + int n; + int[] p; //'previous' cost array, horizontally + int[] d; // cost array, horizontally + int[] _d; //placeholder to assist in swapping p and d + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length()+1 by t.length()+1, + we maintain two single-dimensional arrays of length s.length()+1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + sa = target.ToCharArray(); + n = sa.Length; + p = new int[n + 1]; + d = new int[n + 1]; + int m = other.Length; + + if (n == 0 || m == 0) + { + if (n == m) + { + return 1; + } + else + { + return 0; + } + } + + + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t + + char t_j; // jth character of t + + int cost; // cost + + for (i = 0; i <= n; i++) + { + p[i] = i; + } + + for (j = 1; j <= m; j++) + { + t_j = other[j - 1]; + d[0] = j; + + for (i = 1; i <= n; i++) + { + cost = sa[i - 1] == t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return 1.0f - ((float)p[n] / Math.Max(other.Length, sa.Length)); + } + } +} Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LuceneDictionary.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LuceneDictionary.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LuceneDictionary.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/LuceneDictionary.cs Sun May 30 14:20:28 2010 @@ -22,18 +22,11 @@ using Term = Lucene.Net.Index.Term; namespace SpellChecker.Net.Search.Spell { - /// Lucene Dictionary /// /// - /// Nicolas Maisonneuve - /// public class LuceneDictionary : Dictionary { - virtual public System.Collections.IEnumerator GetWordsIterator() - { - return new LuceneIterator(this); - } internal IndexReader reader; internal System.String field; @@ -42,88 +35,84 @@ namespace SpellChecker.Net.Search.Spell this.reader = reader; this.field = field; } + + virtual public System.Collections.IEnumerator GetWordsIterator() + { + return new LuceneIterator(this); + } + + public System.Collections.IEnumerator GetEnumerator() + { + return GetWordsIterator(); + } internal sealed class LuceneIterator : System.Collections.IEnumerator { - private void InitBlock(LuceneDictionary enclosingInstance) - { - this.enclosingInstance = enclosingInstance; - } - private LuceneDictionary enclosingInstance; - public System.Object Current - { - get - { - if (!has_next_called) - { - MoveNext(); - } - has_next_called = false; - return (actualTerm != null) ? actualTerm.Text() : null; - } - - } - public LuceneDictionary Enclosing_Instance - { - get - { - return enclosingInstance; - } - - } private TermEnum termEnum; private Term actualTerm; - private bool has_next_called; + private bool hasNextCalled; + + private LuceneDictionary enclosingInstance; public LuceneIterator(LuceneDictionary enclosingInstance) { - InitBlock(enclosingInstance); + this.enclosingInstance = enclosingInstance; try { - termEnum = Enclosing_Instance.reader.Terms(new Term(Enclosing_Instance.field, "")); + termEnum = enclosingInstance.reader.Terms(new Term(enclosingInstance.field, "")); } catch (System.IO.IOException ex) { System.Console.Error.WriteLine(ex.StackTrace); } } - - - public bool MoveNext() + + //next() + public System.Object Current { - has_next_called = true; - try + get { - // if there is still words - if (!termEnum.Next()) + if (!hasNextCalled) { - actualTerm = null; - return false; - } - // if the next word are in the field - actualTerm = termEnum.Term(); - System.String fieldt = actualTerm.Field(); - if ( fieldt != Enclosing_Instance.field) - { - actualTerm = null; - return false; + MoveNext(); } - return true; + hasNextCalled = false; + return (actualTerm != null) ? actualTerm.Text() : null; } - catch (System.IO.IOException ex) + + } + + //hasNext() + public bool MoveNext() + { + hasNextCalled = true; + + actualTerm = termEnum.Term(); + + // if there are no words return false + if (actualTerm == null) return false; + + System.String fieldt = actualTerm.Field(); + termEnum.Next(); + + // if the next word doesn't have the same field return false + if (fieldt != enclosingInstance.field) { - System.Console.Error.WriteLine(ex.StackTrace); + actualTerm = null; return false; } + return true; } - - public void Remove() + + public void Remove() { + throw new NotImplementedException(); } - - public void Reset() + + public void Reset() { + throw new NotImplementedException(); } } } Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/NGramDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/NGramDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/NGramDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/NGramDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,159 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +using System; +using System.Collections.Generic; +using System.Text; + +namespace SpellChecker.Net.Search.Spell +{ + public class NGramDistance : StringDistance + { + private int n; + + /// + /// Creates an N-Gram distance measure using n-grams of the specified size. + /// + /// The size of the n-gram to be used to compute the string distance. + public NGramDistance(int size) + { + this.n = size; + } + + /// + /// Creates an N-Gram distance measure using n-grams of size 2. + /// + public NGramDistance() + : this(2) + { + } + + public float GetDistance(String source, String target) + { + int sl = source.Length; + int tl = target.Length; + + if (sl == 0 || tl == 0) + { + if (sl == tl) + { + return 1; + } + else + { + return 0; + } + } + + int cost = 0; + if (sl < n || tl < n) + { + for (int ii = 0, ni = Math.Min(sl, tl); ii < ni; ii++) + { + if (source[ii] == target[ii]) + { + cost++; + } + } + return (float)cost / Math.Max(sl, tl); + } + + char[] sa = new char[sl + n - 1]; + float[] p; //'previous' cost array, horizontally + float[] d; // cost array, horizontally + float[] _d; //placeholder to assist in swapping p and d + + //construct sa with prefix + for (int ii = 0; ii < sa.Length; ii++) + { + if (ii < n - 1) + { + sa[ii] = (char)0; //add prefix + } + else + { + sa[ii] = source[ii - n + 1]; + } + } + p = new float[sl + 1]; + d = new float[sl + 1]; + + // indexes into strings s and t + int i; // iterates through source + int j; // iterates through target + + char[] t_j = new char[n]; // jth n-gram of t + + for (i = 0; i <= sl; i++) + { + p[i] = i; + } + + for (j = 1; j <= tl; j++) + { + //construct t_j n-gram + if (j < n) + { + for (int ti = 0; ti < n - j; ti++) + { + t_j[ti] = (char)0; //add prefix + } + for (int ti = n - j; ti < n; ti++) + { + t_j[ti] = target[ti - (n - j)]; + } + } + else + { + t_j = target.Substring(j - n, n).ToCharArray(); + } + d[0] = j; + for (i = 1; i <= sl; i++) + { + cost = 0; + int tn = n; + //compare sa to t_j + for (int ni = 0; ni < n; ni++) + { + if (sa[i - 1 + ni] != t_j[ni]) + { + cost++; + } + else if (sa[i - 1 + ni] == 0) + { //discount matches on prefix + tn--; + } + } + float ec = (float)cost / tn; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec); + } + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return 1.0f - ((float)p[sl] / Math.Max(tl, sl)); + } + + + } +} Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/SpellChecker.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/SpellChecker.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/SpellChecker.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/SpellChecker.cs Sun May 30 14:20:28 2010 @@ -30,11 +30,12 @@ using IndexSearcher = Lucene.Net.Search. using Query = Lucene.Net.Search.Query; using TermQuery = Lucene.Net.Search.TermQuery; using Directory = Lucene.Net.Store.Directory; +using SpellChecker.Net.Search.Spell; +using Lucene.Net.Store; +using Lucene.Net.Search; namespace SpellChecker.Net.Search.Spell { - - ///

/// Spell Checker class (Main class)
/// (initially inspired by the David Spencer code). @@ -58,38 +59,117 @@ namespace SpellChecker.Net.Search.Spell /// public class SpellChecker { - virtual public void SetSpellIndex(Directory spellindex) - { - this.spellindex = spellindex; - } - ///

Set the accuracy 0 < min < 1; default 0.5 - virtual public void SetAccuracy(float minScore) - { - this.minScore = minScore; - } - /// Field name for each word in the ngram index. public const System.String F_WORD = "word"; - - + private readonly Term F_WORD_TERM = new Term(F_WORD); + /// the spell index internal Directory spellindex; - + /// Boost value for start and end grams private float bStart = 2.0f; private float bEnd = 1.0f; - - - private IndexReader reader; + + //private IndexReader reader; + // don't use this searcher directly - see #swapSearcher() + private IndexSearcher searcher; + + /// + /// this locks all modifications to the current searcher. + /// + private static System.Object searcherLock = new System.Object(); + + /* + * this lock synchronizes all possible modifications to the + * current index directory. It should not be possible to try modifying + * the same index concurrently. Note: Do not acquire the searcher lock + * before acquiring this lock! + */ + private static System.Object modifyCurrentIndexLock = new System.Object(); + private volatile bool closed = false; + internal float minScore = 0.5f; //LUCENENET-359 Spellchecker accuracy gets overwritten - - - public SpellChecker(Directory gramIndex) + + private StringDistance sd; + + /// + /// Use the given directory as a spell checker index. The directory + /// is created if it doesn't exist yet. + /// + /// the spell index directory + /// the {@link StringDistance} measurement to use + public SpellChecker(Directory gramIndex, StringDistance sd) { this.SetSpellIndex(gramIndex); + this.setStringDistance(sd); + } + + /// + /// Use the given directory as a spell checker index with a + /// {@link LevensteinDistance} as the default {@link StringDistance}. The + /// directory is created if it doesn't exist yet. + /// + /// the spell index directory + public SpellChecker(Directory gramIndex) + : this(gramIndex, new LevenshteinDistance()) + { } + + /// + /// Use a different index as the spell checker index or re-open + /// the existing index if spellIndex is the same value + /// as given in the constructor. + /// + /// spellIndexDir the spell directory to use + /// AlreadyClosedException if the Spellchecker is already closed + /// IOException if spellchecker can not open the directory + virtual public void SetSpellIndex(Directory spellIndexDir) + { + // this could be the same directory as the current spellIndex + // modifications to the directory should be synchronized + lock (modifyCurrentIndexLock) + { + EnsureOpen(); + if (!IndexReader.IndexExists(spellIndexDir)) + { + IndexWriter writer = new IndexWriter(spellIndexDir, null, true, + IndexWriter.MaxFieldLength.UNLIMITED); + writer.Close(); + } + SwapSearcher(spellIndexDir); + } + } + + /// + /// Sets the {@link StringDistance} implementation for this + /// {@link SpellChecker} instance. + /// + /// the {@link StringDistance} implementation for this + /// {@link SpellChecker} instance. + public void setStringDistance(StringDistance sd) + { + this.sd = sd; + } + + /// + /// Returns the {@link StringDistance} instance used by this + /// {@link SpellChecker} instance. + /// + /// + /// Returns the {@link StringDistance} instance used by this + /// {@link SpellChecker} instance. + /// + public StringDistance GetStringDistance() + { + return sd; + } + + + /// Set the accuracy 0 < min < 1; default 0.5 + virtual public void SetAccuracy(float minScore) + { + this.minScore = minScore; } - - + /// Suggest similar words /// String the word you want a spell check done on /// @@ -102,12 +182,12 @@ namespace SpellChecker.Net.Search.Spell { return this.SuggestSimilar(word, num_sug, null, null, false); } - - + + /// Suggest similar words (restricted or not to a field of a user index) /// String the word you want a spell check done on /// - /// int the number of suggest words + /// int the number of suggest words /// /// the indexReader of the user index (can be null see field param) /// @@ -122,120 +202,133 @@ namespace SpellChecker.Net.Search.Spell /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity /// of the suggest words in the field of the user index /// - public virtual System.String[] SuggestSimilar(System.String word, int num_sug, IndexReader ir, System.String field, bool morePopular) - { - float min = this.minScore; - TRStringDistance sd = new TRStringDistance(word); - int lengthWord = word.Length; - - int goalFreq = (morePopular && ir != null) ? ir.DocFreq(new Term(field, word)) : 0; - if (!morePopular && goalFreq > 0) - { - return new System.String[]{word}; // return the word if it exist in the index and i don't want a more popular word - } - - BooleanQuery query = new BooleanQuery(); - System.String[] grams; - System.String key; - - for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) - { - - key = "gram" + ng; // form key - - grams = FormGrams(word, ng); // form word into ngrams (allow dups too) - - if (grams.Length == 0) - { - continue; // hmm - } - - if (bStart > 0) - { - // should we boost prefixes? - Add(query, "start" + ng, grams[0], bStart); // matches start of word - } - if (bEnd > 0) - { - // should we boost suffixes - Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word - } - for (int i = 0; i < grams.Length; i++) - { - Add(query, key, grams[i]); - } - } - - IndexSearcher searcher = new IndexSearcher(this.spellindex); - Hits hits = searcher.Search(query); - SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); - - int stop = Math.Min(hits.Length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers - SuggestWord sugword = new SuggestWord(); - for (int i = 0; i < stop; i++) + public virtual System.String[] SuggestSimilar(System.String word, int numSug, IndexReader ir, System.String field, bool morePopular) + { // obtainSearcher calls ensureOpen + IndexSearcher indexSearcher = ObtainSearcher(); + try { - - sugword.string_Renamed = hits.Doc(i).Get(F_WORD); // get orig word) - - if (sugword.string_Renamed.Equals(word)) + float min = this.minScore; + int lengthWord = word.Length; + + int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0; + int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; + // if the word exists in the real index and we don't care for word frequency, return the word itself + if (!morePopular && freq > 0) { - continue; // don't suggest a word for itself, that would be silly + return new String[] { word }; } - - //edit distance/normalize with the min word length - sugword.score = 1.0f - ((float) sd.GetDistance(sugword.string_Renamed) / System.Math.Min(sugword.string_Renamed.Length, lengthWord)); - if (sugword.score < min) + + BooleanQuery query = new BooleanQuery(); + String[] grams; + String key; + + for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++) { - continue; + + key = "gram" + ng; // form key + + grams = FormGrams(word, ng); // form word into ngrams (allow dups too) + + if (grams.Length == 0) + { + continue; // hmm + } + + if (bStart > 0) + { // should we boost prefixes? + Add(query, "start" + ng, grams[0], bStart); // matches start of word + + } + if (bEnd > 0) + { // should we boost suffixes + Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word + + } + for (int i = 0; i < grams.Length; i++) + { + Add(query, key, grams[i]); + } } - - if (ir != null) + + int maxHits = 10 * numSug; + + // System.out.println("Q: " + query); + ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).scoreDocs; + // System.out.println("HITS: " + hits.length()); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + + // go thru more than 'maxr' matches in case the distance filter triggers + int stop = Math.Min(hits.Length, maxHits); + SuggestWord sugWord = new SuggestWord(); + for (int i = 0; i < stop; i++) { - // use the user index - sugword.freq = ir.DocFreq(new Term(field, sugword.string_Renamed)); // freq in the index - if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) + + sugWord.string_Renamed = indexSearcher.Doc(hits[i].doc).Get(F_WORD); // get orig word + + // don't suggest a word for itself, that would be silly + if (sugWord.string_Renamed.Equals(word)) + { + continue; + } + + // edit distance + sugWord.score = sd.GetDistance(word, sugWord.string_Renamed); + if (sugWord.score < min) { - // don't suggest a word that is not present in the field continue; } + + if (ir != null && field != null) + { // use the user index + sugWord.freq = ir.DocFreq(new Term(field, sugWord.string_Renamed)); // freq in the index + // don't suggest a word that is not present in the field + if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) + { + continue; + } + } + sugQueue.InsertWithOverflow(sugWord); + if (sugQueue.Size() == numSug) + { + // if queue full, maintain the minScore score + min = ((SuggestWord)sugQueue.Top()).score; + } + sugWord = new SuggestWord(); } - sugqueue.Insert(sugword); - if (sugqueue.Size() == num_sug) + + // convert to array string + String[] list = new String[sugQueue.Size()]; + for (int i = sugQueue.Size() - 1; i >= 0; i--) { - //if queue full , maintain the min score - min = ((SuggestWord) sugqueue.Top()).score; + list[i] = ((SuggestWord)sugQueue.Pop()).string_Renamed; } - sugword = new SuggestWord(); + + return list; } - - // convert to array string - System.String[] list = new System.String[sugqueue.Size()]; - for (int i = sugqueue.Size() - 1; i >= 0; i--) + finally { - list[i] = ((SuggestWord) sugqueue.Pop()).string_Renamed; + ReleaseSearcher(indexSearcher); } - - searcher.Close(); - return list; + } - - + + /// Add a clause to a boolean query. - private static void Add(BooleanQuery q, System.String k, System.String v, float boost) + private static void Add(BooleanQuery q, System.String k, System.String v, float boost) { Query tq = new TermQuery(new Term(k, v)); tq.SetBoost(boost); q.Add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); } - - + + /// Add a clause to a boolean query. - private static void Add(BooleanQuery q, System.String k, System.String v) + private static void Add(BooleanQuery q, System.String k, System.String v) { q.Add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD)); } - - + + /// Form all ngrams for a given word. /// the word to parse /// @@ -253,16 +346,23 @@ namespace SpellChecker.Net.Search.Spell } return res; } - - - public virtual void ClearIndex() - { - IndexReader.Unlock(spellindex); - IndexWriter writer = new IndexWriter(spellindex, null, true); - writer.Close(); + + /// + /// Removes all terms from the spell check index. + /// + public virtual void ClearIndex() + { + lock (modifyCurrentIndexLock) + { + EnsureOpen(); + Directory dir = this.spellindex; + IndexWriter writer = new IndexWriter(dir, null, true, IndexWriter.MaxFieldLength.UNLIMITED); + writer.Close(); + SwapSearcher(dir); + } } - - + + /// Check whether the word exists in the index. /// String /// @@ -271,56 +371,74 @@ namespace SpellChecker.Net.Search.Spell /// public virtual bool Exist(System.String word) { - if (reader == null) + // obtainSearcher calls ensureOpen + IndexSearcher indexSearcher = ObtainSearcher(); + try + { + return indexSearcher.DocFreq(F_WORD_TERM.CreateTerm(word)) > 0; + } + finally { - reader = IndexReader.Open(spellindex); + ReleaseSearcher(indexSearcher); } - return reader.DocFreq(new Term(F_WORD, word)) > 0; } - - + + /// Index a Dictionary - /// the dictionary to index - /// + /// the dictionary to index + /// mergeFactor to use when indexing + /// the max amount or memory in MB to use /// IOException - public virtual void IndexDictionary(Dictionary dict) + /// AlreadyClosedException if the Spellchecker is already closed + public virtual void IndexDictionary(Dictionary dict, int mergeFactor, int ramMB) + { + lock (modifyCurrentIndexLock) + { + EnsureOpen(); + Directory dir = this.spellindex; + IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + writer.SetMergeFactor(mergeFactor); + writer.SetMaxBufferedDocs(ramMB); + + System.Collections.IEnumerator iter = dict.GetWordsIterator(); + while (iter.MoveNext()) + { + System.String word = (System.String)iter.Current; + + int len = word.Length; + if (len < 3) + { + continue; // too short we bail but "too long" is fine... + } + + if (this.Exist(word)) + { + // if the word already exist in the gramindex + continue; + } + + // ok index the word + Document doc = CreateDocument(word, GetMin(len), GetMax(len)); + writer.AddDocument(doc); + } + // close writer + writer.Optimize(); + writer.Close(); + // also re-open the spell index to see our own changes when the next suggestion + // is fetched: + SwapSearcher(dir); + } + } + + /// + /// Indexes the data from the given {@link Dictionary}. + /// + /// dict the dictionary to index + public void IndexDictionary(Dictionary dict) { - IndexReader.Unlock(spellindex); - IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.IndexExists(spellindex)); - writer.SetMergeFactor(300); - writer.SetMaxBufferedDocs(150); - - System.Collections.IEnumerator iter = dict.GetWordsIterator(); - while (iter.MoveNext()) - { - System.String word = (System.String) iter.Current; - - int len = word.Length; - if (len < 3) - { - continue; // too short we bail but "too long" is fine... - } - - if (this.Exist(word)) - { - // if the word already exist in the gramindex - continue; - } - - // ok index the word - Document doc = CreateDocument(word, GetMin(len), GetMax(len)); - writer.AddDocument(doc); - } - // close writer - writer.Optimize(); - writer.Close(); - - // close reader - reader.Close(); - reader = null; + IndexDictionary(dict, 300, 10); } - - + private int GetMin(int l) { if (l > 5) @@ -333,8 +451,8 @@ namespace SpellChecker.Net.Search.Spell } return 1; } - - + + private int GetMax(int l) { if (l > 5) @@ -347,18 +465,18 @@ namespace SpellChecker.Net.Search.Spell } return 2; } - - + + private static Document CreateDocument(System.String text, int ng1, int ng2) { Document doc = new Document(); - doc.Add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term + doc.Add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term AddGram(text, doc, ng1, ng2); return doc; } - - - private static void AddGram(System.String text, Document doc, int ng1, int ng2) + + + private static void AddGram(System.String text, Document doc, int ng1, int ng2) { int len = text.Length; for (int ng = ng1; ng <= ng2; ng++) @@ -368,28 +486,110 @@ namespace SpellChecker.Net.Search.Spell for (int i = 0; i < len - ng + 1; i++) { System.String gram = text.Substring(i, (i + ng) - (i)); - doc.Add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.Add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); if (i == 0) { - doc.Add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.Add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED)); } end = gram; } if (end != null) { // may not be present if len==ng1 - doc.Add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.Add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED)); } } } - - - ~SpellChecker() + + private IndexSearcher ObtainSearcher() + { + lock (searcherLock) + { + EnsureOpen(); + searcher.GetIndexReader().IncRef(); + return searcher; + } + } + + private void ReleaseSearcher(IndexSearcher aSearcher) { - if (reader != null) + // don't check if open - always decRef + // don't decrement the private searcher - could have been swapped + aSearcher.GetIndexReader().DecRef(); + } + + private void EnsureOpen() + { + if (closed) + { + throw new AlreadyClosedException("Spellchecker has been closed"); + } + } + + public void Close() + { + lock (searcherLock) { - reader.Close(); + EnsureOpen(); + closed = true; + if (searcher != null) + { + searcher.Close(); + } + searcher = null; } } + + private void SwapSearcher(Directory dir) + { + /* + * opening a searcher is possibly very expensive. + * We rather close it again if the Spellchecker was closed during + * this operation than block access to the current searcher while opening. + */ + IndexSearcher indexSearcher = CreateSearcher(dir); + lock (searcherLock) + { + if (closed) + { + indexSearcher.Close(); + throw new AlreadyClosedException("Spellchecker has been closed"); + } + if (searcher != null) + { + searcher.Close(); + } + // set the spellindex in the sync block - ensure consistency. + searcher = indexSearcher; + this.spellindex = dir; + } + } + + /// + /// Creates a new read-only IndexSearcher (for testing purposes) + /// + /// dir the directory used to open the searcher + /// a new read-only IndexSearcher. (throws IOException f there is a low-level IO error) + public virtual IndexSearcher CreateSearcher(Directory dir) + { + return new IndexSearcher(dir, true); + } + + /// + /// Returns true if and only if the {@link SpellChecker} is + /// closed, otherwise false. + /// + /// true if and only if the {@link SpellChecker} is + /// closed, otherwise false. + /// + bool IsClosed() + { + return closed; + } + + ~SpellChecker() + { + this.Close(); + } } } \ No newline at end of file Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/StringDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/StringDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/StringDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/StringDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Text; + +namespace SpellChecker.Net.Search.Spell +{ + /// + /// Interface for string distances. + /// + public interface StringDistance + { + /// + /// Returns a float between 0 and 1 based on how similar the specified strings are to one another. + /// Returning a value of 1 means the specified strings are identical and 0 means the + /// string are maximally different. + /// + /// The first string. + /// The second string. + /// a float between 0 and 1 based on how similar the specified strings are to one another. + float GetDistance(String s1, String s2); + + } +} Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/TRStringDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/Spell/TRStringDistance.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/TRStringDistance.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/Spell/TRStringDistance.cs Sun May 30 14:20:28 2010 @@ -16,12 +16,13 @@ */ using System; +using SpellChecker.Net.Search.Spell; namespace SpellChecker.Net.Search.Spell { /// Edit distance class - sealed class TRStringDistance + public class TRStringDistance { internal char[] sa; Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.csproj URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.csproj?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.csproj (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.csproj Sun May 30 14:20:28 2010 @@ -0,0 +1,148 @@ + + + + Local + 9.0.21022 + 2.0 + {FF45EE91-9CA3-443D-8231-75E9FA1AF40E} + Debug + AnyCPU + + + + + SpellChecker.Net + + + JScript + Grid + IE50 + false + Library + SpellChecker.Net + OnBuildSuccess + + + + + + + 3.5 + v2.0 + + + bin\Debug\ + false + 285212672 + false + + + DEBUG;TRACE + SpellChecker.Net.xml + true + 4096 + false + + + false + false + false + false + 4 + full + prompt + AllRules.ruleset + + + bin\Release\ + false + 285212672 + false + + + TRACE + SpellChecker.Net.xml + false + 4096 + false + + + true + false + false + false + 4 + none + prompt + AllRules.ruleset + + + + False + ..\..\..\src\Lucene.Net\bin\Release\Lucene.Net.dll + + + mscorlib + + + System + + + System.Data + + + System.Design + + + System.Drawing + + + System.Management + + + System.Windows.Forms + + + System.Xml + + + + + Code + + + Code + + + + + Code + + + + Code + + + Code + + + + Code + + + Code + + + Code + + + + + + + + + + + + \ No newline at end of file Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.sln URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.sln?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.sln (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/SpellChecker.Net/SpellChecker.Net.sln Sun May 30 14:20:28 2010 @@ -1,24 +1,22 @@ -Microsoft Visual Studio Solution File, Format Version 8.00 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Net-2.0.0", "SpellChecker.Net-2.0.0.csproj", "{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}" - ProjectSection(ProjectDependencies) = postProject - EndProjectSection +Microsoft Visual Studio Solution File, Format Version 10.00 +# Visual C# Express 2008 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Net", "SpellChecker.Net.csproj", "{FF45EE91-9CA3-443D-8231-75E9FA1AF40E}" EndProject Global - GlobalSection(DPCodeReviewSolutionGUID) = preSolution - DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000} - EndGlobalSection - GlobalSection(SolutionConfiguration) = preSolution - Debug = Debug - Release = Release + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU EndGlobalSection - GlobalSection(ProjectConfiguration) = postSolution - {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug.ActiveCfg = Debug|.NET - {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug.Build.0 = Debug|.NET - {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release.ActiveCfg = Release|.NET - {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release.Build.0 = Release|.NET + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {FF45EE91-9CA3-443D-8231-75E9FA1AF40E}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE EndGlobalSection - GlobalSection(ExtensibilityAddIns) = postSolution + GlobalSection(DPCodeReviewSolutionGUID) = preSolution + DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000} EndGlobalSection EndGlobal Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/AssemblyInfo.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/Test/AssemblyInfo.cs?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/AssemblyInfo.cs (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/AssemblyInfo.cs Sun May 30 14:20:28 2010 @@ -16,7 +16,7 @@ using System.Runtime.CompilerServices; [assembly: AssemblyDefaultAlias("Lucene.Net.SpellChecker")] [assembly: AssemblyCulture("")] -[assembly: AssemblyInformationalVersionAttribute("2.0")] +[assembly: AssemblyInformationalVersionAttribute("2.9")] // // Version information for an assembly consists of the following four values: @@ -29,7 +29,7 @@ using System.Runtime.CompilerServices; // You can specify all the values or you can default the Revision and Build Numbers // by using the '*' as shown below: -[assembly: AssemblyVersion("2.0.0.1")] +[assembly: AssemblyVersion("2.9.2.1")] // // In order to sign your assembly you must specify a key to use. Refer to the Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.csproj URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.csproj?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.csproj (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.csproj Sun May 30 14:20:28 2010 @@ -0,0 +1,125 @@ + + + Local + 8.0.50727 + 2.0 + {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C} + Debug + AnyCPU + + + + + SpellChecker.Net.Test + + + JScript + Grid + IE50 + false + Library + SpellChecker.Net.Test + OnBuildSuccess + + + + + + + 2.0 + v3.5 + + + bin\Debug\ + false + 285212672 + false + + + DEBUG;TRACE + + + true + 4096 + false + + + false + false + false + false + 4 + full + prompt + + + bin\Release\ + false + 285212672 + false + + + TRACE + + + false + 4096 + false + + + true + false + false + false + 4 + none + prompt + + + + False + ..\..\..\src\Lucene.Net\bin\Release\Lucene.Net.dll + + + nunit.framework + D:\DEVS\NUnit\bin\nunit.framework.dll + hklm\dn\nunit.framework + + + False + ..\SpellChecker.Net\bin\Release\SpellChecker.Net.dll + + + System + + + System.Data + + + System.XML + + + + + Code + + + + + + + + Code + + + Code + + + + + + + + + + \ No newline at end of file Modified: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.sln URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.sln?rev=949519&r1=949518&r2=949519&view=diff ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.sln (original) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/SpellChecker.Net.Test.sln Sun May 30 14:20:28 2010 @@ -1,24 +1,22 @@ -Microsoft Visual Studio Solution File, Format Version 8.00 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Net.Test-2.0.0", "SpellChecker.Net.Test-2.0.0.csproj", "{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}" - ProjectSection(ProjectDependencies) = postProject - EndProjectSection +Microsoft Visual Studio Solution File, Format Version 10.00 +# Visual C# Express 2008 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpellChecker.Net.Test", "SpellChecker.Net.Test.csproj", "{4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}" EndProject Global - GlobalSection(DPCodeReviewSolutionGUID) = preSolution - DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000} - EndGlobalSection - GlobalSection(SolutionConfiguration) = preSolution - Debug = Debug - Release = Release + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU EndGlobalSection - GlobalSection(ProjectConfiguration) = postSolution - {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug.ActiveCfg = Debug|.NET - {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug.Build.0 = Debug|.NET - {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release.ActiveCfg = Release|.NET - {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release.Build.0 = Release|.NET + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4DCB81AA-ECC1-4B3D-A0C9-28E54F5B125C}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE EndGlobalSection - GlobalSection(ExtensibilityAddIns) = postSolution + GlobalSection(DPCodeReviewSolutionGUID) = preSolution + DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000} EndGlobalSection EndGlobal Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestJaroWinklerDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/Test/Test/TestJaroWinklerDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestJaroWinklerDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestJaroWinklerDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Text; + +using NUnit.Framework; + +using SpellChecker.Net.Search.Spell; + +namespace SpellChecker.Net.Test.Search.Spell +{ + [TestFixture] + public class TestJaroWinklerDistance + { + private StringDistance sd = new JaroWinklerDistance(); + + [Test] + public void TestGetDistance() + { + float d = sd.GetDistance("al", "al"); + Assert.IsTrue(d == 1.0f); + d = sd.GetDistance("martha", "marhta"); + Assert.IsTrue(d > 0.961 && d < 0.962); + d = sd.GetDistance("jones", "johnson"); + Assert.IsTrue(d > 0.832 && d < 0.833); + d = sd.GetDistance("abcvwxyz", "cabvwxyz"); + Assert.IsTrue(d > 0.958 && d < 0.959); + d = sd.GetDistance("dwayne", "duane"); + Assert.IsTrue(d > 0.84 && d < 0.841); + d = sd.GetDistance("dixon", "dicksonx"); + Assert.IsTrue(d > 0.813 && d < 0.814); + d = sd.GetDistance("fvie", "ten"); + Assert.IsTrue(d == 0f); + float d1 = sd.GetDistance("zac ephron", "zac efron"); + float d2 = sd.GetDistance("zac ephron", "kai ephron"); + Assert.IsTrue(d1 > d2); + d1 = sd.GetDistance("brittney spears", "britney spears"); + d2 = sd.GetDistance("brittney spears", "brittney startzman"); + Assert.IsTrue(d1 > d2); + } + } +} Added: lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestLevenshteinDistance.cs URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/SpellChecker.Net/Test/Test/TestLevenshteinDistance.cs?rev=949519&view=auto ============================================================================== --- lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestLevenshteinDistance.cs (added) +++ lucene/lucene.net/trunk/C#/contrib/SpellChecker.Net/Test/Test/TestLevenshteinDistance.cs Sun May 30 14:20:28 2010 @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Text; +using SpellChecker.Net.Search.Spell; +using NUnit.Framework; + +namespace SpellChecker.Net.Test.Search.Spell +{ + [TestFixture] + public class TestLevenshteinDistance + { + private StringDistance sd = new LevenshteinDistance(); + + [Test] + public void TestGetDistance() + { + float d = sd.GetDistance("al", "al"); + Assert.AreEqual(d, 1.0f, 0.001); + d = sd.GetDistance("martha", "marhta"); + Assert.AreEqual(d, 0.6666, 0.001); + d = sd.GetDistance("jones", "johnson"); + Assert.AreEqual(d, 0.4285, 0.001); + d = sd.GetDistance("abcvwxyz", "cabvwxyz"); + Assert.AreEqual(d, 0.75, 0.001); + d = sd.GetDistance("dwayne", "duane"); + Assert.AreEqual(d, 0.666, 0.001); + d = sd.GetDistance("dixon", "dicksonx"); + Assert.AreEqual(d, 0.5, 0.001); + d = sd.GetDistance("six", "ten"); + Assert.AreEqual(d, 0, 0.001); + float d1 = sd.GetDistance("zac ephron", "zac efron"); + float d2 = sd.GetDistance("zac ephron", "kai ephron"); + Assert.AreEqual(d1, d2, 0.001); + d1 = sd.GetDistance("brittney spears", "britney spears"); + d2 = sd.GetDistance("brittney spears", "brittney startzman"); + Assert.True(d1 > d2); + } + + [Test] + public void TestEmpty() + { + float d = sd.GetDistance("", "al"); + Assert.AreEqual(d, 0.0f, 0.001); + } + + } +}