Return-Path: X-Original-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Delivered-To: apmail-lucene-lucene-net-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 0F72F91A5 for ; Fri, 23 Mar 2012 02:12:12 +0000 (UTC) Received: (qmail 13015 invoked by uid 500); 23 Mar 2012 02:12:12 -0000 Delivered-To: apmail-lucene-lucene-net-commits-archive@lucene.apache.org Received: (qmail 12990 invoked by uid 500); 23 Mar 2012 02:12:11 -0000 Mailing-List: contact lucene-net-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucene.apache.org Delivered-To: mailing list lucene-net-commits@lucene.apache.org Received: (qmail 12978 invoked by uid 99); 23 Mar 2012 02:12:11 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 23 Mar 2012 02:12:11 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 23 Mar 2012 02:12:06 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id CEE5423888EA; Fri, 23 Mar 2012 02:11:44 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r1304164 - in /incubator/lucene.net/trunk: src/contrib/Analyzers/ src/contrib/Analyzers/De/ test/contrib/Analyzers/ test/contrib/Analyzers/De/ Date: Fri, 23 Mar 2012 02:11:44 -0000 To: lucene-net-commits@lucene.apache.org From: ccurrens@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120323021144.CEE5423888EA@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: ccurrens Date: Fri Mar 23 02:11:43 2012 New Revision: 1304164 URL: http://svn.apache.org/viewvc?rev=1304164&view=rev Log: [LUCENENET-466] - added a DIN-5007-2 stemmer to GermanAnalyzer, as well as new constructors to specify its use if desired. TestGermanStemFilter's TestStemming is now renamed to TestDin1Stemming, and TestDin2Stemming has been added for GermanStemmerDIN2 Added: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original) +++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Fri Mar 23 02:11:43 2012 @@ -100,6 +100,7 @@ + Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs (original) +++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs Fri Mar 23 02:11:43 2012 @@ -88,6 +88,7 @@ namespace Lucene.Net.Analysis.De private ISet exclusionSet; private Version matchVersion; + private readonly bool _useDin2Stemmer; /// /// Builds an analyzer with the default stop words: @@ -95,7 +96,7 @@ namespace Lucene.Net.Analysis.De /// [Obsolete("Use GermanAnalyzer(Version) instead")] public GermanAnalyzer() - : this(Version.LUCENE_23) + : this(Version.LUCENE_CURRENT) { } @@ -108,7 +109,15 @@ namespace Lucene.Net.Analysis.De { } /// - /// Builds an analyzer with the given stop words. + /// Builds an analyzer with the default stop words: + /// + /// + public GermanAnalyzer(Version matchVersion, bool useDin2Stemmer) + : this(matchVersion, DefaultSetHolder.DEFAULT_SET, useDin2Stemmer) + { } + + /// + /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer /// /// Lucene compatibility version /// a stopword set @@ -120,15 +129,41 @@ namespace Lucene.Net.Analysis.De /// /// Builds an analyzer with the given stop words /// + /// Lucene compatibility version + /// a stopword set + /// Specifies if the DIN-2007-2 style stemmer should be used. Commonly referred to as + /// phone book sorting, since it was defined to be used with names, rather than words + public GermanAnalyzer(Version matchVersion, ISet stopwords, bool useDin2Stemmer) + : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, useDin2Stemmer) + { + } + + /// + /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer + /// /// lucene compatibility version /// a stopword set /// a stemming exclusion set public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet) + : this(matchVersion, stopwords, stemExclusionSet, false) + { } + + + /// + /// Builds an analyzer with the given stop words + /// + /// lucene compatibility version + /// a stopword set + /// a stemming exclusion set + /// Specifies if the DIN-2007-2 style stemmer should be used. Commonly referred to as + /// phone book sorting, since it was defined to be used with names, rather than words + public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet, bool useDin2Stemmer) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet)); - SetOverridesTokenStreamMethod(); this.matchVersion = matchVersion; + _useDin2Stemmer = useDin2Stemmer; + SetOverridesTokenStreamMethod(); } /// @@ -202,7 +237,7 @@ namespace Lucene.Net.Analysis.De result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); - result = new GermanStemFilter(result, exclusionSet); + result = new GermanStemFilter(result, exclusionSet, _useDin2Stemmer); return result; } } Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs (original) +++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs Fri Mar 23 02:11:43 2012 @@ -43,11 +43,12 @@ namespace Lucene.Net.Analysis.De private TermAttribute termAtt; public GermanStemFilter(TokenStream _in) - : base(_in) - { - stemmer = new GermanStemmer(); - termAtt = AddAttribute(); - } + : this(_in, false) + { } + + public GermanStemFilter(TokenStream _in, bool useDin2Stemmer) + : this(_in, null, useDin2Stemmer) + { } /// /// Builds a GermanStemFilter that uses an exclusiontable. @@ -55,9 +56,22 @@ namespace Lucene.Net.Analysis.De /// /// public GermanStemFilter(TokenStream _in, ISet exclusiontable) - : this(_in) + : this(_in, exclusiontable, false) + { } + + /// + /// Builds a GermanStemFilter that uses an exclusiontable. + /// + /// + /// + /// Specifies where to use the DIN-5007-2 (names) + /// stemmer instead of the default DIN-5007-1 (words) stemmer + public GermanStemFilter(TokenStream _in, ISet exclusiontable, bool useDin2Stemmer) + : base(_in) { exclusionSet = exclusiontable; + stemmer = useDin2Stemmer ? new GermanStemmerDIN2() : new GermanStemmer(); + termAtt = AddAttribute(); } /// Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs (original) +++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs Fri Mar 23 02:11:43 2012 @@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.De /// /// Amount of characters that are removed with Substitute() while stemming. /// - private int substCount = 0; + protected int substCount = 0; /// /// Stemms the given term to an unique discriminator. @@ -187,32 +187,16 @@ namespace Lucene.Net.Analysis.De for ( int c = 0; c < buffer.Length; c++ ) { // Replace the second char of a pair of the equal characters with an asterisk - if ( c > 0 && buffer[c] == buffer[c - 1]) - { - buffer[c] = '*'; - } - // Substitute Umlauts. - else if ( buffer[c] == 'ä' ) - { - buffer[c] = 'a'; - } - else if ( buffer[c] == 'ö' ) - { - buffer[c] = 'o'; - } - else if ( buffer[c] == 'ü' ) - { - buffer[c] = 'u'; - } - // Fix bug so that 'ß' at the end of a word is replaced. - else if ( buffer[c] == 'ß' ) - { - - buffer[c] = 's'; - buffer.Insert(c + 1, 's'); - substCount++; - } - // Take care that at least one character is left left side from the current one + if (c > 0 && buffer[c] == buffer[c - 1]) + { + buffer[c] = '*'; + } + // Substitute Umlauts. + else + { + SubstituteUmlauts(buffer, c); + } + // Take care that at least one character is left left side from the current one if ( c < buffer.Length - 1 ) { // Masking several common character combinations with an token @@ -257,7 +241,30 @@ namespace Lucene.Net.Analysis.De } } - /// + protected virtual void SubstituteUmlauts(StringBuilder buffer, int c) + { + if (buffer[c] == 'ä') + { + buffer[c] = 'a'; + } + else if (buffer[c] == 'ö') + { + buffer[c] = 'o'; + } + else if (buffer[c] == 'ü') + { + buffer[c] = 'u'; + } + // Fix bug so that 'ß' at the end of a word is replaced. + else if (buffer[c] == 'ß') + { + buffer[c] = 's'; + buffer.Insert(c + 1, 's'); + substCount++; + } + } + + /// /// Undoes the changes made by Substitute(). That are character pairs and /// character combinations. Umlauts will remain as their corresponding vowel, /// as "?" remains as "ss". Added: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs?rev=1304164&view=auto ============================================================================== --- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs (added) +++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs Fri Mar 23 02:11:43 2012 @@ -0,0 +1,41 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Analysis.De +{ + /// + /// A stemmer for the german language that uses the + /// DIN-5007-2 "Phone Book" rules for handling + /// umlaut characters. + /// + public sealed class GermanStemmerDIN2 : GermanStemmer + { + protected override void SubstituteUmlauts(StringBuilder buffer, int c) + { + if (buffer[c] == 'ä') + { + buffer[c] = 'a'; + buffer.Insert(c + 1, 'e'); + } + else if (buffer[c] == 'ö') + { + buffer[c] = 'o'; + buffer.Insert(c + 1, 'e'); + } + else if (buffer[c] == 'ü') + { + buffer[c] = 'u'; + buffer.Insert(c + 1, 'e'); + } + // Fix bug so that 'ß' at the end of a word is replaced. + else if (buffer[c] == 'ß') + { + buffer[c] = 's'; + buffer.Insert(c + 1, 's'); + substCount++; + } + } + } +} Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original) +++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Fri Mar 23 02:11:43 2012 @@ -170,6 +170,9 @@ PreserveNewest + + PreserveNewest + PreserveNewest Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff ============================================================================== --- incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs (original) +++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Fri Mar 23 02:11:43 2012 @@ -39,12 +39,12 @@ namespace Lucene.Net.Analyzers.De public class TestGermanStemFilter : BaseTokenStreamTestCase { [Test] - public void TestStemming() + public void TestDin1Stemming() { // read test cases from external file: - string testFile = @"De\data.txt"; - using (FileStream fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read)) - using (StreamReader breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1"))) + const string testFile = @"De\data.txt"; + using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read)) + using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1"))) { while (true) { @@ -56,7 +56,28 @@ namespace Lucene.Net.Analyzers.De continue; // ignore comments and empty lines String[] parts = line.Split(';'); //System.out.println(parts[0] + " -- " + parts[1]); - Check(parts[0], parts[1]); + Check(parts[0], parts[1], false); + } + } + } + + [Test] + public void TestDin2Stemming() + { + // read test cases from external file: + const string testFile = @"De\data_din2.txt"; + using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read)) + using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1"))) + { + string line; + while ((line = breader.ReadLine()) != null) + { + line = line.Trim(); + if (line.StartsWith("#") || string.IsNullOrEmpty(line)) + continue; // ignore comments and empty lines + + var parts = line.Split(';'); + Check(parts[0], parts[1], true); } } } @@ -73,7 +94,7 @@ namespace Lucene.Net.Analyzers.De /** * subclass that acts just like whitespace analyzer for testing */ - private class GermanSubclassAnalyzer : GermanAnalyzer + private sealed class GermanSubclassAnalyzer : GermanAnalyzer { public GermanSubclassAnalyzer(Version matchVersion) : base(matchVersion) @@ -99,15 +120,15 @@ namespace Lucene.Net.Analyzers.De [Test] public void TestExclusionTableReuse() { - GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); + var a = new GermanAnalyzer(Version.LUCENE_CURRENT); CheckReuse(a, "tischen", "tisch"); - a.SetStemExclusionTable(new String[] { "tischen" }); + a.SetStemExclusionTable(new[] { "tischen" }); CheckReuse(a, "tischen", "tischen"); } - private void Check(String input, String expected) + private void Check(String input, String expected, bool useDin2) { - CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected); + CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT, useDin2), input, expected); } private void CheckReuse(Analyzer a, String input, String expected) Added: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt?rev=1304164&view=auto ============================================================================== --- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt (added) +++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Fri Mar 23 02:11:43 2012 @@ -0,0 +1,50 @@ +# German special characters are replaced: +häufig;haeufig +üor;ueor +björk;bjoerk + +# here the stemmer works okay, it maps related words to the same stem: +abschließen;abschliess +abschließender;abschliess +abschließendes;abschliess +abschließenden;abschliess + +Tisch;tisch +Tische;tisch +Tischen;tisch + +Haus;hau +Hauses;hau +Häuser;haeu +Häusern;haeu +# here's a case where overstemming occurs, i.e. a word is +# mapped to the same stem as unrelated words: +hauen;hau + +# here's a case where understemming occurs, i.e. two related words +# are not mapped to the same stem. This is the case with basically +# all irregular forms: +Drama;drama +Dramen;dram + +# replace "ß" with 'ss': +Ausmaß;ausmass + +# fake words to test if suffixes are cut off: +xxxxxe;xxxxx +xxxxxs;xxxxx +xxxxxn;xxxxx +xxxxxt;xxxxx +xxxxxem;xxxxx +xxxxxer;xxxxx +xxxxxnd;xxxxx +# the suffixes are also removed when combined: +xxxxxetende;xxxxx + +# words that are shorter than four charcters are not changed: +xxe;xxe +# -em and -er are not removed from words shorter than five characters: +xxem;xxem +xxer;xxer +# -nd is not removed from words shorter than six characters: +xxxnd;xxxnd