Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id AC359200CB4 for ; Tue, 27 Jun 2017 22:33:51 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id AB22D160BD8; Tue, 27 Jun 2017 20:33:51 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 954ED160BF9 for ; Tue, 27 Jun 2017 22:33:48 +0200 (CEST) Received: (qmail 74993 invoked by uid 500); 27 Jun 2017 20:33:47 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 74847 invoked by uid 99); 27 Jun 2017 20:33:47 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 27 Jun 2017 20:33:47 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id CD676E967B; Tue, 27 Jun 2017 20:33:46 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: nightowl888@apache.org To: commits@lucenenet.apache.org Date: Tue, 27 Jun 2017 20:33:51 -0000 Message-Id: <2b2592e370dd4cd78099fa2807a138b7@git.apache.org> In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [06/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic + tests. Rather than porting over the entire commons-codec library, only the language features were ported and added to this library. archived-at: Tue, 27 Jun 2017 20:33:51 -0000 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs new file mode 100644 index 0000000..07e7f66 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs @@ -0,0 +1,111 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class DoubleMetaphoneFilterTest : BaseTokenStreamTestCase + { + [Test] + public void TestSize4FalseInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); + AssertTokenStreamContents(filter, new String[] { "ANTR" }); + } + + [Test] + public void TestSize4TrueInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); + AssertTokenStreamContents(filter, new String[] { "international", "ANTR" }); + } + [Test] + public void TestAlternateInjectFalse() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); + AssertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); + } + [Test] + public void TestSize8FalseInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); + } + [Test] + public void TestNonConvertableStringsWithInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); + } + + [Test] + public void TestNonConvertableStringsWithoutInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); + + // should have something after the stream + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello")); + filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); + } + + [Test] + public void TestRandom() + { + int codeLen = TestUtil.NextInt(Random(), 1, 8); + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false)); + }); + + CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER); + + Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true)); + }); + + CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, Random().nextBoolean())); + }); + + CheckOneTerm(a, "", ""); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs new file mode 100644 index 0000000..bd3681b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs @@ -0,0 +1,255 @@ +using NUnit.Framework; +using System; +using System.Globalization; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Tests BeiderMorseEncoder. + /// + public class BeiderMorseEncoderTest : StringEncoderAbstractTest + { + private static readonly char[] TEST_CHARS = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u' }; + + private void AssertNotEmpty(BeiderMorseEncoder bmpm, string value) + { + Assert.False(bmpm.Encode(value).Equals(""), value); + } + + private BeiderMorseEncoder CreateGenericApproxEncoder() + { + BeiderMorseEncoder encoder = new BeiderMorseEncoder(); + encoder.NameType=(NameType.GENERIC); + encoder.RuleType=(RuleType.APPROX); + return encoder; + } + + protected override BeiderMorseEncoder CreateStringEncoder() + { + return new BeiderMorseEncoder(); + } + + /** + * Tests we do not blow up. + * + * @throws EncoderException + */ + [Test] + public void TestAllChars() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c = char.MinValue; c < char.MaxValue; c++) + { + bmpm.Encode(c.ToString()); + } + } + + [Test] + public void TestAsciiEncodeNotEmpty1Letter() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c = 'a'; c <= 'z'; c++) + { + string value = c.ToString(); + string valueU = value.ToUpperInvariant(); + AssertNotEmpty(bmpm, value); + AssertNotEmpty(bmpm, valueU); + } + } + + [Test] + public void TestAsciiEncodeNotEmpty2Letters() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c1 = 'a'; c1 <= 'z'; c1++) + { + for (char c2 = 'a'; c2 <= 'z'; c2++) + { + String value = new String(new char[] { c1, c2 }); + String valueU = value.ToUpperInvariant(); + AssertNotEmpty(bmpm, value); + AssertNotEmpty(bmpm, valueU); + } + } + } + + [Test] + public void TestEncodeAtzNotEmpty() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + //String[] names = { "ácz", "átz", "Ignácz", "Ignátz", "Ignác" }; + String[] + names = { "\u00e1cz", "\u00e1tz", "Ign\u00e1cz", "Ign\u00e1tz", "Ign\u00e1c" }; + foreach (String name in names) + { + AssertNotEmpty(bmpm, name); + } + } + + /** + * Tests https://issues.apache.org/jira/browse/CODEC-125?focusedCommentId=13071566&page=com.atlassian.jira.plugin.system.issuetabpanels: + * comment-tabpanel#comment-13071566 + * + * @throws EncoderException + */ + [Test] + public void TestEncodeGna() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + bmpm.Encode("gna"); + } + + [Test]//@Test(expected = IllegalArgumentException.class) + public void TestInvalidLangIllegalArgumentException() + { + Assert.Throws(() => Rule.GetInstance(NameType.GENERIC, RuleType.APPROX, "noSuchLanguage")); + } + + [Test]//@Test(expected = IllegalStateException.class) + public void TestInvalidLangIllegalStateException() + { + Assert.Throws(() => Lang.LoadFromResource("thisIsAMadeUpResourceName", Languages.GetInstance(NameType.GENERIC))); + } + + [Test]//@Test(expected = IllegalArgumentException.class) + public void TestInvalidLanguageIllegalArgumentException() + { + Assert.Throws(() => Languages.GetInstance("thereIsNoSuchLanguage")); + } + + [Test]//@Test(timeout = 10000L) + public void TestLongestEnglishSurname() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + bmpm.Encode("MacGhilleseatheanaich"); + } + + [Test]//@Test(expected = IndexOutOfBoundsException.class) + public void TestNegativeIndexForRuleMatchIndexOutOfBoundsException() + { + Assert.Throws(() => + { + Rule r = new Rule("a", "", "", new Phoneme("", Languages.ANY_LANGUAGE)); + r.PatternAndContextMatches("bob", -1); + }); + } + + [Test] + public void TestOOM() + { + String phrase = "200697900'-->�aadaabcf\"aedfbff?>cae" + + "cfaaa>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" />" + + "