Return-Path: X-Original-To: apmail-commons-commits-archive@minotaur.apache.org Delivered-To: apmail-commons-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 98D0D9CD1 for ; Thu, 8 Mar 2012 20:57:01 +0000 (UTC) Received: (qmail 27373 invoked by uid 500); 8 Mar 2012 20:57:01 -0000 Delivered-To: apmail-commons-commits-archive@commons.apache.org Received: (qmail 27258 invoked by uid 500); 8 Mar 2012 20:57:01 -0000 Mailing-List: contact commits-help@commons.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@commons.apache.org Delivered-To: mailing list commits@commons.apache.org Received: (qmail 27250 invoked by uid 99); 8 Mar 2012 20:57:01 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 08 Mar 2012 20:57:01 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 08 Mar 2012 20:56:57 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 6A066238890A for ; Thu, 8 Mar 2012 20:56:36 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: svn commit: r1298576 - /commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java Date: Thu, 08 Mar 2012 20:56:36 -0000 To: commits@commons.apache.org From: tn@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120308205636.6A066238890A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: tn Date: Thu Mar 8 20:56:35 2012 New Revision: 1298576 URL: http://svn.apache.org/viewvc?rev=1298576&view=rev Log: [CODEC-63] Added explanation for different results to dropby.com, Raised CC to 100/100 Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298576&r1=1298575&r2=1298576&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java (original) +++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java Thu Mar 8 20:56:35 2012 @@ -49,6 +49,15 @@ public class NysiisTest extends StringEn } @Test + public void testTrueVariant() { + Nysiis encoder = new Nysiis(true); + + String encoded = encoder.encode("WESTERLUND"); + Assert.assertTrue(encoded.length() <= 6); + Assert.assertEquals("WASTAR", encoded); + } + + @Test public void testBran() throws EncoderException { encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN"); } @@ -71,6 +80,17 @@ public class NysiisTest extends StringEn } @Test + public void testSpecialBranches() throws EncoderException { + this.encodeAll(new String[] { "Kobwick" }, "CABWAC"); + this.encodeAll(new String[] { "Kocher" }, "CACAR"); + this.encodeAll(new String[] { "Fesca" }, "FASC"); + this.encodeAll(new String[] { "Shom" }, "SAN"); + this.encodeAll(new String[] { "Ohlo" }, "OL"); + this.encodeAll(new String[] { "Uhu" }, "UH"); + this.encodeAll(new String[] { "Um" }, "UN"); + } + + @Test public void testDropBy() throws EncoderException { List testValues = Arrays.asList( @@ -112,16 +132,62 @@ public class NysiisTest extends StringEn */ @Test public void testDropBy2() throws EncoderException { + // Explanation of differences between this implementation and the one at dropby.com. + // + // Algorithm (taken from www.dropby.com/NYSIIS.html): + // + // 1. Transcode first characters of name: + // MAC » MCC + // KN » NN + // K » C + // PH » FF + // PF » FF + // SCH » SSS + // + // 2. Transcode last characters of name: + // EE, IE » Y + // DT,RT,RD,NT,ND » D + // + // 3. First character of key = first character of name. + // + // 4. Transcode remaining characters by following these rules, incrementing by one character each time: + // 4a. EV » AF else A,E,I,O,U » A + // 4b. Q » G + // 4c. Z » S + // 4d. M » N + // 4e. KN » N else K » C + // 4f. SCH » SSS + // 4g. PH » FF + // 4h. H » If previous or next is nonvowel, previous + // 4i. W » If previous is vowel, previous + // 4j. Add current to key if current != last key character + // + // 5. If last character is S, remove it + // 6. If last characters are AY, replace with Y + // 7. If last character is A, remove it + // 8. Collapse all strings of repeated characters + // 9. Add original first character of name as first character of key + List testValues = Arrays.asList( // http://www.dropby.com/indexLF.html?content=/NYSIIS.html // 1. Transcode first characters of name new String[] { "MACINTOSH", "MCANT" }, - //new String[] { "KNUTH", "NNATH" }, // Original: NNAT; modified: NATH - //new String[] { "KOEHN", "C" }, - //new String[] { "PHILLIPSON", "FFALAP" }, - //new String[] { "PFEISTER", "FFASTA" }, - //new String[] { "SCHOENHOEFT", "SSANAF" }, + // violates 4j: the second N should not be added, as the first + // key char is already a N + new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH + // O and E are transcoded to A because of rule 4a + // H also to A because of rule 4h + // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h + // that skips the next char in such a case? + // the remaining A is removed because of rule 7 + new String[] { "KOEHN", "CAN" }, // Original: C + // violates 4j: see also KNUTH + new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN] + // violates 4j: see also KNUTH + new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R] + // violoates 4j: see also KNUTH + new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T] // http://www.dropby.com/indexLF.html?content=/NYSIIS.html // 2.Transcode last characters of name: new String[] { "MCKEE", "MCY" }, @@ -139,14 +205,21 @@ public class NysiisTest extends StringEn new String[] { "BOWMAN", "BANAN" }, new String[] { "MCKNIGHT", "MCNAGT" }, new String[] { "RICKERT", "RACAD" }, - //new String[] { "DEUTSCH", "DATS" }, + // violates 5: the last S is not removed + // when comparing to DEUTS, which is phonetically similar + // the result it also DAT, which is correct for DEUTSCH too imo + new String[] { "DEUTSCH", "DAT" }, // Original: DATS new String[] { "WESTPHAL", "WASTFAL" }, - //new String[] { "SHRIVER", "SHRAVA" }, - //new String[] { "KUHL", "C" }, + // violates 4h: the H should be transcoded to S and thus ignored as + // the first key character is also S + new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R] + // same as KOEHN, the L gets mysteriously lost, the correct one + new String[] { "KUHL", "CAL" }, // Original: C new String[] { "RAWSON", "RASAN" }, // If last character is S, remove it new String[] { "JILES", "JAL" }, - //new String[] { "CARRAWAY", "CARAY" }, + // violates 6: if the last two characters are AY, remove A + new String[] { "CARRAWAY", "CARY" }, // Original: CARAY new String[] { "YAMADA", "YANAD" }); for (String[] arr : testValues) {