commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t.@apache.org
Subject svn commit: r1298576 - /commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Date Thu, 08 Mar 2012 20:56:36 GMT
Author: tn
Date: Thu Mar  8 20:56:35 2012
New Revision: 1298576

URL: http://svn.apache.org/viewvc?rev=1298576&view=rev
Log:
[CODEC-63] Added explanation for different results to dropby.com, Raised CC to 100/100

Modified:
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298576&r1=1298575&r2=1298576&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
(original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Thu Mar  8 20:56:35 2012
@@ -49,6 +49,15 @@ public class NysiisTest extends StringEn
     }
 
     @Test
+    public void testTrueVariant() {
+        Nysiis encoder = new Nysiis(true);
+
+        String encoded = encoder.encode("WESTERLUND");
+        Assert.assertTrue(encoded.length() <= 6);
+        Assert.assertEquals("WASTAR", encoded);
+    }
+
+    @Test
     public void testBran() throws EncoderException {
         encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
     }
@@ -71,6 +80,17 @@ public class NysiisTest extends StringEn
     }
 
     @Test
+    public void testSpecialBranches() throws EncoderException {
+        this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
+        this.encodeAll(new String[] { "Kocher" }, "CACAR");
+        this.encodeAll(new String[] { "Fesca" }, "FASC");
+        this.encodeAll(new String[] { "Shom" }, "SAN");
+        this.encodeAll(new String[] { "Ohlo" }, "OL");
+        this.encodeAll(new String[] { "Uhu" }, "UH");
+        this.encodeAll(new String[] { "Um" }, "UN");
+    }
+
+    @Test
     public void testDropBy() throws EncoderException {
         List<String[]> testValues =
                 Arrays.asList(
@@ -112,16 +132,62 @@ public class NysiisTest extends StringEn
      */
     @Test
     public void testDropBy2() throws EncoderException {
+        // Explanation of differences between this implementation and the one at dropby.com.
+        //
+        // Algorithm (taken from www.dropby.com/NYSIIS.html):
+        //
+        // 1.  Transcode first characters of name:
+        //    MAC »   MCC
+        //    KN  »   NN
+        //    K   »   C
+        //    PH  »   FF
+        //    PF  »   FF
+        //    SCH »   SSS
+        //
+        // 2.  Transcode last characters of name:
+        //    EE, IE  »   Y
+        //    DT,RT,RD,NT,ND  »   D
+        //
+        // 3.  First character of key = first character of name.
+        //
+        // 4.  Transcode remaining characters by following these rules, incrementing by one
character each time:
+        //   4a.   EV  »   AF  else A,E,I,O,U » A
+        //   4b.   Q   »   G
+        //   4c.   Z   »   S
+        //   4d.   M   »   N
+        //   4e.   KN  »   N   else K » C
+        //   4f.   SCH     »   SSS
+        //   4g.   PH  »   FF
+        //   4h.   H   »   If previous or next is nonvowel, previous
+        //   4i.   W   »   If previous is vowel, previous
+        //   4j.   Add current to key if current != last key character
+        //
+        // 5.  If last character is S, remove it
+        // 6.  If last characters are AY, replace with Y
+        // 7.  If last character is A, remove it
+        // 8.  Collapse all strings of repeated characters
+        // 9.  Add original first character of name as first character of key
+
         List<String[]> testValues =
                 Arrays.asList(
                         // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
                         // 1. Transcode first characters of name
                         new String[] { "MACINTOSH", "MCANT" },
-                        //new String[] { "KNUTH", "NNATH" }, // Original: NNAT; modified:
NATH
-                        //new String[] { "KOEHN", "C" },
-                        //new String[] { "PHILLIPSON", "FFALAP" },
-                        //new String[] { "PFEISTER", "FFASTA" },
-                        //new String[] { "SCHOENHOEFT", "SSANAF" },
+                        // violates 4j: the second N should not be added, as the first
+                        //              key char is already a N
+                        new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
+                        // O and E are transcoded to A because of rule 4a
+                        // H also to A because of rule 4h
+                        // the N gets mysteriously lost, maybe because of a wrongly implemented
rule 4h
+                        // that skips the next char in such a case?
+                        // the remaining A is removed because of rule 7
+                        new String[] { "KOEHN", "CAN" }, // Original: C
+                        // violates 4j: see also KNUTH
+                        new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
+                        // violates 4j: see also KNUTH
+                        new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
+                        // violoates 4j: see also KNUTH
+                        new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
                         // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
                         // 2.Transcode last characters of name: 
                         new String[] { "MCKEE", "MCY" },
@@ -139,14 +205,21 @@ public class NysiisTest extends StringEn
                         new String[] { "BOWMAN", "BANAN" },
                         new String[] { "MCKNIGHT", "MCNAGT" },
                         new String[] { "RICKERT", "RACAD" },
-                        //new String[] { "DEUTSCH", "DATS" },
+                        // violates 5: the last S is not removed
+                        // when comparing to DEUTS, which is phonetically similar
+                        // the result it also DAT, which is correct for DEUTSCH too imo
+                        new String[] { "DEUTSCH", "DAT" }, // Original: DATS
                         new String[] { "WESTPHAL", "WASTFAL" },
-                        //new String[] { "SHRIVER", "SHRAVA" },
-                        //new String[] { "KUHL", "C" },
+                        // violates 4h: the H should be transcoded to S and thus ignored
as
+                        // the first key character is also S
+                        new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
+                        // same as KOEHN, the L gets mysteriously lost, the correct one
+                        new String[] { "KUHL", "CAL" }, // Original: C
                         new String[] { "RAWSON", "RASAN" },
                         // If last character is S, remove it
                         new String[] { "JILES", "JAL" },
-                        //new String[] { "CARRAWAY", "CARAY" },
+                        // violates 6: if the last two characters are AY, remove A
+                        new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
                         new String[] { "YAMADA", "YANAD" });
 
         for (String[] arr : testValues) {



Mime
View raw message