commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t.@apache.org
Subject svn commit: r1298958 - in /commons/proper/codec/trunk/src: main/java/org/apache/commons/codec/language/Nysiis.java test/java/org/apache/commons/codec/language/NysiisTest.java
Date Fri, 09 Mar 2012 18:22:10 GMT
Author: tn
Date: Fri Mar  9 18:22:09 2012
New Revision: 1298958

URL: http://svn.apache.org/viewvc?rev=1298958&view=rev
Log:
[CODEC-63] Merged duplicate unit tests, added algorithm outline to class description

Modified:
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java?rev=1298958&r1=1298957&r2=1298958&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
(original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Nysiis.java
Fri Mar  9 18:22:09 2012
@@ -27,11 +27,42 @@ import org.apache.commons.codec.StringEn
  * 
  * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names,
but can also be used as a
  * general purpose scheme to find word with similar phonemes.
- * 
+ *
  * <p>
  * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
  * </p>
- * 
+ *
+ * <p>Algorithm description:
+ * <pre>
+ * 1. Transcode first characters of name
+ *   1a. MAC ->   MCC
+ *   1b. KN  ->   NN
+ *   1c. K   ->   C
+ *   1d. PH  ->   FF
+ *   1e. PF  ->   FF
+ *   1f. SCH ->   SSS
+ * 2. Transcode last characters of name
+ *   2a. EE, IE          ->   Y
+ *   2b. DT,RT,RD,NT,ND  ->   D
+ * 3. First character of key = first character of name
+ * 4. Transcode remaining characters by following these rules, incrementing by one character
each time
+ *   4a. EV  ->   AF  else A,E,I,O,U -> A
+ *   4b. Q   ->   G
+ *   4c. Z   ->   S
+ *   4d. M   ->   N
+ *   4e. KN  ->   N   else K -> C
+ *   4f. SCH ->   SSS
+ *   4g. PH  ->   FF
+ *   4h. H   ->   If previous or next is nonvowel, previous
+ *   4i. W   ->   If previous is vowel, previous
+ *   4j. Add current to key if current != last key character
+ * 5. If last character is S, remove it
+ * 6. If last characters are AY, replace with Y
+ * 7. If last character is A, remove it
+ * 8. Collapse all strings of repeated characters
+ * 9. Add original first character of name as first character of key
+ * </pre></p>
+ *
  * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">http://en.wikipedia.org/wiki/NYSIIS</a>
  * @see <a href="http://www.dropby.com/NYSIIS.html">http://www.dropby.com/NYSIIS.html</a>
  * @see Soundex
@@ -39,24 +70,24 @@ import org.apache.commons.codec.StringEn
  */
 public class Nysiis implements StringEncoder {
 
-    private static final char[] CHARS_A = new char[] { 'A' };
-    private static final char[] CHARS_AF = new char[] { 'A', 'F' };
-    private static final char[] CHARS_C = new char[] { 'C' };
-    private static final char[] CHARS_FF = new char[] { 'F', 'F' };
-    private static final char[] CHARS_G = new char[] { 'G' };
-    private static final char[] CHARS_N = new char[] { 'N' };
-    private static final char[] CHARS_NN = new char[] { 'N', 'N' };
-    private static final char[] CHARS_S = new char[] { 'S' };
+    private static final char[] CHARS_A   = new char[] { 'A' };
+    private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
+    private static final char[] CHARS_C   = new char[] { 'C' };
+    private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
+    private static final char[] CHARS_G   = new char[] { 'G' };
+    private static final char[] CHARS_N   = new char[] { 'N' };
+    private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
+    private static final char[] CHARS_S   = new char[] { 'S' };
     private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
-    
-    private static final Pattern PAT_MAC = Pattern.compile("^MAC");
-    private static final Pattern PAT_KN = Pattern.compile("^KN");
-    private static final Pattern PAT_K = Pattern.compile("^K");
-    private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
-    private static final Pattern PAT_SCH = Pattern.compile("^SCH");
-    private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
+
+    private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
+    private static final Pattern PAT_KN     = Pattern.compile("^KN");
+    private static final Pattern PAT_K      = Pattern.compile("^K");
+    private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
+    private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
+    private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
     private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
-    
+
     private static final char SPACE = ' ';
     private static final int TRUE_LENGTH = 6;
 

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java?rev=1298958&r1=1298957&r2=1298958&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
(original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/NysiisTest.java
Fri Mar  9 18:22:09 2012
@@ -17,9 +17,6 @@
 
 package org.apache.commons.codec.language;
 
-import java.util.Arrays;
-import java.util.List;
-
 import org.apache.commons.codec.EncoderException;
 import org.apache.commons.codec.StringEncoder;
 import org.apache.commons.codec.StringEncoderAbstractTest;
@@ -83,100 +80,37 @@ public class NysiisTest extends StringEn
         this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
     }
 
-    @Test
-    public void testDropBy() throws EncoderException {
-        this.assertEncodings(
-                new String[] { "MACINTOSH", "MCANT" },
-                new String[] { "KNUTH", "NAT"   },
-                new String[] { "KOEHN", "CAN" },
-                new String[] { "PHILLIPSON", "FALAPSAN" },
-                new String[] { "PFEISTER", "FASTAR" },
-                new String[] { "MCKEE", "MCY" },
-                new String[] { "MACKIE", "MCY" },
-                new String[] { "HEITSCHMIDT", "HATSNAD" },
-                new String[] { "BART", "BAD" },
-                new String[] { "HURD", "HAD" },
-                new String[] { "HUNT", "HAD" },
-                new String[] { "WESTERLUND", "WASTARLAD" },
-                new String[] { "CASSTEVENS", "CASTAFAN" },
-                new String[] { "VASQUEZ", "VASG" },
-                new String[] { "FRAZIER", "FRASAR" },
-                new String[] { "BOWMAN", "BANAN" },
-                new String[] { "RICKERT", "RACAD" },
-                new String[] { "DEUTSCH", "DAT" },
-                new String[] { "WESTPHAL", "WASTFAL" },
-                new String[] { "SHRIVER", "SRAVAR" },
-                new String[] { "KUHL", "CAL" },
-                new String[] { "RAWSON", "RASAN" },
-                new String[] { "JILES", "JAL" },
-                new String[] { "CARRAWAY", "CARY" },
-                new String[] { "YAMADA", "YANAD" });
-    }
-
     /**
-     * Tests data gathered from around the internets.
+     * Tests data gathered from around the internet.
      * 
+     * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
      * @throws EncoderException
      */
     @Test
-    public void testDropBy2() throws EncoderException {
-        // Explanation of differences between this implementation and the one at dropby.com.
-        //
-        // Algorithm (taken from www.dropby.com/NYSIIS.html):
-        //
-        // 1.  Transcode first characters of name:
-        //    MAC >   MCC
-        //    KN  >   NN
-        //    K   >   C
-        //    PH  >   FF
-        //    PF  >   FF
-        //    SCH >   SSS
-        //
-        // 2.  Transcode last characters of name:
-        //    EE, IE  >   Y
-        //    DT,RT,RD,NT,ND  >   D
-        //
-        // 3.  First character of key = first character of name.
-        //
-        // 4.  Transcode remaining characters by following these rules, incrementing by one
character each time:
-        //   4a.   EV  >   AF  else A,E,I,O,U > A
-        //   4b.   Q   >   G
-        //   4c.   Z   >   S
-        //   4d.   M   >   N
-        //   4e.   KN  >   N   else K > C
-        //   4f.   SCH >   SSS
-        //   4g.   PH  >   FF
-        //   4h.   H   >   If previous or next is nonvowel, previous
-        //   4i.   W   >   If previous is vowel, previous
-        //   4j.   Add current to key if current != last key character
-        //
-        // 5.  If last character is S, remove it
-        // 6.  If last characters are AY, replace with Y
-        // 7.  If last character is A, remove it
-        // 8.  Collapse all strings of repeated characters
-        // 9.  Add original first character of name as first character of key
+    public void testDropBy() throws EncoderException {
+        // Explanation of differences between this implementation and the one at dropby.com
is
+        // prepended to the test string. The referenced rules refer to the outlined steps
the
+        // class description for Nysiis.
 
         this.assertEncodings(
-                // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
                 // 1. Transcode first characters of name
                 new String[] { "MACINTOSH", "MCANT" },
                 // violates 4j: the second N should not be added, as the first
                 //              key char is already a N
-                new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
+                new String[] { "KNUTH", "NAT" },           // Original: NNAT; modified: NATH
                 // O and E are transcoded to A because of rule 4a
                 // H also to A because of rule 4h
                 // the N gets mysteriously lost, maybe because of a wrongly implemented rule
4h
                 // that skips the next char in such a case?
                 // the remaining A is removed because of rule 7
-                new String[] { "KOEHN", "CAN" }, // Original: C
+                new String[] { "KOEHN", "CAN" },           // Original: C
                 // violates 4j: see also KNUTH
                 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
                 // violates 4j: see also KNUTH
-                new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
+                new String[] { "PFEISTER", "FASTAR" },     // Original: FFASTA[R]
                 // violates 4j: see also KNUTH
-                new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
-                // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
-                // 2.Transcode last characters of name: 
+                new String[] { "SCHOENHOEFT", "SANAFT" },  // Original: SSANAF[T]
+                // 2. Transcode last characters of name:
                 new String[] { "MCKEE", "MCY" },
                 new String[] { "MACKIE", "MCY" },
                 new String[] { "HEITSCHMIDT", "HATSNAD" },
@@ -184,8 +118,8 @@ public class NysiisTest extends StringEn
                 new String[] { "HURD", "HAD" },
                 new String[] { "HUNT", "HAD" },
                 new String[] { "WESTERLUND", "WASTARLAD" },
-                // http://www.dropby.com/indexLF.html?content=/NYSIIS.html
-                // 4. Transcode remaining characters by following these rules, incrementing
by one character each time: 
+                // 4. Transcode remaining characters by following these rules,
+                //    incrementing by one character each time:
                 new String[] { "CASSTEVENS", "CASTAFAN" },
                 new String[] { "VASQUEZ", "VASG" },
                 new String[] { "FRAZIER", "FRASAR" },
@@ -195,18 +129,18 @@ public class NysiisTest extends StringEn
                 // violates 5: the last S is not removed
                 // when comparing to DEUTS, which is phonetically similar
                 // the result it also DAT, which is correct for DEUTSCH too imo
-                new String[] { "DEUTSCH", "DAT" }, // Original: DATS
+                new String[] { "DEUTSCH", "DAT" },         // Original: DATS
                 new String[] { "WESTPHAL", "WASTFAL" },
                 // violates 4h: the H should be transcoded to S and thus ignored as
                 // the first key character is also S
-                new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
+                new String[] { "SHRIVER", "SRAVAR" },      // Original: SHRAVA[R]
                 // same as KOEHN, the L gets mysteriously lost
-                new String[] { "KUHL", "CAL" }, // Original: C
+                new String[] { "KUHL", "CAL" },            // Original: C
                 new String[] { "RAWSON", "RASAN" },
                 // If last character is S, remove it
                 new String[] { "JILES", "JAL" },
                 // violates 6: if the last two characters are AY, remove A
-                new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
+                new String[] { "CARRAWAY", "CARY" },       // Original: CARAY
                 new String[] { "YAMADA", "YANAD" });
     }
 



Mime
View raw message