commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r1789911 - in /commons/proper/codec/trunk/src: changes/changes.xml main/java/org/apache/commons/codec/language/Soundex.java test/java/org/apache/commons/codec/language/SoundexTest.java
Date Sun, 02 Apr 2017 20:41:29 GMT
Author: sebb
Date: Sun Apr  2 20:41:29 2017
New Revision: 1789911

URL: http://svn.apache.org/viewvc?rev=1789911&view=rev
Log:
CODEC-233 Soundex should support more algorithm variants

Modified:
    commons/proper/codec/trunk/src/changes/changes.xml
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Sun Apr  2 20:41:29 2017
@@ -45,6 +45,7 @@ The <action> type attribute can be add,u
     <release version="1.11" date="2017-MM-DD" description="Feature and fix release.">
       <!-- The first attribute below should be the issue id; makes it easier to navigate
in the IDE outline -->
 
+      <action issue="CODEC-233" dev="sebb" type="update" due-to="Yossi Tamari">Soundex
should support more algorithm variants</action>
       <action issue="CODEC-145" dev="sebb" type="fix" due-to="Jesse Glick">Base64.encodeBase64String
could better use newStringUsAscii (ditto encodeBase64URLSafeString)</action>
       <action issue="CODEC-144" dev="sebb" type="fix">BaseNCodec: encodeToString and
encodeAsString methods are identical</action>
       <action issue="CODEC-232" dev="sebb" type="fix">URLCodec is neither immutable
nor threadsafe</action>

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
(original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/Soundex.java
Sun Apr  2 20:41:29 2017
@@ -32,15 +32,31 @@ import org.apache.commons.codec.StringEn
 public class Soundex implements StringEncoder {
 
     /**
+     * The marker character used to indicate a silent (ignored) character.
+     * These are ignored except when they appear as the first character.
+     * <p>
+     * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
+     * because changing it might break existing code. Mappings that don't contain
+     * a silent marker code are treated as though H and W are silent.
+     * <p>
+     * To override this, use the {@link #Soundex(String, boolean)} constructor.
+     * @since 1.11
+     */
+    public static final char SILENT_MARKER = '-';
+
+    /**
      * This is a default mapping of the 26 letters used in US English. A value of <code>0</code>
for a letter position
-     * means do not encode.
+     * means do not encode, but treat as a separator when it occurs between consonants with
the same code.
      * <p>
      * (This constant is provided as both an implementation convenience and to allow Javadoc
to pick
      * up the value for the constant values page.)
-     * </p>
-     *
+     * <p>
+     * <b>Note that letters H and W are treated specially.</b>
+     * They are ignored (after the first letter) and don't act as separators
+     * between consonants with the same code.
      * @see #US_ENGLISH_MAPPING
      */
+    //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
     public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
 
     /**
@@ -53,12 +69,45 @@ public class Soundex implements StringEn
 
     /**
      * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
+     * This treats H and W as silent letters.
+     * Apart from when they appear as the first letter, they are ignored.
+     * They don't act as separators between duplicate codes.
      *
      * @see #US_ENGLISH_MAPPING
+     * @see #US_ENGLISH_MAPPING_STRING
      */
     public static final Soundex US_ENGLISH = new Soundex();
 
     /**
+     * An instance of Soundex using the Simplified Soundex mapping, as described here:
+     * http://west-penwith.org.uk/misc/soundex.htm
+     * <p>
+     * This treats H and W the same as vowels (AEIOUY).
+     * Such letters aren't encoded (after the first), but they do
+     * act as separators when dropping duplicate codes.
+     * The mapping is otherwise the same as for {@link #US_ENGLISH}
+     * <p>
+     * @since 1.11
+     */
+    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING,
false);
+
+    /**
+     * An instance of Soundex using the mapping as per the Genealogy site:
+     * http://www.genealogy.com/articles/research/00000060.html
+     * <p>
+     * This treats vowels (AEIOUY), H and W as silent letters.
+     * Such letters are ignored (after the first) and do not
+     * act as separators when dropping duplicate codes.
+     * <p>
+     * The codes for consonants are otherwise the same as for 
+     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
+     *
+     * @since 1.11
+     */
+    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
+    //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+    /**
      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
      *
      * @deprecated This feature is not needed since the encoding size must be constant. Will
be removed in 2.0.
@@ -73,6 +122,15 @@ public class Soundex implements StringEn
     private final char[] soundexMapping;
 
     /**
+     * Should H and W be treated specially?
+     * <p>
+     * In versions of the code prior to 1.11,
+     * the code always treated H and W as silent (ignored) letters.
+     * If this field is false, H and W are no longer special-cased.
+     */
+    private final boolean specialCaseHW;
+
+    /**
      * Creates an instance using US_ENGLISH_MAPPING
      *
      * @see Soundex#Soundex(char[])
@@ -80,6 +138,7 @@ public class Soundex implements StringEn
      */
     public Soundex() {
         this.soundexMapping = US_ENGLISH_MAPPING;
+        this.specialCaseHW = true;
     }
 
     /**
@@ -88,6 +147,8 @@ public class Soundex implements StringEn
      *
      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds
the values to which each
      * letter is mapped. This implementation contains a default map for US_ENGLISH
+     * <p>
+     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not
given special treatment
      *
      * @param mapping
      *                  Mapping array to use when finding the corresponding code for a given
character
@@ -95,11 +156,23 @@ public class Soundex implements StringEn
     public Soundex(final char[] mapping) {
         this.soundexMapping = new char[mapping.length];
         System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
+        this.specialCaseHW = !hasMarker(this.soundexMapping);
+    }
+
+    private boolean hasMarker(char[] mapping) {
+        for(char ch : mapping) {
+            if (ch == SILENT_MARKER) {
+                return true;
+            }
+        }
+        return false;
     }
 
     /**
      * Creates a refined soundex instance using a custom mapping. This constructor can be
used to customize the mapping,
      * and/or possibly provide an internationalized mapping for a non-Western character set.
+     * <p>
+     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not
given special treatment
      *
      * @param mapping
      *            Mapping string to use when finding the corresponding code for a given character
@@ -107,6 +180,21 @@ public class Soundex implements StringEn
      */
     public Soundex(final String mapping) {
         this.soundexMapping = mapping.toCharArray();
+        this.specialCaseHW = !hasMarker(this.soundexMapping);
+    }
+
+    /**
+     * Creates a refined soundex instance using a custom mapping. This constructor can be
used to customize the mapping,
+     * and/or possibly provide an internationalized mapping for a non-Western character set.
+     *
+     * @param mapping
+     *            Mapping string to use when finding the corresponding code for a given character
+     * @param specialCaseHW if true, then 
+     * @since 1.11
+     */
+    public Soundex(final String mapping, boolean specialCaseHW) {
+        this.soundexMapping = mapping.toCharArray();
+        this.specialCaseHW = specialCaseHW;
     }
 
     /**
@@ -190,7 +278,7 @@ public class Soundex implements StringEn
     private char map(final char ch) {
         final int index = ch - 'A';
         if (index < 0 || index >= this.soundexMapping.length) {
-            throw new IllegalArgumentException("The character is not mapped: " + ch);
+            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index="
+ index + ")");
         }
         return this.soundexMapping[index];
     }
@@ -231,10 +319,13 @@ public class Soundex implements StringEn
         char lastDigit = map(first); // previous digit
         for(int i = 1; i < str.length() && count < out.length ; i++) {
             char ch = str.charAt(i);
-            if (ch == 'H' || ch == 'W') { // these are ignored completely
+            if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are
ignored completely
                 continue;
             }
             char digit = map(ch);
+            if (digit == SILENT_MARKER) {
+                continue;
+            }
             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
                 out[count++] = digit;
             }

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java?rev=1789911&r1=1789910&r2=1789911&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
(original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/SoundexTest.java
Sun Apr  2 20:41:29 2017
@@ -403,4 +403,33 @@ public class SoundexTest extends StringE
         Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
         Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
     }
+
+    @Test
+// examples and algorithm rules from:  http://www.genealogy.com/articles/research/00000060.html
+    public void testGenealogy() { // treat vowels and HW as silent
+        Soundex s = Soundex.US_ENGLISH_GENEALOGY;
+        Assert.assertEquals("H251", s.encode("Heggenburger"));
+        Assert.assertEquals("B425", s.encode("Blackman"));
+        Assert.assertEquals("S530", s.encode("Schmidt"));
+        Assert.assertEquals("L150", s.encode("Lippmann"));
+        // Additional local example
+        Assert.assertEquals("D200", s.encode("Dodds")); // 'o' is not a separator here -
it is silent
+        Assert.assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
+        Assert.assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
+    }
+
+    @Test
+// examples and algorithm rules from:  http://west-penwith.org.uk/misc/soundex.htm
+    public void testSimplifiedSoundex() { // treat vowels and HW as separators
+        Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
+        Assert.assertEquals("W452", s.encode("WILLIAMS"));
+        Assert.assertEquals("B625", s.encode("BARAGWANATH"));
+        Assert.assertEquals("D540", s.encode("DONNELL"));
+        Assert.assertEquals("L300", s.encode("LLOYD"));
+        Assert.assertEquals("W422", s.encode("WOOLCOCK"));
+        // Additional local examples
+        Assert.assertEquals("D320", s.encode("Dodds"));
+        Assert.assertEquals("D320", s.encode("Dwdds")); // w is a separator
+        Assert.assertEquals("D320", s.encode("Dhdds")); // h is a separator
+    }
 }



Mime
View raw message