commons-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ggreg...@apache.org
Subject cvs commit: jakarta-commons/codec/src/java/org/apache/commons/codec/language Soundex.java
Date Thu, 06 Nov 2003 16:31:47 GMT
ggregory    2003/11/06 08:31:47

  Modified:    codec/src/test/org/apache/commons/codec/language
                        SoundexTest.java
               codec/src/java/org/apache/commons/codec/language
                        Soundex.java
  Added:       codec/src/test/org/apache/commons/codec/language
                        AllTests.java
  Log:
  Soundex encoding bugs.
  http://issues.apache.org/bugzilla/show_bug.cgi?id=24471
  
  Revision  Changes    Path
  1.6       +250 -98   jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java
  
  Index: SoundexTest.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/test/org/apache/commons/codec/language/SoundexTest.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- SoundexTest.java	4 Nov 2003 02:43:09 -0000	1.5
  +++ SoundexTest.java	6 Nov 2003 16:31:47 -0000	1.6
  @@ -2,68 +2,57 @@
    * ====================================================================
    * 
    * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  + * 
  + * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
  + * 
    * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer. 
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgement:  
  - *       "This product includes software developed by the 
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgement may appear in the software itself,
  - *    if and wherever such third-party acknowledgements normally appear.
  - *
  - * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
  - *    Foundation" must not be used to endorse or promote products derived
  - *    from this software without prior written permission. For written 
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache" nor may "Apache" appear in their name without prior 
  - *    written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  + * modification, are permitted provided that the following conditions are met: 1.
  + * Redistributions of source code must retain the above copyright notice, this
  + * list of conditions and the following disclaimer. 2. Redistributions in
  + * binary form must reproduce the above copyright notice, this list of
  + * conditions and the following disclaimer in the documentation and/or other
  + * materials provided with the distribution. 3. The end-user documentation
  + * included with the redistribution, if any, must include the following
  + * acknowledgement: "This product includes software developed by the Apache
  + * Software Foundation (http://www.apache.org/)." Alternately, this
  + * acknowledgement may appear in the software itself, if and wherever such
  + * third-party acknowledgements normally appear. 4. The names "Apache", "The
  + * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
  + * used to endorse or promote products derived from this software without prior
  + * written permission. For written permission, please contact
  + * apache@apache.org. 5. Products derived from this software may not be called
  + * "Apache", "Apache" nor may "Apache" appear in their name without prior
  + * written permission of the Apache Software Foundation.
  + * 
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
  + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  + * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - *
  - */ 
  + * 
  + * This software consists of voluntary contributions made by many individuals
  + * on behalf of the Apache Software Foundation. For more information on the
  + * Apache Software Foundation, please see <http://www.apache.org/> .
  + *  
  + */
   
  +// (FYI: Formatted and sorted with Eclipse)
   package org.apache.commons.codec.language;
   
   import junit.framework.Test;
   import junit.framework.TestSuite;
  -
   import org.apache.commons.codec.StringEncoder;
   import org.apache.commons.codec.StringEncoderAbstractTest;
   
   /**
  + * Tests {@link Soundex}
  + * 
    * @version $Revision$ $Date$
    * @author Rodney Waldhoff
    * @author Gary Gregory
  @@ -74,16 +63,17 @@
           return (new TestSuite(SoundexTest.class));
       }
   
  -    private Soundex _encoder = null;
  +    private Soundex encoder = null;
   
       public SoundexTest(String name) {
           super(name);
       }
  +    
       /**
  -     * @return Returns the _encoder.
  -     */
  +	 * @return Returns the _encoder.
  +	 */
       public Soundex getEncoder() {
  -        return this._encoder;
  +        return this.encoder;
       }
   
       protected StringEncoder makeEncoder() {
  @@ -91,13 +81,14 @@
       }
   
       /**
  -     * @param _encoder The _encoder to set.
  -     */
  +	 * @param encoder
  +	 *                  The encoder to set.
  +	 */
       public void setEncoder(Soundex encoder) {
  -        this._encoder = encoder;
  +        this.encoder = encoder;
       }
   
  -    public void setUp() throws Exception {        
  +    public void setUp() throws Exception {
           super.setUp();
           this.setEncoder(new Soundex());
       }
  @@ -107,51 +98,212 @@
           this.setEncoder(null);
       }
   
  -    // ------------------------------------------------------------------------
  +    void encodeAll(String[] strings, String expectedEncoding) {
  +        for (int i = 0; i < strings.length; i++) {
  +            assertEquals(expectedEncoding, this.getEncoder().encode(strings[i]));
  +        }
  +    }
  +
  +    public void testB650() {
  +        this.encodeAll(
  +            new String[] {
  +                "BARHAM",
  +                "BARONE",
  +                "BARRON",
  +                "BERNA",
  +                "BIRNEY",
  +                "BIRNIE",
  +                "BOOROM",
  +                "BOREN",
  +                "BORN",
  +                "BOURN",
  +                "BOURNE",
  +                "BOWRON",
  +                "BRAIN",
  +                "BRAME",
  +                "BRANN",
  +                "BRAUN",
  +                "BREEN",
  +                "BRIEN",
  +                "BRIM",
  +                "BRIMM",
  +                "BRINN",
  +                "BRION",
  +                "BROOM",
  +                "BROOME",
  +                "BROWN",
  +                "BROWNE",
  +                "BRUEN",
  +                "BRUHN",
  +                "BRUIN",
  +                "BRUMM",
  +                "BRUN",
  +                "BRUNO",
  +                "BRYAN",
  +                "BURIAN",
  +                "BURN",
  +                "BURNEY",
  +                "BYRAM",
  +                "BYRNE",
  +                "BYRON",
  +                "BYRUM" },
  +            "B650");
  +    }
   
  -    public void testEncode() throws Exception {
  -        assertEquals("T235",this.getEncoder().encode("testing"));
  -        assertEquals("T000",this.getEncoder().encode("The"));
  -        assertEquals("Q200",this.getEncoder().encode("quick"));
  -        assertEquals("B650",this.getEncoder().encode("brown"));
  -        assertEquals("F200",this.getEncoder().encode("fox"));
  -        assertEquals("J513",this.getEncoder().encode("jumped"));
  -        assertEquals("O160",this.getEncoder().encode("over"));
  -        assertEquals("T000",this.getEncoder().encode("the"));
  -        assertEquals("L200",this.getEncoder().encode("lazy"));
  -        assertEquals("D200",this.getEncoder().encode("dogs"));
  +    public void testEncodeBasic() {
  +        assertEquals("T235", this.getEncoder().encode("testing"));
  +        assertEquals("T000", this.getEncoder().encode("The"));
  +        assertEquals("Q200", this.getEncoder().encode("quick"));
  +        assertEquals("B650", this.getEncoder().encode("brown"));
  +        assertEquals("F200", this.getEncoder().encode("fox"));
  +        assertEquals("J513", this.getEncoder().encode("jumped"));
  +        assertEquals("O160", this.getEncoder().encode("over"));
  +        assertEquals("T000", this.getEncoder().encode("the"));
  +        assertEquals("L200", this.getEncoder().encode("lazy"));
  +        assertEquals("D200", this.getEncoder().encode("dogs"));
       }
   
       /**
  -     * Examples from
  -     * http://www.bradandkathy.com/genealogy/overviewofsoundex.html
  -     */
  -    public void testEncode2() throws Exception {
  -        assertEquals("A462",this.getEncoder().encode("Allricht"));
  -        assertEquals("E166",this.getEncoder().encode("Eberhard"));
  -        assertEquals("E521",this.getEncoder().encode("Engebrethson"));
  -        assertEquals("H512",this.getEncoder().encode("Heimbach"));
  -        assertEquals("H524",this.getEncoder().encode("Hanselmann"));
  -        assertEquals("H431",this.getEncoder().encode("Hildebrand"));
  -        assertEquals("K152",this.getEncoder().encode("Kavanagh"));
  -        assertEquals("L530",this.getEncoder().encode("Lind, Van"));
  -        assertEquals("L222",this.getEncoder().encode("Lukaschowsky"));
  -        assertEquals("M235",this.getEncoder().encode("McDonnell"));
  -        assertEquals("M200",this.getEncoder().encode("McGee"));
  -        // Fix me?
  -        //assertEquals("O165",this.getEncoder().encode("O'Brien"));
  -        assertEquals("O155",this.getEncoder().encode("Opnian"));
  -        assertEquals("O155",this.getEncoder().encode("Oppenheimer"));
  -        // Fix me?
  -        //assertEquals("S460",this.getEncoder().encode("Swhgler"));
  -        assertEquals("R355",this.getEncoder().encode("Riedemanas"));
  -        assertEquals("Z300",this.getEncoder().encode("Zita"));
  -        assertEquals("Z325",this.getEncoder().encode("Zitzmeinn"));    
  +	 * Examples from
  +	 * http://www.bradandkathy.com/genealogy/overviewofsoundex.html
  +	 */
  +    public void testEncodeBatch2() {
  +        assertEquals("A462", this.getEncoder().encode("Allricht"));
  +        assertEquals("E166", this.getEncoder().encode("Eberhard"));
  +        assertEquals("E521", this.getEncoder().encode("Engebrethson"));
  +        assertEquals("H512", this.getEncoder().encode("Heimbach"));
  +        assertEquals("H524", this.getEncoder().encode("Hanselmann"));
  +        assertEquals("H431", this.getEncoder().encode("Hildebrand"));
  +        assertEquals("K152", this.getEncoder().encode("Kavanagh"));
  +        assertEquals("L530", this.getEncoder().encode("Lind"));
  +        assertEquals("L222", this.getEncoder().encode("Lukaschowsky"));
  +        assertEquals("M235", this.getEncoder().encode("McDonnell"));
  +        assertEquals("M200", this.getEncoder().encode("McGee"));
  +        assertEquals("O155", this.getEncoder().encode("Opnian"));
  +        assertEquals("O155", this.getEncoder().encode("Oppenheimer"));
  +        assertEquals("R355", this.getEncoder().encode("Riedemanas"));
  +        assertEquals("Z300", this.getEncoder().encode("Zita"));
  +        assertEquals("Z325", this.getEncoder().encode("Zitzmeinn"));
       }
  -    
  +
  +    /**
  +	 * Examples from
  +	 * http://www.archives.gov/research_room/genealogy/census/soundex.html
  +	 */
  +    public void testEncodeBatch3() {
  +        assertEquals("W252", this.getEncoder().encode("Washington"));
  +        assertEquals("L000", this.getEncoder().encode("Lee"));
  +        assertEquals("G362", this.getEncoder().encode("Gutierrez"));
  +        assertEquals("P236", this.getEncoder().encode("Pfister"));
  +        assertEquals("J250", this.getEncoder().encode("Jackson"));
  +        assertEquals("T522", this.getEncoder().encode("Tymczak"));
  +        // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
  +        // possible.
  +        assertEquals("V532", this.getEncoder().encode("VanDeusen"));
  +    }
  +
  +    /**
  +	 * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
  +	 */
  +    public void testEncodeBatch4() {
  +        assertEquals("H452", this.getEncoder().encode("HOLMES"));
  +        assertEquals("A355", this.getEncoder().encode("ADOMOMI"));
  +        assertEquals("V536", this.getEncoder().encode("VONDERLEHR"));
  +        assertEquals("B400", this.getEncoder().encode("BALL"));
  +        assertEquals("S000", this.getEncoder().encode("SHAW"));
  +        assertEquals("J250", this.getEncoder().encode("JACKSON"));
  +        assertEquals("S545", this.getEncoder().encode("SCANLON"));
  +        assertEquals("S532", this.getEncoder().encode("SAINTJOHN"));
  +
  +    }
  +
  +    public void testEncodeIgnoreApostrophes() {
  +        this.encodeAll(new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
"OBri'en", "OBrie'n", "OBrien'" }, "O165");
  +    }
  +
  +    /**
  +	 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
  +	 * 
  +	 * @throws EncoderException
  +	 */
  +    public void testEncodeIgnoreHyphens() {
  +        this.encodeAll(
  +            new String[] {
  +                "KINGSMITH",
  +                "-KINGSMITH",
  +                "K-INGSMITH",
  +                "KI-NGSMITH",
  +                "KIN-GSMITH",
  +                "KING-SMITH",
  +                "KINGS-MITH",
  +                "KINGSM-ITH",
  +                "KINGSMI-TH",
  +                "KINGSMIT-H",
  +                "KINGSMITH-" },
  +            "K525");
  +    }
  +
  +    public void testEncodeIgnoreTrimmable() {
  +        assertEquals("W252", this.getEncoder().encode(" \t\n\r Washington \t\n\r "));
  +    }
  +
  +    /**
  +	 * Consonants from the same code group separated by W or H are treated as
  +	 * one.
  +	 */
  +    public void testHWRuleEx1() {
  +        // From
  +        // http://www.archives.gov/research_room/genealogy/census/soundex.html:
  +        // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
  +        // for the F). It is not coded A-226.
  +        assertEquals("A261", this.getEncoder().encode("Ashcraft"));
  +    }
  +
  +    /**
  +	 * Consonants from the same code group separated by W or H are treated as
  +	 * one.
  +	 * 
  +	 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
  +	 */
  +    public void testHWRuleEx2() {
  +        assertEquals("B312", this.getEncoder().encode("BOOTHDAVIS"));
  +        assertEquals("B312", this.getEncoder().encode("BOOTH-DAVIS"));
  +    }
  +
  +    /**
  +	 * Consonants from the same code group separated by W or H are treated as
  +	 * one.
  +	 * 
  +	 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
  +	 */
  +    public void testHWRuleEx3() {
  +        assertEquals("S460", this.getEncoder().encode("Sgler"));
  +        assertEquals("S460", this.getEncoder().encode("Swhgler"));
  +        // Also S460:
  +        this.encodeAll(
  +            new String[] {
  +                "SAILOR",
  +                "SALYER",
  +                "SAYLOR",
  +                "SCHALLER",
  +                "SCHELLER",
  +                "SCHILLER",
  +                "SCHOOLER",
  +                "SCHULER",
  +                "SCHUYLER",
  +                "SEILER",
  +                "SEYLER",
  +                "SHOLAR",
  +                "SHULER",
  +                "SILAR",
  +                "SILER",
  +                "SILLER" },
  +            "S460");
  +    }
  +
       public void testMaxLength() throws Exception {
           Soundex soundex = new Soundex();
  -        soundex.setMaxLength( soundex.getMaxLength() );
  +        soundex.setMaxLength(soundex.getMaxLength());
       }
   
   }
  
  
  
  1.1                  jakarta-commons/codec/src/test/org/apache/commons/codec/language/AllTests.java
  
  Index: AllTests.java
  ===================================================================
  /*
   * Copyright (C) 1993-2003 SEAGULL
   * 
   * AllTests.java
   * Created on Nov 5, 2003, 8:25:55 PM
   * 
   */
   
  package org.apache.commons.codec.language;
  
  import junit.framework.Test;
  import junit.framework.TestSuite;
  
  /**
   * Tests all test cases in this package.
   * 
   * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
   * @version $Id: AllTests.java,v 1.1 2003/11/06 16:31:47 ggregory Exp $
   */
  public class AllTests {
  
      public static Test suite() {
          TestSuite suite = new TestSuite("Test for org.apache.commons.codec.language");
          //$JUnit-BEGIN$
          suite.addTest(MetaphoneTest.suite());
          suite.addTest(SoundexTest.suite());
          suite.addTest(RefinedSoundexTest.suite());
          suite.addTest(DoubleMetaphoneTest.suite());
          //$JUnit-END$
          return suite;
      }
  }
  
  
  
  1.11      +173 -126  jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java
  
  Index: Soundex.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Soundex.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- Soundex.java	4 Nov 2003 02:43:09 -0000	1.10
  +++ Soundex.java	6 Nov 2003 16:31:47 -0000	1.11
  @@ -2,58 +2,45 @@
    * ====================================================================
    * 
    * The Apache Software License, Version 1.1
  - *
  - * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  + * 
  + * Copyright (c) 2001-2003 The Apache Software Foundation. All rights reserved.
  + * 
    * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer. 
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgement:  
  - *       "This product includes software developed by the 
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgement may appear in the software itself,
  - *    if and wherever such third-party acknowledgements normally appear.
  - *
  - * 4. The names "Apache", "The Jakarta Project", "Commons", and "Apache Software
  - *    Foundation" must not be used to endorse or promote products derived
  - *    from this software without prior written permission. For written 
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    "Apache" nor may "Apache" appear in their name without prior 
  - *    written permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  + * modification, are permitted provided that the following conditions are met: 1.
  + * Redistributions of source code must retain the above copyright notice, this
  + * list of conditions and the following disclaimer. 2. Redistributions in
  + * binary form must reproduce the above copyright notice, this list of
  + * conditions and the following disclaimer in the documentation and/or other
  + * materials provided with the distribution. 3. The end-user documentation
  + * included with the redistribution, if any, must include the following
  + * acknowledgement: "This product includes software developed by the Apache
  + * Software Foundation (http://www.apache.org/)." Alternately, this
  + * acknowledgement may appear in the software itself, if and wherever such
  + * third-party acknowledgements normally appear. 4. The names "Apache", "The
  + * Jakarta Project", "Commons", and "Apache Software Foundation" must not be
  + * used to endorse or promote products derived from this software without prior
  + * written permission. For written permission, please contact
  + * apache@apache.org. 5. Products derived from this software may not be called
  + * "Apache", "Apache" nor may "Apache" appear in their name without prior
  + * written permission of the Apache Software Foundation.
  + * 
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
  + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  + * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - *
  - */ 
  + * 
  + * This software consists of voluntary contributions made by many individuals
  + * on behalf of the Apache Software Foundation. For more information on the
  + * Apache Software Foundation, please see <http://www.apache.org/> .
  + *  
  + */
   
   package org.apache.commons.codec.language;
   
  @@ -61,9 +48,9 @@
   import org.apache.commons.codec.StringEncoder;
   
   /**
  - * Encodes a string into a soundex value.  Soundex is an encoding used to
  - * relate similar names, but can also be used as a general purpose
  - * scheme to find word with similar phonemes. 
  + * Encodes a string into a soundex value. Soundex is an encoding used to relate
  + * similar names, but can also be used as a general purpose scheme to find word
  + * with similar phonemes.
    * 
    * @author bayard@generationjava.com
    * @author Tim O'Brien
  @@ -73,71 +60,92 @@
   public class Soundex implements StringEncoder {
   
       /**
  -     * This static variable contains an instance of the
  -     * Soundex using the US_ENGLISH mapping.
  -     */
  +	 * This static variable contains an instance of the Soundex using the
  +	 * US_ENGLISH mapping.
  +	 */
       public static final Soundex US_ENGLISH = new Soundex();
   
       /**
  -     * This is a default mapping of the 26 letters used
  -     * in US english.
  -     */
  -    public static final char[] US_ENGLISH_MAPPING =
  -        "01230120022455012623010202".toCharArray();
  +	 * This is a default mapping of the 26 letters used in US english.
  +     * A value of <code>0</code> for a letter position means do not encode.
  +	 */
  +    public static final char[] US_ENGLISH_MAPPING = "01230120022455012623010202".toCharArray();
   
       /**
  -     * The maximum length of a Soundex code - Soundex codes are
  -     * only four characters by definition.
  -     */
  +	 * The maximum length of a Soundex code - Soundex codes are only four
  +	 * characters by definition.
  +	 */
       private int maxLength = 4;
  -    
  +
       /**
  -     * Every letter of the alphabet is "mapped" to a numerical 
  -     * value.  This char array holds the values to which each
  -     * letter is mapped.  This implementation contains a default
  -     * map for US_ENGLISH
  -     */
  +	 * Every letter of the alphabet is "mapped" to a numerical value. This char
  +	 * array holds the values to which each letter is mapped. This
  +	 * implementation contains a default map for US_ENGLISH
  +	 */
       private char[] soundexMapping;
   
       /**
  -     * Creates an instance of the Soundex object using the default
  -     * US_ENGLISH mapping.
  -     */
  +	 * Creates an instance of the Soundex object using the default US_ENGLISH
  +	 * mapping.
  +	 */
       public Soundex() {
           this(US_ENGLISH_MAPPING);
       }
   
       /**
  -     * Creates a soundex instance using a custom mapping.  This
  -     * constructor can be used to customize the mapping, and/or possibly
  -     * provide an internationalized mapping for a non-Western character
  -     * set.
  -     *
  -     * @param mapping Mapping array to use when finding the corresponding
  -     *                code for a given character
  -     */
  +	 * Creates a soundex instance using a custom mapping. This constructor can
  +	 * be used to customize the mapping, and/or possibly provide an
  +	 * internationalized mapping for a non-Western character set.
  +	 * 
  +	 * @param mapping
  +	 *                  Mapping array to use when finding the corresponding code for
  +	 *                  a given character
  +	 */
       public Soundex(char[] mapping) {
           this.setSoundexMapping(mapping);
       }
   
       /**
  -     * Encodes an Object using the soundex algorithm.  This method
  -     * is provided in order to satisfy the requirements of the
  -     * Encoder interface, and will throw an EncoderException if the
  -     * supplied object is not of type java.lang.String.
  -     *
  -     * @param pObject Object to encode
  -     * @return An object (or type java.lang.String) containing the 
  -     *         soundex code which corresponds to the String supplied.
  -     * @throws EncoderException if the parameter supplied is not
  -     *                          of type java.lang.String
  -     */
  +	 * Cleans up the input string before Soundex processing by trimming and
  +	 * removing punctuation characters. The string is returned in upper-case.
  +	 */
  +    private String clean(String str) {
  +        if (str == null || str.length() == 0) {
  +            return str;
  +        }
  +        int len = str.length();
  +        char[] chars = new char[len];
  +        int count = 0;
  +        for (int i = 0; i < len; i++) {
  +            if (Character.isLetter(str.charAt(i))) {
  +                chars[count++] = str.charAt(i);
  +            }
  +        }
  +        if (count == len) {
  +            return str.toUpperCase();
  +        }
  +        return new String(chars, 0, count).toUpperCase();
  +    }
  +
  +    /**
  +	 * Encodes an Object using the soundex algorithm. This method is provided
  +	 * in order to satisfy the requirements of the Encoder interface, and will
  +	 * throw an EncoderException if the supplied object is not of type
  +	 * java.lang.String.
  +	 * 
  +	 * @param pObject
  +	 *                  Object to encode
  +	 * @return An object (or type java.lang.String) containing the soundex code
  +	 *             which corresponds to the String supplied.
  +	 * @throws EncoderException
  +	 *                  if the parameter supplied is not of type java.lang.String
  +	 */
       public Object encode(Object pObject) throws EncoderException {
   
           Object result;
   
           if (!(pObject instanceof java.lang.String)) {
  -            throw new EncoderException("Parameter supplied to Soundex encode is not of
type java.lang.String"); 
  +            throw new EncoderException("Parameter supplied to Soundex encode is not of
type java.lang.String");
           } else {
               result = soundex((String) pObject);
           }
  @@ -147,79 +155,118 @@
       }
   
       /**
  -     * Encodes a String using the soundex algorithm. 
  -     *
  -     * @param pString A String object to encode
  -     * @return A Soundex code corresponding to the String supplied
  -     */
  +	 * Encodes a String using the soundex algorithm.
  +	 * 
  +	 * @param pString
  +	 *                  A String object to encode
  +	 * @return A Soundex code corresponding to the String supplied
  +	 */
       public String encode(String pString) {
  -        return (soundex(pString));   
  +        return soundex(pString);
       }
   
       /**
  -     * Used internally by the SoundEx algorithm.
  -     *
  -     * @param c character to use to retrieve mapping code
  -     * @return Mapping code for a particular character
  -     */
  -    private char getMappingCode(char c) {
  +	 * Used internally by the SoundEx algorithm.
  +	 * 
  +	 * Consonants from the same code group separated by W or H are treated as one.
  +	 * 
  +	 * @param str
  +	 *                  the whole string
  +	 * @param index
  +	 *                  the character position to encode
  +	 * @return Mapping code for a particular character
  +	 */
  +    private char getMappingCode(String str, int index) {
  +        char c = str.charAt(index);
           if (!Character.isLetter(c)) {
               return 0;
           } else {
  -            return this.getSoundexMapping()[Character.toUpperCase(c) - 'A'];
  +            char mappedChar = this.map(c);
  +            // HW rule check
  +            if (index > 1 && mappedChar != '0') {
  +                char hwChar = str.charAt(index-1);
  +                if ('H' == hwChar || 'W' == hwChar) {
  +                    char preHWChar = str.charAt(index - 2);
  +                    char firstCode = this.map(preHWChar);
  +                    if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar)
{
  +                        return 0;
  +                    }
  +                }              
  +            }
  +            return mappedChar;
           }
       }
   
       /**
  -     * Returns the maxLength.  Standard Soundex
  -     * @return int
  -     */
  +	 * Returns the maxLength. Standard Soundex
  +	 * 
  +	 * @return int
  +	 */
       public int getMaxLength() {
           return this.maxLength;
       }
   
       /**
  -     * @return Returns the soundexMapping.
  -     */
  +	 * @return Returns the soundexMapping.
  +	 */
       private char[] getSoundexMapping() {
           return this.soundexMapping;
       }
   
       /**
  -     * Sets the maxLength.
  -     * @param maxLength The maxLength to set
  +     * Maps the given upper-case character to it's Soudex code.
        */
  +    private char map(char c) {
  +       return this.getSoundexMapping()[c - 'A'];
  +    }
  +
  +    /**
  +	 * Sets the maxLength.
  +	 * 
  +	 * @param maxLength
  +	 *                  The maxLength to set
  +	 */
       public void setMaxLength(int maxLength) {
           this.maxLength = maxLength;
       }
   
       /**
  -     * @param soundexMapping The soundexMapping to set.
  -     */
  +	 * @param soundexMapping
  +	 *                  The soundexMapping to set.
  +	 */
       private void setSoundexMapping(char[] soundexMapping) {
           this.soundexMapping = soundexMapping;
       }
   
       /**
  -     * Retreives the Soundex code for a given String object.
  -     *
  -     * @param str String to encode using the Soundex algorithm
  -     * @return A soundex code for the String supplied
  -     */
  +	 * Retreives the Soundex code for a given String object.
  +	 * 
  +	 * @param str
  +	 *                  String to encode using the Soundex algorithm
  +	 * @return A soundex code for the String supplied
  +	 */
       public String soundex(String str) {
  -        if (null == str || str.length() == 0) { return str; }
  -        
  +        if (str == null) {
  +            return null;
  +        }
  +        str = this.clean(str);
  +        if (str.length() == 0) {
  +            return str;
  +        }
  +
           char out[] = { '0', '0', '0', '0' };
           char last, mapped;
           int incount = 1, count = 1;
  -        out[0] = Character.toUpperCase(str.charAt(0));
  -        last = getMappingCode(str.charAt(0));
  -        while ((incount < str.length()) && (mapped = getMappingCode(str.charAt(incount++)))
!= 0 && (count < this.getMaxLength())) {
  +        out[0] = str.charAt(0);
  +        last = getMappingCode(str, 0);
  +        while ((incount < str.length()) && (count < this.getMaxLength()))
{
  +            if ((mapped = getMappingCode(str, incount++)) != 0) {
                   if ((mapped != '0') && (mapped != last)) {
                       out[count++] = mapped;
                   }
                   last = mapped;
               }
  +        }
           return new String(out);
       }
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


Mime
View raw message