commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ggreg...@apache.org
Subject svn commit: r1414916 - in /commons/proper/codec/trunk: pom.xml src/changes/changes.xml src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
Date Wed, 28 Nov 2012 20:57:08 GMT
Author: ggregory
Date: Wed Nov 28 20:57:07 2012
New Revision: 1414916

URL: http://svn.apache.org/viewvc?rev=1414916&view=rev
Log:
<action dev="ggregory" type="add" issue="CODEC-161" due-to="crice">Add Match Rating
Approach (MRA) phonetic algorithm encoder.</action>

Added:
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
  (with props)
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
  (with props)
Modified:
    commons/proper/codec/trunk/pom.xml
    commons/proper/codec/trunk/src/changes/changes.xml

Modified: commons/proper/codec/trunk/pom.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/pom.xml?rev=1414916&r1=1414915&r2=1414916&view=diff
==============================================================================
--- commons/proper/codec/trunk/pom.xml (original)
+++ commons/proper/codec/trunk/pom.xml Wed Nov 28 20:57:07 2012
@@ -190,6 +190,13 @@ limitations under the License.
         <role>Beider-Morse phonetic matching</role>
       </roles>
     </contributor>
+    <contributor>
+      <name>Colm Rice</name>
+      <email>colm_rice at hotmail dot com</email>
+      <roles>
+        <role>Submitted Match Rating Approach (MRA) phonetic encoder and tests [CODEC-161]</role>
+      </roles>
+    </contributor>
   </contributors>
   <!-- Codec should depend on very little -->
   <dependencies>

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1414916&r1=1414915&r2=1414916&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Wed Nov 28 20:57:07 2012
@@ -48,6 +48,7 @@ The <action> type attribute can be add,u
     </release>
     -->
     <release version="1.8" date="TBA" description="Feature and fix release."> 
+      <action dev="ggregory" type="add" issue="CODEC-161" due-to="crice">Add Match
Rating Approach (MRA) phonetic algorithm encoder.</action>   
       <action dev="ggregory" type="fix" issue="CODEC-163" due-to="leo141">ColognePhonetic
encoder unneccessarily creates many char arrays on every loop run.</action>   
       <action dev="sebb" type="fix" issue="CODEC-160">Base64.encodeBase64URLSafeString
doesn't add padding characters at the end.</action>   
     </release>

Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java?rev=1414916&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
(added)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
Wed Nov 28 20:57:07 2012
@@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import java.util.Locale;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE>
in 1977.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match
Rating Approach</a>
+ * @since 1.8
+ */
+public class MatchRatingApproachEncoder implements StringEncoder {
+
+    private static final String SPACE = " ";
+
+    private static final String EMPTY = "";
+
+    /**
+     * Constants used mainly for the min rating value.
+     */
+    private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN
= 7, EIGHT = 8, ELEVEN = 11, TWELVE = 12;
+
+    /**
+     * The plain letter equivalent of the accented letters.
+     */
+    private static final String PLAIN_ASCII = "AaEeIiOoUu" // grave
+            + "AaEeIiOoUuYy" // acute
+            + "AaEeIiOoUuYy" // circumflex
+            + "AaOoNn" // tilde
+            + "AaEeIiOoUuYy" // umlaut
+            + "Aa" // ring
+            + "Cc" // cedilla
+            + "OoUu"; // double acute
+
+    /**
+     * Unicode characters corresponding to various accented letters. For example: \u00DA
is U acute etc...
+     */
+    private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9"
+            + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD"
+            + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177"
+ "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1"
+            + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF"
+ "\u00C5\u00E5" + "\u00C7\u00E7"
+            + "\u0150\u0151\u0170\u0171";
+
+    /**
+     * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3.
Removes accents 4. Removes any
+     * spaces.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param name
+     *            The name to be cleaned
+     * @return The cleaned name
+     */
+    String cleanName(final String name) {
+        String upperName = name.toUpperCase(Locale.ENGLISH);
+
+        String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
+        for (String str : charsToTrim) {
+            upperName = upperName.replaceAll(str, EMPTY);
+        }
+
+        upperName = removeAccents(upperName);
+        upperName = upperName.replaceAll("\\s+", EMPTY);
+
+        return upperName;
+    }
+
+    /**
+     * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy
the requirements of the
+     * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
+     *
+     * @param pObject
+     *            Object to encode
+     * @return An object (or type java.lang.String) containing the Match Rating Approach
code which corresponds to the
+     *         String supplied.
+     * @throws EncoderException
+     *             if the parameter supplied is not of type java.lang.String
+     */
+    @Override
+    public final Object encode(final Object pObject) throws EncoderException {
+        if (!(pObject instanceof String)) {
+            throw new EncoderException("Parameter supplied to Match Rating Approach encoder
is not of type java.lang.String");
+        }
+        return encode((String) pObject);
+    }
+
+    /**
+     * Encodes a String using the Match Rating Approach (MRA) algorithm.
+     *
+     * @param name
+     *            String object to encode
+     * @return The MRA code corresponding to the String supplied
+     */
+    @Override
+    public final String encode(String name) {
+        // Bulletproof for trivial input - NINO
+        if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name)
|| name.length() == 1) {
+            return EMPTY;
+        }
+
+        // Preprocessing
+        name = cleanName(name);
+
+        // BEGIN: Actual encoding part of the algorithm...
+        // 1. Delete all vowels unless the vowel begins the word
+        name = removeVowels(name);
+
+        // 2. Remove second consonant from any double consonant
+        name = removeDoubleConsonants(name);
+
+        // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
+        name = getFirst3Last3(name);
+
+        return name;
+    }
+
+    /**
+     * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns
the name.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param name
+     *            The string to get the substrings from
+     * @return Annexed first & last 3 letters of input word.
+     */
+    String getFirst3Last3(final String name) {
+        int nameLength = name.length();
+
+        if (nameLength > SIX) {
+            String firstThree = name.substring(0, THREE);
+            String lastThree = name.substring(nameLength - THREE, nameLength);
+            return firstThree + lastThree;
+        } else {
+            return name;
+        }
+    }
+
+    /**
+     * Obtains the min rating of the length sum of the 2 names. In essence the larger the
sum length the smaller the
+     * min rating. Values strictly from documentation.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param sumLength
+     *            The length of 2 strings sent down
+     * @return The min rating value
+     */
+    int getMinRating(final int sumLength) {
+        int minRating = 0;
+
+        if (sumLength <= FOUR) {
+            minRating = FIVE;
+        } else if ((sumLength >= FIVE) && (sumLength <= SEVEN)) {
+            minRating = FOUR;
+        } else if ((sumLength >= EIGHT) && (sumLength <= ELEVEN)) {
+            minRating = THREE;
+        } else if (sumLength == TWELVE) {
+            minRating = TWO;
+        } else {
+            minRating = ONE; // docs said little here.
+        }
+
+        return minRating;
+    }
+
+    /**
+     * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm.
It should be noted that the
+     * strings are cleaned in the same way as {@link #encode(String)}.
+     *
+     * @param name1
+     *            First of the 2 strings (names) to compare
+     * @param name2
+     *            Second of the 2 names to compare
+     * @return <code>true</code> if the encodings are identical <code>false</code>
otherwise.
+     */
+    public boolean isEncodeEquals(String name1, String name2) {
+        // Bulletproof for trivial input - NINO
+        if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1))
{
+            return false;
+        } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2))
{
+            return false;
+        } else if (name1.length() == 1 || name2.length() == 1) {
+            return false;
+        } else if (name1.equalsIgnoreCase(name2)) {
+            return true;
+        }
+
+        // Preprocessing
+        name1 = cleanName(name1);
+        name2 = cleanName(name2);
+
+        // Actual MRA Algorithm
+
+        // 1. Remove vowels
+        name1 = removeVowels(name1);
+        name2 = removeVowels(name2);
+
+        // 2. Remove double consonants
+        name1 = removeDoubleConsonants(name1);
+        name2 = removeDoubleConsonants(name2);
+
+        // 3. Reduce down to 3 letters
+        name1 = getFirst3Last3(name1);
+        name2 = getFirst3Last3(name2);
+
+        // 4. Check for length difference - if 3 or greater then no similarity
+        // comparison is done
+        if (Math.abs(name1.length() - name2.length()) >= THREE) {
+            return false;
+        }
+
+        // 5. Obtain the minimum rating value by calculating the length sum of the
+        // encoded Strings and sending it down.
+        int sumLength = Math.abs(name1.length() + name2.length());
+        int minRating = 0;
+        minRating = getMinRating(sumLength);
+
+        // 6. Process the encoded Strings from left to right and remove any
+        // identical characters found from both Strings respectively.
+        int count = leftToRightThenRightToLeftProcessing(name1, name2);
+
+        // 7. Each PNI item that has a similarity rating equal to or greater than
+        // the min is considered to be a good candidate match
+        return count >= minRating;
+
+    }
+
+    /**
+     * Determines if a letter is a vowel.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param letter
+     *            The letter under investiagtion
+     * @return True if a vowel, else false
+     */
+    boolean isVowel(String letter) {
+        return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O")
|| letter.equalsIgnoreCase("I") ||
+                letter.equalsIgnoreCase("U");
+    }
+
+    /**
+     * Processes the names from left to right (first) then right to left removing identical
letters in same positions.
+     * Then subtracts the longer string that remains from 6 and returns this.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param name1
+     *            name2
+     * @return
+     */
+    int leftToRightThenRightToLeftProcessing(String name1, String name2) {
+        char[] name1Char = name1.toCharArray();
+        char[] name2Char = name2.toCharArray();
+
+        int name1Size = name1.length() - 1;
+        int name2Size = name2.length() - 1;
+
+        String name1LtRStart = EMPTY;
+        String name1LtREnd = EMPTY;
+
+        String name2RtLStart = EMPTY;
+        String name2RtLEnd = EMPTY;
+
+        for (int i = 0; i < name1Char.length; i++) {
+            if (i > name2Size) {
+                break;
+            }
+
+            name1LtRStart = name1.substring(i, i + 1);
+            name1LtREnd = name1.substring(name1Size - i, (name1Size - i) + 1);
+
+            name2RtLStart = name2.substring(i, i + 1);
+            name2RtLEnd = name2.substring(name2Size - i, (name2Size - i) + 1);
+
+            // Left to right...
+            if (name1LtRStart.equals(name2RtLStart)) {
+                name1Char[i] = ' ';
+                name2Char[i] = ' ';
+            }
+
+            // Right to left...
+            if (name1LtREnd.equals(name2RtLEnd)) {
+                name1Char[name1Size - i] = ' ';
+                name2Char[name2Size - i] = ' ';
+            }
+        }
+
+        // Char arrays -> string & remove extraneous space
+        String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
+        String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
+
+        // Final bit - subtract longest string from 6 and return this int value
+        if (strA.length() > strB.length()) {
+            return Math.abs(SIX - strA.length());
+        } else {
+            return Math.abs(SIX - strB.length());
+        }
+    }
+
+    /**
+     * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
+     * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
+     *
+     * @param accentedWord
+     *            The word that may have accents in it.
+     * @return De-accented word
+     */
+    String removeAccents(final String accentedWord) {
+        if (accentedWord == null) {
+            return null;
+        }
+
+        StringBuilder sb = new StringBuilder();
+        int n = accentedWord.length();
+
+        for (int i = 0; i < n; i++) {
+            char c = accentedWord.charAt(i);
+            int pos = UNICODE.indexOf(c);
+            if (pos > -1) {
+                sb.append(PLAIN_ASCII.charAt(pos));
+            } else {
+                sb.append(c);
+            }
+        }
+
+        return sb.toString();
+    }
+
+    /**
+     * Replaces any double consonant pair with the single letter equivalent.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param name
+     *            String to have double consonants removed
+     * @return Single consonant word
+     */
+    String removeDoubleConsonants(String name) {
+        String[] dblCnstArray = new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ",
"KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", "TT", "VV",
+                "WW", "XX", "YY", "ZZ" };
+
+        String replacedName = name.toUpperCase();
+        for (String dc : dblCnstArray) {
+            if (replacedName.contains(dc)) {
+                String singleLetter = dc.substring(0, 1);
+                replacedName = replacedName.replace(dc, singleLetter);
+            }
+        }
+
+        return replacedName;
+    }
+
+    /**
+     * Deletes all vowels unless the vowel begins the word.
+     *
+     * <h2>API Usage</h2>
+     * <p>
+     * Consider this method private, it is package protected for unit testing only.
+     * </p>
+     *
+     * @param name
+     *            The name to have vowels removed
+     * @return De-voweled word
+     */
+    String removeVowels(String name) {
+        // Extract first letter
+        String firstLetter = name.substring(0, 1);
+
+        name = name.replaceAll("A", EMPTY);
+        name = name.replaceAll("E", EMPTY);
+        name = name.replaceAll("I", EMPTY);
+        name = name.replaceAll("O", EMPTY);
+        name = name.replaceAll("U", EMPTY);
+
+        name = name.replaceAll("\\s{2,}\\b", SPACE);
+
+        // return isVowel(firstLetter) ? (firstLetter + name) : name;
+        if (isVowel(firstLetter)) {
+            return (firstLetter + name);
+        } else {
+            return name;
+        }
+    }
+}

Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/MatchRatingApproachEncoder.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java?rev=1414916&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
(added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
Wed Nov 28 20:57:07 2012
@@ -0,0 +1,426 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.codec.StringEncoder;
+import org.apache.commons.codec.StringEncoderAbstractTest;
+import org.junit.Test;
+
+/**
+ * Series of tests for the Match Rating Approach algorithm.
+ * 
+ * General naming nomeneclature for the test is of the form:
+ * GeneralMetadataOnTheTestArea_ActualTestValues_ExpectedResult
+ * 
+ * An unusual value is indicated by the term "corner case"
+ */
+public class MatchRatingApproachEncoderTest extends StringEncoderAbstractTest {
+
+    private MatchRatingApproachEncoder getMatchRatingApproachEncoder() {
+        return (MatchRatingApproachEncoder) this.getStringEncoder();
+    }
+
+    // ********** BEGIN REGION - TEST SUPPORT METHODS
+
+    @Test
+    public final void testAccentRemoval_AllLower_SuccessfullyRemoved() {
+        assertEquals("aeiou", getMatchRatingApproachEncoder().removeAccents("áéíóú"));
+    }
+
+    @Test
+    public final void testAccentRemoval_WithSpaces_SuccessfullyRemovedAndSpacesInvariant()
{
+        assertEquals("ae io  u", getMatchRatingApproachEncoder().removeAccents("áé
íó  ú"));
+    }
+
+    @Test
+    public final void testAccentRemoval_UpperandLower_SuccessfullyRemovedAndCaseInvariant()
{
+        assertEquals("AeiOuu", getMatchRatingApproachEncoder().removeAccents("ÁeíÓuu"));
+    }
+
+    @Test
+    public final void testAccentRemoval_MixedWithUnusualChars_SuccessfullyRemovedAndUnusualcharactersInvariant()
{
+        assertEquals("A-e'i.,o&u", getMatchRatingApproachEncoder().removeAccents("Á-e'í.,ó&ú"));
+    }
+
+    @Test
+    public final void testAccentRemoval_GerSpanFrenMix_SuccessfullyRemoved() {
+        assertEquals("aeoußAEOUnNa", getMatchRatingApproachEncoder().removeAccents("äëöüßÄËÖÜñÑà"));
+    }
+
+    @Test
+    public final void testAccentRemoval_ComprehensiveAccentMix_AllSuccessfullyRemoved() {
+        assertEquals("E,E,E,E,U,U,I,I,A,A,O,e,e,e,e,u,u,i,i,a,a,o,c",
+                getMatchRatingApproachEncoder().removeAccents("È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç"));
+    }
+
+    @Test
+    public final void testAccentRemovalNormalString_NoChange() {
+        assertEquals("Colorless green ideas sleep furiously", getMatchRatingApproachEncoder().removeAccents("Colorless
green ideas sleep furiously"));
+    }
+
+    @Test
+    public final void testAccentRemoval_NINO_NoChange() {
+        assertEquals("", getMatchRatingApproachEncoder().removeAccents(""));
+    }
+
+    @Test
+    public final void testRemoveSingleDoubleConsonants_BUBLE_RemovedSuccessfully() {
+        assertEquals("BUBLE", getMatchRatingApproachEncoder().removeDoubleConsonants("BUBBLE"));
+    }
+
+    @Test
+    public final void testRemoveDoubleConsonants_MISSISSIPPI_RemovedSuccessfully() {
+        assertEquals("MISISIPI", getMatchRatingApproachEncoder().removeDoubleConsonants("MISSISSIPPI"));
+    }
+
+    @Test
+    public final void testRemoveDoubleDoubleVowel_BEETLE_NotRemoved() {
+        assertEquals("BEETLE", getMatchRatingApproachEncoder().removeDoubleConsonants("BEETLE"));
+    }
+
+    @Test
+    public final void testIsVowel_CapitalA_ReturnsTrue() {
+        assertEquals(true, getMatchRatingApproachEncoder().isVowel("A"));
+    }
+
+    @Test
+    public final void testIsVowel_SmallD_ReturnsFalse() {
+        assertFalse(getMatchRatingApproachEncoder().isVowel("d"));
+    }
+
+    @Test
+    public final void testRemoveVowel_ALESSANDRA_Returns_ALSSNDR() {
+        assertEquals("ALSSNDR", getMatchRatingApproachEncoder().removeVowels("ALESSANDRA"));
+    }
+
+    @Test
+    public final void testRemoveVowel__AIDAN_Returns_ADN() {
+        assertEquals("ADN", getMatchRatingApproachEncoder().removeVowels("AIDAN"));
+    }
+
+    @Test
+    public final void testRemoveVowel__DECLAN_Returns_DCLN() {
+        assertEquals("DCLN", getMatchRatingApproachEncoder().removeVowels("DECLAN"));
+    }
+
+    @Test
+    public final void testGetFirstLast3__ALEXANDER_Returns_Aleder() {
+        assertEquals("Aleder", getMatchRatingApproachEncoder().getFirst3Last3("Alexzander"));
+    }
+
+    @Test
+    public final void testGetFirstLast3_PETE_Returns_PETE() {
+        assertEquals("PETE", getMatchRatingApproachEncoder().getFirst3Last3("PETE"));
+    }
+
+    @Test
+    public final void testleftTorightThenRightToLeft_ALEXANDER_ALEXANDRA_Returns4() {
+        assertEquals(4, getMatchRatingApproachEncoder().leftToRightThenRightToLeftProcessing("ALEXANDER",
"ALEXANDRA"));
+    }
+
+    @Test
+    public final void testleftTorightThenRightToLeft_EINSTEIN_MICHAELA_Returns0() {
+        assertEquals(0, getMatchRatingApproachEncoder().leftToRightThenRightToLeftProcessing("EINSTEIN",
"MICHAELA"));
+    }
+
+    @Test
+    public final void testGetMinRating_7_Return4_Successfully() {
+        assertEquals(4, getMatchRatingApproachEncoder().getMinRating(7));
+    }
+
+    @Test
+    public final void testGetMinRating_2_Returns5_Successfully() {
+        assertEquals(5, getMatchRatingApproachEncoder().getMinRating(2));
+    }
+
+    @Test
+    public final void testGetMinRating_2_Return1_Successfully() {
+        assertEquals(1, getMatchRatingApproachEncoder().getMinRating(13));
+    }
+
+    @Test
+    public final void testcleanName_SuccessfullyClean() {
+        assertEquals("THISISATEST", getMatchRatingApproachEncoder().cleanName("This-ís
  a t.,es &t"));
+    }
+
+    // ***** END REGION - TEST SUPPORT METHODS
+
+    // ***** BEGIN REGION - TEST GET MRA ENCODING
+
+    @Test
+    public final void testGetEncoding_HARPER_HRPR() {
+        assertEquals("HRPR", getMatchRatingApproachEncoder().encode("HARPER"));
+    }
+
+    @Test
+    public final void testGetEncoding_SMITH_to_SMTH() {
+        assertEquals("SMTH", getMatchRatingApproachEncoder().encode("Smith"));
+    }
+
+    @Test
+    public final void testGetEncoding_SMYTH_to_SMYTH() {
+        assertEquals("SMYTH", getMatchRatingApproachEncoder().encode("Smyth"));
+    }
+
+    @Test
+    public final void testGetEncoding_Space_to_Nothing() {
+        assertEquals("", getMatchRatingApproachEncoder().encode(" "));
+    }
+
+    @Test
+    public final void testGetEncoding_NoSpace_to_Nothing() {
+        assertEquals("", getMatchRatingApproachEncoder().encode(""));
+    }
+
+    @Test
+    public final void testGetEncoding_Null_to_Nothing() {
+        assertEquals("", getMatchRatingApproachEncoder().encode(null));
+    }
+
+    @Test
+    public final void testGetEncoding_One_Letter_to_Nothing() {
+        assertEquals("", getMatchRatingApproachEncoder().encode("E"));
+    }
+
+    // ***** END REGION - TEST GET MRA ENCODING
+
+    // ***** BEGIN REGION - TEST GET MRA COMPARISONS
+
+    @Test
+    public final void testCompare_SMITH_SMYTH_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("smith", "smyth"));
+    }
+
+    @Test
+    public final void testCompare_BURNS_BOURNE_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Burns", "Bourne"));
+    }
+
+    @Test
+    public final void testCompare_ShortNames_AL_ED_WorksButNoMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Al", "Ed"));
+    }
+
+    @Test
+    public final void testCompare_CATHERINE_KATHRYN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Catherine", "Kathryn"));
+    }
+
+    @Test
+    public final void testCompare_BRIAN_BRYAN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Brian", "Bryan"));
+    }
+
+    @Test
+    public final void testCompare_SEAN_SHAUN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Séan", "Shaun"));
+    }
+
+    @Test
+    public final void testCompare_COLM_COLIN_WithAccentsAndSymbolsAndSpaces_SuccessfullyMatched()
{
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Cólm.   ", "C-olín"));
+    }
+
+    @Test
+    public final void testCompare_STEPHEN_STEVEN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Stephen", "Steven"));
+    }
+
+    @Test
+    public final void testCompare_STEVEN_STEFAN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Steven", "Stefan"));
+    }
+
+    @Test
+    public final void testCompare_STEPHEN_STEFAN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Stephen", "Stefan"));
+    }
+
+    @Test
+    public final void testCompare_SAM_SAMUEL_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sam", "Samuel"));
+    }
+
+    @Test
+    public final void testCompare_MICKY_MICHAEL_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Micky", "Michael"));
+    }
+
+    @Test
+    public final void testCompare_OONA_OONAGH_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Oona", "Oonagh"));
+    }
+
+    @Test
+    public final void testCompare_SOPHIE_SOFIA_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sophie", "Sofia"));
+    }
+
+    @Test
+    public final void testCompare_FRANCISZEK_FRANCES_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Franciszek", "Frances"));
+    }
+
+    @Test
+    public final void testCompare_TOMASZ_TOM_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Tomasz", "tom"));
+    }
+
+    @Test
+    public final void testCompare_SmallInput_CARK_Kl_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Kl", "Karl"));
+    }
+
+    @Test
+    public final void testCompareNameToSingleLetter_KARL_C_DoesNotMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Karl", "C"));
+    }
+
+    @Test
+    public final void testCompare_ZACH_ZAKARIA_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Zach", "Zacharia"));
+    }
+
+    @Test
+    public final void testCompare_KARL_ALESSANDRO_DoesNotMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Karl", "Alessandro"));
+    }
+
+    @Test
+    public final void testCompare_Forenames_UNA_OONAGH_ShouldSuccessfullyMatchButDoesNot()
{
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Úna", "Oonagh"));
// Disappointing
+    }
+
+    // ***** Begin Region - Test Get Encoding - Surnames
+
+    @Test
+    public final void testCompare_Surname_OSULLIVAN_OSUILLEABHAIN_SuccessfulMatch() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("O'Sullivan", "Ó ' Súilleabháin"));
+    }
+
+    @Test
+    public final void testCompare_LongSurnames_MORIARTY_OMUIRCHEARTAIGH_DoesNotSuccessfulMatch()
{
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Moriarty", "OMuircheartaigh"));
+    }
+
+    @Test
+    public final void testCompare_LongSurnames_OMUIRCHEARTAIGH_OMIREADHAIGH_SuccessfulMatch()
{
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("o'muireadhaigh", "Ó
'Muircheartaigh "));
+    }
+
+    @Test
+    public final void testCompare_Surname_COOPERFLYNN_SUPERLYN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Cooper-Flynn", "Super-Lyn"));
+    }
+
+    @Test
+    public final void testCompare_Surname_HAILEY_HALLEY_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Hailey", "Halley"));
+    }
+
+    // **** BEGIN YIDDISH/SLAVIC SECTION ****
+
+    @Test
+    public final void testCompare_Surname_AUERBACH_UHRBACH_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Auerbach", "Uhrbach"));
+    }
+
+    @Test
+    public final void testCompare_Surname_MOSKOWITZ_MOSKOVITZ_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Moskowitz", "Moskovitz"));
+    }
+
+    @Test
+    public final void testCompare_Surname_LIPSHITZ_LIPPSZYC_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("LIPSHITZ", "LIPPSZYC"));
+    }
+
+    @Test
+    public final void testCompare_Surname_LEWINSKY_LEVINSKI_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("LEWINSKY", "LEVINSKI"));
+    }
+
+    @Test
+    public final void testCompare_Surname_SZLAMAWICZ_SHLAMOVITZ_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("SZLAMAWICZ", "SHLAMOVITZ"));
+    }
+
+    @Test
+    public final void testCompare_Surname_ROSOCHOWACIEC_ROSOKHOVATSETS_SuccessfullyMatched()
{
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("R o s o ch o w a c ie
c", " R o s o k ho v a ts e ts"));
+    }
+
+    @Test
+    public final void testCompare_Surname_PRZEMYSL_PSHEMESHIL_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals(" P rz e m y s l", " P
sh e m e sh i l"));
+    }
+
+    // **** END YIDDISH/SLAVIC SECTION ****
+
+    @Test
+    public final void testCompare_PETERSON_PETERS_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Peterson", "Peters"));
+    }
+
+    @Test
+    public final void testCompare_MCGOWAN_MCGEOGHEGAN_SuccessfullyMatched() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("McGowan", "Mc Geoghegan"));
+    }
+
+    @Test
+    public final void testCompare_SurnamesCornerCase_MURPHY_Space_NoMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", " "));
+    }
+
+    @Test
+    public final void testCompare_SurnamesCornerCase_MURPHY_NoSpace_NoMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", ""));
+    }
+
+    @Test
+    public final void testCompare_SurnameCornerCase_Nulls_NoMatch() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals(null, null));
+    }
+
+    @Test
+    public final void testCompare_Surnames_MURPHY_LYNCH_NoMatchExpected() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Murphy", "Lynch"));
+    }
+
+    @Test
+    public final void testCompare_Forenames_SEAN_JOHN_MatchExpected() {
+        assertTrue(getMatchRatingApproachEncoder().isEncodeEquals("Sean", "John"));
+    }
+
+    @Test
+    public final void testCompare_Forenames_SEAN_PETE_NoMatchExpected() {
+        assertFalse(getMatchRatingApproachEncoder().isEncodeEquals("Sean", "Pete"));
+    }
+
+    @Override
+    protected StringEncoder createStringEncoder() {
+        return new MatchRatingApproachEncoder();
+    }
+
+    // ***** END REGION - TEST GET MRA COMPARISONS
+
+}

Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/MatchRatingApproachEncoderTest.java
------------------------------------------------------------------------------
    svn:keywords = Id



Mime
View raw message