Return-Path: Delivered-To: apmail-jakarta-commons-dev-archive@www.apache.org Received: (qmail 54817 invoked from network); 18 Apr 2004 21:33:43 -0000 Received: from daedalus.apache.org (HELO mail.apache.org) (208.185.179.12) by minotaur-2.apache.org with SMTP; 18 Apr 2004 21:33:43 -0000 Received: (qmail 2511 invoked by uid 500); 18 Apr 2004 21:33:26 -0000 Delivered-To: apmail-jakarta-commons-dev-archive@jakarta.apache.org Received: (qmail 2464 invoked by uid 500); 18 Apr 2004 21:33:26 -0000 Mailing-List: contact commons-dev-help@jakarta.apache.org; run by ezmlm Precedence: bulk List-Unsubscribe: List-Subscribe: List-Help: List-Post: List-Id: "Jakarta Commons Developers List" Reply-To: "Jakarta Commons Developers List" Delivered-To: mailing list commons-dev@jakarta.apache.org Received: (qmail 2451 invoked by uid 500); 18 Apr 2004 21:33:26 -0000 Received: (qmail 2448 invoked from network); 18 Apr 2004 21:33:26 -0000 Received: from unknown (HELO minotaur.apache.org) (209.237.227.194) by daedalus.apache.org with SMTP; 18 Apr 2004 21:33:26 -0000 Received: (qmail 54786 invoked by uid 1628); 18 Apr 2004 21:33:38 -0000 Date: 18 Apr 2004 21:33:38 -0000 Message-ID: <20040418213338.54785.qmail@minotaur.apache.org> From: tobrien@apache.org To: jakarta-commons-cvs@apache.org Subject: cvs commit: jakarta-commons/codec/src/java/org/apache/commons/codec/language Metaphone.java X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N X-Spam-Rating: minotaur-2.apache.org 1.6.2 0/1000/N tobrien 2004/04/18 14:33:38 Modified: codec/src/java/org/apache/commons/codec/language Metaphone.java Log: Fixed the ending-MB bug - Bugzilla #28457 - also refactored some of the index arithmetic and content tests into functions for readability. Revision Changes Path 1.16 +98 -56 jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java Index: Metaphone.java =================================================================== RCS file: /home/cvs/jakarta-commons/codec/src/java/org/apache/commons/codec/language/Metaphone.java,v retrieving revision 1.15 retrieving revision 1.16 diff -u -r1.15 -r1.16 --- Metaphone.java 29 Feb 2004 04:08:31 -0000 1.15 +++ Metaphone.java 18 Apr 2004 21:33:38 -0000 1.16 @@ -70,7 +70,6 @@ * @return A metaphone code corresponding to the String supplied */ public String metaphone(String txt) { - int mtsz = 0 ; boolean hard = false ; if ((txt == null) || (txt.length() == 0)) { return "" ; @@ -126,99 +125,109 @@ int wdsz = local.length(); int n = 0 ; - while ((mtsz < this.getMaxCodeLen()) && (n < wdsz)) { // max code size of 4 works well + while ((code.length() < this.getMaxCodeLen()) && + (n < wdsz) ) { // max code size of 4 works well char symb = local.charAt(n) ; // remove duplicate letters except C - if ((symb != 'C') && (n > 0) && (local.charAt(n - 1) == symb)) { + if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { n++ ; } else { // not dup switch(symb) { case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : if (n == 0) { code.append(symb); - mtsz++; } break ; // only use vowel if leading char case 'B' : - if ((n > 0) && !(n + 1 == wdsz) && (local.charAt(n - 1) == 'M')) { // not MB at end of word - code.append(symb); + if ( isPreviousChar(local, n, 'M') && + isLastChar(wdsz, n) ) { // B is silent if word ends in MB + break; } else { code.append(symb); } - mtsz++; break; case 'C' : // lots of C special cases /* discard if SCI, SCE or SCY */ - if ((n > 0) && (local.charAt(n - 1) == 'S') && (n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { - break ; + if ( isPreviousChar(local, n, 'S') && + !isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { + break; } tmpS = local.toString(); - if (tmpS.indexOf("CIA", n) == n) { // "CIA" -> X - code.append('X'); mtsz++; break ; + if (regionMatch(local, n, "CIA")) { // "CIA" -> X + code.append('X'); + break; } - if ((n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { code.append('S'); - mtsz++; - break ; // CI,CE,CY -> S + break; // CI,CE,CY -> S } - if ((n > 0) && (tmpS.indexOf("SCH", n - 1) == n - 1)) { // SCH->sk + if (isPreviousChar(local, n, 'S') && + isNextChar(local, n, 'H') ) { // SCH->sk code.append('K') ; - mtsz++; break ; } - if (tmpS.indexOf("CH", n) == n) { // detect CH - if ((n == 0) && (wdsz >= 3) && (this.vowels.indexOf(local.charAt(2)) < 0)) { // CH consonant -> K consonant + if (isNextChar(local, n, 'H')) { // detect CH + if ((n == 0) && + (wdsz >= 3) && + isVowel(local,2) ) { // CH consonant -> K consonant code.append('K'); } else { code.append('X'); // CHvowel -> X } - mtsz++; } else { code.append('K'); - mtsz++; } break ; case 'D' : - if ((n + 2 < wdsz) && (local.charAt(n + 1) == 'G') && (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J + if (!isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'G') && + (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J code.append('J'); n += 2 ; } else { code.append('T'); } - mtsz++; break ; case 'G' : // GH silent at end or before consonant - if ((n + 2 == wdsz) && (local.charAt(n + 1) == 'H')) { + if (isLastChar(wdsz, n + 1) && + isNextChar(local, n, 'H')) { break; } - if ((n + 2 < wdsz) && (local.charAt(n + 1) == 'H') && (this.vowels.indexOf(local.charAt(n + 2)) < 0)) { + if (!isLastChar(wdsz, n + 1) && + isNextChar(local,n,'H') && + !isVowel(local,n+2)) { break; } tmpS = local.toString(); - if ((n > 0) && (tmpS.indexOf("GN", n) == n) || (tmpS.indexOf("GNED", n) == n)) { + if ((n > 0) && + ( regionMatch(local, n, "GN") || + regionMatch(local, n, "GNED") ) ) { break; // silent G } - if ((n > 0) && (local.charAt(n - 1) == 'G')) { + if (isPreviousChar(local, n, 'G')) { hard = true ; } else { hard = false ; } - if ((n + 1 < wdsz) && (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && (!hard)) { + if (!isLastChar(wdsz, n) && + (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && + (!hard)) { code.append('J'); } else { code.append('K'); } - mtsz++; break ; case 'H': - if (n + 1 == wdsz) { + if (isLastChar(wdsz, n)) { break ; // terminal H } - if ((n > 0) && (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { + if ((n > 0) && + (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { break; } - if (this.vowels.indexOf(local.charAt(n + 1)) >= 0) { - code.append('H'); - mtsz++;// Hvowel + if (isVowel(local,n+1)) { + code.append('H'); // Hvowel } break; case 'F': @@ -228,78 +237,111 @@ case 'N' : case 'R' : code.append(symb); - mtsz++; break; case 'K' : if (n > 0) { // not initial - if (local.charAt(n - 1) != 'C') { + if (!isPreviousChar(local, n, 'C')) { code.append(symb); } } else { code.append(symb); // initial K } - mtsz++ ; break ; case 'P' : - if ((n + 1 < wdsz) && (local.charAt(n + 1) == 'H')) { + if (isNextChar(local,n,'H')) { // PH -> F code.append('F'); } else { code.append(symb); } - mtsz++; break ; case 'Q' : code.append('K'); - mtsz++; break; case 'S' : - tmpS = local.toString(); - if ((tmpS.indexOf("SH", n) == n) || (tmpS.indexOf("SIO", n) == n) || (tmpS.indexOf("SIA", n) == n)) { + if (regionMatch(local,n,"SH") || + regionMatch(local,n,"SIO") || + regionMatch(local,n,"SIA")) { code.append('X'); } else { code.append('S'); } - mtsz++; break; case 'T' : - tmpS = local.toString(); // TIA TIO -> X - if ((tmpS.indexOf("TIA", n) == n) || (tmpS.indexOf("TIO", n) == n)) { + if (regionMatch(local,n,"TIA") || + regionMatch(local,n,"TIO")) { code.append('X'); - mtsz++; break; } - if (tmpS.indexOf("TCH", n) == n) { + if (regionMatch(local,n,"TCH")) { + // Silent if in "TCH" break; } // substitute numeral 0 for TH (resembles theta after all) - if (tmpS.indexOf("TH", n) == n) { + if (regionMatch(local,n,"TH")) { code.append('0'); } else { code.append('T'); } - mtsz++ ; break ; case 'V' : - code.append('F'); mtsz++;break ; + code.append('F'); break ; case 'W' : case 'Y' : // silent if not followed by vowel - if ((n + 1 < wdsz) && (this.vowels.indexOf(local.charAt(n + 1)) >= 0)) { + if (!isLastChar(wdsz,n) && + isVowel(local,n+1)) { code.append(symb); - mtsz++; } break ; case 'X' : - code.append('K'); code.append('S');mtsz += 2; + code.append('K'); code.append('S'); break ; case 'Z' : - code.append('S'); mtsz++; break ; + code.append('S'); break ; } // end switch n++ ; } // end else from symb != 'C' - if (mtsz > this.getMaxCodeLen()) { code.setLength(this.getMaxCodeLen()); } + if (code.length() > this.getMaxCodeLen()) { + code.setLength(this.getMaxCodeLen()); + } } return code.toString(); - } + } + + private boolean isVowel(StringBuffer string, int index) { + return (this.vowels.indexOf(string.charAt(index)) >= 0); + } + + private boolean isPreviousChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index > 0 && + index < string.length() ) { + matches = string.charAt(index - 1) == c; + } + return matches; + } + + private boolean isNextChar(StringBuffer string, int index, char c) { + boolean matches = false; + if( index >= 0 && + index < string.length() - 1 ) { + matches = string.charAt(index + 1) == c; + } + return matches; + } + + private boolean regionMatch(StringBuffer string, int index, String test) { + boolean matches = false; + if( index >= 0 && + (index + test.length() - 1) < string.length() ) { + String substring = string.substring( index, index + test.length()); + matches = substring.equals( test ); + } + return matches; + } + + private boolean isLastChar(int wdsz, int n) { + return n + 1 == wdsz; + } /** --------------------------------------------------------------------- To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org For additional commands, e-mail: commons-dev-help@jakarta.apache.org