commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ggreg...@apache.org
Subject svn commit: r1151715 - in /commons/proper/codec/trunk/src: java/org/apache/commons/codec/language/bm/ test/org/apache/commons/codec/language/bm/
Date Thu, 28 Jul 2011 03:45:15 GMT
Author: ggregory
Date: Thu Jul 28 03:45:14 2011
New Revision: 1151715

URL: http://svn.apache.org/viewvc?rev=1151715&view=rev
Log:
Fix all current issues in [CODEC-125] including the big performance issue. Thanks to a patch
from Matthew Pocock! Some clean ups and more tests are needed but this is looking much better
now.

Modified:
    commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java
    commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java
    commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
    commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
    commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
    commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
    commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
    commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java

Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java (original)
+++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java Thu
Jul 28 03:45:14 2011
@@ -198,9 +198,9 @@ public class Lang {
      * @return the language that the word originates from or {@link Languages#ANY} if there
was no unique match
      */
     public String guessLanguage(String text) {
-        Set<String> ls = guessLanguages(text);
-        if (ls.size() == 1) {
-            return ls.iterator().next();
+        Languages.LanguageSet ls = guessLanguages(text);
+        if (ls.isSingleton()) {
+            return ls.getAny();
         } else {
             return Languages.ANY;
         }
@@ -209,11 +209,11 @@ public class Lang {
     /**
      * Guesses the languages of a word.
      * 
-     * @param text
+     * @param input
      *            the word
-     * @return a Set of Strings of language names that are potential matches for the word
+     * @return a Set of Strings of language names that are potential matches for the input
word
      */
-    public Set<String> guessLanguages(String input) {
+    public Languages.LanguageSet guessLanguages(String input) {
         String text = input.toLowerCase(); // todo: locale?
         // System.out.println("Testing text: '" + text + "'");
 
@@ -234,6 +234,6 @@ public class Lang {
             }
         }
 
-        return langs;
+        return Languages.LanguageSet.from(langs);
     }
 }

Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java
(original)
+++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java
Thu Jul 28 03:45:14 2011
@@ -22,6 +22,7 @@ import java.util.Collections;
 import java.util.EnumMap;
 import java.util.HashSet;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.Scanner;
 import java.util.Set;
 
@@ -114,43 +115,135 @@ public class Languages {
         return this.languages;
     }
 
-    // // The original code mapped sets of languages to unique numerical codes - this doesn't
seem to be needed in this impl
-    // public static Languages instance(String languagesResourceName)
-    // {
-    // // read languages list
-    // Map<String, Integer> ls = new HashMap<String, Integer>();
-    // InputStream langIS = Languages.class.getClassLoader().getResourceAsStream(languagesResourceName);
-    //
-    // if(langIS == null)
-    // throw new IllegalArgumentException("Unable to resolve required resource: " + languagesResourceName);
-    //
-    // Scanner lsScanner = new Scanner(langIS);
-    // int i = 0;
-    // while(lsScanner.hasNextLine()) {
-    // String line = lsScanner.nextLine();
-    // i++;
-    // ls.put(line.trim(), i^2);
-    // }
-    //
-    // return new Languages(Collections.unmodifiableSet(ls.keySet()), Collections.unmodifiableMap(ls));
-    // }
-    //
-    // // todo: phoneticutils.php: LanguageIndex, LanguageName, LanguageCode, LanguageIndexFromCode
-    //
-    //
-    // private final Set<String> languages;
-    // private final Map<String, Integer> language_codes;
-    //
-    // private Languages(Set<String> languages, Map<String, Integer> language_codes)
{
-    // this.languages = languages;
-    // this.language_codes = language_codes;
-    // }
-    //
-    // public Set<String> getLanguages() {
-    // return languages;
-    // }
-    //
-    // public Map<String, Integer> getLanguage_codes() {
-    // return language_codes;
-    // }
+    /**
+     * A set of languages.
+     */
+    public static abstract class LanguageSet {
+        public abstract LanguageSet restrictTo(LanguageSet other);
+
+        public static LanguageSet from(Set<String> langs) {
+            if (langs.isEmpty()) {
+                return NO_LANGUAGES;
+            } else {
+                return new SomeLanguages(langs);
+            }
+        }
+
+        public abstract boolean contains(String language);
+
+        public abstract boolean isSingleton();
+
+        public abstract String getAny();
+
+        public abstract boolean isEmpty();
+    }
+
+    /**
+     * No languages at all.
+     */
+    public static LanguageSet NO_LANGUAGES = new LanguageSet() {
+        @Override
+        public LanguageSet restrictTo(LanguageSet other) {
+            return this;
+        }
+
+        @Override
+        public boolean contains(String language) {
+            return false;
+        }
+
+        @Override
+        public boolean isSingleton() {
+            return false;
+        }
+
+        @Override
+        public String getAny() {
+            throw new NoSuchElementException("Can't fetch any language from the empty language
set.");
+        }
+
+        @Override
+        public boolean isEmpty() {
+            return true;
+        }
+    };
+
+    /**
+     * Any/all languages.
+     */
+    public static LanguageSet ANY_LANGUAGE = new LanguageSet() {
+        @Override
+        public LanguageSet restrictTo(LanguageSet other) {
+            return other;
+        }
+
+        @Override
+        public boolean contains(String language) {
+            return true;
+        }
+
+        @Override
+        public boolean isSingleton() {
+            return false;
+        }
+
+        @Override
+        public String getAny() {
+            throw new NoSuchElementException("Can't fetch any language from the any language
set.");
+        }
+
+        @Override
+        public boolean isEmpty() {
+            return false;
+        }
+    };
+
+    /**
+     * Some languages, explicitly enumerated.
+     */
+    public static class SomeLanguages extends LanguageSet {
+        private final Set<String> languages;
+
+        private SomeLanguages(Set<String> languages) {
+            this.languages = Collections.unmodifiableSet(languages);
+        }
+
+        public Set<String> getLanguages() {
+            return this.languages;
+        }
+
+        @Override
+        public LanguageSet restrictTo(LanguageSet other) {
+            if (other == NO_LANGUAGES) {
+                return other;
+            } else if (other == ANY_LANGUAGE) {
+                return this;
+            } else {
+                SomeLanguages sl = (SomeLanguages) other;
+                Set<String> ls = new HashSet<String>(this.languages);
+                ls.retainAll(sl.languages);
+                return from(ls);
+            }
+        }
+
+        @Override
+        public boolean contains(String language) {
+            return this.languages.contains(language);
+        }
+
+        @Override
+        public boolean isSingleton() {
+            return this.languages.size() == 1;
+        }
+
+        @Override
+        public String getAny() {
+            return this.languages.iterator().next();
+        }
+
+        @Override
+        public boolean isEmpty() {
+            return this.languages.isEmpty();
+        }
+    }
 }

Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
(original)
+++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
Thu Jul 28 03:45:14 2011
@@ -132,7 +132,7 @@ public class PhoneticEngine {
      * @return the encoding of the input
      */
     public String encode(String input) {
-        Set<String> languageSet = this.lang.guessLanguages(input);
+        Languages.LanguageSet languageSet = this.lang.guessLanguages(input);
         return phoneticUtf8(input, languageSet);
     }
 
@@ -144,7 +144,7 @@ public class PhoneticEngine {
      * @param languageSet
      * @return a phonetic representation of the input; a String containing '-'-separated
phonetic representations of the input
      */
-    public String phoneticUtf8(String input, final Set<String> languageSet) {
+    public String phoneticUtf8(String input, final Languages.LanguageSet languageSet) {
         final List<Rule> rules = Rule.instance(this.nameType, RuleType.RULES, languageSet);
         final List<Rule> finalRules1 = Rule.instance(this.nameType, this.ruleType,
"common");
         final List<Rule> finalRules2 = Rule.instance(this.nameType, this.ruleType,
languageSet);
@@ -213,208 +213,64 @@ public class PhoneticEngine {
             return result.substring(1);
         }
 
-        String phonetic = "";
+        PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet);
 
         // loop over each char in the input - we will handle the increment manually
         for (int i = 0; i < input.length();) {
-            RulesApplication rulesApplication = new RulesApplication(rules, languageSet,
input, phonetic, i).invoke();
+            RulesApplication rulesApplication = new RulesApplication(rules, languageSet,
input, phonemeBuilder, i).invoke();
             i = rulesApplication.getI();
-            phonetic = rulesApplication.getPhonetic();
+            phonemeBuilder = rulesApplication.getPhonemeBuilder();
+            // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
         }
 
-        phonetic = applyFinalRules(phonetic, finalRules1, languageSet, false);
-        phonetic = applyFinalRules(phonetic, finalRules2, languageSet, true);
+        // System.err.println("Applying general rules");
+        phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1, languageSet, false);
+        // System.err.println("Now got: " + phonemeBuilder.makeString());
+        // System.err.println("Applying language-specific rules");
+        phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2, languageSet, true);
+        // System.err.println("Now got: " + phonemeBuilder.makeString());
+        // System.err.println("Done");
 
-        return phonetic;
+        return phonemeBuilder.makeString();
     }
 
-    private String removeDuplicateAlternates(final String phonetic) {
-        List<String> altArray = splitOnPipe(phonetic);
-
-        String result = "|";
-        for (String alt : altArray) {
-            if (!result.contains("|" + alt + "|")) {
-                result += (alt + "|");
-            }
-        }
-
-        result = result.substring(1, result.length() - 1);
-        return result;
-    }
-
-    /**
-     * Applied to a single alternative at a time -- not to a parenthisized list it removes
all embedded bracketed attributes, logically-ands
-     * them together, and places them at the end.
-     * 
-     * However if strip is true, this can indeed remove embedded bracketed attributes from
a parenthesized list
-     * 
-     * @param input
-     * @param strip
-     * @return
-     */
-    private String normalizeLanguageAttributes(final String input, final boolean strip) {
-        String text = input;
-        Set<String> langs = new HashSet<String>();
-
-        int bracketStart;
-        while ((bracketStart = text.indexOf('[')) != -1) {
-            int bracketEnd = text.indexOf(']', bracketStart);
-            if (bracketEnd == -1) {
-                throw new IllegalArgumentException("no closing square bracket in: " + text);
-            }
-
-            String body = text.substring(bracketStart + 1, bracketEnd);
-            langs.addAll(Arrays.asList(body.split("[+]")));
-            text = text.substring(0, bracketStart) + text.substring(bracketEnd + 1);
-        }
-
-        if (langs.isEmpty() || strip) {
-            return text;
-        } else if (langs.contains(Languages.ANY)) {
-            return "[" + Languages.ANY + "]";
-        } else {
-            return text + "[" + join(langs, "+") + "]";
-        }
-    }
-
-    private String applyFinalRules(String phonetic, List<Rule> finalRules, Set<String>
languageArg, boolean strip) {
+    private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule>
finalRules, Languages.LanguageSet languageSet,
+            boolean strip) {
         if (finalRules == null) {
             throw new NullPointerException("finalRules can not be null");
         }
         if (finalRules.isEmpty()) {
-            return phonetic;
+            return phonemeBuilder;
         }
 
-        phonetic = expand(phonetic);
-        // must protect | in [] as split takes a regex, not a string literal
-        List<String> phoneticArray = splitOnPipe(phonetic);
-
-        for (int k = 0; k < phoneticArray.size(); k++) {
-            // log("k: " + k);
-
-            String aPhonetic = phoneticArray.get(k);
-            String phonetic2 = "";
-
-            String phoneticx = normalizeLanguageAttributes(aPhonetic, true);
-            for (int i = 0; i < aPhonetic.length();) {
-                // we will handle the increment manually
-                if (aPhonetic.substring(i, i + 1).equals("[")) {
-                    int attribStart = i;
-                    i++;
-                    while (true) {
-                        i++;
-                        String nextChar = aPhonetic.substring(i, i + 1);
-                        if (nextChar.equals("]")) {
-                            phonetic2 += aPhonetic.substring(attribStart, i);
-                            break;
-                        }
-                    }
+        Set<Rule.Phoneme> phonemes = new HashSet<Rule.Phoneme>();
 
-                    continue;
-                }
+        for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
+            PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
+            String phonemeText = phoneme.getPhonemeText();
+            // System.err.println("Expanding: " + phonemeText);
 
-                RulesApplication rulesApplication = new RulesApplication(finalRules, languageArg,
phoneticx, phonetic2, i).invoke();
+            for (int i = 0; i < phonemeText.length();) {
+                RulesApplication rulesApplication = new RulesApplication(finalRules, languageSet,
phonemeText, subBuilder, i).invoke();
                 boolean found = rulesApplication.isFound();
-                phonetic2 = rulesApplication.getPhonetic();
+                subBuilder = rulesApplication.getPhonemeBuilder();
 
                 if (!found) {
-                    phonetic2 += aPhonetic.substring(i, i + 1);
+                    // System.err.println("Not found. Appending as-is");
+                    subBuilder = subBuilder.append(phonemeText.substring(i, i + 1));
                 }
 
                 i = rulesApplication.getI();
-            }
 
-            phoneticArray.set(k, expand(phonetic2));
-        }
-
-        phonetic = join(phoneticArray, "|");
-        if (strip) {
-            phonetic = normalizeLanguageAttributes(phonetic, true);
-        }
-        if (!phonetic.contains("|")) {
-            phonetic = "(" + removeDuplicateAlternates(phonetic) + ")";
-        }
-
-        return phonetic;
-    }
-
-    private String expand(String phonetic) {
-        int altStart = phonetic.indexOf('(');
-        if (altStart == -1) {
-            return normalizeLanguageAttributes(phonetic, false);
-        }
-
-        String prefix = phonetic.substring(0, altStart);
-        altStart++;
-        int altEnd = phonetic.indexOf(')');
-
-        if (altEnd < altStart) {
-            throw new IllegalArgumentException("Phonetic string has a close-bracket before
the first open-bracket");
-        }
-        
-        String altString = phonetic.substring(altStart, altEnd);
-        altEnd++;
-        String suffix = phonetic.substring(altEnd);
-        List<String> altArray = splitOnPipe(altString);
-
-        String result = "";
-        for (String alt : altArray) {
-            String alternate = expand(prefix + alt + suffix);
-            if (alternate.length() != 0 && !alternate.equals("[any]")) {
-                if (result.length() > 0) {
-                    result += "|";
-                }
-                result += alternate;
+                // System.err.println(phonemeText + " " + i + ": " + subBuilder.makeString());
             }
-        }
-
-        return result;
-    }
-
-    /**
-     * Tests for compatible language rules to do so, apply the rule, expand the results,
and detect alternatives with incompatible
-     * attributes then drop each alternative that has incompatible attributes and keep those
that are compatible if there are no compatible
-     * alternatives left, return false otherwise return the compatible alternatives
-     * 
-     * @param phonetic
-     * @param target
-     * @param languageArg
-     * @return a String or null.
-     */
-    private String applyRuleIfCompatible(String phonetic, String target, Set<String>
languageArg) {
-        String candidate = phonetic + target;
-        if (!candidate.contains("[")) {
-            return candidate;
-        }
-
-        candidate = expand(candidate);
-        List<String> candidateArray = splitOnPipe(candidate);
-
-        candidate = "";
-        boolean found = false;
 
-        for (String thisCandidate : candidateArray) {
-            if (!languageArg.contains(Languages.ANY)) {
-                thisCandidate = normalizeLanguageAttributes(thisCandidate + "[" + languageArg
+ "]", false);
-            }
+            // System.err.println("Expanded to: " + subBuilder.makeString());
 
-            if (!thisCandidate.equals("[0]")) {
-                found = true;
-                if (candidate.length() != 0) {
-                    candidate += "|";
-                }
-                candidate += thisCandidate;
-            }
+            phonemes.addAll(subBuilder.getPhonemes());
         }
 
-        if (!found) {
-            return null; // eugh!
-        }
-        if (candidate.contains("|")) {
-            candidate = "(" + candidate + ")";
-        }
-
-        return candidate;
+        return new PhonemeBuilder(phonemes);
     }
 
     private static String join(Iterable<String> strings, String sep) {
@@ -430,45 +286,28 @@ public class PhoneticEngine {
         return sb.toString();
     }
 
-    private static List<String> splitOnPipe(String str) {
-        List<String> res = new ArrayList<String>();
-
-        while (true) {
-            int i = str.indexOf('|');
-            if (i < 0) {
-                res.add(str);
-                break;
-            }
-
-            res.add(str.substring(0, i));
-            str = str.substring(i + 1);
-        }
-
-        return res;
-    }
-
     private class RulesApplication {
         private final List<Rule> finalRules;
-        private final Set<String> languageArg;
+        private final Languages.LanguageSet languageSet;
         private final String input;
 
-        private String phonetic;
+        private PhonemeBuilder phonemeBuilder;
         private int i;
         private boolean found;
 
-        public RulesApplication(List<Rule> finalRules, Set<String> languageArg,
String input, String phonetic, int i) {
+        public RulesApplication(List<Rule> finalRules, Languages.LanguageSet languageSet,
String input, PhonemeBuilder phonemeBuilder, int i) {
             if (finalRules == null) {
                 throw new NullPointerException("The finalRules argument must not be null");
             }
             this.finalRules = finalRules;
-            this.languageArg = languageArg;
-            this.phonetic = phonetic;
+            this.languageSet = languageSet;
+            this.phonemeBuilder = phonemeBuilder;
             this.input = input;
             this.i = i;
         }
 
-        public String getPhonetic() {
-            return this.phonetic;
+        public PhonemeBuilder getPhonemeBuilder() {
+            return this.phonemeBuilder;
         }
 
         public int getI() {
@@ -487,18 +326,12 @@ public class PhoneticEngine {
                 patternLength = pattern.length();
                 // log("trying pattern: " + pattern);
 
-                if (!rule.patternAndContextMatches(this.input, this.i) || !rule.languageMatches(this.languageArg))
{
+                if (!rule.patternAndContextMatches(this.input, this.i)) {
                     // log("no match");
                     continue RULES;
                 }
 
-                String candidate = applyRuleIfCompatible(this.phonetic, rule.getPhoneme(),
this.languageArg);
-
-                if (candidate == null || candidate.length() == 0) {
-                    // log("no candidate");
-                    continue RULES;
-                }
-                this.phonetic = candidate;
+                this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme());
                 this.found = true;
                 break RULES;
             }
@@ -511,4 +344,65 @@ public class PhoneticEngine {
             return this;
         }
     }
+
+    static class PhonemeBuilder {
+
+        public static PhonemeBuilder empty(Languages.LanguageSet languages) {
+            return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
+        }
+
+        private final Set<Rule.Phoneme> phonemes;
+
+        private PhonemeBuilder(Set<Rule.Phoneme> phonemes) {
+            this.phonemes = phonemes;
+        }
+
+        public Set<Rule.Phoneme> getPhonemes() {
+            return this.phonemes;
+        }
+
+        public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
+            Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
+
+            for (Rule.Phoneme left : this.phonemes) {
+                for (Rule.Phoneme right : phonemeExpr.getPhonemes()) {
+                    Rule.Phoneme join = left.join(right);
+                    if (!join.getLanguages().isEmpty()) {
+                        newPhonemes.add(join);
+                    }
+                }
+            }
+
+            return new PhonemeBuilder(newPhonemes);
+        }
+
+        public String makeString() {
+            List<String> sorted = new ArrayList<String>();
+
+            for (Rule.Phoneme ph : this.phonemes) {
+                sorted.add(ph.getPhonemeText());
+            }
+
+            Collections.sort(sorted);
+            StringBuilder sb = new StringBuilder();
+
+            for (String ph : sorted) {
+                if (sb.length() > 0)
+                    sb.append("|");
+                sb.append(ph);
+            }
+
+            return sb.toString();
+        }
+
+        public PhonemeBuilder append(String str) {
+            Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
+
+            for (Rule.Phoneme ph : this.phonemes) {
+                newPhonemes.add(ph.append(str));
+            }
+
+            return new PhonemeBuilder(newPhonemes);
+        }
+    }
 }

Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java (original)
+++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java Thu
Jul 28 03:45:14 2011
@@ -19,6 +19,7 @@ package org.apache.commons.codec.languag
 
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.EnumMap;
 import java.util.HashMap;
@@ -27,7 +28,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
-import java.util.Stack;
 import java.util.regex.Pattern;
 
 /**
@@ -150,9 +150,9 @@ public class Rule {
      *            the set of languages to consider
      * @return a list of Rules that apply
      */
-    public static List<Rule> instance(NameType nameType, RuleType rt, Set<String>
langs) {
-        if (langs.size() == 1) {
-            return instance(nameType, rt, langs.iterator().next());
+    public static List<Rule> instance(NameType nameType, RuleType rt, Languages.LanguageSet
langs) {
+        if (langs.isSingleton()) {
+            return instance(nameType, rt, langs.getAny());
         } else {
             return instance(nameType, rt, Languages.ANY);
         }
@@ -225,17 +225,16 @@ public class Rule {
                         if (parts.length != 4) {
                             System.err.println("Warning: malformed rule statement split into
" + parts.length + " parts: " + rawLine);
                         } else {
-                            String pat = stripQuotes(parts[0]);
-                            String lCon = stripQuotes(parts[1]);
-                            String rCon = stripQuotes(parts[2]);
-                            String ph = stripQuotes(parts[3]);
                             try {
-                                validatePhenome(ph);
+                                String pat = stripQuotes(parts[0]);
+                                String lCon = stripQuotes(parts[1]);
+                                String rCon = stripQuotes(parts[2]);
+                                PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
+                                Rule r = new Rule(pat, lCon, rCon, ph);
+                                lines.add(r);
                             } catch (IllegalArgumentException e) {
                                 throw new IllegalStateException("Problem parsing line " +
currentLine, e);
                             }
-                            Rule r = new Rule(pat, lCon, rCon, ph, Collections.<String>
emptySet(), ""); // guessing last 2 parameters
-                            lines.add(r);
                         }
                     }
                 }
@@ -257,49 +256,48 @@ public class Rule {
         return str;
     }
 
-    private static void validatePhenome(CharSequence ph) {
-        Stack<Character> stack = new Stack<Character>();
-        for (int i = 0; i < ph.length(); i++) {
-            switch (ph.charAt(i)) {
-            case '(':
-                stack.push('(');
-                break;
-            case '[':
-                stack.push('[');
-                break;
-            case ')': {
-                if (stack.isEmpty())
-                    throw new IllegalArgumentException("Closing ')' at " + i + " without
an opening '('" + " in " + ph);
-                char c = stack.pop();
-                if (c != '(')
-                    throw new IllegalArgumentException("Closing ')' does not pair with opening
'" + c + "' at " + i + " in " + ph);
-                break;
+    private static PhonemeExpr parsePhonemeExpr(String ph) {
+        if (ph.startsWith("(")) { // we have a bracketed list of options
+            if (!ph.endsWith(")")) {
+                throw new IllegalArgumentException("Phoneme starts with '(' so must end with
')'");
             }
-            case ']': {
-                if (stack.isEmpty())
-                    throw new IllegalArgumentException("Closing ']' at " + i + " without
an opening '['" + " in " + ph);
-                char c = stack.pop();
-                if (c != '[')
-                    throw new IllegalArgumentException("Closing ']' does not pair with opening
'" + c + "' at " + i + " in " + ph);
-                break;
+
+            List<Phoneme> phs = new ArrayList<Phoneme>();
+            String body = ph.substring(1, ph.length() - 1);
+            for (String part : body.split("[|]")) {
+                phs.add(parsePhoneme(part));
             }
-            default:
-                break;
+            if (body.startsWith("|") || body.endsWith("|")) {
+                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
             }
+
+            return new PhonemeList(phs);
+        } else {
+            return parsePhoneme(ph);
         }
-        if (!stack.isEmpty())
-            throw new IllegalArgumentException("Bracket(s) opened without corresponding closes:
" + stack + " in " + ph);
     }
 
-    private final Set<String> languages;
+    private static Phoneme parsePhoneme(String ph) {
+        int open = ph.indexOf("[");
+        if (open >= 0) {
+            if (!ph.endsWith("]")) {
+                throw new IllegalArgumentException("Phoneme expression contains a '[' but
does not end in ']'");
+            }
+            String before = ph.substring(0, open);
+            String in = ph.substring(open + 1, ph.length() - 1);
+            Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
 
-    private final Pattern lContext;
+            return new Phoneme(before, Languages.LanguageSet.from(langs));
+        } else {
+            return new Phoneme(ph, Languages.ANY_LANGUAGE);
+        }
+    }
 
-    private final String logical;
+    private final Pattern lContext;
 
     private final String pattern;
 
-    private final String phoneme;
+    private final PhonemeExpr phoneme;
 
     private final Pattern rContext;
 
@@ -314,27 +312,12 @@ public class Rule {
      *            the right context
      * @param phoneme
      *            the resulting phoneme
-     * @param languages
-     *            the required languages
-     * @param logical
-     *            flag to indicate if all or only some languages must be in scope
      */
-    public Rule(String pattern, String lContext, String rContext, String phoneme, Set<String>
languages, String logical) {
+    public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
         this.pattern = pattern;
         this.lContext = Pattern.compile(lContext + "$");
         this.rContext = Pattern.compile("^" + rContext + ".*");
         this.phoneme = phoneme;
-        this.languages = languages;
-        this.logical = logical;
-    }
-
-    /**
-     * Gets the languages that must be in scope. Not all rules apply in every language.
-     * 
-     * @return a Set of Strings giving the relevant languages
-     */
-    public Set<String> getLanguages() {
-        return this.languages;
     }
 
     /**
@@ -347,16 +330,6 @@ public class Rule {
     }
 
     /**
-     * Gets the logical combinator for the languages. ALL means all languages must be in
scope for the rule to apply. Any other value means
-     * that any one language must be in scope for the rule to apply.
-     * 
-     * @return the logical combinator String
-     */
-    public String getLogical() {
-        return this.logical;
-    }
-
-    /**
      * Gets the pattern. This is a string-literal that must exactly match.
      * 
      * @return the pattern
@@ -370,7 +343,7 @@ public class Rule {
      * 
      * @return the phoneme
      */
-    public String getPhoneme() {
+    public PhonemeExpr getPhoneme() {
         return this.phoneme;
     }
 
@@ -383,26 +356,26 @@ public class Rule {
         return this.rContext;
     }
 
-    /**
-     * Decides if the language restriction for this rule applies.
-     * 
-     * @param languageArg
-     *            a Set of Strings giving the names of the languages in scope
-     * @return true if these satistfy the language and logical restrictions on this rule,
false otherwise
-     */
-    public boolean languageMatches(Set<String> languageArg) {
-        if (!languageArg.contains(Languages.ANY) && !this.languages.isEmpty()) {
-            if (ALL.equals(this.logical) && !languageArg.containsAll(this.languages))
{
-                return false;
-            } else {
-                Set<String> isect = new HashSet<String>(languageArg);
-                isect.retainAll(this.languages);
-                return !isect.isEmpty();
-            }
-        } else {
-            return true;
-        }
-    }
+    // /**
+    // * Decides if the language restriction for this rule applies.
+    // *
+    // * @param languageArg
+    // * a Set of Strings giving the names of the languages in scope
+    // * @return true if these satistfy the language and logical restrictions on this rule,
false otherwise
+    // */
+    // public boolean languageMatches(Set<String> languageArg) {
+    // if (!languageArg.contains(Languages.ANY) && !this.languages.isEmpty()) {
+    // if (ALL.equals(this.logical) && !languageArg.containsAll(this.languages))
{
+    // return false;
+    // } else {
+    // Set<String> isect = new HashSet<String>(languageArg);
+    // isect.retainAll(this.languages);
+    // return !isect.isEmpty();
+    // }
+    // } else {
+    // return true;
+    // }
+    // }
 
     /**
      * Decides if the pattern and context match the input starting at a position.
@@ -432,4 +405,49 @@ public class Rule {
         return patternMatches && rContextMatches && lContextMatches;
     }
 
+    public interface PhonemeExpr {
+        Iterable<Phoneme> getPhonemes();
+    }
+
+    public static class Phoneme implements PhonemeExpr {
+        private final String phonemeText;
+        private final Languages.LanguageSet languages;
+
+        public Phoneme(String phonemeText, Languages.LanguageSet languages) {
+            this.phonemeText = phonemeText;
+            this.languages = languages;
+        }
+
+        public String getPhonemeText() {
+            return this.phonemeText;
+        }
+
+        public Languages.LanguageSet getLanguages() {
+            return this.languages;
+        }
+
+        public Iterable<Phoneme> getPhonemes() {
+            return Collections.singleton(this);
+        }
+
+        public Phoneme join(Phoneme right) {
+            return new Phoneme(this.phonemeText + right.phonemeText, this.languages.restrictTo(right.languages));
+        }
+
+        public Phoneme append(String str) {
+            return new Phoneme(this.phonemeText + str, this.languages);
+        }
+    }
+
+    public static class PhonemeList implements PhonemeExpr {
+        private final List<Phoneme> phonemes;
+
+        public PhonemeList(List<Phoneme> phonemes) {
+            this.phonemes = phonemes;
+        }
+
+        public List<Phoneme> getPhonemes() {
+            return this.phonemes;
+        }
+    }
 }

Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
(original)
+++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
Thu Jul 28 03:45:14 2011
@@ -19,13 +19,11 @@ package org.apache.commons.codec.languag
 
 import static org.junit.Assert.assertEquals;
 
-import java.util.Collections;
 import java.util.Random;
 
 import org.apache.commons.codec.EncoderException;
 import org.apache.commons.codec.StringEncoder;
 import org.apache.commons.codec.StringEncoderAbstractTest;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -69,7 +67,6 @@ public class BeiderMorseEncoderTest exte
         Languages.instance("thereIsNoSuchLanguage");
     }
 
-    @Ignore
     @Test(timeout = 10000L)
     public void testLongestEnglishSurname() throws EncoderException {
         BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
@@ -80,7 +77,7 @@ public class BeiderMorseEncoderTest exte
 
     @Test(expected = IndexOutOfBoundsException.class)
     public void testNegativeIndexForRuleMatchIndexOutOfBoundsException() {
-        Rule r = new Rule("a", "", "", "", Collections.<String> emptySet(), "bob");
+        Rule r = new Rule("a", "", "", new Rule.Phoneme("", Languages.ANY_LANGUAGE));
         r.patternAndContextMatches("bob", -1);
     }
 
@@ -111,7 +108,6 @@ public class BeiderMorseEncoderTest exte
         bmpm.setRuleType(RuleType.RULES);
     }
 
-    @Ignore
     @Test(timeout = 10000L)
     public void testSpeedCheck() throws EncoderException {
         char[] chars = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u' };
@@ -122,12 +118,12 @@ public class BeiderMorseEncoderTest exte
         Random rand = new Random();
         stringBuffer.append(chars[rand.nextInt(chars.length)]);
         long start;
-        for (int i = 0; i < 20; i++) {
+        for (int i = 0; i < 40; i++) {
             start = System.currentTimeMillis();
-            System.out.println(i + " String to encode:" + stringBuffer.toString());
+            // System.out.println(i + " String to encode:" + stringBuffer.toString());
             bmpm.encode(stringBuffer.toString());
             stringBuffer.append(chars[rand.nextInt(chars.length)]);
-            System.out.println(i + " Elapsed time in ms:" + (System.currentTimeMillis() -
start));
+            // System.out.println(i + " Elapsed time in ms:" + (System.currentTimeMillis()
- start));
         }
     }
 }

Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
(original)
+++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java
Thu Jul 28 03:45:14 2011
@@ -17,12 +17,10 @@
 
 package org.apache.commons.codec.language.bm;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.util.Arrays;
 import java.util.List;
-import java.util.Set;
 
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -67,17 +65,10 @@ public class LanguageGuessingTest {
 
     @Test
     public void testLanguageGuessing() {
-        Set<String> guesses = this.lang.guessLanguages(this.name);
-        String guess = this.lang.guessLanguage(this.name);
+        Languages.LanguageSet guesses = this.lang.guessLanguages(this.name);
 
         assertTrue("language predicted for name '" + this.name + "' is wrong: " + guesses
+ " should contain '" + this.language + "'",
                 guesses.contains(this.language));
 
-        if (this.exactness.equals(EXACT)) {
-            assertEquals("language predicted for name '" + this.name + "' is wrong", this.language,
guess);
-        } else {
-            // System.out.println("warning: test case that maps to multiple languages: '"
+
-            // name + "':" + language + " ~> " + guesses);
-        }
     }
 }

Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
(original)
+++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
Thu Jul 28 03:45:14 2011
@@ -38,16 +38,16 @@ public class PhoneticEngineTest {
     @Parameterized.Parameters
     public static List<Object[]> data() {
         return Arrays
-                .asList(new Object[] { "Renault", "rinolt|rino|rinDlt|rinalt|rinult|rinD|rina|rinu",
NameType.GENERIC, RuleType.APPROX,
+                .asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult",
NameType.GENERIC, RuleType.APPROX,
                         true },
                         new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult",
NameType.ASHKENAZI, RuleType.APPROX, true },
-                        new Object[] { "Renault", "(rinDlt)", NameType.SEPHARDIC, RuleType.APPROX,
true },
-                        new Object[] { "SntJohn-Smith", "(sntjonsmit)", NameType.GENERIC,
RuleType.EXACT, true },
-                        new Object[] { "d'ortley", "ortlaj|ortlej|ortlaj|ortlej-dortlaj|dortlej|dortlaj|dortlej",
NameType.GENERIC,
+                        new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX,
true },
+                        new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT,
true },
+                        new Object[] { "d'ortley", "ortlaj|ortlaj|ortlej|ortlej-dortlaj|dortlaj|dortlej|dortlej",
NameType.GENERIC,
                                 RuleType.EXACT, true },
                         new Object[] {
                                 "van helsing",
-                                "helSink|helsink|helzink|xelSink|xelsink|xelzink|HelSink|Helsink|Helzink-vanhelSink|vanhelsink|vanhelzink|vanjelSink|vanjelsink|vanjelzink|fanhelSink|fanhelsink|fanhelzink|fanjelSink|fanjelsink|fanjelzink|banhelSink|banhelsink|banhelzink|banjelSink|banjelsink|banjelzink",
+                                "elSink|elsink|helSink|helsink|helzink|xelsink-banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink",
                                 NameType.GENERIC, RuleType.EXACT, false });
     }
 
@@ -71,6 +71,8 @@ public class PhoneticEngineTest {
 
         String phoneticActual = engine.encode(this.name);
 
+        System.err.println("expecting: " + this.phoneticExpected);
+        System.err.println("actual: " + phoneticActual);
         assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
     }
 }

Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java?rev=1151715&r1=1151714&r2=1151715&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
(original)
+++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java
Thu Jul 28 03:45:14 2011
@@ -17,15 +17,11 @@
 
 package org.apache.commons.codec.language.bm;
 
-import static org.junit.Assert.assertEquals;
-
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
-import org.junit.Test;
-import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
 /**
@@ -34,17 +30,19 @@ import org.junit.runners.Parameterized;
  * @author Apache Software Foundation
  * @since 2.0
  */
-@RunWith(Parameterized.class)
+//@RunWith(Parameterized.class)
 public class RuleTest {
 
     @Parameterized.Parameters
     public static List<Object[]> data() {
         return Arrays.asList(
                 new Object[] { "matching language sets with ALL",
-                        new Rule("e", "", "", "o", new HashSet<String>(Arrays.asList("english",
"french")), Rule.ALL),
+                        new Rule("e", "", "", new Rule.Phoneme("o", Languages.LanguageSet.from(
+                                new HashSet<String>(Arrays.asList("english", "french"))))),
                         new HashSet<String>(Arrays.asList("english", "french")), true
},
                 new Object[] { "non-matching language sets with ALL",
-                        new Rule("e", "", "", "o", new HashSet<String>(Arrays.asList("english",
"french")), Rule.ALL),
+                        new Rule("e", "", "", new Rule.Phoneme("o", Languages.LanguageSet.from(
+                                new HashSet<String>(Arrays.asList("english", "french"))))),
                         new HashSet<String>(Arrays.asList("english")), false });
     }
 
@@ -60,9 +58,9 @@ public class RuleTest {
         this.expected = expected;
     }
 
-    @Test
-    public void testRuleLanguageMatches() {
-        assertEquals(this.caseName, this.expected, this.rule.languageMatches(this.langs));
-    }
+//    @Test
+//    public void testRuleLanguageMatches() {
+//        assertEquals(this.caseName, this.expected, this.rule.languageMatches(this.langs));
+//    }
 
 }



Mime
View raw message