lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [09/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic + tests. Rather than porting over the entire commons-codec library, only the language features were ported and added to this library.
Date Tue, 27 Jun 2017 20:33:54 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
new file mode 100644
index 0000000..de636f8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Sephardic
+
+// CONSONANTS
+"kh" "" "" "x" // foreign
+"ph" "" "" "f"
+
+"ç" "" "" "s"
+"x" "" "" "ks"
+"ch" "" "" "S"
+"c" "" "[eiyéèê]" "s"
+"c" "" "" "k"
+"gn" "" "" "(n|gn)"
+"g" "" "[eiy]" "Z" 
+"gue" "" "$" "k"     
+"gu" "" "[eiy]" "g" 
+//"aill" "" "e" "aj" // non Jewish
+//"ll" "" "e" "(l|j)" // non Jewish
+"que" "" "$" "k"
+"qu" "" "" "k"
+"q" "" "" "k"
+"s" "[aeiouyéèê]" "[aeiouyéèê]" "z"
+"h" "[bdgt]" "" "" // translit from Arabic
+"h" "" "$" "" // foreign
+"j" "" "" "Z"
+"w" "" "" "v"
+"ouh" "" "[aioe]" "(v|uh)"
+"ou" "" "[aeio]" "v" 
+"uo" "" "" "(vo|o)"
+"u" "" "[aeio]" "v" 
+
+// VOWELS
+"aue" "" "" "aue" 
+"eau" "" "" "o" 
+//"au" "" "" "(o|au)" // non Jewish
+"ai" "" "" "aj" // [e] is non Jewish
+"ay" "" "" "aj" // [e] is non Jewish
+"é" "" "" "e"
+"ê" "" "" "e"
+"è" "" "" "e"
+"à" "" "" "a"
+"â" "" "" "a"
+"où" "" "" "u"
+"ou" "" "" "u"
+"oi" "" "" "oj" // [ua] is non Jewish
+"ei" "" "" "ej" // [e] is non Jewish, in Ashk should be aj
+"ey" "" "" "ej" // [e] non Jewish, in Ashk should be aj
+//"eu" "" "" "(e|o)" // non Jewish
+"y" "[ou]" "" "j"
+"e" "" "$" "(e|)"
+"i" "" "[aou]" "j"
+"y" "" "[aoeu]" "j"
+"y" "" "" "i"
+
+// TRIVIAL      
+"a" "" "" "a"
+"b" "" "" "b"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"z" "" "" "z"

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
new file mode 100644
index 0000000..91cf5ba
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Sephardic
+
+"אי" "" "" "i"
+"עי" "" "" "i"
+"עו" "" "" "VV"
+"או" "" "" "VV"
+
+"ג׳" "" "" "Z"
+"ד׳" "" "" "dZ"
+
+"א" "" "" "L"
+"ב" "" "" "b"
+"ג" "" "" "g"
+"ד" "" "" "d"
+
+"ה" "^" "" "1"
+"ה" "" "$" "1"
+"ה" "" "" ""
+
+"וו" "" "" "V" 
+"וי" "" "" "WW"
+"ו" "" "" "W"
+"ז" "" "" "z"
+"ח" "" "" "X"
+"ט" "" "" "T"
+"יי" "" "" "i"
+"י" "" "" "i"
+"ך" "" "" "X"
+"כ" "^" "" "K"
+"כ" "" "" "k"
+"ל" "" "" "l"
+"ם" "" "" "m"
+"מ" "" "" "m"
+"ן" "" "" "n"
+"נ" "" "" "n"
+"ס" "" "" "s"
+"ע" "" "" "L"
+"ף" "" "" "f"
+"פ" "" "" "f"
+"ץ" "" "" "C"
+"צ" "" "" "C"
+"ק" "" "" "K"
+"ר" "" "" "r"
+"ש" "" "" "s"
+"ת" "" "" "T"   // Special for Sephardim

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
new file mode 100644
index 0000000..76cf14b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"kh" "" "" "x" // foreign
+
+"gli" "" "" "(l|gli)"
+"gn" "" "[aeou]" "(n|nj|gn)"
+"gni" "" "" "(ni|gni)"
+
+"gi" "" "[aeou]" "dZ"
+"gg" "" "[ei]" "dZ"
+"g" "" "[ei]" "dZ"
+"h" "[bdgt]" "" "g" // gh is It; others from Arabic translit
+
+"ci" "" "[aeou]" "tS"
+"ch" "" "[ei]" "k"
+"sc" "" "[ei]" "S" 
+"cc" "" "[ei]" "tS"
+"c" "" "[ei]" "tS"
+"s" "[aeiou]" "[aeiou]" "z"
+
+"i" "[aeou]" "" "j"
+"i" "" "[aeou]" "j"
+"y" "[aeou]" "" "j" // foreign
+"y" "" "[aeou]" "j" // foreign
+
+"qu" "" "" "k"    
+"uo" "" "" "(vo|o)"
+"u" "" "[aei]" "v" 
+
+"�" "" "" "e" 
+"�" "" "" "e" 
+"�" "" "" "o"  
+"�" "" "" "o" 
+
+// LATIN ALPHABET    
+"a" "" "" "a"
+"b" "" "" "b"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "(Z|dZ|j)" // foreign
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"q" "" "" "k"    
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"w" "" "" "v"    // foreign
+"x" "" "" "ks"    // foreign
+"y" "" "" "i"    // foreign
+"z" "" "" "(ts|dz)"

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
new file mode 100644
index 0000000..67cbd9b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"kh" "" "" "x" // foreign
+"ch" "" "" "S"
+"ss" "" "" "s"
+"sc" "" "[ei]" "s"
+"sç" "" "[aou]" "s"
+"ç" "" "" "s"
+"c" "" "[ei]" "s"
+//  "c" "" "[aou]" "(k|C)"
+
+"s" "^" "" "s"
+"s" "[aáuiíoóeéêy]" "[aáuiíoóeéêy]" "z"
+"s" "" "[dglmnrv]" "(Z|S)" // Z is Brazil
+
+"z" "" "$" "(Z|s|S)" // s and S in Brazil
+"z" "" "[bdgv]" "(Z|z)" // Z in Brazil
+"z" "" "[ptckf]" "(s|S|z)" // s and S in Brazil
+
+"gu" "" "[eiu]" "g"    
+"gu" "" "[ao]" "gv"    
+"g" "" "[ei]" "Z"
+"qu" "" "[eiu]" "k"    
+"qu" "" "[ao]" "kv"    
+
+"uo" "" "" "(vo|o|u)"
+"u" "" "[aei]" "v" 
+
+"lh" "" "" "l"
+"nh" "" "" "nj"
+"h" "[bdgt]" "" "" // translit. from Arabic
+
+"ex" "" "[aáuiíoóeéêy]" "(ez|eS|eks)" // ez in Brazil
+"ex" "" "[cs]" "e" 
+
+"y" "[aáuiíoóeéê]" "" "j"
+"y" "" "[aeiíou]" "j"
+"m" "" "[bcdfglnprstv]" "(m|n)" // maybe to add a rule for m/n before a consonant that disappears [preceding vowel becomes nasalized]
+"m" "" "$" "(m|n)" // maybe to add a rule for final m/n that disappears [preceding vowel becomes nasalized]
+
+"ão" "" "" "(au|an|on)"
+"ãe" "" "" "(aj|an)"
+"ãi" "" "" "(aj|an)"
+"õe" "" "" "(oj|on)"
+"i" "[aáuoóeéê]" "" "j"
+"i" "" "[aeou]" "j"
+
+"â" "" "" "a"
+"à" "" "" "a"
+"á" "" "" "a"
+"ã" "" "" "(a|an|on)"
+"é" "" "" "e"
+"ê" "" "" "e"
+"í" "" "" "i"
+"ô" "" "" "o"
+"ó" "" "" "o"
+"õ" "" "" "(o|on)"
+"ú" "" "" "u"
+"ü" "" "" "u"
+
+"aue" "" "" "aue"
+
+// LATIN ALPHABET
+"a" "" "" "a"
+"b" "" "" "b"
+"c" "" "" "k"
+"d" "" "" "d"
+"e" "" "" "(e|i)"
+"f" "" "" "f"
+"g" "" "" "g"
+"h" "" "" "h"
+"i" "" "" "i"
+"j" "" "" "Z" 
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "(o|u)"
+"p" "" "" "p"
+"q" "" "" "k"    
+"r" "" "" "r"
+"s" "" "" "S"
+"t" "" "" "t"
+"u" "" "" "u"
+"v" "" "" "v"
+"w" "" "" "v"    
+"x" "" "" "(S|ks)"   
+"y" "" "" "i"   
+"z" "" "" "z"

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
new file mode 100644
index 0000000..b900e7e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//Sephardic
+
+// Includes both Spanish (Castillian) & Catalan
+
+// CONSONANTS
+"ñ" "" "" "(n|nj)"
+"ny" "" "" "nj" // Catalan
+"ç" "" "" "s" // Catalan
+
+"ig" "[aeiou]" "" "(tS|ig)" // tS is Catalan
+"ix" "[aeiou]" "" "S" // Catalan
+"tx" "" "" "tS" // Catalan
+"tj" "" "$" "tS" // Catalan
+"tj" "" "" "dZ" // Catalan
+"tg" "" "" "(tg|dZ)" // dZ is Catalan
+"ch" "" "" "(tS|dZ)" // dZ is typical for Argentina
+"bh" "" "" "b" // translit. from Arabic
+"h" "[dgt]" "" "" // translit. from Arabic
+
+"j" "" "" "(x|Z)" // Z is Catalan
+"x" "" "" "(ks|gz|S)" // ks is Spanish, all are Catalan
+
+//"ll" "" "" "(l|Z)" // Z is typical for Argentina, only Ashkenazic
+"w" "" "" "v" // foreign words
+
+"v" "^" "" "(B|v)"
+"b" "^" "" "(b|V)"
+"v" "" "" "(b|v)"
+"b" "" "" "(b|v)"
+"m" "" "[bpvf]" "(m|n)"
+
+"c" "" "[ei]" "s" 
+//  "c" "" "[aou]" "(k|C)"
+"c" "" "" "k"
+
+"z" "" "" "(z|s)" // as "c" befoire "e" or "i", in Spain it is like unvoiced English "th"
+
+"gu" "" "[ei]" "(g|gv)" // "gv" because "u" can actually be "ü"
+"g" "" "[ei]" "(x|g|dZ)"  // "g" only for foreign words; dZ is Catalan
+
+"qu" "" "" "k"
+"q" "" "" "k"
+
+"uo" "" "" "(vo|o)"    
+"u" "" "[aei]" "v"
+
+//  "y" "" "" "(i|j|S|Z)" // S or Z are peculiar to South America; only Ashkenazic
+"y" "" "" "(i|j)"
+
+// VOWELS
+"ü" "" "" "v"
+"á" "" "" "a"
+"é" "" "" "e"
+"í" "" "" "i"
+"ó" "" "" "o"
+"ú" "" "" "u"
+"à" "" "" "a"  // Catalan
+"è" "" "" "e" // Catalan
+"ò" "" "" "o"  // Catalan
+
+// TRIVIAL      
+"a" "" "" "a"
+"d" "" "" "d"
+"e" "" "" "e"
+"f" "" "" "f"
+"g" "" "" "g" 
+"h" "" "" "h"
+"i" "" "" "i"
+"k" "" "" "k"
+"l" "" "" "l"
+"m" "" "" "m"
+"n" "" "" "n"
+"o" "" "" "o"
+"p" "" "" "p"
+"r" "" "" "r"
+"s" "" "" "s"
+"t" "" "" "t"
+"u" "" "" "u"

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
new file mode 100644
index 0000000..1abfcd1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs
@@ -0,0 +1,131 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Caverphone 1.0 value.
+    /// <para/>
+    /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 1.0
+    /// algorithm:
+    /// <para/>
+    /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>
+    /// <para/>
+    /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp060902.pdf">Caverphone 1.0 specification</a>
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// <para/>
+    /// since 1.5
+    /// </summary>
+    public class Caverphone1 : AbstractCaverphone
+    {
+        private static readonly string SIX_1 = "111111";
+
+        /// <summary>
+        /// Encodes the given string into a Caverphone value.
+        /// </summary>
+        /// <param name="source">The source string.</param>
+        /// <returns>A caverphone code for the given string.</returns>
+        public override string Encode(string source)
+        {
+            string txt = source;
+            if (txt == null || txt.Length == 0)
+            {
+                return SIX_1;
+            }
+
+            // 1. Convert to lowercase
+            txt = txt.ToLowerInvariant(); // LUCENENET NOTE: This doesn't work right under "en" language, but does under invariant
+
+            // 2. Remove anything not A-Z
+            txt = Regex.Replace(txt, "[^a-z]", "");
+
+            // 3. Handle various start options
+            // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in.
+            txt = Regex.Replace(txt, "^cough", "cou2f");
+            txt = Regex.Replace(txt, "^rough", "rou2f");
+            txt = Regex.Replace(txt, "^tough", "tou2f");
+            txt = Regex.Replace(txt, "^enough", "enou2f");
+            txt = Regex.Replace(txt, "^gn", "2n");
+
+            // End
+            txt = Regex.Replace(txt, "mb$", "m2");
+
+            // 4. Handle replacements
+            txt = Regex.Replace(txt, "cq", "2q");
+            txt = Regex.Replace(txt, "ci", "si");
+            txt = Regex.Replace(txt, "ce", "se");
+            txt = Regex.Replace(txt, "cy", "sy");
+            txt = Regex.Replace(txt, "tch", "2ch");
+            txt = Regex.Replace(txt, "c", "k");
+            txt = Regex.Replace(txt, "q", "k");
+            txt = Regex.Replace(txt, "x", "k");
+            txt = Regex.Replace(txt, "v", "f");
+            txt = Regex.Replace(txt, "dg", "2g");
+            txt = Regex.Replace(txt, "tio", "sio");
+            txt = Regex.Replace(txt, "tia", "sia");
+            txt = Regex.Replace(txt, "d", "t");
+            txt = Regex.Replace(txt, "ph", "fh");
+            txt = Regex.Replace(txt, "b", "p");
+            txt = Regex.Replace(txt, "sh", "s2");
+            txt = Regex.Replace(txt, "z", "s");
+            txt = Regex.Replace(txt, "^[aeiou]", "A");
+            // 3 is a temporary placeholder marking a vowel
+            txt = Regex.Replace(txt, "[aeiou]", "3");
+            txt = Regex.Replace(txt, "3gh3", "3kh3");
+            txt = Regex.Replace(txt, "gh", "22");
+            txt = Regex.Replace(txt, "g", "k");
+            txt = Regex.Replace(txt, "s+", "S");
+            txt = Regex.Replace(txt, "t+", "T");
+            txt = Regex.Replace(txt, "p+", "P");
+            txt = Regex.Replace(txt, "k+", "K");
+            txt = Regex.Replace(txt, "f+", "F");
+            txt = Regex.Replace(txt, "m+", "M");
+            txt = Regex.Replace(txt, "n+", "N");
+            txt = Regex.Replace(txt, "w3", "W3");
+            txt = Regex.Replace(txt, "wy", "Wy"); // 1.0 only
+            txt = Regex.Replace(txt, "wh3", "Wh3");
+            txt = Regex.Replace(txt, "why", "Why"); // 1.0 only
+            txt = Regex.Replace(txt, "w", "2");
+            txt = Regex.Replace(txt, "^h", "A");
+            txt = Regex.Replace(txt, "h", "2");
+            txt = Regex.Replace(txt, "r3", "R3");
+            txt = Regex.Replace(txt, "ry", "Ry"); // 1.0 only
+            txt = Regex.Replace(txt, "r", "2");
+            txt = Regex.Replace(txt, "l3", "L3");
+            txt = Regex.Replace(txt, "ly", "Ly"); // 1.0 only
+            txt = Regex.Replace(txt, "l", "2");
+            txt = Regex.Replace(txt, "j", "y"); // 1.0 only
+            txt = Regex.Replace(txt, "y3", "Y3"); // 1.0 only
+            txt = Regex.Replace(txt, "y", "2"); // 1.0 only
+
+            // 5. Handle removals
+            txt = Regex.Replace(txt, "2", "");
+            txt = Regex.Replace(txt, "3", "");
+
+            // 6. put ten 1s on the end
+            txt = txt + SIX_1;
+
+            // 7. take the first six characters as the code
+            return txt.Substring(0, SIX_1.Length - 0);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
new file mode 100644
index 0000000..cec7388
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs
@@ -0,0 +1,133 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Caverphone 2.0 value.
+    /// <para/>
+    /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0
+    /// algorithm:
+    /// <para/>
+    /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>
+    /// <para/>
+    /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp150804.pdf">Caverphone 2.0 specification</a>
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// </summary>
+    public class Caverphone2 : AbstractCaverphone
+    {
+        private static readonly string TEN_1 = "1111111111";
+
+        /// <summary>
+        /// Encodes the given string into a Caverphone 2.0 value.
+        /// </summary>
+        /// <param name="source">The source string.</param>
+        /// <returns>A caverphone code for the given string.</returns>
+        public override string Encode(string source)
+        {
+            string txt = source;
+            if (txt == null || txt.Length == 0)
+            {
+                return TEN_1;
+            }
+
+            // 1. Convert to lowercase
+            txt = new CultureInfo("en").TextInfo.ToLower(txt);
+
+            // 2. Remove anything not A-Z
+            txt = Regex.Replace(txt, "[^a-z]", "");
+
+            // 2.5. Remove final e
+            txt = Regex.Replace(txt, "e$", ""); // 2.0 only
+
+            // 3. Handle various start options
+            txt = Regex.Replace(txt, "^cough", "cou2f");
+            txt = Regex.Replace(txt, "^rough", "rou2f");
+            txt = Regex.Replace(txt, "^tough", "tou2f");
+            txt = Regex.Replace(txt, "^enough", "enou2f"); // 2.0 only
+            txt = Regex.Replace(txt, "^trough", "trou2f"); // 2.0 only
+                                                       // note the spec says ^enough here again, c+p error I assume
+            txt = Regex.Replace(txt, "^gn", "2n");
+
+            // End
+            txt = Regex.Replace(txt, "mb$", "m2");
+
+            // 4. Handle replacements
+            txt = Regex.Replace(txt, "cq", "2q");
+            txt = Regex.Replace(txt, "ci", "si");
+            txt = Regex.Replace(txt, "ce", "se");
+            txt = Regex.Replace(txt, "cy", "sy");
+            txt = Regex.Replace(txt, "tch", "2ch");
+            txt = Regex.Replace(txt, "c", "k");
+            txt = Regex.Replace(txt, "q", "k");
+            txt = Regex.Replace(txt, "x", "k");
+            txt = Regex.Replace(txt, "v", "f");
+            txt = Regex.Replace(txt, "dg", "2g");
+            txt = Regex.Replace(txt, "tio", "sio");
+            txt = Regex.Replace(txt, "tia", "sia");
+            txt = Regex.Replace(txt, "d", "t");
+            txt = Regex.Replace(txt, "ph", "fh");
+            txt = Regex.Replace(txt, "b", "p");
+            txt = Regex.Replace(txt, "sh", "s2");
+            txt = Regex.Replace(txt, "z", "s");
+            txt = Regex.Replace(txt, "^[aeiou]", "A");
+            txt = Regex.Replace(txt, "[aeiou]", "3");
+            txt = Regex.Replace(txt, "j", "y"); // 2.0 only
+            txt = Regex.Replace(txt, "^y3", "Y3"); // 2.0 only
+            txt = Regex.Replace(txt, "^y", "A"); // 2.0 only
+            txt = Regex.Replace(txt, "y", "3"); // 2.0 only
+            txt = Regex.Replace(txt, "3gh3", "3kh3");
+            txt = Regex.Replace(txt, "gh", "22");
+            txt = Regex.Replace(txt, "g", "k");
+            txt = Regex.Replace(txt, "s+", "S");
+            txt = Regex.Replace(txt, "t+", "T");
+            txt = Regex.Replace(txt, "p+", "P");
+            txt = Regex.Replace(txt, "k+", "K");
+            txt = Regex.Replace(txt, "f+", "F");
+            txt = Regex.Replace(txt, "m+", "M");
+            txt = Regex.Replace(txt, "n+", "N");
+            txt = Regex.Replace(txt, "w3", "W3");
+            txt = Regex.Replace(txt, "wh3", "Wh3");
+            txt = Regex.Replace(txt, "w$", "3"); // 2.0 only
+            txt = Regex.Replace(txt, "w", "2");
+            txt = Regex.Replace(txt, "^h", "A");
+            txt = Regex.Replace(txt, "h", "2");
+            txt = Regex.Replace(txt, "r3", "R3");
+            txt = Regex.Replace(txt, "r$", "3"); // 2.0 only
+            txt = Regex.Replace(txt, "r", "2");
+            txt = Regex.Replace(txt, "l3", "L3");
+            txt = Regex.Replace(txt, "l$", "3"); // 2.0 only
+            txt = Regex.Replace(txt, "l", "2");
+
+            // 5. Handle removals
+            txt = Regex.Replace(txt, "2", "");
+            txt = Regex.Replace(txt, "3$", "A"); // 2.0 only
+            txt = Regex.Replace(txt, "3", "");
+
+            // 6. put ten 1s on the end
+            txt = txt + TEN_1;
+
+            // 7. take the first ten characters as the code
+            return txt.Substring(0, TEN_1.Length);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
new file mode 100644
index 0000000..a4824b3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs
@@ -0,0 +1,501 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Cologne Phonetic value.
+    /// </summary>
+    /// <remarks>
+    /// Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&#214;lner Phonetik</a>
+    /// (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
+    /// <para/>
+    /// The <i>K&#214;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
+    /// It is related to the well-known soundex algorithm.
+    /// <para/>
+    /// <h2>Algorithm</h2>
+    /// <list type="bullet">
+    ///     <item>
+    ///         <term>Step 1:</term>
+    ///         <description>
+    ///             After preprocessing (conversion to upper case, transcription of <a
+    ///             href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
+    ///             letters of the supplied text are replaced by their phonetic code according to the following table.
+    ///             <list type="table">
+    ///                 <listheader>
+    ///                     <term>Letter</term>
+    ///                     <term>Context</term>
+    ///                     <term>Code</term>
+    ///                 </listheader>
+    ///                 <item>
+    ///                     <term>A, E, I, J, O, U, Y</term>
+    ///                     <term></term>
+    ///                     <term>0</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>H</term>
+    ///                     <term></term>
+    ///                     <term>-</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>B</term>
+    ///                     <term></term>
+    ///                     <term>1</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>P</term>
+    ///                     <term>not before H</term>
+    ///                     <term>1</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>D, T</term>
+    ///                     <term>not before C, S, Z</term>
+    ///                     <term>2</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>F, V, W</term>
+    ///                     <term></term>
+    ///                     <term>3</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>P</term>
+    ///                     <term>before H</term>
+    ///                     <term>3</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>G, K, Q</term>
+    ///                     <term></term>
+    ///                     <term>4</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>C</term>
+    ///                     <term>t onset before A, H, K, L, O, Q, R, U, X <para>OR</para>
+    ///                     before A, H, K, O, Q, U, X except after S, Z</term>
+    ///                     <term>4</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>X</term>
+    ///                     <term>not after C, K, Q</term>
+    ///                     <term>48</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>L</term>
+    ///                     <term></term>
+    ///                     <term>5</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>M, N</term>
+    ///                     <term></term>
+    ///                     <term>6</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>R</term>
+    ///                     <term></term>
+    ///                     <term>7</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>S, Z</term>
+    ///                     <term></term>
+    ///                     <term>8</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>C</term>
+    ///                     <term>after S, Z <para>OR</para>
+    ///                     at onset except before A, H, K, L, O, Q, R, U, X <para>OR</para>
+    ///                     not before A, H, K, O, Q, U, X
+    ///                     </term>
+    ///                     <term>8</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>D, T</term>
+    ///                     <term>before C, S, Z</term>
+    ///                     <term>8</term>
+    ///                 </item>
+    ///                 <item>
+    ///                     <term>X</term>
+    ///                     <term>after C, K, Q</term>
+    ///                     <term>8</term>
+    ///                 </item>
+    ///             </list>
+    ///             <para>
+    ///                 <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
+    ///                 K&#214;lner Phonetik -- Buchstabencodes</a>)</i></small>
+    ///             </para>
+    ///             <h4>Example:</h4>
+    ///             <c>"M&#220;ller-L&#220;denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"</c>
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>Step 2:</term>
+    ///         <description>
+    ///             Collapse of all multiple consecutive code digits.
+    ///             <h4>Example:</h4>
+    ///             <c>"6005507500206880022" => "6050750206802"</c>
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>Step 3:</term>
+    ///         <description>
+    ///             Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
+    ///             if they occur after removing the "0" digits.
+    ///             <h4>Example:</h4>
+    ///             <c>"6050750206802" => "65752682"</c>
+    ///         </description>
+    ///     </item>
+    /// </list>
+    /// <para/>
+    /// This class is thread-safe.
+    /// <para/>
+    /// See: <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&#246;lner Phonetik (in German)</a>
+    /// <para/>
+    /// since 1.5
+    /// </remarks>
+    public class ColognePhonetic : IStringEncoder
+    {
+        // Predefined char arrays for better performance and less GC load
+        private static readonly char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
+        private static readonly char[] SCZ = new char[] { 'S', 'C', 'Z' };
+        private static readonly char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
+        private static readonly char[] GKQ = new char[] { 'G', 'K', 'Q' };
+        private static readonly char[] CKQ = new char[] { 'C', 'K', 'Q' };
+        private static readonly char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
+        private static readonly char[] SZ = new char[] { 'S', 'Z' };
+        private static readonly char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
+        private static readonly char[] TDX = new char[] { 'T', 'D', 'X' };
+
+        /// <summary>
+        /// This class is not thread-safe; the field <see cref="length"/> is mutable.
+        /// However, it is not shared between threads, as it is constructed on demand
+        /// by the method <see cref="ColognePhonetic.GetColognePhonetic(string)"/>.
+        /// </summary>
+        private abstract class CologneBuffer
+        {
+
+            protected readonly char[] data;
+
+            protected int length = 0;
+
+            public CologneBuffer(char[] data)
+            {
+                this.data = data;
+                this.length = data.Length;
+            }
+
+            public CologneBuffer(int buffSize)
+            {
+                this.data = new char[buffSize];
+                this.length = 0;
+            }
+
+            protected abstract char[] CopyData(int start, int length);
+
+            public virtual int Length
+            {
+                get { return length; }
+            }
+
+            public override string ToString()
+            {
+                return new string(CopyData(0, length));
+            }
+        }
+
+        private class CologneOutputBuffer : CologneBuffer
+        {
+            public CologneOutputBuffer(int buffSize)
+                : base(buffSize)
+            {
+            }
+
+            public void AddRight(char chr)
+            {
+                data[length] = chr;
+                length++;
+            }
+
+            protected override char[] CopyData(int start, int length)
+            {
+                char[] newData = new char[length];
+                System.Array.Copy(data, start, newData, 0, length);
+                return newData;
+            }
+        }
+
+        private class CologneInputBuffer : CologneBuffer
+        {
+            public CologneInputBuffer(char[] data)
+                : base(data)
+            {
+            }
+
+            public virtual void AddLeft(char ch)
+            {
+                length++;
+                data[GetNextPos()] = ch;
+            }
+
+            protected override char[] CopyData(int start, int length)
+            {
+                char[] newData = new char[length];
+                System.Array.Copy(data, data.Length - this.length + start, newData, 0, length);
+                return newData;
+            }
+
+            public virtual char GetNextChar()
+            {
+                return data[GetNextPos()];
+            }
+
+            protected virtual int GetNextPos()
+            {
+                return data.Length - length;
+            }
+
+            public virtual char RemoveNext()
+            {
+                char ch = GetNextChar();
+                length--;
+                return ch;
+            }
+        }
+
+        /// <summary>
+        /// Maps some Germanic characters to plain for internal processing. The following characters are mapped:
+        /// <list type="bullet">
+        ///     <item><description>capital a, umlaut mark</description></item>
+        ///     <item><description>capital u, umlaut mark</description></item>
+        ///     <item><description>capital o, umlaut mark</description></item>
+        ///     <item><description>small sharp s, German</description></item>
+        /// </list>
+        /// </summary>
+        private static readonly char[][] PREPROCESS_MAP = {
+            new char[] {'\u00C4', 'A'}, // capital a, umlaut mark
+            new char[] {'\u00DC', 'U'}, // capital u, umlaut mark
+            new char[] {'\u00D6', 'O'}, // capital o, umlaut mark
+            new char[] {'\u00DF', 'S'} // small sharp s, German
+        };
+
+        /// <summary>
+        /// Returns whether the array contains the key, or not.
+        /// </summary>
+        private static bool ArrayContains(char[] arr, char key)
+        {
+            foreach (char element in arr)
+            {
+                if (element == key)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /// <summary>
+        /// <para>
+        /// Implements the <i>K&#246;lner Phonetik</i> algorithm.
+        /// </para>
+        /// <para>
+        /// In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
+        /// </para>
+        /// 
+        /// </summary>
+        /// <param name="text"></param>
+        /// <returns>The corresponding encoding according to the <i>K&#246;lner Phonetik</i> algorithm</returns>
+        public virtual string GetColognePhonetic(string text)
+        {
+            if (text == null)
+            {
+                return null;
+            }
+
+            text = Preprocess(text);
+
+            CologneOutputBuffer output = new CologneOutputBuffer(text.Length * 2);
+            CologneInputBuffer input = new CologneInputBuffer(text.ToCharArray());
+
+            char nextChar;
+
+            char lastChar = '-';
+            char lastCode = '/';
+            char code;
+            char chr;
+
+            int rightLength = input.Length;
+
+            while (rightLength > 0)
+            {
+                chr = input.RemoveNext();
+
+                if ((rightLength = input.Length) > 0)
+                {
+                    nextChar = input.GetNextChar();
+                }
+                else
+                {
+                    nextChar = '-';
+                }
+
+                if (ArrayContains(AEIJOUY, chr))
+                {
+                    code = '0';
+                }
+                else if (chr == 'H' || chr < 'A' || chr > 'Z')
+                {
+                    if (lastCode == '/')
+                    {
+                        continue;
+                    }
+                    code = '-';
+                }
+                else if (chr == 'B' || (chr == 'P' && nextChar != 'H'))
+                {
+                    code = '1';
+                }
+                else if ((chr == 'D' || chr == 'T') && !ArrayContains(SCZ, nextChar))
+                {
+                    code = '2';
+                }
+                else if (ArrayContains(WFPV, chr))
+                {
+                    code = '3';
+                }
+                else if (ArrayContains(GKQ, chr))
+                {
+                    code = '4';
+                }
+                else if (chr == 'X' && !ArrayContains(CKQ, lastChar))
+                {
+                    code = '4';
+                    input.AddLeft('S');
+                    rightLength++;
+                }
+                else if (chr == 'S' || chr == 'Z')
+                {
+                    code = '8';
+                }
+                else if (chr == 'C')
+                {
+                    if (lastCode == '/')
+                    {
+                        if (ArrayContains(AHKLOQRUX, nextChar))
+                        {
+                            code = '4';
+                        }
+                        else
+                        {
+                            code = '8';
+                        }
+                    }
+                    else
+                    {
+                        if (ArrayContains(SZ, lastChar) || !ArrayContains(AHOUKQX, nextChar))
+                        {
+                            code = '8';
+                        }
+                        else
+                        {
+                            code = '4';
+                        }
+                    }
+                }
+                else if (ArrayContains(TDX, chr))
+                {
+                    code = '8';
+                }
+                else if (chr == 'R')
+                {
+                    code = '7';
+                }
+                else if (chr == 'L')
+                {
+                    code = '5';
+                }
+                else if (chr == 'M' || chr == 'N')
+                {
+                    code = '6';
+                }
+                else
+                {
+                    code = chr;
+                }
+
+                if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8'))
+                {
+                    output.AddRight(code);
+                }
+
+                lastChar = chr;
+                lastCode = code;
+            }
+            return output.ToString();
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+        //@Override
+        //    public Object encode(final Object object) throws EncoderException
+        //{
+        //        if (!(object instanceof String)) {
+        //        throw new EncoderException("This method's parameter was expected to be of the type " +
+        //            String.class.getName() +
+        //                ". But actually it was of the type " +
+        //                object.getClass().getName() +
+        //                ".");
+        //        }
+        //        return encode((String) object);
+        //    }
+
+
+        public virtual string Encode(string text)
+        {
+            return GetColognePhonetic(text);
+        }
+
+        public virtual bool IsEncodeEqual(string text1, string text2)
+        {
+            return GetColognePhonetic(text1).Equals(GetColognePhonetic(text2));
+        }
+
+        /// <summary>
+        /// Converts the string to upper case and replaces germanic characters as defined in <see cref="PREPROCESS_MAP"/>.
+        /// </summary>
+        private string Preprocess(string text)
+        {
+            text = new CultureInfo("de").TextInfo.ToUpper(text);
+
+            char[] chrs = text.ToCharArray();
+
+            for (int index = 0; index < chrs.Length; index++)
+            {
+                if (chrs[index] > 'Z')
+                {
+                    foreach (char[] element in PREPROCESS_MAP)
+                    {
+                        if (chrs[index] == element[0])
+                        {
+                            chrs[index] = element[1];
+                            break;
+                        }
+                    }
+                }
+            }
+            return new string(chrs);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
new file mode 100644
index 0000000..e72bc38
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs
@@ -0,0 +1,620 @@
+// commons-codec version compatibility level: 1.10
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Reflection;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Daitch-Mokotoff Soundex value.
+    /// </summary>
+    /// <remarks>
+    /// The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
+    /// accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
+    /// <para/>
+    /// The main differences compared to the other soundex variants are:
+    /// <list type="bullet">
+    ///     <item><description>coded names are 6 digits long</description></item>
+    ///     <item><description>the initial character of the name is coded</description></item>
+    ///     <item><description>rules to encoded multi-character n-grams</description></item>
+    ///     <item><description>multiple possible encodings for the same name (branching)</description></item>
+    /// </list>
+    /// <para/>
+    /// This implementation supports branching, depending on the used method:
+    /// <list type="bullet">
+    ///     <item><term><see cref="Encode(string)"/></term><description>branching disabled, only the first code will be returned</description></item>
+    ///     <item><term><see cref="GetSoundex(string)"/></term><description>branching enabled, all codes will be returned, separated by '|'</description></item>
+    /// </list>
+    /// <para/>
+    /// Note: this implementation has additional branching rules compared to the original description of the algorithm. The
+    /// rules can be customized by overriding the default rules contained in the resource file
+    /// <c>Lucene.Net.Analysis.Phonetic.Language.dmrules.txt</c>.
+    /// <para/>
+    /// This class is thread-safe.
+    /// <para/>
+    /// See: <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
+    /// <para/>
+    /// See: <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
+    /// <para/>
+    /// since 1.10
+    /// </remarks>
+    /// <seealso cref="Soundex"/>
+    public class DaitchMokotoffSoundex : IStringEncoder
+    {
+        /// <summary>
+        /// Inner class representing a branch during DM soundex encoding.
+        /// </summary>
+        private sealed class Branch
+        {
+            private readonly StringBuilder builder;
+            private string cachedString;
+            private string lastReplacement;
+
+            internal Branch()
+            {
+                builder = new StringBuilder();
+                lastReplacement = null;
+                cachedString = null;
+            }
+
+            /// <summary>
+            /// Creates a new branch, identical to this branch.
+            /// </summary>
+            /// <returns>A new, identical branch.</returns>
+            public Branch CreateBranch()
+            {
+                Branch branch = new Branch();
+                branch.builder.Append(ToString());
+                branch.lastReplacement = this.lastReplacement;
+                return branch;
+            }
+
+            public override bool Equals(object other)
+            {
+                if (this == other)
+                {
+                    return true;
+                }
+                if (!(other is Branch))
+                {
+                    return false;
+                }
+
+                return ToString().Equals(((Branch)other).ToString());
+            }
+
+            /// <summary>
+            /// Finish this branch by appending '0's until the maximum code length has been reached.
+            /// </summary>
+            public void Finish()
+            {
+                while (builder.Length < MAX_LENGTH)
+                {
+                    builder.Append('0');
+                    cachedString = null;
+                }
+            }
+
+            public override int GetHashCode()
+            {
+                return ToString().GetHashCode();
+            }
+
+            /// <summary>
+            /// Process the next replacement to be added to this branch.
+            /// </summary>
+            /// <param name="replacement">The next replacement to append.</param>
+            /// <param name="forceAppend">Indicates if the default processing shall be overridden.</param>
+            public void ProcessNextReplacement(string replacement, bool forceAppend)
+            {
+                bool append = lastReplacement == null || !lastReplacement.EndsWith(replacement, StringComparison.Ordinal) || forceAppend;
+
+                if (append && builder.Length < MAX_LENGTH)
+                {
+                    builder.Append(replacement);
+                    // remove all characters after the maximum length
+                    if (builder.Length > MAX_LENGTH)
+                    {
+                        //builder.delete(MAX_LENGTH, builder.Length);
+                        builder.Remove(MAX_LENGTH, builder.Length - MAX_LENGTH);
+                    }
+                    cachedString = null;
+                }
+
+                lastReplacement = replacement;
+            }
+
+            public override string ToString()
+            {
+                if (cachedString == null)
+                {
+                    cachedString = builder.ToString();
+                }
+                return cachedString;
+            }
+        }
+
+        /// <summary>
+        /// Inner class for storing rules.
+        /// </summary>
+        private sealed class Rule
+        {
+            private readonly string pattern;
+            private readonly string[] replacementAtStart;
+            private readonly string[] replacementBeforeVowel;
+            private readonly string[] replacementDefault;
+
+            internal Rule(string pattern, string replacementAtStart, string replacementBeforeVowel,
+                    string replacementDefault)
+            {
+                this.pattern = pattern;
+                this.replacementAtStart = Regex.Split(replacementAtStart, "\\|");
+                this.replacementBeforeVowel = Regex.Split(replacementBeforeVowel, "\\|");
+                this.replacementDefault = Regex.Split(replacementDefault, "\\|");
+            }
+
+            // LUCENENET specific - need read access to pattern
+            public string Pattern
+            {
+                get { return pattern; }
+            }
+
+            public int PatternLength
+            {
+                get { return pattern.Length; }
+            }
+
+            public string[] GetReplacements(string context, bool atStart)
+            {
+                if (atStart)
+                {
+                    return replacementAtStart;
+                }
+
+                int nextIndex = PatternLength;
+                bool nextCharIsVowel = nextIndex < context.Length ? IsVowel(context[nextIndex]) : false;
+                if (nextCharIsVowel)
+                {
+                    return replacementBeforeVowel;
+                }
+
+                return replacementDefault;
+            }
+
+            private bool IsVowel(char ch)
+            {
+                return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+            }
+
+            public bool Matches(string context)
+            {
+                return context.StartsWith(pattern, StringComparison.Ordinal);
+            }
+
+            public override string ToString()
+            {
+                return string.Format("{0}=({1},{2},{3})", pattern, Collections.ToString(replacementAtStart),
+                    Collections.ToString(replacementBeforeVowel), Collections.ToString(replacementDefault));
+            }
+        }
+
+        private static readonly string COMMENT = "//";
+        private static readonly string DOUBLE_QUOTE = "\"";
+
+        private static readonly string MULTILINE_COMMENT_END = "*/";
+
+        private static readonly string MULTILINE_COMMENT_START = "/*";
+
+        /// <summary>The resource file containing the replacement and folding rules</summary>
+        private static readonly string RESOURCE_FILE = "dmrules.txt";
+
+        /// <summary>The code length of a DM soundex value.</summary>
+        private static readonly int MAX_LENGTH = 6;
+
+        /// <summary>Transformation rules indexed by the first character of their pattern.</summary>
+        private static readonly IDictionary<char, IList<Rule>> RULES = new Dictionary<char, IList<Rule>>();
+
+        /// <summary>Folding rules.</summary>
+        private static readonly IDictionary<char, char> FOLDINGS = new Dictionary<char, char>();
+
+        private class DaitchMokotoffRuleComparer : IComparer<Rule>
+        {
+            public int Compare(Rule rule1, Rule rule2)
+            {
+                return rule2.PatternLength - rule1.PatternLength;
+            }
+        }
+
+        static DaitchMokotoffSoundex()
+        {
+            Stream rulesIS = typeof(DaitchMokotoffSoundex).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(DaitchMokotoffSoundex), RESOURCE_FILE);
+            if (rulesIS == null)
+            {
+                throw new ArgumentException("Unable to load resource: " + RESOURCE_FILE);
+            }
+
+            using (TextReader scanner = new StreamReader(rulesIS, Encoding.UTF8))
+            {
+                ParseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+            }
+
+            // sort RULES by pattern length in descending order
+            foreach (var rule in RULES)
+            {
+                IList<Rule> ruleList = rule.Value;
+                ruleList.Sort(new DaitchMokotoffRuleComparer());
+            }
+        }
+
+        private static void ParseRules(TextReader scanner, string location,
+            IDictionary<char, IList<Rule>> ruleMapping, IDictionary<char, char> asciiFoldings)
+        {
+            int currentLine = 0;
+            bool inMultilineComment = false;
+
+            string rawLine;
+            while ((rawLine = scanner.ReadLine()) != null)
+            { 
+                currentLine++;
+                string line = rawLine;
+
+                if (inMultilineComment)
+                {
+                    if (line.EndsWith(MULTILINE_COMMENT_END, StringComparison.Ordinal))
+                    {
+                        inMultilineComment = false;
+                    }
+                    continue;
+                }
+
+                if (line.StartsWith(MULTILINE_COMMENT_START, StringComparison.Ordinal))
+                {
+                    inMultilineComment = true;
+                }
+                else
+                {
+                    // discard comments
+                    int cmtI = line.IndexOf(COMMENT);
+                    if (cmtI >= 0)
+                    {
+                        line = line.Substring(0, cmtI - 0);
+                    }
+
+                    // trim leading-trailing whitespace
+                    line = line.Trim();
+
+                    if (line.Length == 0)
+                    {
+                        continue; // empty lines can be safely skipped
+                    }
+
+                    if (line.Contains("="))
+                    {
+                        // folding
+                        string[] parts = line.Split(new string[] { "=" }, StringSplitOptions.RemoveEmptyEntries);
+                        if (parts.Length != 2)
+                        {
+                            throw new ArgumentException("Malformed folding statement split into " + parts.Length +
+                                    " parts: " + rawLine + " in " + location);
+                        }
+                        else
+                        {
+                            string leftCharacter = parts[0];
+                            string rightCharacter = parts[1];
+
+                            if (leftCharacter.Length != 1 || rightCharacter.Length != 1)
+                            {
+                                throw new ArgumentException("Malformed folding statement - " +
+                                        "patterns are not single characters: " + rawLine + " in " + location);
+                            }
+
+                            asciiFoldings[leftCharacter[0]] = rightCharacter[0];
+                        }
+                    }
+                    else
+                    {
+                        // rule
+                        string[] parts = Regex.Split(line, "\\s+");
+                        if (parts.Length != 4)
+                        {
+                            throw new ArgumentException("Malformed rule statement split into " + parts.Length +
+                                    " parts: " + rawLine + " in " + location);
+                        }
+                        else
+                        {
+                            try
+                            {
+                                string pattern = StripQuotes(parts[0]);
+                                string replacement1 = StripQuotes(parts[1]);
+                                string replacement2 = StripQuotes(parts[2]);
+                                string replacement3 = StripQuotes(parts[3]);
+
+                                Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+                                char patternKey = r.Pattern[0];
+                                IList<Rule> rules;
+                                if (!ruleMapping.TryGetValue(patternKey, out rules) || rules == null)
+                                {
+                                    rules = new List<Rule>();
+                                    ruleMapping[patternKey] = rules;
+                                }
+                                rules.Add(r);
+                            }
+                            catch (ArgumentException e)
+                            {
+                                throw new InvalidOperationException(
+                                        "Problem parsing line '" + currentLine + "' in " + location, e);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        private static string StripQuotes(string str)
+        {
+            if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+            {
+                str = str.Substring(1);
+            }
+
+            if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal))
+            {
+                str = str.Substring(0, str.Length - 1);
+            }
+
+            return str;
+        }
+
+        /// <summary>Whether to use ASCII folding prior to encoding.</summary>
+        private readonly bool folding;
+
+        /// <summary>
+        /// Creates a new instance with ASCII-folding enabled.
+        /// </summary>
+        public DaitchMokotoffSoundex()
+            : this(true)
+        {
+        }
+
+        /// <summary>
+        /// Creates a new instance.
+        /// <para/>
+        /// With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
+        /// è -&gt; e.
+        /// </summary>
+        /// <param name="folding">If ASCII-folding shall be performed before encoding.</param>
+        public DaitchMokotoffSoundex(bool folding)
+        {
+            this.folding = folding;
+        }
+
+        /// <summary>
+        /// Performs a cleanup of the input string before the actual soundex transformation.
+        /// <para/>
+        /// Removes all whitespace characters and performs ASCII folding if enabled.
+        /// </summary>
+        /// <param name="input">The input string to cleanup.</param>
+        /// <returns>A cleaned up string.</returns>
+        private string Cleanup(string input)
+        {
+            StringBuilder sb = new StringBuilder();
+            foreach (char c in input.ToCharArray())
+            {
+                char ch = c;
+                if (char.IsWhiteSpace(ch))
+                {
+                    continue;
+                }
+
+                ch = char.ToLowerInvariant(ch);
+                if (folding && FOLDINGS.ContainsKey(ch))
+                {
+                    ch = FOLDINGS[ch];
+                }
+                sb.Append(ch);
+            }
+            return sb.ToString();
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+        //**
+        // * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+        // * <p>
+        // * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+        // * EncoderException if the supplied object is not of type java.lang.String.
+        // * </p>
+        // *
+        // * @see #soundex(String)
+        // *
+        // * @param obj
+        // *            Object to encode
+        // * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+        // *         supplied.
+        // * @throws EncoderException
+        // *             if the parameter supplied is not of type java.lang.String
+        // * @throws IllegalArgumentException
+        // *             if a character is not mapped
+        // */
+        //@Override
+        //    public Object encode(object obj) 
+        //{
+        //        if (!(obj instanceof String)) {
+        //        throw new EncoderException(
+        //                "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+        //    }
+        //        return encode((String) obj);
+        //}
+
+        /// <summary>
+        /// Encodes a string using the Daitch-Mokotoff soundex algorithm without branching.
+        /// </summary>
+        /// <param name="source">A string to encode.</param>
+        /// <returns>A DM Soundex code corresponding to the string supplied.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        /// <seealso cref="GetSoundex(string)"/>
+        public virtual string Encode(string source)
+        {
+            if (source == null)
+            {
+                return null;
+            }
+            return GetSoundex(source, false)[0];
+        }
+
+        /// <summary>
+        /// Encodes a string using the Daitch-Mokotoff soundex algorithm with branching.
+        /// <para/>
+        /// In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+        /// separated by '|'.
+        /// <para/>
+        /// Example: the name "AUERBACH" is encoded as both
+        /// <list type="bullet">
+        ///     <item><description>097400</description></item>
+        ///     <item><description>097500</description></item>
+        /// </list>
+        /// <para/>
+        /// Thus the result will be "097400|097500".
+        /// </summary>
+        /// <param name="source">A string to encode.</param>
+        /// <returns>A string containing a set of DM Soundex codes corresponding to the string supplied.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        public virtual string GetSoundex(string source)
+        {
+            string[] branches = GetSoundex(source, true);
+            StringBuilder sb = new StringBuilder();
+            int index = 0;
+            foreach (string branch in branches)
+            {
+                sb.Append(branch);
+                if (++index < branches.Length)
+                {
+                    sb.Append('|');
+                }
+            }
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Perform the actual DM Soundex algorithm on the input string.
+        /// </summary>
+        /// <param name="source">A string to encode.</param>
+        /// <param name="branching">If branching shall be performed.</param>
+        /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        private string[] GetSoundex(string source, bool branching)
+        {
+            if (source == null)
+            {
+                return null;
+            }
+
+            string input = Cleanup(source);
+
+            // LinkedHashSet preserves input order. In .NET we can use List for that purpose.
+            IList<Branch> currentBranches = new List<Branch>();
+            currentBranches.Add(new Branch());
+
+            char lastChar = '\0';
+            for (int index = 0; index < input.Length; index++)
+            {
+                char ch = input[index];
+
+                // ignore whitespace inside a name
+                if (char.IsWhiteSpace(ch))
+                {
+                    continue;
+                }
+
+                string inputContext = input.Substring(index);
+                IList<Rule> rules;
+                if (!RULES.TryGetValue(ch, out rules) || rules == null)
+                {
+                    continue;
+                }
+
+                // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+                IList<Branch> nextBranches = branching ? new List<Branch>() : Collections.EmptyList<Branch>();
+
+                foreach (Rule rule in rules)
+                {
+                    if (rule.Matches(inputContext))
+                    {
+                        if (branching)
+                        {
+                            nextBranches.Clear();
+                        }
+                        string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0');
+                        bool branchingRequired = replacements.Length > 1 && branching;
+
+                        foreach (Branch branch in currentBranches)
+                        {
+                            foreach (string nextReplacement in replacements)
+                            {
+                                // if we have multiple replacements, always create a new branch
+                                Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch;
+
+                                // special rule: occurrences of mn or nm are treated differently
+                                bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+
+                                nextBranch.ProcessNextReplacement(nextReplacement, force);
+
+                                if (branching)
+                                {
+                                    if (!nextBranches.Contains(nextBranch))
+                                    {
+                                        nextBranches.Add(nextBranch);
+                                    }
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        if (branching)
+                        {
+                            currentBranches.Clear();
+                            currentBranches.AddRange(nextBranches);
+                        }
+                        index += rule.PatternLength - 1;
+                        break;
+                    }
+                }
+
+                lastChar = ch;
+            }
+
+            string[] result = new string[currentBranches.Count];
+            int idx = 0;
+            foreach (Branch branch in currentBranches)
+            {
+                branch.Finish();
+                result[idx++] = branch.ToString();
+            }
+
+            return result;
+        }
+    }
+}


Mime
View raw message