lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [07/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic + tests. Rather than porting over the entire commons-codec library, only the language features were ported and added to this library.
Date Tue, 27 Jun 2017 20:33:52 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs
new file mode 100644
index 0000000..a80d4f4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs
@@ -0,0 +1,370 @@
+// commons-codec version compatibility level: 1.9
+using System;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
+    /// general purpose scheme to find word with similar phonemes.
+    /// </summary>
+    /// <remarks>
+    /// NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
+    /// <para/>
+    /// Algorithm description:
+    /// <list type="number">
+    ///     <item>
+    ///         <term>Transcode first characters of name</term>
+    ///         <description>
+    ///             <list type="number">
+    ///                 <item><description>MAC ->   MCC</description></item>
+    ///                 <item><description>KN  ->   NN</description></item>
+    ///                 <item><description>K   ->   C</description></item>
+    ///                 <item><description>PH  ->   FF</description></item>
+    ///                 <item><description>PF  ->   FF</description></item>
+    ///                 <item><description>SCH ->   SSS</description></item>
+    ///             </list>
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>Transcode last characters of name</term>
+    ///         <description>
+    ///             <list type="number">
+    ///                 <item><description>EE, IE          ->   Y</description></item>
+    ///                 <item><description>DT,RT,RD,NT,ND  ->   D</description></item>
+    ///             </list>
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>First character of key = first character of name</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>Transcode remaining characters by following these rules, incrementing by one character each time</term>
+    ///         <description>
+    ///             <list type="number">
+    ///                 <item><description>EV  ->   AF  else A,E,I,O,U -> A</description></item>
+    ///                 <item><description>Q   ->   G</description></item>
+    ///                 <item><description>Z   ->   S</description></item>
+    ///                 <item><description>M   ->   N</description></item>
+    ///                 <item><description>KN  ->   N   else K -> C</description></item>
+    ///                 <item><description>SCH ->   SSS</description></item>
+    ///                 <item><description>PH  ->   FF</description></item>
+    ///                 <item><description>H   ->   If previous or next is nonvowel, previous</description></item>
+    ///                 <item><description>W   ->   If previous is vowel, previous</description></item>
+    ///                 <item><description>Add current to key if current != last key character</description></item>
+    ///             </list>
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>If last character is S, remove it</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>If last characters are AY, replace with Y</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>If last character is A, remove it</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>Collapse all strings of repeated characters</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>Add original first character of name as first character of key</term>
+    ///     </item>
+    /// </list>
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// <para/>
+    /// See: <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
+    /// <para/>
+    /// See: <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
+    /// <para/>
+    /// since 1.7
+    /// </remarks>
+    /// <seealso cref="Soundex"/>
+    public class Nysiis : IStringEncoder
+    {
+        private static readonly char[] CHARS_A = new char[] { 'A' };
+        private static readonly char[] CHARS_AF = new char[] { 'A', 'F' };
+        private static readonly char[] CHARS_C = new char[] { 'C' };
+        private static readonly char[] CHARS_FF = new char[] { 'F', 'F' };
+        private static readonly char[] CHARS_G = new char[] { 'G' };
+        private static readonly char[] CHARS_N = new char[] { 'N' };
+        private static readonly char[] CHARS_NN = new char[] { 'N', 'N' };
+        private static readonly char[] CHARS_S = new char[] { 'S' };
+        private static readonly char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
+
+        private static readonly Regex PAT_MAC = new Regex("^MAC", RegexOptions.Compiled);
+        private static readonly Regex PAT_KN = new Regex("^KN", RegexOptions.Compiled);
+        private static readonly Regex PAT_K = new Regex("^K", RegexOptions.Compiled);
+        private static readonly Regex PAT_PH_PF = new Regex("^(PH|PF)", RegexOptions.Compiled);
+        private static readonly Regex PAT_SCH = new Regex("^SCH", RegexOptions.Compiled);
+        private static readonly Regex PAT_EE_IE = new Regex("(EE|IE)$", RegexOptions.Compiled);
+        private static readonly Regex PAT_DT_ETC = new Regex("(DT|RT|RD|NT|ND)$", RegexOptions.Compiled);
+
+        private static readonly char SPACE = ' ';
+        private static readonly int TRUE_LENGTH = 6;
+
+        /// <summary>
+        /// Tests if the given character is a vowel.
+        /// </summary>
+        /// <param name="c">The character to test.</param>
+        /// <returns><c>true</c> if the character is a vowel, <c>false</c> otherwise.</returns>
+        private static bool IsVowel(char c)
+        {
+            return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
+        }
+
+        /// <summary>
+        /// Transcodes the remaining parts of the string. The method operates on a sliding window, looking at 4 characters at
+        /// a time: [i-1, i, i+1, i+2].
+        /// </summary>
+        /// <param name="prev">The previous character.</param>
+        /// <param name="curr">The current character.</param>
+        /// <param name="next">The next character.</param>
+        /// <param name="aNext">The after next character</param>
+        /// <returns>A transcoded array of characters, starting from the current position.</returns>
+        private static char[] TranscodeRemaining(char prev, char curr, char next, char aNext)
+        {
+            // 1. EV -> AF
+            if (curr == 'E' && next == 'V')
+            {
+                return CHARS_AF;
+            }
+
+            // A, E, I, O, U -> A
+            if (IsVowel(curr))
+            {
+                return CHARS_A;
+            }
+
+            // 2. Q -> G, Z -> S, M -> N
+            if (curr == 'Q')
+            {
+                return CHARS_G;
+            }
+            else if (curr == 'Z')
+            {
+                return CHARS_S;
+            }
+            else if (curr == 'M')
+            {
+                return CHARS_N;
+            }
+
+            // 3. KN -> NN else K -> C
+            if (curr == 'K')
+            {
+                if (next == 'N')
+                {
+                    return CHARS_NN;
+                }
+                else
+                {
+                    return CHARS_C;
+                }
+            }
+
+            // 4. SCH -> SSS
+            if (curr == 'S' && next == 'C' && aNext == 'H')
+            {
+                return CHARS_SSS;
+            }
+
+            // PH -> FF
+            if (curr == 'P' && next == 'H')
+            {
+                return CHARS_FF;
+            }
+
+            // 5. H -> If previous or next is a non vowel, previous.
+            if (curr == 'H' && (!IsVowel(prev) || !IsVowel(next)))
+            {
+                return new char[] { prev };
+            }
+
+            // 6. W -> If previous is vowel, previous.
+            if (curr == 'W' && IsVowel(prev))
+            {
+                return new char[] { prev };
+            }
+
+            return new char[] { curr };
+        }
+
+        /// <summary>Indicates the strict mode.</summary>
+        private readonly bool strict;
+
+        /// <summary>
+        /// Creates an instance of the <see cref="Nysiis"/> encoder with strict mode (original form),
+        /// i.e. encoded strings have a maximum length of 6.
+        /// </summary>
+        public Nysiis()
+            : this(true)
+        {
+        }
+
+        /// <summary>
+        /// Create an instance of the {@link Nysiis} encoder with the specified strict mode:
+        /// <list type="bullet">
+        ///     <item><term><c>true</c>:</term><description>encoded strings have a maximum length of 6</description></item>
+        ///     <item><term><c>false</c>:</term><description>encoded strings may have arbitrary length</description></item>
+        /// </list>
+        /// </summary>
+        /// <param name="strict">The strict mode.</param>
+        public Nysiis(bool strict)
+        {
+            this.strict = strict;
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+        //    /**
+        //     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
+        //     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
+        //     * {@link String}.
+        //     *
+        //     * @param obj
+        //     *            Object to encode
+        //     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
+        //     * @throws EncoderException
+        //     *            if the parameter supplied is not of a {@link String}
+        //     * @throws IllegalArgumentException
+        //     *            if a character is not mapped
+        //     */
+        //    @Override
+        //public object Encode(object obj) 
+        //    {
+        //    if (!(obj is String)) {
+        //            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
+        //        }
+        //    return this.nysiis((String) obj);
+        //    }
+
+        /// <summary>
+        /// Encodes a string using the NYSIIS algorithm.
+        /// </summary>
+        /// <param name="str">A string object to encode.</param>
+        /// <returns>A <see cref="Nysiis"/> code corresponding to the string supplied.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        public virtual string Encode(string str)
+        {
+            return this.GetNysiis(str);
+        }
+
+        /// <summary>
+        /// Indicates the strict mode for this <see cref="Nysiis"/> encoder.
+        /// <c>true</c> if the encoder is configured for strict mode, <c>false</c> otherwise.
+        /// </summary>
+        public virtual bool IsStrict
+        {
+            get { return this.strict; }
+        }
+
+        /// <summary>
+        /// Retrieves the NYSIIS code for a given string.
+        /// </summary>
+        /// <param name="str">String to encode using the NYSIIS algorithm.</param>
+        /// <returns>A NYSIIS code for the string supplied.</returns>
+        public virtual string GetNysiis(string str)
+        {
+            if (str == null)
+            {
+                return null;
+            }
+
+            // Use the same clean rules as Soundex
+            str = SoundexUtils.Clean(str);
+
+            if (str.Length == 0)
+            {
+                return str;
+            }
+
+            // Translate first characters of name:
+            // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
+            str = PAT_MAC.Replace(str, "MCC", 1);
+            str = PAT_KN.Replace(str, "NN", 1);
+            str = PAT_K.Replace(str, "C", 1);
+            str = PAT_PH_PF.Replace(str, "FF", 1);
+            str = PAT_SCH.Replace(str, "SSS", 1);
+
+            // Translate last characters of name:
+            // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
+            str = PAT_EE_IE.Replace(str, "Y", 1);
+            str = PAT_DT_ETC.Replace(str, "D", 1);
+
+            // First character of key = first character of name.
+            StringBuilder key = new StringBuilder(str.Length);
+            key.Append(str[0]);
+
+            // Transcode remaining characters, incrementing by one character each time
+            char[] chars = str.ToCharArray();
+            int len = chars.Length;
+
+            for (int i = 1; i < len; i++)
+            {
+                char next = i < len - 1 ? chars[i + 1] : SPACE;
+                char aNext = i < len - 2 ? chars[i + 2] : SPACE;
+                char[] transcoded = TranscodeRemaining(chars[i - 1], chars[i], next, aNext);
+                System.Array.Copy(transcoded, 0, chars, i, transcoded.Length);
+
+                // only append the current char to the key if it is different from the last one
+                if (chars[i] != chars[i - 1])
+                {
+                    key.Append(chars[i]);
+                }
+            }
+
+            if (key.Length > 1)
+            {
+                char lastChar = key[key.Length - 1];
+
+                // If last character is S, remove it.
+                if (lastChar == 'S')
+                {
+                    //key.deleteCharAt(key.length() - 1);
+                    key.Remove(key.Length - 1, 1);
+                    lastChar = key[key.Length - 1];
+                }
+
+                if (key.Length > 2)
+                {
+                    char last2Char = key[key.Length - 2];
+                    // If last characters are AY, replace with Y.
+                    if (last2Char == 'A' && lastChar == 'Y')
+                    {
+                        //.key.deleteCharAt(key.length() - 2);
+                        key.Remove(key.Length - 2, 1);
+                    }
+                }
+
+                // If last character is A, remove it.
+                if (lastChar == 'A')
+                {
+                    //key.deleteCharAt(key.length() - 1);
+                    key.Remove(key.Length - 1, 1);
+                }
+            }
+
+            string result = key.ToString();
+            return this.IsStrict ? result.Substring(0, Math.Min(TRUE_LENGTH, result.Length) - 0) : result;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs
new file mode 100644
index 0000000..e0f9071
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs
@@ -0,0 +1,202 @@
+// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Refined Soundex value. A refined soundex code is
+    /// optimized for spell checking words. Soundex method originally developed by
+    /// <c>Margaret Odell</c> and <c>Robert Russell</c>.
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// </summary>
+    public class RefinedSoundex : IStringEncoder
+    {
+        /// <summary>
+        /// since 1.4
+        /// </summary>
+        public static readonly string US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
+
+        /// <summary>
+        /// RefinedSoundex is *refined* for a number of reasons one being that the
+        /// mappings have been altered. This implementation contains default
+        /// mappings for US English.
+        /// </summary>
+        private static readonly char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.ToCharArray();
+
+        /// <summary>
+        /// Every letter of the alphabet is "mapped" to a numerical value. This char
+        /// array holds the values to which each letter is mapped. This
+        /// implementation contains a default map for US_ENGLISH.
+        /// </summary>
+        private readonly char[] soundexMapping;
+
+        /// <summary>
+        /// This static variable contains an instance of the RefinedSoundex using
+        /// the US_ENGLISH mapping.
+        /// </summary>
+        public static readonly RefinedSoundex US_ENGLISH = new RefinedSoundex();
+
+        /// <summary>
+        /// Creates an instance of the <see cref="RefinedSoundex"/> object using the default US
+        /// English mapping.
+        /// </summary>
+        public RefinedSoundex()
+        {
+            this.soundexMapping = US_ENGLISH_MAPPING;
+        }
+
+        /// <summary>
+        /// Creates a refined soundex instance using a custom mapping. This
+        /// constructor can be used to customize the mapping, and/or possibly
+        /// provide an internationalized mapping for a non-Western character set.
+        /// </summary>
+        /// <param name="mapping">Mapping array to use when finding the corresponding code for a given character.</param>
+        public RefinedSoundex(char[] mapping)
+        {
+            this.soundexMapping = new char[mapping.Length];
+            System.Array.Copy(mapping, 0, this.soundexMapping, 0, mapping.Length);
+        }
+
+        /// <summary>
+        /// Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
+        /// and/or possibly provide an internationalized mapping for a non-Western character set.
+        /// </summary>
+        /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param>
+        public RefinedSoundex(string mapping)
+        {
+            this.soundexMapping = mapping.ToCharArray();
+        }
+
+        /// <summary>
+        /// Returns the number of characters in the two encoded strings that are the
+        /// same. This return value ranges from 0 to the length of the shortest
+        /// encoded string: 0 indicates little or no similarity, and 4 out of 4 (for
+        /// example) indicates strong similarity or identical values. For refined
+        /// Soundex, the return value can be greater than 4.
+        /// <para/>
+        /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+        ///     MS T-SQL DIFFERENCE</a>
+        /// <para/>
+        /// since 1.3
+        /// </summary>
+        /// <param name="s1">A string that will be encoded and compared.</param>
+        /// <param name="s2">A string that will be encoded and compared.</param>
+        /// <returns>The number of characters in the two encoded strings that are the same from 0 to to the length of the shortest encoded string.</returns>
+        /// <seealso cref="SoundexUtils.Difference(IStringEncoder, string, string)"/>
+        public virtual int Difference(string s1, string s2)
+        {
+            return SoundexUtils.Difference(this, s1, s2);
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+        //    /**
+        //     * Encodes an Object using the refined soundex algorithm. This method is
+        //     * provided in order to satisfy the requirements of the Encoder interface,
+        //     * and will throw an EncoderException if the supplied object is not of type
+        //     * java.lang.String.
+        //     *
+        //     * @param obj
+        //     *                  Object to encode
+        //     * @return An object (or type java.lang.String) containing the refined
+        //     *             soundex code which corresponds to the String supplied.
+        //     * @throws EncoderException
+        //     *                  if the parameter supplied is not of type java.lang.String
+        //     */
+        //    @Override
+        //public virtual object Encode(object obj) 
+        //    {
+        //    if (!(obj is String)) {
+        //            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
+        //        }
+        //    return soundex((String) obj);
+        //    }
+
+        /// <summary>
+        /// Encodes a string using the refined soundex algorithm.
+        /// </summary>
+        /// <param name="str">A string object to encode.</param>
+        /// <returns>A Soundex code corresponding to the string supplied.</returns>
+        public virtual string Encode(string str)
+        {
+            return GetSoundex(str);
+        }
+
+        /// <summary>
+        /// Returns the mapping code for a given character. The mapping codes are
+        /// maintained in an internal char array named soundexMapping, and the
+        /// default values of these mappings are US English.
+        /// </summary>
+        /// <param name="c"><see cref="char"/> to get mapping for.</param>
+        /// <returns>A character (really a numeral) to return for the given <see cref="char"/>.</returns>
+        internal char GetMappingCode(char c)
+        {
+            if (!char.IsLetter(c))
+            {
+                return (char)0;
+            }
+            return this.soundexMapping[char.ToUpperInvariant(c) - 'A'];
+        }
+
+        /// <summary>
+        /// Retrieves the Refined Soundex code for a given string.
+        /// </summary>
+        /// <param name="str">String to encode using the Refined Soundex algorithm.</param>
+        /// <returns>A soundex code for the string supplied.</returns>
+        public virtual string GetSoundex(string str)
+        {
+            if (str == null)
+            {
+                return null;
+            }
+            str = SoundexUtils.Clean(str);
+            if (str.Length == 0)
+            {
+                return str;
+            }
+
+            StringBuilder sBuf = new StringBuilder();
+            sBuf.Append(str[0]);
+
+            char last, current;
+            last = '*';
+
+            for (int i = 0; i < str.Length; i++)
+            {
+
+                current = GetMappingCode(str[i]);
+                if (current == last)
+                {
+                    continue;
+                }
+                else if (current != 0)
+                {
+                    sBuf.Append(current);
+                }
+
+                last = current;
+
+            }
+
+            return sBuf.ToString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs
new file mode 100644
index 0000000..abb70c3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs
@@ -0,0 +1,318 @@
+// commons-codec version compatibility level: 1.10
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
+    /// general purpose scheme to find word with similar phonemes.
+    /// <para/>
+    /// This class is thread-safe.
+    /// Although not strictly immutable, the <see cref="maxLength"/> field is not actually used.
+    /// </summary>
+    public class Soundex : IStringEncoder
+    {
+        /// <summary>
+        /// The marker character used to indicate a silent (ignored) character.
+        /// These are ignored except when they appear as the first character.
+        /// <para/>
+        /// Note: the <see cref="US_ENGLISH_MAPPING_STRING"/> does not use this mechanism
+        /// because changing it might break existing code. Mappings that don't contain
+        /// a silent marker code are treated as though H and W are silent.
+        /// <para/>
+        /// To override this, use the <see cref="Soundex(string, bool)"/> constructor.
+        /// <para/>
+        /// since 1.11
+        /// </summary>
+        public static readonly char SILENT_MARKER = '-';
+
+        /// <summary>
+        /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position
+        /// means do not encode, but treat as a separator when it occurs between consonants with the same code.
+        /// <para/>
+        /// (This constant is provided as both an implementation convenience and to allow documentation to pick
+        /// up the value for the constant values page.)
+        /// <para/>
+        /// <b>Note that letters H and W are treated specially.</b>
+        /// They are ignored (after the first letter) and don't act as separators
+        /// between consonants with the same code.
+        /// </summary>
+        /// <seealso cref="US_ENGLISH_MAPPING"/>
+        //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
+        public static readonly string US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
+
+        /// <summary>
+        /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position
+        /// means do not encode.
+        /// </summary>
+        /// <seealso cref="Soundex.Soundex(char[])"/>
+        private static readonly char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.ToCharArray();
+
+        /// <summary>
+        /// An instance of Soundex using the US_ENGLISH_MAPPING mapping.
+        /// This treats H and W as silent letters.
+        /// Apart from when they appear as the first letter, they are ignored.
+        /// They don't act as separators between duplicate codes.
+        /// </summary>
+        /// <seealso cref="US_ENGLISH_MAPPING"/>
+        /// <seealso cref="US_ENGLISH_MAPPING_STRING"/>
+        public static readonly Soundex US_ENGLISH = new Soundex();
+
+        /// <summary>
+        /// An instance of Soundex using the Simplified Soundex mapping, as described here:
+        /// http://west-penwith.org.uk/misc/soundex.htm
+        /// <para/>
+        /// This treats H and W the same as vowels (AEIOUY).
+        /// Such letters aren't encoded (after the first), but they do
+        /// act as separators when dropping duplicate codes.
+        /// The mapping is otherwise the same as for <see cref="US_ENGLISH"/>.
+        /// <para/>
+        /// since 1.11
+        /// </summary>
+        public static readonly Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
+
+        /// <summary>
+        /// An instance of Soundex using the mapping as per the Genealogy site:
+        /// http://www.genealogy.com/articles/research/00000060.html
+        /// <para/>
+        /// This treats vowels (AEIOUY), H and W as silent letters.
+        /// Such letters are ignored (after the first) and do not
+        /// act as separators when dropping duplicate codes.
+        /// <para/>
+        /// The codes for consonants are otherwise the same as for 
+        /// <see cref="US_ENGLISH_MAPPING_STRING"/> and <see cref="US_ENGLISH_SIMPLIFIED"/>.
+        /// <para/>
+        /// since 1.11
+        /// </summary>
+        public static readonly Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
+        //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+        /// <summary>
+        /// The maximum length of a Soundex code - Soundex codes are only four characters by definition.
+        /// </summary>
+        [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")]
+        private int maxLength = 4;
+
+        /// <summary>
+        /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
+        /// letter is mapped. This implementation contains a default map for US_ENGLISH
+        /// </summary>
+        private readonly char[] soundexMapping;
+
+        /// <summary>
+        /// Should H and W be treated specially?
+        /// <para/>
+        /// In versions of the code prior to 1.11,
+        /// the code always treated H and W as silent (ignored) letters.
+        /// If this field is false, H and W are no longer special-cased.
+        /// </summary>
+        private readonly bool specialCaseHW;
+
+        /// <summary>
+        /// Creates an instance using <see cref="US_ENGLISH_MAPPING"/>.
+        /// </summary>
+        /// <seealso cref="Soundex.Soundex(char[])"/>
+        /// <seealso cref="US_ENGLISH_MAPPING"/>
+        public Soundex()
+        {
+            this.soundexMapping = US_ENGLISH_MAPPING;
+            this.specialCaseHW = true;
+        }
+
+        /// <summary>
+        /// Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
+        /// mapping for a non-Western character set.
+        /// <para/>
+        /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
+        /// letter is mapped. This implementation contains a default map for <see cref="US_ENGLISH"/>.
+        /// <para/>
+        /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment.
+        /// </summary>
+        /// <param name="mapping"> Mapping array to use when finding the corresponding code for a given character.</param>
+        public Soundex(char[] mapping)
+        {
+            this.soundexMapping = new char[mapping.Length];
+            System.Array.Copy(mapping, 0, this.soundexMapping, 0, mapping.Length);
+            this.specialCaseHW = !HasMarker(this.soundexMapping);
+        }
+
+        private bool HasMarker(char[] mapping)
+        {
+            foreach (char ch in mapping)
+            {
+                if (ch == SILENT_MARKER)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /// <summary>
+        /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
+        /// and/or possibly provide an internationalized mapping for a non-Western character set.
+        /// <para/>
+        /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment.
+        /// <para/>
+        /// since 1.4
+        /// </summary>
+        /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param>
+        public Soundex(string mapping)
+        {
+            this.soundexMapping = mapping.ToCharArray();
+            this.specialCaseHW = !HasMarker(this.soundexMapping);
+        }
+
+        /// <summary>
+        /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
+        /// and/or possibly provide an internationalized mapping for a non-Western character set.
+        /// <para/>
+        /// since 1.11
+        /// </summary>
+        /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param>
+        /// <param name="specialCaseHW">if true, then </param>
+        public Soundex(string mapping, bool specialCaseHW)
+        {
+            this.soundexMapping = mapping.ToCharArray();
+            this.specialCaseHW = specialCaseHW;
+        }
+
+        /// <summary>
+        /// Encodes the strings and returns the number of characters in the two encoded strings that are the same. This
+        /// return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
+        /// identical values.
+        /// <para/>
+        /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
+        /// T-SQL DIFFERENCE </a>
+        /// <para/>
+        /// since 1.3
+        /// </summary>
+        /// <param name="s1">A string that will be encoded and compared.</param>
+        /// <param name="s2">A string that will be encoded and compared.</param>
+        /// <returns>The number of characters in the two encoded strings that are the same from 0 to 4.</returns>
+        /// <seealso cref="SoundexUtils.Difference(IStringEncoder, string, string)"/>
+        public virtual int Difference(string s1, string s2)
+        {
+            return SoundexUtils.Difference(this, s1, s2);
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway.
+        //    /**
+        //     * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
+        //     * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
+        //     *
+        //     * @param obj
+        //     *                  Object to encode
+        //     * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
+        //     *             supplied.
+        //     * @throws EncoderException
+        //     *                  if the parameter supplied is not of type java.lang.String
+        //     * @throws IllegalArgumentException
+        //     *                  if a character is not mapped
+        //     */
+        //public virtual Object encode(object obj) 
+        //    {
+        //    if (!(obj is string)) {
+        //            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
+        //        }
+        //    return soundex((string) obj);
+        //    }
+
+        /// <summary>
+        /// Encodes a string using the soundex algorithm.
+        /// </summary>
+        /// <param name="str">A string to encode.</param>
+        /// <returns>A Soundex code corresponding to the string supplied.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        public virtual string Encode(string str)
+        {
+            return GetSoundex(str);
+        }
+
+        /// <summary>
+        /// Gets or Sets the maxLength. Standard Soundex
+        /// </summary>
+        [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")]
+        public virtual int MaxLength
+        {
+            get { return this.maxLength; }
+            set { this.maxLength = value; }
+        }
+
+        /// <summary>
+        ///  Maps the given upper-case character to its Soundex code.
+        /// </summary>
+        /// <param name="ch">An upper-case character.</param>
+        /// <returns>A Soundex code.</returns>
+        /// <exception cref="ArgumentException">Thrown if <paramref name="ch"/> is not mapped.</exception>
+        private char Map(char ch)
+        {
+            int index = ch - 'A';
+            if (index < 0 || index >= this.soundexMapping.Length)
+            {
+                throw new ArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
+            }
+            return this.soundexMapping[index];
+        }
+
+        /// <summary>
+        /// Retrieves the Soundex code for a given string.
+        /// </summary>
+        /// <param name="str">String to encode using the Soundex algorithm.</param>
+        /// <returns>A soundex code for the string supplied.</returns>
+        /// <exception cref="ArgumentException">If a character is not mapped.</exception>
+        public virtual string GetSoundex(string str)
+        {
+            if (str == null)
+            {
+                return null;
+            }
+            str = SoundexUtils.Clean(str);
+            if (str.Length == 0)
+            {
+                return str;
+            }
+            char[] output = { '0', '0', '0', '0' };
+            int count = 0;
+            char first = str[0];
+            output[count++] = first;
+            char lastDigit = Map(first); // previous digit
+            for (int i = 1; i < str.Length && count < output.Length; i++)
+            {
+                char ch = str[i];
+                if ((this.specialCaseHW) && (ch == 'H' || ch == 'W'))
+                { // these are ignored completely
+                    continue;
+                }
+                char digit = Map(ch);
+                if (digit == SILENT_MARKER)
+                {
+                    continue;
+                }
+                if (digit != '0' && digit != lastDigit)
+                { // don't store vowels or repeats
+                    output[count++] = digit;
+                }
+                lastDigit = digit;
+            }
+            return new string(output);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs b/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs
new file mode 100644
index 0000000..e6079c2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs
@@ -0,0 +1,123 @@
+// commons-codec version compatibility level: 1.9
+using System;
+using System.Globalization;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Utility methods for <see cref="Soundex"/> and <see cref="RefinedSoundex"/> classes.
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// <para/>
+    /// @since 1.3
+    /// </summary>
+    internal sealed class SoundexUtils
+    {
+        /// <summary>
+        /// Cleans up the input string before Soundex processing by only returning
+        /// upper case letters.
+        /// </summary>
+        /// <param name="str">The string to clean.</param>
+        /// <returns>A clean string.</returns>
+        public static string Clean(string str)
+        {
+            if (str == null || str.Length == 0)
+            {
+                return str;
+            }
+            int len = str.Length;
+            char[] chars = new char[len];
+            int count = 0;
+            for (int i = 0; i < len; i++)
+            {
+                if (char.IsLetter(str[i]))
+                {
+                    chars[count++] = str[i];
+                }
+            }
+            if (count == len)
+            {
+                return new CultureInfo("en").TextInfo.ToUpper(str);
+            }
+            return new CultureInfo("en").TextInfo.ToUpper(new string(chars, 0, count));
+        }
+
+        /// <summary>
+        /// Encodes the Strings and returns the number of characters in the two
+        /// encoded Strings that are the same.
+        /// <list type="bullet">
+        ///     <item><description>
+        ///         For Soundex, this return value ranges from 0 through 4: 0 indicates
+        ///         little or no similarity, and 4 indicates strong similarity or identical
+        ///         values.
+        ///     </description></item>
+        ///     <item><description>For refined Soundex, the return value can be greater than 4.</description></item>
+        /// </list>
+        /// <para/>
+        /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+        /// MS T-SQL DIFFERENCE</a>
+        /// </summary>
+        /// <param name="encoder">The encoder to use to encode the strings.</param>
+        /// <param name="s1">A string that will be encoded and compared.</param>
+        /// <param name="s2">A string that will be encoded and compared.</param>
+        /// <returns>The number of characters in the two Soundex encoded strings that are the same.</returns>
+        /// <seealso cref="DifferenceEncoded(string, string)"/>
+        public static int Difference(IStringEncoder encoder, string s1, string s2)
+        {
+            return DifferenceEncoded(encoder.Encode(s1), encoder.Encode(s2));
+        }
+
+        /// <summary>
+        /// Returns the number of characters in the two Soundex encoded strings that
+        /// are the same.
+        /// <list type="bullet">
+        ///     <item><description>
+        ///         For Soundex, this return value ranges from 0 through 4: 0 indicates
+        ///         little or no similarity, and 4 indicates strong similarity or identical
+        ///         values.
+        ///     </description></item>
+        ///     <item><description>For refined Soundex, the return value can be greater than 4.</description></item>
+        /// </list>
+        /// <para/>
+        /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
+        /// MS T-SQL DIFFERENCE</a>
+        /// </summary>
+        /// <param name="es1">An encoded string.</param>
+        /// <param name="es2">An encoded string.</param>
+        /// <returns>The number of characters in the two Soundex encoded strings that are the same.</returns>
+        public static int DifferenceEncoded(string es1, string es2)
+        {
+            if (es1 == null || es2 == null)
+            {
+                return 0;
+            }
+            int lengthToMatch = Math.Min(es1.Length, es2.Length);
+            int diff = 0;
+            for (int i = 0; i < lengthToMatch; i++)
+            {
+                if (es1[i] == es2[i])
+                {
+                    diff++;
+                }
+            }
+            return diff;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs b/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs
new file mode 100644
index 0000000..b4137a4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs
@@ -0,0 +1,35 @@
+// commons-codec version compatibility level: 1.9
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Defines common encoding methods for <see cref="string"/> encoders.
+    /// </summary>
+    public interface IStringEncoder
+    {
+        /// <summary>
+        /// Encodes a <see cref="string"/> and returns a <see cref="string"/>.
+        /// </summary>
+        /// <param name="source">the <see cref="string"/> to encode</param>
+        /// <returns>the encoded <see cref="string"/></returns>
+        // LUCENENET specific - EncoderException not ported, as it was only thrown on a coversion from object to string type
+        // <exception cref="EncoderException">thrown if there is an error condition during the encoding process.</exception>
+        string Encode(string source);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt b/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt
new file mode 100644
index 0000000..db8367d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Format
+// "pattern" "replacement at start of word" "replacement before a vowel" "replacement in other cases"
+
+// Vowels
+
+"a" "0" "" ""
+"e" "0" "" ""
+"i" "0" "" ""
+"o" "0" "" ""
+"u" "0" "" ""
+
+// Consonants
+
+"b" "7" "7" "7"
+"d" "3" "3" "3"
+"f" "7" "7" "7"
+"g" "5" "5" "5"
+"h" "5" "5" ""
+"k" "5" "5" "5"
+"l" "8" "8" "8"
+"m" "6" "6" "6"
+"n" "6" "6" "6"
+"p" "7" "7" "7"
+"q" "5" "5" "5"
+"r" "9" "9" "9"
+"s" "4" "4" "4"
+"t" "3" "3" "3"
+"v" "7" "7" "7"
+"w" "7" "7" "7"
+"x" "5" "54" "54"
+"y" "1" "" ""
+"z" "4" "4" "4"
+
+// Romanian t-cedilla and t-comma should be equivalent
+"ţ" "3|4" "3|4" "3|4"
+"ț" "3|4" "3|4" "3|4"
+
+// Polish characters (e-ogonek and a-ogonek): default case branch either not coded or 6
+"ę" "" "" "|6"
+"ą" "" "" "|6"
+
+// Other terms
+
+"schtsch" "2" "4" "4"
+"schtsh" "2" "4" "4"
+"schtch" "2" "4" "4"
+"shtch" "2" "4" "4"
+"shtsh" "2" "4" "4"
+"stsch" "2" "4" "4"
+"ttsch" "4" "4" "4"
+"zhdzh" "2" "4" "4"
+"shch" "2" "4" "4"
+"scht" "2" "43" "43"
+"schd" "2" "43" "43"
+"stch" "2" "4" "4"
+"strz" "2" "4" "4"
+"strs" "2" "4" "4"
+"stsh" "2" "4" "4"
+"szcz" "2" "4" "4"
+"szcs" "2" "4" "4"
+"ttch" "4" "4" "4"
+"tsch" "4" "4" "4"
+"ttsz" "4" "4" "4"
+"zdzh" "2" "4" "4"
+"zsch" "4" "4" "4"
+"chs" "5" "54" "54"
+"csz" "4" "4" "4"
+"czs" "4" "4" "4"
+"drz" "4" "4" "4"
+"drs" "4" "4" "4"
+"dsh" "4" "4" "4"
+"dsz" "4" "4" "4"
+"dzh" "4" "4" "4"
+"dzs" "4" "4" "4"
+"sch" "4" "4" "4"
+"sht" "2" "43" "43"
+"szt" "2" "43" "43"
+"shd" "2" "43" "43"
+"szd" "2" "43" "43"
+"tch" "4" "4" "4"
+"trz" "4" "4" "4"
+"trs" "4" "4" "4"
+"tsh" "4" "4" "4"
+"tts" "4" "4" "4"
+"ttz" "4" "4" "4"
+"tzs" "4" "4" "4"
+"tsz" "4" "4" "4"
+"zdz" "2" "4" "4"
+"zhd" "2" "43" "43"
+"zsh" "4" "4" "4"
+"ai" "0" "1" ""
+"aj" "0" "1" ""
+"ay" "0" "1" ""
+"au" "0" "7" ""
+"cz" "4" "4" "4"
+"cs" "4" "4" "4"
+"ds" "4" "4" "4"
+"dz" "4" "4" "4"
+"dt" "3" "3" "3"
+"ei" "0" "1" ""
+"ej" "0" "1" ""
+"ey" "0" "1" ""
+"eu" "1" "1" ""
+"fb" "7" "7" "7"
+"ia" "1" "" ""
+"ie" "1" "" ""
+"io" "1" "" ""
+"iu" "1" "" ""
+"ks" "5" "54" "54"
+"kh" "5" "5" "5"
+"mn" "66" "66" "66"
+"nm" "66" "66" "66"
+"oi" "0" "1" ""
+"oj" "0" "1" ""
+"oy" "0" "1" ""
+"pf" "7" "7" "7"
+"ph" "7" "7" "7"
+"sh" "4" "4" "4"
+"sc" "2" "4" "4"
+"st" "2" "43" "43"
+"sd" "2" "43" "43"
+"sz" "4" "4" "4"
+"th" "3" "3" "3"
+"ts" "4" "4" "4"
+"tc" "4" "4" "4"
+"tz" "4" "4" "4"
+"ui" "0" "1" ""
+"uj" "0" "1" ""
+"uy" "0" "1" ""
+"ue" "0" "1" ""
+"zd" "2" "43" "43"
+"zh" "4" "4" "4"
+"zs" "4" "4" "4"
+
+// Branching cases
+
+"c" "4|5" "4|5" "4|5"
+"ch" "4|5" "4|5" "4|5"
+"ck" "5|45" "5|45" "5|45"
+"rs" "4|94" "4|94" "4|94"
+"rz" "4|94" "4|94" "4|94"
+"j" "1|4" "|4" "|4"
+
+
+// ASCII foldings
+
+ß=s
+à=a
+á=a
+â=a
+ã=a
+ä=a
+å=a
+æ=a
+ç=c
+è=e
+é=e
+ê=e
+ë=e
+ì=i
+í=i
+î=i
+ï=i
+ð=d
+ñ=n
+ò=o
+ó=o
+ô=o
+õ=o
+ö=o
+ø=o
+ù=u
+ú=u
+û=u
+ý=y
+ý=y
+þ=b
+ÿ=y
+ć=c
+ł=l
+ś=s
+ż=z
+ź=z

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj
new file mode 100644
index 0000000..2a60aff
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj
@@ -0,0 +1,225 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{DAFE3B64-616A-4A2F-90E5-1F135E8A9AF5}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Analysis.Phonetic</RootNamespace>
+    <AssemblyName>Lucene.Net.Analysis.Phonetic</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Net.Http" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="BeiderMorseFilter.cs" />
+    <Compile Include="BeiderMorseFilterFactory.cs" />
+    <Compile Include="DoubleMetaphoneFilter.cs" />
+    <Compile Include="DoubleMetaphoneFilterFactory.cs" />
+    <Compile Include="Language\AbstractCaverphone .cs" />
+    <Compile Include="Language\Bm\BeiderMorseEncoder.cs" />
+    <Compile Include="Language\Bm\Lang.cs" />
+    <Compile Include="Language\Bm\Languages.cs" />
+    <Compile Include="Language\Bm\NameType.cs" />
+    <Compile Include="Language\Bm\PhoneticEngine.cs" />
+    <Compile Include="Language\Bm\ResourceConstants.cs" />
+    <Compile Include="Language\Bm\Rule.cs" />
+    <Compile Include="Language\Bm\RuleType.cs" />
+    <Compile Include="Language\Caverphone1.cs" />
+    <Compile Include="Language\Caverphone2.cs" />
+    <Compile Include="Language\ColognePhonetic.cs" />
+    <Compile Include="Language\DaitchMokotoffSoundex.cs" />
+    <Compile Include="Language\DoubleMetaphone.cs" />
+    <Compile Include="Language\MatchRatingApproachEncoder.cs" />
+    <Compile Include="Language\Metaphone.cs" />
+    <Compile Include="Language\Nysiis.cs" />
+    <Compile Include="Language\RefinedSoundex.cs" />
+    <Compile Include="Language\Soundex.cs" />
+    <Compile Include="Language\SoundexUtils.cs" />
+    <Compile Include="Language\StringEncoder.cs" />
+    <Compile Include="PhoneticFilter.cs" />
+    <Compile Include="PhoneticFilterFactory.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="..\CommonAssemblyInfo.cs">
+      <Link>Properties\CommonAssemblyInfo.cs</Link>
+    </Compile>
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj">
+      <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project>
+      <Name>Lucene.Net.Analysis.Common</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj">
+      <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="Language\Bm\ash_approx_any.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_english.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_french.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_german.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_approx_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_any.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_common.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_english.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_french.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_german.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_exact_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_hebrew_common.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_languages.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_any.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_english.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_french.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_german.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\ash_rules_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_any.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_arabic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_czech.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_dutch.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_english.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_french.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_german.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_greek.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_greeklatin.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_approx_turkish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_any.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_arabic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_common.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_czech.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_dutch.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_english.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_french.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_german.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_greek.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_greeklatin.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_exact_turkish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_hebrew_common.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_languages.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_any.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_arabic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_cyrillic.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_czech.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_dutch.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_english.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_french.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_german.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_greek.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_greeklatin.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_hungarian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_polish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_romanian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_russian.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\gen_rules_turkish.txt" />
+    <EmbeddedResource Include="Language\Bm\lang.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_any.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_french.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_approx_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_any.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_approx_common.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_common.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_french.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_exact_spanish.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_hebrew_common.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_languages.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_any.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_french.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_hebrew.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_italian.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_portuguese.txt" />
+    <EmbeddedResource Include="Language\Bm\sep_rules_spanish.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="Language\dmrules.txt" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json
new file mode 100644
index 0000000..86d1c12
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json
@@ -0,0 +1,8 @@
+{
+  "runtimes": {
+    "win": {}
+  },
+  "frameworks": {
+    "net451": {}
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj
new file mode 100644
index 0000000..321b9b2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion>
+    <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
+  </PropertyGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" />
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>56b2ffb7-6870-4420-8bc7-187adf5341d9</ProjectGuid>
+    <RootNamespace>Lucene.Net.Analysis.Phonetic</RootNamespace>
+    <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath>
+    <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <SchemaVersion>2.0</SchemaVersion>
+  </PropertyGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" />
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs
new file mode 100644
index 0000000..c5d2886
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs
@@ -0,0 +1,109 @@
+// lucene version compatibility level: 4.8.1
+using Lucene.Net.Analysis.Phonetic.Language;
+using Lucene.Net.Analysis.TokenAttributes;
+using System;
+
+namespace Lucene.Net.Analysis.Phonetic
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Create tokens for phonetic matches.
+    /// See the Language namespace.
+    /// </summary>
+    public sealed class PhoneticFilter : TokenFilter
+    {
+        /// <summary>true if encoded tokens should be added as synonyms</summary>
+        private bool inject = true;
+        /// <summary>phonetic encoder</summary>
+        private IStringEncoder encoder = null;
+        /// <summary>captured state, non-null when <c>inject=true</c> and a token is buffered</summary>
+        private State save = null;
+        private readonly ICharTermAttribute termAtt;
+        private readonly IPositionIncrementAttribute posAtt;
+
+        /// <summary>
+        /// Creates a <see cref="PhoneticFilter"/> with the specified encoder, and either
+        /// adding encoded forms as synonyms (<c>inject=true</c>) or
+        /// replacing them.
+        /// </summary>
+        public PhoneticFilter(TokenStream input, IStringEncoder encoder, bool inject)
+            : base(input)
+        {
+            this.encoder = encoder;
+            this.inject = inject;
+            this.termAtt = AddAttribute<ICharTermAttribute>();
+            this.posAtt = AddAttribute<IPositionIncrementAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (save != null)
+            {
+                // clearAttributes();  // not currently necessary
+                RestoreState(save);
+                save = null;
+                return true;
+            }
+
+            if (!m_input.IncrementToken()) return false;
+
+            // pass through zero-length terms
+            if (termAtt.Length == 0) return true;
+
+            string value = termAtt.ToString();
+            string phonetic = null;
+            try
+            {
+                string v = encoder.Encode(value);
+                if (v.Length > 0 && !value.Equals(v))
+                {
+                    phonetic = v;
+                }
+            }
+            catch (Exception) { /* ignored */ } // just use the direct text
+
+                if (phonetic == null) return true;
+
+            if (!inject)
+            {
+                // just modify this token
+                termAtt.SetEmpty().Append(phonetic);
+                return true;
+            }
+
+            // We need to return both the original and the phonetic tokens.
+            // to avoid a orig=captureState() change_to_phonetic() saved=captureState()  restoreState(orig)
+            // we return the phonetic alternative first
+
+            int origOffset = posAtt.PositionIncrement;
+            posAtt.PositionIncrement = 0;
+            save = CaptureState();
+
+            posAtt.PositionIncrement = origOffset;
+            termAtt.SetEmpty().Append(phonetic);
+            return true;
+        }
+
+        public override void Reset()
+        {
+            m_input.Reset();
+            save = null;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs
new file mode 100644
index 0000000..8af2e5f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs
@@ -0,0 +1,187 @@
+// lucene version compatibility level: 4.8.1
+using Lucene.Net.Analysis.Phonetic.Language;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Reflection;
+
+namespace Lucene.Net.Analysis.Phonetic
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="PhoneticFilter"/>.
+    /// <para/>
+    /// Create tokens based on phonetic encoders from the Language namespace.
+    /// <para/>
+    /// This takes one required argument, "encoder", and the rest are optional:
+    /// <list type="bullet">
+    ///     <item>
+    ///         <term>encoder</term>
+    ///         <description>
+    ///         required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
+    ///         or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
+    ///         itself if it already contains a '.' or otherwise as in the same package as these others.
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>inject</term>
+    ///         <description>
+    ///         (default=true) add tokens to the stream with the offset=0
+    ///         </description>
+    ///     </item>
+    ///     <item>
+    ///         <term>maxCodeLength</term>
+    ///         <description>
+    ///         The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
+    ///         support this then specifying this is an error.
+    ///         </description>
+    ///     </item>
+    /// </list>
+    /// 
+    /// <code>
+    /// &lt;fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.PhoneticFilterFactory" encoder="DoubleMetaphone" inject="true"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    /// <seealso cref="PhoneticFilter"/>
+    public class PhoneticFilterFactory : TokenFilterFactory, IResourceLoaderAware
+    {
+        /// <summary>parameter name: either a short name or a full class name</summary>
+        public static readonly string ENCODER = "encoder";
+        /// <summary>parameter name: true if encoded tokens should be added as synonyms</summary>
+        public static readonly string INJECT = "inject"; // boolean
+                                                         /** parameter name: restricts the length of the phonetic code */
+        public static readonly string MAX_CODE_LENGTH = "maxCodeLength";
+        private static readonly string PACKAGE_CONTAINING_ENCODERS = "Lucene.Net.Analysis.Phonetic.Language.";
+
+        //Effectively constants; uppercase keys
+        private static readonly IDictionary<string, Type> registry = new Dictionary<string, Type>(6);
+
+        static PhoneticFilterFactory()
+        {
+            registry["DoubleMetaphone".ToUpperInvariant()] = typeof(DoubleMetaphone);
+            registry["Metaphone".ToUpperInvariant()] = typeof(Metaphone);
+            registry["Soundex".ToUpperInvariant()] = typeof(Soundex);
+            registry["RefinedSoundex".ToUpperInvariant()] = typeof(RefinedSoundex);
+            registry["Caverphone".ToUpperInvariant()] = typeof(Caverphone2);
+            registry["ColognePhonetic".ToUpperInvariant()] = typeof(ColognePhonetic);
+        }
+
+        internal bool inject; //accessed by the test
+        private readonly string name;
+        private readonly int? maxCodeLength;
+        private Type clazz = null;
+        private MethodInfo setMaxCodeLenMethod = null;
+
+        /// <summary>Creates a new <see cref="PhoneticFilterFactory"/>.</summary>
+        public PhoneticFilterFactory(IDictionary<string, string> args)
+                : base(args)
+        {
+            inject = GetBoolean(args, INJECT, true);
+            name = Require(args, ENCODER);
+            string v = Get(args, MAX_CODE_LENGTH);
+            if (v != null)
+            {
+                maxCodeLength = int.Parse(v, CultureInfo.InvariantCulture);
+            }
+            else
+            {
+                maxCodeLength = null;
+            }
+            if (!(args.Count == 0))
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+
+        public virtual void Inform(IResourceLoader loader)
+        {
+            registry.TryGetValue(name.ToUpperInvariant(), out clazz);
+            if (clazz == null)
+            {
+                clazz = ResolveEncoder(name, loader);
+            }
+
+            if (maxCodeLength != null)
+            {
+                try
+                {
+                    setMaxCodeLenMethod = clazz.GetMethod("set_MaxCodeLen");
+                }
+                catch (Exception e)
+                {
+                    throw new ArgumentException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e);
+                }
+            }
+
+            GetEncoder();//trigger initialization for potential problems to be thrown now
+        }
+
+        private Type ResolveEncoder(string name, IResourceLoader loader)
+        {
+            string lookupName = name;
+            if (name.IndexOf('.') == -1)
+            {
+                lookupName = PACKAGE_CONTAINING_ENCODERS + name;
+            }
+            try
+            {
+                return loader.NewInstance<IStringEncoder>(lookupName).GetType();
+            }
+            catch (Exception e)
+            {
+                throw new ArgumentException("Error loading encoder '" + name + "': must be full class name or one of " + Collections.ToString(registry.Keys), e);
+            }
+        }
+
+        /// <summary>Must be thread-safe.</summary>
+        protected internal virtual IStringEncoder GetEncoder()
+        {
+            // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate
+            // every time.  A simple benchmark showed this as negligible.
+            try
+            {
+                IStringEncoder encoder = (IStringEncoder)Activator.CreateInstance(clazz);
+                // Try to set the maxCodeLength
+                if (maxCodeLength != null && setMaxCodeLenMethod != null)
+                {
+                    setMaxCodeLenMethod.Invoke(encoder, new object[] { maxCodeLength });
+                }
+                return encoder;
+            }
+            catch (Exception e)
+            {
+                Exception t = (e is TargetInvocationException) ? e.InnerException : e;
+                throw new ArgumentException("Error initializing encoder: " + name + " / " + clazz, t);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new PhoneticFilter(input, GetEncoder(), inject);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..b7cd03f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs
@@ -0,0 +1,48 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analysis.Phonetic")]
+[assembly: AssemblyDescription(
+    "Analyzer for indexing phonetic signatures (for sounds-alike search) " +
+    "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.Phonetic")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("dafe3b64-616a-4a2f-90e5-1f135e8a9af5")]
+
+// for testing
+[assembly: InternalsVisibleTo("Lucene.Net.Tests.Analysis.Phonetic")]
+
+// NOTE: Version information is in CommonAssemblyInfo.cs

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/project.json b/src/Lucene.Net.Analysis.Phonetic/project.json
new file mode 100644
index 0000000..460721b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/project.json
@@ -0,0 +1,54 @@
+{
+  "version": "4.8.0",
+  "title": "Lucene.Net.Analysis.Phonetic",
+  "description": "Analyzer for indexing phonetic signatures (for sounds-alike search) for the Lucene.Net full-text search engine library from The Apache Software Foundation.",
+  "authors": [ "The Apache Software Foundation" ],
+  "packOptions": {
+    "projectUrl": "http://lucenenet.apache.org/",
+    "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt",
+    "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true",
+    "owners": [ "The Apache Software Foundation" ],
+    "repository": { "url": "https://github.com/apache/lucenenet" },
+    "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query", "soundex", "double", "metaphone", "sounds", "like", "beider", "morse", "cologne", "caverphone", "nysiis", "match", "rating" ]
+  },
+  "buildOptions": {
+  "compile": {
+      "includeFiles": [ "../CommonAssemblyInfo.cs" ]
+    },
+    "embed": {
+      "include": [
+        "Language/Bm/ash_*.txt",
+        "Language/Bm/gen_*.txt",
+        "Language/Bm/sep_*.txt"
+      ],
+      "includeFiles": [
+        "Language/Bm/lang.txt",
+        "Language/dmrules.txt"
+      ]
+    },
+    "nowarn": [ "1591", "1573" ]
+  },
+  "dependencies": {
+    "icu.net": "54.1.1-alpha",
+    "Lucene.Net": "4.8.0",
+    "Lucene.Net.Analysis.Common": "4.8.0"
+  },
+  "frameworks": {
+    "netstandard1.5": {
+      "imports": "dnxcore50",
+      "buildOptions": {
+        "debugType": "portable",
+        "define": [ "NETSTANDARD" ]
+      },
+      "dependencies": {
+        "NETStandard.Library": "1.6.0"
+      }
+    },
+    "net451": {
+      "buildOptions": {
+        "debugType": "full",
+        "define": [ "FEATURE_SERIALIZABLE" ]
+      }
+    }
+  }
+}


Mime
View raw message