lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [38/50] [abbrv] lucenenet git commit: Lucene.Net.Analysis.SmartCn: Renamed HHMM namespace to Hhmm to follow .NET conventions better
Date Sat, 09 Sep 2017 00:31:56 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
deleted file mode 100644
index c0cd331..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
+++ /dev/null
@@ -1,779 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-using Lucene.Net.Support.IO;
-using System;
-using System.IO;
-using System.Reflection;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// SmartChineseAnalyzer Word Dictionary
-    /// <para/>
-    /// @lucene.experimental
-    /// </summary>
-    internal class WordDictionary : AbstractDictionary
-    {
-        private WordDictionary()
-        {
-        }
-
-        private static WordDictionary singleInstance;
-
-        /// <summary>
-        /// Large prime number for hash function
-        /// </summary>
-        public static readonly int PRIME_INDEX_LENGTH = 12071;
-
-        /// <summary>
-        /// wordIndexTable guarantees to hash all Chinese characters in Unicode into 
-        /// PRIME_INDEX_LENGTH array. There will be conflict, but in reality this 
-        /// program only handles the 6768 characters found in GB2312 plus some 
-        /// ASCII characters. Therefore in order to guarantee better precision, it is
-        /// necessary to retain the original symbol in the charIndexTable.
-        /// </summary>
-        private short[] wordIndexTable;
-
-        private char[] charIndexTable;
-
-        /// <summary>
-        /// To avoid taking too much space, the data structure needed to store the 
-        /// lexicon requires two multidimensional arrays to store word and frequency.
-        /// Each word is placed in a char[]. Each char represents a Chinese char or 
-        /// other symbol.  Each frequency is put into an int. These two arrays 
-        /// correspond to each other one-to-one. Therefore, one can use 
-        /// wordItem_charArrayTable[i][j] to look up word from lexicon, and 
-        /// wordItem_frequencyTable[i][j] to look up the corresponding frequency. 
-        /// </summary>
-        private char[][][] wordItem_charArrayTable;
-
-        private int[][] wordItem_frequencyTable;
-
-        // static Logger log = Logger.getLogger(WordDictionary.class);
-
-        private static object syncLock = new object();
-
-        /// <summary>
-        /// Get the singleton dictionary instance.
-        /// </summary>
-        /// <returns>singleton</returns>
-        public static WordDictionary GetInstance()
-        {
-            lock (syncLock)
-            {
-                if (singleInstance == null)
-                {
-                    singleInstance = new WordDictionary();
-
-                    // LUCENENET specific
-                    // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817
-                    // This issue still existed as of 4.8.0. Here is the fix - we only
-                    // load from a directory if the actual directory exists (AnalyzerProfile
-                    // ensures it is an empty string if it is not available).
-                    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
-                    if (string.IsNullOrEmpty(dictRoot))
-                    {
-                        singleInstance.Load();
-                    }
-                    else
-                    {
-                        singleInstance.Load(dictRoot);
-                    }
-
-
-                    //try
-                    //{
-                    //    singleInstance.Load();
-                    //}
-                    //catch (IOException e)
-                    //{
-                    //    string wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
-                    //    singleInstance.Load(wordDictRoot);
-                    //}
-                    //catch (TypeLoadException e)
-                    //{
-                    //    throw new Exception(e.ToString(), e);
-                    //}
-                }
-                return singleInstance;
-            }
-        }
-
-        /// <summary>
-        /// Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
-        /// </summary>
-        /// <param name="dctFileRoot">path to dictionary directory</param>
-        public virtual void Load(string dctFileRoot)
-        {
-            string dctFilePath = System.IO.Path.Combine(dctFileRoot, "coredict.dct");
-            FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dctFileRoot, "coredict.mem"));
-
-            if (serialObj.Exists && LoadFromObj(serialObj))
-            {
-
-            }
-            else
-            {
-                try
-                {
-                    wordIndexTable = new short[PRIME_INDEX_LENGTH];
-                    charIndexTable = new char[PRIME_INDEX_LENGTH];
-                    for (int i = 0; i < PRIME_INDEX_LENGTH; i++)
-                    {
-                        charIndexTable[i] = (char)0;
-                        wordIndexTable[i] = -1;
-                    }
-                    wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
-                    wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
-                    // int total =
-                    LoadMainDataFromFile(dctFilePath);
-                    ExpandDelimiterData();
-                    MergeSameWords();
-                    SortEachItems();
-                    // log.info("load dictionary: " + dctFilePath + " total:" + total);
-                }
-                catch (IOException e)
-                {
-                    throw new Exception(e.ToString(), e);
-                }
-
-                SaveToObj(serialObj);
-            }
-
-        }
-
-        /// <summary>
-        /// Load coredict.mem internally from the jar file.
-        /// </summary>
-        /// <exception cref="IOException">If there is a low-level I/O error.</exception>
-        public virtual void Load()
-        {
-            using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "coredict.mem"))
-            {
-                LoadFromObjectInputStream(input);
-            }
-        }
-
-        private bool LoadFromObj(FileInfo serialObj)
-        {
-            try
-            {
-                using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read))
-                    LoadFromObjectInputStream(input);
-                return true;
-            }
-            catch (Exception e)
-            {
-                throw new Exception(e.ToString(), e);
-            }
-        }
-
-        // LUCENENET conversion note:
-        // The data in Lucene is stored in a proprietary binary format (similar to
-        // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the
-        // data was extracted using Java's DataOutputStream using the following Java code.
-        // It can then be read in using the LoadFromInputStream method below 
-        // (using a DataInputStream instead of a BinaryReader), and saved
-        // in the correct (BinaryWriter) format by calling the SaveToObj method.
-        // Alternatively, the data can be loaded from disk using the files
-        // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, 
-        // which will automatically produce the .mem files.
-
-        //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException
-        //{
-        //    // save bigramHashTable
-        //    int bhLen = bigramHashTable.length;
-        //    stream.writeInt(bhLen);
-        //    for (int i = 0; i<bhLen; i++)
-        //    {
-        //        stream.writeLong(bigramHashTable[i]);
-        //    }
-
-        //    // save frequencyTable
-        //    int fLen = frequencyTable.length;
-        //    stream.writeInt(fLen);
-        //    for (int i = 0; i<fLen; i++)
-        //    {
-        //        stream.writeInt(frequencyTable[i]);
-        //    }
-        //}
-
-        private void LoadFromObjectInputStream(Stream serialObjectInputStream)
-        {
-            //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
-            //wordIndexTable = (short[])input.ReadObject();
-            //charIndexTable = (char[])input.ReadObject();
-            //wordItem_charArrayTable = (char[][][])input.ReadObject();
-            //wordItem_frequencyTable = (int[][])input.ReadObject();
-            //// log.info("load core dict from serialization.");
-            //input.close();
-
-            using (var reader = new BinaryReader(serialObjectInputStream))
-            //using (var reader = new DataInputStream(serialObjectInputStream))
-            {
-
-                // Read wordIndexTable
-                int wiLen = reader.ReadInt32();
-                wordIndexTable = new short[wiLen];
-                for (int i = 0; i < wiLen; i++)
-                {
-                    wordIndexTable[i] = reader.ReadInt16();
-                }
-
-                // Read charIndexTable
-                int ciLen = reader.ReadInt32();
-                charIndexTable = new char[ciLen];
-                for (int i = 0; i < ciLen; i++)
-                {
-                    charIndexTable[i] = reader.ReadChar();
-                }
-
-                // Read wordItem_charArrayTable
-                int caDim1 = reader.ReadInt32();
-                if (caDim1 > -1)
-                {
-                    wordItem_charArrayTable = new char[caDim1][][];
-                    for (int i = 0; i < caDim1; i++)
-                    {
-                        int caDim2 = reader.ReadInt32();
-                        if (caDim2 > -1)
-                        {
-                            wordItem_charArrayTable[i] = new char[caDim2][];
-                            for (int j = 0; j < caDim2; j++)
-                            {
-                                int caDim3 = reader.ReadInt32();
-                                if (caDim3 > -1)
-                                {
-                                    wordItem_charArrayTable[i][j] = new char[caDim3];
-                                    for (int k = 0; k < caDim3; k++)
-                                    {
-                                        wordItem_charArrayTable[i][j][k] = reader.ReadChar();
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                // Read wordItem_frequencyTable
-                int fDim1 = reader.ReadInt32();
-                if (fDim1 > -1)
-                {
-                    wordItem_frequencyTable = new int[fDim1][];
-                    for (int i = 0; i < fDim1; i++)
-                    {
-                        int fDim2 = reader.ReadInt32();
-                        if (fDim2 > -1)
-                        {
-                            wordItem_frequencyTable[i] = new int[fDim2];
-                            for (int j = 0; j < fDim2; j++)
-                            {
-                                wordItem_frequencyTable[i][j] = reader.ReadInt32();
-                            }
-                        }
-                    }
-                }
-            }
-
-            // log.info("load core dict from serialization.");
-        }
-
-        private void SaveToObj(FileInfo serialObj)
-        {
-            try
-            {
-                //ObjectOutputStream output = new ObjectOutputStream(new FileStream(
-                //    serialObj.FullName, FileMode.Create, FileAccess.Write));
-                //output.writeObject(wordIndexTable);
-                //output.writeObject(charIndexTable);
-                //output.writeObject(wordItem_charArrayTable);
-                //output.writeObject(wordItem_frequencyTable);
-                //output.close();
-                //// log.info("serialize core dict.");
-
-                using (Stream stream = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write))
-                {
-                    using (var writer = new BinaryWriter(stream))
-                    {
-                        // Write wordIndexTable
-                        int wiLen = wordIndexTable.Length;
-                        writer.Write(wiLen);
-                        for (int i = 0; i < wiLen; i++)
-                        {
-                            writer.Write(wordIndexTable[i]);
-                        }
-
-                        // Write charIndexTable
-                        int ciLen = charIndexTable.Length;
-                        writer.Write(ciLen);
-                        for (int i = 0; i < ciLen; i++)
-                        {
-                            writer.Write(charIndexTable[i]);
-                        }
-
-                        // Write wordItem_charArrayTable
-                        int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.Length;
-                        writer.Write(caDim1);
-                        for (int i = 0; i < caDim1; i++)
-                        {
-                            int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].Length;
-                            writer.Write(caDim2);
-                            for (int j = 0; j < caDim2; j++)
-                            {
-                                int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].Length;
-                                writer.Write(caDim3);
-                                for (int k = 0; k < caDim3; k++)
-                                {
-                                    writer.Write(wordItem_charArrayTable[i][j][k]);
-                                }
-                            }
-                        }
-
-                        // Write wordItem_frequencyTable
-                        int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.Length;
-                        writer.Write(fDim1);
-                        for (int i = 0; i < fDim1; i++)
-                        {
-                            int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].Length;
-                            writer.Write(fDim2);
-                            for (int j = 0; j < fDim2; j++)
-                            {
-                                writer.Write(wordItem_frequencyTable[i][j]);
-                            }
-                        }
-                    }
-                }
-
-                // log.info("serialize core dict.");
-            }
-#pragma warning disable 168
-            catch (Exception e)
-#pragma warning restore 168
-            {
-                // log.warn(e.getMessage());
-            }
-        }
-
-        /// <summary>
-        /// Load the datafile into this <see cref="WordDictionary"/>
-        /// </summary>
-        /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
-        /// <returns>number of words read</returns>
-        /// <exception cref="IOException">If there is a low-level I/O error.</exception>
-        private int LoadMainDataFromFile(string dctFilePath)
-        {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
-            // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
-            //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
-            using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
-            {
-
-                // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
-                {
-                    // if (i == 5231)
-                    // System.out.println(i);
-
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must be converted to work with Java
-                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
-                    if (cnt <= 0)
-                    {
-                        wordItem_charArrayTable[i] = null;
-                        wordItem_frequencyTable[i] = null;
-                        continue;
-                    }
-                    wordItem_charArrayTable[i] = new char[cnt][];
-                    wordItem_frequencyTable[i] = new int[cnt];
-                    total += cnt;
-                    int j = 0;
-                    while (j < cnt)
-                    {
-                        // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
-                            .GetInt32();// handle
-
-                        // wordItemTable[i][j].frequency = buffer[0];
-                        wordItem_frequencyTable[i][j] = buffer[0];
-
-                        length = buffer[1];
-                        if (length > 0)
-                        {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            //tmpword = new String(lchBuffer, "GB2312");
-                            tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
-                            //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                            // indexTable[i].wordItems[j].word = tmpword;
-                            // wordItemTable[i][j].charArray = tmpword.toCharArray();
-                            wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
-                        }
-                        else
-                        {
-                            // wordItemTable[i][j].charArray = null;
-                            wordItem_charArrayTable[i][j] = null;
-                        }
-                        // System.out.println(indexTable[i].wordItems[j]);
-                        j++;
-                    }
-
-                    string str = GetCCByGB2312Id(i);
-                    SetTableIndex(str[0], i);
-                }
-            }
-            return total;
-        }
-
-        /// <summary>
-        /// The original lexicon puts all information with punctuation into a 
-        /// chart (from 1 to 3755). Here it then gets expanded, separately being
-        /// placed into the chart that has the corresponding symbol.
-        /// </summary>
-        private void ExpandDelimiterData()
-        {
-            int i;
-            int cnt;
-            // Punctuation then treating index 3755 as 1, 
-            // distribute the original punctuation corresponding dictionary into 
-            int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
-            i = 0;
-            while (i < wordItem_charArrayTable[delimiterIndex].Length)
-            {
-                char c = wordItem_charArrayTable[delimiterIndex][i][0];
-                int j = GetGB2312Id(c);// the id value of the punctuation
-                if (wordItem_charArrayTable[j] == null)
-                {
-
-                    int k = i;
-                    // Starting from i, count the number of the following worditem symbol from j
-                    while (k < wordItem_charArrayTable[delimiterIndex].Length
-                        && wordItem_charArrayTable[delimiterIndex][k][0] == c)
-                    {
-                        k++;
-                    }
-                    // c is the punctuation character, j is the id value of c
-                    // k-1 represents the index of the last punctuation character
-                    cnt = k - i;
-                    if (cnt != 0)
-                    {
-                        wordItem_charArrayTable[j] = new char[cnt][];
-                        wordItem_frequencyTable[j] = new int[cnt];
-                    }
-
-                    // Assign value for each wordItem.
-                    for (k = 0; k < cnt; k++, i++)
-                    {
-                        // wordItemTable[j][k] = new WordItem();
-                        wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
-                        wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].Length - 1];
-                        System.Array.Copy(wordItem_charArrayTable[delimiterIndex][i], 1,
-                            wordItem_charArrayTable[j][k], 0,
-                            wordItem_charArrayTable[j][k].Length);
-                    }
-                    SetTableIndex(c, j);
-                }
-            }
-            // Delete the original corresponding symbol array.
-            wordItem_charArrayTable[delimiterIndex] = null;
-            wordItem_frequencyTable[delimiterIndex] = null;
-        }
-
-        /// <summary>
-        /// since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
-        /// </summary>
-        private void MergeSameWords()
-        {
-            int i;
-            for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
-            {
-                if (wordItem_charArrayTable[i] == null)
-                    continue;
-                int len = 1;
-                for (int j = 1; j < wordItem_charArrayTable[i].Length; j++)
-                {
-                    if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
-                        wordItem_charArrayTable[i][j - 1], 0) != 0)
-                        len++;
-
-                }
-                if (len < wordItem_charArrayTable[i].Length)
-                {
-                    char[][] tempArray = new char[len][];
-                    int[] tempFreq = new int[len];
-                    int k = 0;
-                    tempArray[0] = wordItem_charArrayTable[i][0];
-                    tempFreq[0] = wordItem_frequencyTable[i][0];
-                    for (int j = 1; j < wordItem_charArrayTable[i].Length; j++)
-                    {
-                        if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
-                            tempArray[k], 0) != 0)
-                        {
-                            k++;
-                            // temp[k] = wordItemTable[i][j];
-                            tempArray[k] = wordItem_charArrayTable[i][j];
-                            tempFreq[k] = wordItem_frequencyTable[i][j];
-                        }
-                        else
-                        {
-                            // temp[k].frequency += wordItemTable[i][j].frequency;
-                            tempFreq[k] += wordItem_frequencyTable[i][j];
-                        }
-                    }
-                    // wordItemTable[i] = temp;
-                    wordItem_charArrayTable[i] = tempArray;
-                    wordItem_frequencyTable[i] = tempFreq;
-                }
-            }
-        }
-
-        private void SortEachItems()
-        {
-            char[] tmpArray;
-            int tmpFreq;
-            for (int i = 0; i < wordItem_charArrayTable.Length; i++)
-            {
-                if (wordItem_charArrayTable[i] != null
-                    && wordItem_charArrayTable[i].Length > 1)
-                {
-                    for (int j = 0; j < wordItem_charArrayTable[i].Length - 1; j++)
-                    {
-                        for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].Length; j2++)
-                        {
-                            if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
-                                wordItem_charArrayTable[i][j2], 0) > 0)
-                            {
-                                tmpArray = wordItem_charArrayTable[i][j];
-                                tmpFreq = wordItem_frequencyTable[i][j];
-                                wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
-                                wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
-                                wordItem_charArrayTable[i][j2] = tmpArray;
-                                wordItem_frequencyTable[i][j2] = tmpFreq;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        /// <summary>
-        /// Calculate character <paramref name="c"/>'s position in hash table, 
-        /// then initialize the value of that position in the address table.
-        /// </summary>
-        private bool SetTableIndex(char c, int j)
-        {
-            int index = GetAvaliableTableIndex(c);
-            if (index != -1)
-            {
-                charIndexTable[index] = c;
-                wordIndexTable[index] = (short)j;
-                return true;
-            }
-            else
-                return false;
-        }
-
-        private short GetAvaliableTableIndex(char c)
-        {
-            int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH);
-            int hash2 = Hash2(c) % PRIME_INDEX_LENGTH;
-            if (hash1 < 0)
-                hash1 = PRIME_INDEX_LENGTH + hash1;
-            if (hash2 < 0)
-                hash2 = PRIME_INDEX_LENGTH + hash2;
-            int index = hash1;
-            int i = 1;
-            while (charIndexTable[index] != 0 && charIndexTable[index] != c
-                && i < PRIME_INDEX_LENGTH)
-            {
-                index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
-                i++;
-            }
-            // System.out.println(i - 1);
-
-            if (i < PRIME_INDEX_LENGTH
-                && (charIndexTable[index] == 0 || charIndexTable[index] == c))
-            {
-                return (short)index;
-            }
-            else
-            {
-                return -1;
-            }
-        }
-
-        private short GetWordItemTableIndex(char c)
-        {
-            int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH);
-            int hash2 = Hash2(c) % PRIME_INDEX_LENGTH;
-            if (hash1 < 0)
-                hash1 = PRIME_INDEX_LENGTH + hash1;
-            if (hash2 < 0)
-                hash2 = PRIME_INDEX_LENGTH + hash2;
-            int index = hash1;
-            int i = 1;
-            while (charIndexTable[index] != 0 && charIndexTable[index] != c
-                && i < PRIME_INDEX_LENGTH)
-            {
-                index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
-                i++;
-            }
-
-            if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c)
-            {
-                return (short)index;
-            }
-            else
-                return -1;
-        }
-
-        /// <summary>
-        /// Look up the text string corresponding with the word char array,
-        /// and return the position of the word list.
-        /// </summary>
-        /// <param name="knownHashIndex">
-        /// already figure out position of the first word
-        /// symbol charArray[0] in hash table. If not calculated yet, can be
-        /// replaced with function int findInTable(char[] charArray).
-        /// </param>
-        /// <param name="charArray">look up the char array corresponding with the word.</param>
-        /// <returns>word location in word array.  If not found, then return -1.</returns>
-        private int FindInTable(short knownHashIndex, char[] charArray)
-        {
-            if (charArray == null || charArray.Length == 0)
-                return -1;
-
-            char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
-            int start = 0, end = items.Length - 1;
-            int mid = (start + end) / 2, cmpResult;
-
-            // Binary search for the index of idArray
-            while (start <= end)
-            {
-                cmpResult = Utility.CompareArray(items[mid], 0, charArray, 1);
-
-                if (cmpResult == 0)
-                    return mid;// find it
-                else if (cmpResult < 0)
-                    start = mid + 1;
-                else if (cmpResult > 0)
-                    end = mid - 1;
-
-                mid = (start + end) / 2;
-            }
-            return -1;
-        }
-
-        /// <summary>
-        /// Find the first word in the dictionary that starts with the supplied prefix
-        /// </summary>
-        /// <param name="charArray">input prefix</param>
-        /// <returns>index of word, or -1 if not found</returns>
-        /// <seealso cref="GetPrefixMatch(char[], int)"/>
-        public virtual int GetPrefixMatch(char[] charArray)
-        {
-            return GetPrefixMatch(charArray, 0);
-        }
-
-        /// <summary>
-        /// Find the nth word in the dictionary that starts with the supplied prefix
-        /// </summary>
-        /// <param name="charArray">input prefix</param>
-        /// <param name="knownStart">relative position in the dictionary to start</param>
-        /// <returns>index of word, or -1 if not found</returns>
-        /// <seealso cref="GetPrefixMatch(char[])"/>
-        public virtual int GetPrefixMatch(char[] charArray, int knownStart)
-        {
-            short index = GetWordItemTableIndex(charArray[0]);
-            if (index == -1)
-                return -1;
-            char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
-            int start = knownStart, end = items.Length - 1;
-
-            int mid = (start + end) / 2, cmpResult;
-
-            // Binary search for the index of idArray
-            while (start <= end)
-            {
-                cmpResult = Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0);
-                if (cmpResult == 0)
-                {
-                    // Get the first item which match the current word
-                    while (mid >= 0
-                        && Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
-                        mid--;
-                    mid++;
-                    return mid;// Find the first word that uses charArray as prefix.
-                }
-                else if (cmpResult < 0)
-                    end = mid - 1;
-                else
-                    start = mid + 1;
-                mid = (start + end) / 2;
-            }
-            return -1;
-        }
-
-        /// <summary>
-        /// Get the frequency of a word from the dictionary
-        /// </summary>
-        /// <param name="charArray">input word</param>
-        /// <returns>word frequency, or zero if the word is not found</returns>
-        public virtual int GetFrequency(char[] charArray)
-        {
-            short hashIndex = GetWordItemTableIndex(charArray[0]);
-            if (hashIndex == -1)
-            {
-                return 0;
-            }
-            int itemIndex = FindInTable(hashIndex, charArray);
-            if (itemIndex != -1)
-            {
-                return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
-            }
-            return 0;
-        }
-
-        /// <summary>
-        /// Return <c>true</c> if the dictionary entry at itemIndex for table charArray[0] is charArray
-        /// </summary>
-        /// <param name="charArray">input word</param>
-        /// <param name="itemIndex">item index for table charArray[0]</param>
-        /// <returns><c>true</c> if the entry exists</returns>
-        public virtual bool IsEqual(char[] charArray, int itemIndex)
-        {
-            short hashIndex = GetWordItemTableIndex(charArray[0]);
-            return Utility.CompareArray(charArray, 1,
-                wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
index 27ca17c..61ca00b 100644
--- a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
+++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
@@ -1,5 +1,5 @@
 // lucene version compatibility level: 4.8.1
-using Lucene.Net.Analysis.Cn.Smart.HHMM;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
new file mode 100644
index 0000000..83b4614
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -0,0 +1,225 @@
+// lucene version compatibility level: 4.8.1
+using System;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// <para>
+    /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation.
+    /// </para>
+    /// <para>
+    /// Contains methods for dealing with GB2312 encoding.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal abstract class AbstractDictionary
+    {
+        /// <summary>
+        /// First Chinese Character in GB2312 (15 * 94)
+        /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
+        /// </summary>
+        public static readonly int GB2312_FIRST_CHAR = 1410;
+
+        /// <summary>
+        /// Last Chinese Character in GB2312 (87 * 94). 
+        /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
+        /// </summary>
+        public static readonly int GB2312_CHAR_NUM = 87 * 94;
+
+        /// <summary>
+        /// Dictionary data contains 6768 Chinese characters with frequency statistics.
+        /// </summary>
+        public static readonly int CHAR_NUM_IN_FILE = 6768;
+
+        // =====================================================
+        // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
+        // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
+        // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
+        // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
+        // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
+        // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
+        // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
+        // =====================================================
+        //
+        // GB2312 character set:
+        // 01 94 Symbols
+        // 02 72 Numbers
+        // 03 94 Latin
+        // 04 83 Kana
+        // 05 86 Katakana
+        // 06 48 Greek
+        // 07 66 Cyrillic
+        // 08 63 Phonetic Symbols
+        // 09 76 Drawing Symbols
+        // 10-15 Unassigned
+        // 16-55 3755 Plane 1, in pinyin order
+        // 56-87 3008 Plane 2, in radical/stroke order
+        // 88-94 Unassigned
+        // ======================================================
+
+        /// <summary>
+        /// <para>
+        /// Transcode from GB2312 ID to Unicode
+        /// </para>
+        /// <para>
+        /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
+        /// Some regions are unassigned (reserved).
+        /// </para>
+        /// </summary>
+        /// <param name="ccid">GB2312 id</param>
+        /// <returns>unicode String</returns>
+        public virtual string GetCCByGB2312Id(int ccid)
+        {
+            if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM)
+                return "";
+            int cc1 = ccid / 94 + 161;
+            int cc2 = ccid % 94 + 161;
+            byte[] buffer = new byte[2];
+            buffer[0] = (byte)cc1;
+            buffer[1] = (byte)cc2;
+            try
+            {
+                //String cchar = new String(buffer, "GB2312");
+                string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
+                return cchar;
+            }
+            catch (ArgumentException) // Encoding is not supported by the platform
+            {
+                return "";
+            }
+        }
+
+        /// <summary>
+        /// Transcode from Unicode to GB2312
+        /// </summary>
+        /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param>
+        /// <returns>position in GB2312</returns>
+        public virtual short GetGB2312Id(char ch)
+        {
+            try
+            {
+                //byte[] buffer = Character.ToString(ch).getBytes("GB2312");
+                byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
+                //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
+                if (buffer.Length != 2)
+                {
+                    // Should be a two-byte character
+                    return -1;
+                }
+                int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
+                int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. 
+                                                    // Therefore, each code page only has 16*6-2=94 characters.
+                return (short)(b0 * 94 + b1);
+            }
+            catch (ArgumentException e) // Encoding is not supported by the platform
+            {
+                throw new Exception(e.ToString(), e);
+            }
+        }
+
+        /// <summary>
+        /// 32-bit FNV Hash Function
+        /// </summary>
+        /// <param name="c">input character</param>
+        /// <returns>hashcode</returns>
+        public virtual long Hash1(char c)
+        {
+            long p = 1099511628211L;
+            long hash = unchecked((long)0xcbf29ce484222325L);
+            hash = (hash ^ (c & 0x00FF)) * p;
+            hash = (hash ^ (c >> 8)) * p;
+            hash += hash << 13;
+            hash ^= hash >> 7;
+            hash += hash << 3;
+            hash ^= hash >> 17;
+            hash += hash << 5;
+            return hash;
+        }
+
+        /// <summary>
+        /// 32-bit FNV Hash Function
+        /// </summary>
+        /// <param name="carray">character array</param>
+        /// <returns>hashcode</returns>
+        public virtual long Hash1(char[] carray)
+        {
+            long p = 1099511628211L;
+            long hash = unchecked((long)0xcbf29ce484222325L);
+            for (int i = 0; i < carray.Length; i++)
+            {
+                char d = carray[i];
+                hash = (hash ^ (d & 0x00FF)) * p;
+                hash = (hash ^ (d >> 8)) * p;
+            }
+
+            // hash += hash << 13;
+            // hash ^= hash >> 7;
+            // hash += hash << 3;
+            // hash ^= hash >> 17;
+            // hash += hash << 5;
+            return hash;
+        }
+
+        /// <summary>
+        /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+        /// bernstein many years ago in comp.lang.c. another version of this algorithm
+        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+        /// the magic of number 33 (why it works better than many other constants,
+        /// prime or not) has never been adequately explained.
+        /// </summary>
+        /// <param name="c">character</param>
+        /// <returns>hashcode</returns>
+        public virtual int Hash2(char c)
+        {
+            int hash = 5381;
+
+            /* hash 33 + c */
+            hash = ((hash << 5) + hash) + c & 0x00FF;
+            hash = ((hash << 5) + hash) + c >> 8;
+
+            return hash;
+        }
+
+        /// <summary>
+        /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+        /// bernstein many years ago in comp.lang.c. another version of this algorithm
+        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+        /// the magic of number 33 (why it works better than many other constants,
+        /// prime or not) has never been adequately explained.
+        /// </summary>
+        /// <param name="carray">character array</param>
+        /// <returns>hashcode</returns>
+        public virtual int Hash2(char[] carray)
+        {
+            int hash = 5381;
+
+            /* hash 33 + c */
+            for (int i = 0; i < carray.Length; i++)
+            {
+                char d = carray[i];
+                hash = ((hash << 5) + hash) + d & 0x00FF;
+                hash = ((hash << 5) + hash) + d >> 8;
+            }
+
+            return hash;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs
new file mode 100644
index 0000000..6c2923e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs
@@ -0,0 +1,257 @@
+// lucene version compatibility level: 4.8.1
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Graph representing possible token pairs (bigrams) at each start offset in the sentence.
+    /// <para>
+    /// For each start offset, a list of possible token pairs is stored.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal class BiSegGraph
+    {
+        private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>();
+
+        private IList<SegToken> segTokenList;
+
+        private static BigramDictionary bigramDict = BigramDictionary.GetInstance();
+
+        public BiSegGraph(SegGraph segGraph)
+        {
+            segTokenList = segGraph.MakeIndex();
+            GenerateBiSegGraph(segGraph);
+        }
+
+        /// <summary>
+        /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
+        /// </summary>
+        private void GenerateBiSegGraph(SegGraph segGraph)
+        {
+            double smooth = 0.1;
+            int wordPairFreq = 0;
+            int maxStart = segGraph.MaxStart;
+            double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
+
+            int next;
+            char[] idBuffer;
+            // get the list of tokens ordered and indexed
+            segTokenList = segGraph.MakeIndex();
+            // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
+            int key = -1;
+            IList<SegToken> nextTokens = null;
+            while (key < maxStart)
+            {
+                if (segGraph.IsStartExist(key))
+                {
+
+                    IList<SegToken> tokenList = segGraph.GetStartList(key);
+
+                    // Calculate all tokens for a given key.
+                    foreach (SegToken t1 in tokenList)
+                    {
+                        oneWordFreq = t1.Weight;
+                        next = t1.EndOffset;
+                        nextTokens = null;
+                        // Find the next corresponding Token.
+                        // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
+                        // If we cannot find the next Token, then go to the end and repeat the same cycle.
+                        while (next <= maxStart)
+                        {
+                            // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
+                            if (segGraph.IsStartExist(next))
+                            {
+                                nextTokens = segGraph.GetStartList(next);
+                                break;
+                            }
+                            next++;
+                        }
+                        if (nextTokens == null)
+                        {
+                            break;
+                        }
+                        foreach (SegToken t2 in nextTokens)
+                        {
+                            idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
+                            System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
+                            idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
+                            System.Array.Copy(t2.CharArray, 0, idBuffer,
+                                t1.CharArray.Length + 1, t2.CharArray.Length);
+
+                            // Two linked Words frequency
+                            wordPairFreq = bigramDict.GetFrequency(idBuffer);
+
+                            // Smoothing
+
+                            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
+                            weight = -Math
+                                .Log(smooth
+                                    * (1.0 + oneWordFreq)
+                                    / (Utility.MAX_FREQUENCE + 0.0)
+                                    + (1.0 - smooth)
+                                    * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
+
+                            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
+                                t2.Index, weight);
+                            this.AddSegTokenPair(tokenPair);
+                        }
+                    }
+                }
+                key++;
+            }
+
+        }
+
+        /// <summary>
+        /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token)
+        /// </summary>
+        /// <param name="to">index of the second token in the token pair</param>
+        /// <returns><c>true</c> if a token pair exists</returns>
+        public virtual bool IsToExist(int to)
+        {
+            //return tokenPairListTable.get(Integer.valueOf(to)) != null;
+            //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null;
+            IList<SegTokenPair> result;
+            return tokenPairListTable.TryGetValue(to, out result) && result != null;
+        }
+
+        /// <summary>
+        /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token)
+        /// </summary>
+        /// <param name="to">index of the second token in the token pair</param>
+        /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns>
+        public virtual IList<SegTokenPair> GetToList(int to)
+        {
+            IList<SegTokenPair> result;
+            tokenPairListTable.TryGetValue(to, out result);
+            return result;
+        }
+
+        /// <summary>
+        /// Add a <see cref="SegTokenPair"/>
+        /// </summary>
+        /// <param name="tokenPair"><see cref="SegTokenPair"/></param>
+        public virtual void AddSegTokenPair(SegTokenPair tokenPair)
+        {
+            int to = tokenPair.To;
+            if (!IsToExist(to))
+            {
+                List<SegTokenPair> newlist = new List<SegTokenPair>();
+                newlist.Add(tokenPair);
+                tokenPairListTable[to] = newlist;
+            }
+            else
+            {
+                IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
+                tokenPairList.Add(tokenPair);
+            }
+        }
+
+        /// <summary>
+        /// Get the number of <see cref="SegTokenPair"/> entries in the table.
+        /// </summary>
+        /// <returns>number of <see cref="SegTokenPair"/> entries</returns>
+        public virtual int ToCount
+        {
+            get { return tokenPairListTable.Count; }
+        }
+
+        /// <summary>
+        /// Find the shortest path with the Viterbi algorithm.
+        /// </summary>
+        /// <returns><see cref="T:IList{SegToken}"/></returns>
+        [ExceptionToNetNumericConvention]
+        public virtual IList<SegToken> GetShortPath()
+        {
+            int current;
+            int nodeCount = ToCount;
+            IList<PathNode> path = new List<PathNode>();
+            PathNode zeroPath = new PathNode();
+            zeroPath.Weight = 0;
+            zeroPath.PreNode = 0;
+            path.Add(zeroPath);
+            for (current = 1; current <= nodeCount; current++)
+            {
+                double weight;
+                IList<SegTokenPair> edges = GetToList(current);
+
+                double minWeight = double.MaxValue;
+                SegTokenPair minEdge = null;
+                foreach (SegTokenPair edge in edges)
+                {
+                    weight = edge.Weight;
+                    PathNode preNode2 = path[edge.From];
+                    if (preNode2.Weight + weight < minWeight)
+                    {
+                        minWeight = preNode2.Weight + weight;
+                        minEdge = edge;
+                    }
+                }
+                PathNode newNode = new PathNode();
+                newNode.Weight = minWeight;
+                newNode.PreNode = minEdge.From;
+                path.Add(newNode);
+            }
+
+            // Calculate PathNodes
+            int preNode, lastNode;
+            lastNode = path.Count - 1;
+            current = lastNode;
+            IList<int> rpath = new List<int>();
+            IList<SegToken> resultPath = new List<SegToken>();
+
+            rpath.Add(current);
+            while (current != 0)
+            {
+                PathNode currentPathNode = path[current];
+                preNode = currentPathNode.PreNode;
+                rpath.Add(preNode);
+                current = preNode;
+            }
+            for (int j = rpath.Count - 1; j >= 0; j--)
+            {
+                //int idInteger = rpath.get(j);
+                //int id = idInteger.intValue();
+                int id = rpath[j];
+                SegToken t = segTokenList[id];
+                resultPath.Add(t);
+            }
+            return resultPath;
+        }
+
+        public override string ToString()
+        {
+            StringBuilder sb = new StringBuilder();
+            ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values;
+            foreach (IList<SegTokenPair> segList in values)
+            {
+                foreach (SegTokenPair pair in segList)
+                {
+                    sb.Append(pair + "\n");
+                }
+            }
+            return sb.ToString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
new file mode 100644
index 0000000..b21925f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -0,0 +1,432 @@
+// lucene version compatibility level: 4.8.1
+using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
+using System;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer Bigram dictionary.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    internal class BigramDictionary : AbstractDictionary
+    {
+        private BigramDictionary()
+        {
+        }
+
+        public static readonly char WORD_SEGMENT_CHAR = '@';
+
+        private static BigramDictionary singleInstance;
+
+        public static readonly int PRIME_BIGRAM_LENGTH = 402137;
+
+        /// <summary>
+        /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.  
+        /// </summary>
+        private long[] bigramHashTable;
+
+        private int[] frequencyTable;
+
+        private int max = 0;
+
+        private int repeat = 0;
+
+        // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+        private static object syncLock = new object();
+
+        public static BigramDictionary GetInstance()
+        {
+            lock (syncLock)
+            {
+                if (singleInstance == null)
+                {
+                    singleInstance = new BigramDictionary();
+
+                    // LUCENENET specific
+                    // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817
+                    // This issue still existed as of 4.8.0. Here is the fix - we only
+                    // load from a directory if the actual directory exists (AnalyzerProfile
+                    // ensures it is an empty string if it is not available).
+                    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    if (string.IsNullOrEmpty(dictRoot))
+                    {
+                        singleInstance.Load();
+                    }
+                    else
+                    {
+                        singleInstance.Load(dictRoot);
+                    }
+
+
+                    //try
+                    //{
+                    //    singleInstance.Load();
+                    //}
+                    //catch (IOException e)
+                    //{
+                    //    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    //    singleInstance.Load(dictRoot);
+                    //}
+                    //catch (TypeLoadException e)
+                    //{
+                    //    throw new Exception(e.ToString(), e);
+                    //}
+                }
+                return singleInstance;
+            }
+        }
+
+        private bool LoadFromObj(FileInfo serialObj)
+        {
+            try
+            {
+                using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read))
+                    LoadFromInputStream(input);
+                return true;
+            }
+            catch (Exception e)
+            {
+                throw new Exception(e.ToString(), e);
+            }
+        }
+
+        // LUCENENET conversion note:
+        // The data in Lucene is stored in a proprietary binary format (similar to
+        // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the
+        // data was extracted using Java's DataOutputStream using the following Java code.
+        // It can then be read in using the LoadFromInputStream method below 
+        // (using a DataInputStream instead of a BinaryReader), and saved
+        // in the correct (BinaryWriter) format by calling the SaveToObj method.
+        // Alternatively, the data can be loaded from disk using the files
+        // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, 
+        // which will automatically produce the .mem files.
+
+        //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException
+        //{
+        //    // save wordIndexTable
+        //    int wiLen = wordIndexTable.length;
+        //    stream.writeInt(wiLen);
+        //    for (int i = 0; i<wiLen; i++)
+        //    {
+        //        stream.writeShort(wordIndexTable[i]);
+        //    }
+
+        //    // save charIndexTable
+        //    int ciLen = charIndexTable.length;
+        //    stream.writeInt(ciLen);
+        //    for (int i = 0; i<ciLen; i++)
+        //    {
+        //        stream.writeChar(charIndexTable[i]);
+        //    }
+
+        //    int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length;
+        //    stream.writeInt(caDim1);
+        //    for (int i = 0; i<caDim1; i++)
+        //    {
+        //        int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length;
+        //        stream.writeInt(caDim2);
+        //        for (int j = 0; j<caDim2; j++)
+        //        {
+        //            int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length;
+        //            stream.writeInt(caDim3);
+        //            for (int k = 0; k<caDim3; k++)
+        //            {
+        //                stream.writeChar(wordItem_charArrayTable[i][j][k]);
+        //            }
+        //        }
+        //    }
+
+        //    int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length;
+        //    stream.writeInt(fDim1);
+        //    for (int i = 0; i<fDim1; i++)
+        //    {
+        //        int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length;
+        //        stream.writeInt(fDim2);
+        //        for (int j = 0; j<fDim2; j++)
+        //        {
+        //            stream.writeInt(wordItem_frequencyTable[i][j]);
+        //        }
+        //    }
+        //}
+
+        private void LoadFromInputStream(Stream serialObjectInputStream)
+        {
+            //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
+            //bigramHashTable = (long[])input.readObject();
+            //frequencyTable = (int[])input.readObject();
+            //// log.info("load bigram dict from serialization.");
+            //input.close();
+
+            using (var reader = new BinaryReader(serialObjectInputStream))
+            //using (var reader = new DataInputStream(serialObjectInputStream))
+            {
+                // Read bigramHashTable
+                int bhLen = reader.ReadInt32();
+                bigramHashTable = new long[bhLen];
+                for (int i = 0; i < bhLen; i++)
+                {
+                    bigramHashTable[i] = reader.ReadInt64();
+                }
+
+                // Read frequencyTable
+                int fLen = reader.ReadInt32();
+                frequencyTable = new int[fLen];
+                for (int i = 0; i < fLen; i++)
+                {
+                    frequencyTable[i] = reader.ReadInt32();
+                }
+            }
+
+            // log.info("load bigram dict from serialization.");
+        }
+
+        private void SaveToObj(FileInfo serialObj)
+        {
+            try
+            {
+                //ObjectOutputStream output = new ObjectOutputStream(new FileStream(
+                //    serialObj.FullName, FileMode.Create, FileAccess.Write));
+                //output.writeObject(bigramHashTable);
+                //output.writeObject(frequencyTable);
+                //output.close();
+                
+                using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write))
+                {
+                    using (BinaryWriter writer = new BinaryWriter(output))
+                    {
+                        int bhLen = bigramHashTable.Length;
+                        writer.Write(bhLen);
+                        for (int i = 0; i < bhLen; i++)
+                        {
+                            writer.Write(bigramHashTable[i]);
+                        }
+
+                        int fLen = frequencyTable.Length;
+                        writer.Write(fLen);
+                        for (int i = 0; i < fLen; i++)
+                        {
+                            writer.Write(frequencyTable[i]);
+                        }
+                    }
+                }
+                // log.info("serialize bigram dict.");
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+                // log.warn(e.getMessage());
+            }
+        }
+
+        private void Load()
+        {
+            using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem"))
+            {
+                LoadFromInputStream(input);
+            }
+        }
+
+        private void Load(string dictRoot)
+        {
+            string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct");
+
+            FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem"));
+
+            if (serialObj.Exists && LoadFromObj(serialObj))
+            {
+
+            }
+            else
+            {
+                try
+                {
+                    bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+                    frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+                    for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++)
+                    {
+                        // it is possible for a value to hash to 0, but the probability is extremely low
+                        bigramHashTable[i] = 0;
+                        frequencyTable[i] = 0;
+                    }
+                    LoadFromFile(bigramDictPath);
+                }
+                catch (IOException e)
+                {
+                    throw new Exception(e.ToString(), e);
+                }
+                SaveToObj(serialObj);
+            }
+        }
+
+        /// <summary>
+        /// Load the datafile into this <see cref="BigramDictionary"/>
+        /// </summary>
+        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
+        /// <exception cref="IOException">If there is a low-level I/O error</exception>
+        public virtual void LoadFromFile(string dctFilePath)
+        {
+            int i, cnt, length, total = 0;
+            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.  
+            // The 3756th is used (as a header) to store information.
+            int[]
+            buffer = new int[3];
+            byte[] intBuffer = new byte[4];
+            string tmpword;
+            //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
+            using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
+            {
+
+                // GB2312 characters 0 - 6768
+                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+                {
+                    string currentStr = GetCCByGB2312Id(i);
+                    // if (i == 5231)
+                    // System.out.println(i);
+
+                    dctFile.Read(intBuffer, 0, intBuffer.Length);
+                    // the dictionary was developed for C, and byte order must be converted to work with Java
+                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
+                    if (cnt <= 0)
+                    {
+                        continue;
+                    }
+                    total += cnt;
+                    int j = 0;
+                    while (j < cnt)
+                    {
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// frequency
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// length
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+                        // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+                        length = buffer[1];
+                        if (length > 0)
+                        {
+                            byte[] lchBuffer = new byte[length];
+                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                            //tmpword = new String(lchBuffer, "GB2312");
+                            tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+                            //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
+                            if (i != 3755 + GB2312_FIRST_CHAR)
+                            {
+                                tmpword = currentStr + tmpword;
+                            }
+                            char[] carray = tmpword.ToCharArray();
+                            long hashId = Hash1(carray);
+                            int index = GetAvaliableIndex(hashId, carray);
+                            if (index != -1)
+                            {
+                                if (bigramHashTable[index] == 0)
+                                {
+                                    bigramHashTable[index] = hashId;
+                                    // bigramStringTable[index] = tmpword;
+                                }
+                                frequencyTable[index] += buffer[0];
+                            }
+                        }
+                        j++;
+                    }
+                }
+            }
+            // log.info("load dictionary done! " + dctFilePath + " total:" + total);
+        }
+
+        private int GetAvaliableIndex(long hashId, char[] carray)
+        {
+            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_BIGRAM_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_BIGRAM_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+                && i < PRIME_BIGRAM_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+                i++;
+            }
+            // System.out.println(i - 1);
+
+            if (i < PRIME_BIGRAM_LENGTH
+                && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId))
+            {
+                return index;
+            }
+            else
+                return -1;
+        }
+
+        /// <summary>
+        /// lookup the index into the frequency array.
+        /// </summary>
+        private int GetBigramItemIndex(char[] carray)
+        {
+            long hashId = Hash1(carray);
+            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_BIGRAM_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_BIGRAM_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            repeat++;
+            while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+                && i < PRIME_BIGRAM_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+                i++;
+                repeat++;
+                if (i > max)
+                    max = i;
+            }
+            // System.out.println(i - 1);
+
+            if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId)
+            {
+                return index;
+            }
+            else
+                return -1;
+        }
+
+        public int GetFrequency(char[] carray)
+        {
+            int index = GetBigramItemIndex(carray);
+            if (index != -1)
+                return frequencyTable[index];
+            return 0;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs
new file mode 100644
index 0000000..4940dba
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs
@@ -0,0 +1,253 @@
+// lucene version compatibility level: 4.8.1
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Finds the optimal segmentation of a sentence into Chinese words
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public class HHMMSegmenter
+    {
+        private static WordDictionary wordDict = WordDictionary.GetInstance();
+
+        /// <summary>
+        /// Create the <see cref="SegGraph"/> for a sentence.
+        /// </summary>
+        /// <param name="sentence">input sentence, without start and end markers</param>
+        /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
+        private SegGraph CreateSegGraph(string sentence)
+        {
+            int i = 0, j;
+            int length = sentence.Length;
+            int foundIndex;
+            CharType[] charTypeArray = GetCharTypes(sentence);
+            StringBuilder wordBuf = new StringBuilder();
+            SegToken token;
+            int frequency = 0; // the number of times word appears.
+            bool hasFullWidth;
+            WordType wordType;
+            char[] charArray;
+
+            SegGraph segGraph = new SegGraph();
+            while (i < length)
+            {
+                hasFullWidth = false;
+                switch (charTypeArray[i])
+                {
+                    case CharType.SPACE_LIKE:
+                        i++;
+                        break;
+                    case CharType.HANZI:
+                        j = i + 1;
+                        //wordBuf.delete(0, wordBuf.length());
+                        wordBuf.Remove(0, wordBuf.Length);
+                        // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
+                        // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
+                        // cause word division.
+                        wordBuf.Append(sentence[i]);
+                        charArray = new char[] { sentence[i] };
+                        frequency = wordDict.GetFrequency(charArray);
+                        token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+                            frequency);
+                        segGraph.AddToken(token);
+
+                        foundIndex = wordDict.GetPrefixMatch(charArray);
+                        while (j <= length && foundIndex != -1)
+                        {
+                            if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
+                            {
+                                // It is the phrase we are looking for; In other words, we have found a phrase SegToken
+                                // from i to j.  It is not a monosyllabic word (single word).
+                                frequency = wordDict.GetFrequency(charArray);
+                                token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+                                    frequency);
+                                segGraph.AddToken(token);
+                            }
+
+                            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
+                                j++;
+
+                            if (j < length && charTypeArray[j] == CharType.HANZI)
+                            {
+                                wordBuf.Append(sentence[j]);
+                                charArray = new char[wordBuf.Length];
+                                //wordBuf.GetChars(0, charArray.Length, charArray, 0);
+                                wordBuf.CopyTo(0, charArray, 0, charArray.Length);
+                                // idArray has been found (foundWordIndex!=-1) as a prefix before.  
+                                // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.  
+                                // So start searching after foundWordIndex.
+                                foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
+                                j++;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                        i++;
+                        break;
+                    case CharType.FULLWIDTH_LETTER:
+                        hasFullWidth = true; /* intentional fallthrough */
+
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is LETTER char string.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+                        token = new SegToken(charArray, i, j, wordType, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+
+                    case CharType.LETTER:
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is LETTER char string.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+                        token = new SegToken(charArray, i, j, wordType, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    case CharType.FULLWIDTH_DIGIT:
+                        hasFullWidth = true; /* intentional fallthrough */
+
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is NUMBER char string.
+                        charArray = Utility.NUMBER_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+                        token = new SegToken(charArray, i, j, wordType, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+
+                    case CharType.DIGIT:
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is NUMBER char string.
+                        charArray = Utility.NUMBER_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+                        token = new SegToken(charArray, i, j, wordType, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    case CharType.DELIMITER:
+                        j = i + 1;
+                        // No need to search the weight for the punctuation.  Picking the highest frequency will work.
+                        frequency = Utility.MAX_FREQUENCE;
+                        charArray = new char[] { sentence[i] };
+                        token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    default:
+                        j = i + 1;
+                        // Treat the unrecognized char symbol as unknown string.
+                        // For example, any symbol not in GB2312 is treated as one of these.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        token = new SegToken(charArray, i, j, WordType.STRING, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                }
+            }
+
+            // Add two more Tokens: "beginning xx beginning"
+            charArray = Utility.START_CHAR_ARRAY;
+            frequency = wordDict.GetFrequency(charArray);
+            token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
+            segGraph.AddToken(token);
+
+            // "end xx end"
+            charArray = Utility.END_CHAR_ARRAY;
+            frequency = wordDict.GetFrequency(charArray);
+            token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
+                frequency);
+            segGraph.AddToken(token);
+
+            return segGraph;
+        }
+
+        /// <summary>
+        /// Get the character types for every character in a sentence.
+        /// </summary>
+        /// <param name="sentence">input sentence</param>
+        /// <returns>array of character types corresponding to character positions in the sentence</returns>
+        /// <seealso cref="Utility.GetCharType(char)"/>
+        private static CharType[] GetCharTypes(string sentence)
+        {
+            int length = sentence.Length;
+            CharType[] charTypeArray = new CharType[length];
+            // the type of each character by position
+            for (int i = 0; i < length; i++)
+            {
+                charTypeArray[i] = Utility.GetCharType(sentence[i]);
+            }
+
+            return charTypeArray;
+        }
+
+        /// <summary>
+        /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
+        /// </summary>
+        /// <param name="sentence">input sentence</param>
+        /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
+        public virtual IList<SegToken> Process(string sentence)
+        {
+            SegGraph segGraph = CreateSegGraph(sentence);
+            BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+            IList<SegToken> shortPath = biSegGraph.GetShortPath();
+            return shortPath;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs
new file mode 100644
index 0000000..7295f3f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs
@@ -0,0 +1,81 @@
+// lucene version compatibility level: 4.8.1
+using Lucene.Net.Support;
+using System;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer internal node representation
+    /// <para>
+    /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal class PathNode : IComparable<PathNode>
+    {
+        public double Weight { get; set; }
+
+        public int PreNode { get; set; }
+
+        public virtual int CompareTo(PathNode pn)
+        {
+            if (Weight < pn.Weight)
+                return -1;
+            else if (Weight == pn.Weight)
+                return 0;
+            else
+                return 1;
+        }
+
+        /// <summary>
+        /// <see cref="object.GetHashCode()"/>
+        /// </summary>
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            result = prime * result + PreNode;
+            long temp;
+            temp = Number.DoubleToInt64Bits(Weight);
+            result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
+            return result;
+        }
+
+        /// <summary>
+        /// <see cref="object.Equals(object)"/>
+        /// </summary>
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            PathNode other = (PathNode)obj;
+            if (PreNode != other.PreNode)
+                return false;
+            if (Number.DoubleToInt64Bits(Weight) != Number
+                .DoubleToInt64Bits(other.Weight))
+                return false;
+            return true;
+        }
+    }
+}


Mime
View raw message