lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [39/50] [abbrv] lucenenet git commit: Lucene.Net.Analysis.SmartCn: Renamed HHMM namespace to Hhmm to follow .NET conventions better
Date Sat, 09 Sep 2017 00:31:57 GMT
Lucene.Net.Analysis.SmartCn: Renamed HHMM namespace to Hhmm to follow .NET conventions better


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/056353d4
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/056353d4
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/056353d4

Branch: refs/heads/master
Commit: 056353d47fbdad6f0379f959be90752ad5081bc4
Parents: 94d0291
Author: Shad Storhaug <shad@shadstorhaug.com>
Authored: Sat Sep 9 03:40:08 2017 +0700
Committer: Shad Storhaug <shad@shadstorhaug.com>
Committed: Sat Sep 9 03:41:22 2017 +0700

----------------------------------------------------------------------
 .../HHMM/AbstractDictionary.cs                  | 225 ------
 .../HHMM/BiSegGraph.cs                          | 257 ------
 .../HHMM/BigramDictionary.cs                    | 432 ----------
 .../HHMM/HHMMSegmenter.cs                       | 253 ------
 .../HHMM/PathNode.cs                            |  81 --
 .../HHMM/SegGraph.cs                            | 161 ----
 .../HHMM/SegToken.cs                            | 124 ---
 .../HHMM/SegTokenFilter.cs                      |  76 --
 .../HHMM/SegTokenPair.cs                        |  96 ---
 .../HHMM/WordDictionary.cs                      | 779 -------------------
 .../HMMChineseTokenizer.cs                      |   2 +-
 .../Hhmm/AbstractDictionary.cs                  | 225 ++++++
 .../Hhmm/BiSegGraph.cs                          | 257 ++++++
 .../Hhmm/BigramDictionary.cs                    | 432 ++++++++++
 .../Hhmm/HHMMSegmenter.cs                       | 253 ++++++
 .../Hhmm/PathNode.cs                            |  81 ++
 .../Hhmm/SegGraph.cs                            | 161 ++++
 .../Hhmm/SegToken.cs                            | 124 +++
 .../Hhmm/SegTokenFilter.cs                      |  76 ++
 .../Hhmm/SegTokenPair.cs                        |  96 +++
 .../Hhmm/WordDictionary.cs                      | 779 +++++++++++++++++++
 .../Lucene.Net.Analysis.SmartCn.csproj          |   4 +-
 .../WordSegmenter.cs                            |   2 +-
 .../WordTokenFilter.cs                          |   2 +-
 .../TestHMMChineseTokenizerFactory.cs           |   2 +-
 25 files changed, 2491 insertions(+), 2489 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
deleted file mode 100644
index 370056a..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
+++ /dev/null
@@ -1,225 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using System;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// <para>
-    /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation.
-    /// </para>
-    /// <para>
-    /// Contains methods for dealing with GB2312 encoding.
-    /// </para>
-    /// @lucene.experimental
-    /// </summary>
-    internal abstract class AbstractDictionary
-    {
-        /// <summary>
-        /// First Chinese Character in GB2312 (15 * 94)
-        /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
-        /// </summary>
-        public static readonly int GB2312_FIRST_CHAR = 1410;
-
-        /// <summary>
-        /// Last Chinese Character in GB2312 (87 * 94). 
-        /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
-        /// </summary>
-        public static readonly int GB2312_CHAR_NUM = 87 * 94;
-
-        /// <summary>
-        /// Dictionary data contains 6768 Chinese characters with frequency statistics.
-        /// </summary>
-        public static readonly int CHAR_NUM_IN_FILE = 6768;
-
-        // =====================================================
-        // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
-        // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
-        // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
-        // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
-        // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
-        // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
-        // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
-        // =====================================================
-        //
-        // GB2312 character set:
-        // 01 94 Symbols
-        // 02 72 Numbers
-        // 03 94 Latin
-        // 04 83 Kana
-        // 05 86 Katakana
-        // 06 48 Greek
-        // 07 66 Cyrillic
-        // 08 63 Phonetic Symbols
-        // 09 76 Drawing Symbols
-        // 10-15 Unassigned
-        // 16-55 3755 Plane 1, in pinyin order
-        // 56-87 3008 Plane 2, in radical/stroke order
-        // 88-94 Unassigned
-        // ======================================================
-
-        /// <summary>
-        /// <para>
-        /// Transcode from GB2312 ID to Unicode
-        /// </para>
-        /// <para>
-        /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
-        /// Some regions are unassigned (reserved).
-        /// </para>
-        /// </summary>
-        /// <param name="ccid">GB2312 id</param>
-        /// <returns>unicode String</returns>
-        public virtual string GetCCByGB2312Id(int ccid)
-        {
-            if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM)
-                return "";
-            int cc1 = ccid / 94 + 161;
-            int cc2 = ccid % 94 + 161;
-            byte[] buffer = new byte[2];
-            buffer[0] = (byte)cc1;
-            buffer[1] = (byte)cc2;
-            try
-            {
-                //String cchar = new String(buffer, "GB2312");
-                string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
-                return cchar;
-            }
-            catch (ArgumentException) // Encoding is not supported by the platform
-            {
-                return "";
-            }
-        }
-
-        /// <summary>
-        /// Transcode from Unicode to GB2312
-        /// </summary>
-        /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param>
-        /// <returns>position in GB2312</returns>
-        public virtual short GetGB2312Id(char ch)
-        {
-            try
-            {
-                //byte[] buffer = Character.ToString(ch).getBytes("GB2312");
-                byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
-                //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
-                if (buffer.Length != 2)
-                {
-                    // Should be a two-byte character
-                    return -1;
-                }
-                int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
-                int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. 
-                                                    // Therefore, each code page only has 16*6-2=94 characters.
-                return (short)(b0 * 94 + b1);
-            }
-            catch (ArgumentException e) // Encoding is not supported by the platform
-            {
-                throw new Exception(e.ToString(), e);
-            }
-        }
-
-        /// <summary>
-        /// 32-bit FNV Hash Function
-        /// </summary>
-        /// <param name="c">input character</param>
-        /// <returns>hashcode</returns>
-        public virtual long Hash1(char c)
-        {
-            long p = 1099511628211L;
-            long hash = unchecked((long)0xcbf29ce484222325L);
-            hash = (hash ^ (c & 0x00FF)) * p;
-            hash = (hash ^ (c >> 8)) * p;
-            hash += hash << 13;
-            hash ^= hash >> 7;
-            hash += hash << 3;
-            hash ^= hash >> 17;
-            hash += hash << 5;
-            return hash;
-        }
-
-        /// <summary>
-        /// 32-bit FNV Hash Function
-        /// </summary>
-        /// <param name="carray">character array</param>
-        /// <returns>hashcode</returns>
-        public virtual long Hash1(char[] carray)
-        {
-            long p = 1099511628211L;
-            long hash = unchecked((long)0xcbf29ce484222325L);
-            for (int i = 0; i < carray.Length; i++)
-            {
-                char d = carray[i];
-                hash = (hash ^ (d & 0x00FF)) * p;
-                hash = (hash ^ (d >> 8)) * p;
-            }
-
-            // hash += hash << 13;
-            // hash ^= hash >> 7;
-            // hash += hash << 3;
-            // hash ^= hash >> 17;
-            // hash += hash << 5;
-            return hash;
-        }
-
-        /// <summary>
-        /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
-        /// bernstein many years ago in comp.lang.c. another version of this algorithm
-        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
-        /// the magic of number 33 (why it works better than many other constants,
-        /// prime or not) has never been adequately explained.
-        /// </summary>
-        /// <param name="c">character</param>
-        /// <returns>hashcode</returns>
-        public virtual int Hash2(char c)
-        {
-            int hash = 5381;
-
-            /* hash 33 + c */
-            hash = ((hash << 5) + hash) + c & 0x00FF;
-            hash = ((hash << 5) + hash) + c >> 8;
-
-            return hash;
-        }
-
-        /// <summary>
-        /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
-        /// bernstein many years ago in comp.lang.c. another version of this algorithm
-        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
-        /// the magic of number 33 (why it works better than many other constants,
-        /// prime or not) has never been adequately explained.
-        /// </summary>
-        /// <param name="carray">character array</param>
-        /// <returns>hashcode</returns>
-        public virtual int Hash2(char[] carray)
-        {
-            int hash = 5381;
-
-            /* hash 33 + c */
-            for (int i = 0; i < carray.Length; i++)
-            {
-                char d = carray[i];
-                hash = ((hash << 5) + hash) + d & 0x00FF;
-                hash = ((hash << 5) + hash) + d >> 8;
-            }
-
-            return hash;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
deleted file mode 100644
index c32c8d5..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
+++ /dev/null
@@ -1,257 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Graph representing possible token pairs (bigrams) at each start offset in the sentence.
-    /// <para>
-    /// For each start offset, a list of possible token pairs is stored.
-    /// </para>
-    /// @lucene.experimental
-    /// </summary>
-    internal class BiSegGraph
-    {
-        private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>();
-
-        private IList<SegToken> segTokenList;
-
-        private static BigramDictionary bigramDict = BigramDictionary.GetInstance();
-
-        public BiSegGraph(SegGraph segGraph)
-        {
-            segTokenList = segGraph.MakeIndex();
-            GenerateBiSegGraph(segGraph);
-        }
-
-        /// <summary>
-        /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
-        /// </summary>
-        private void GenerateBiSegGraph(SegGraph segGraph)
-        {
-            double smooth = 0.1;
-            int wordPairFreq = 0;
-            int maxStart = segGraph.MaxStart;
-            double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
-
-            int next;
-            char[] idBuffer;
-            // get the list of tokens ordered and indexed
-            segTokenList = segGraph.MakeIndex();
-            // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
-            int key = -1;
-            IList<SegToken> nextTokens = null;
-            while (key < maxStart)
-            {
-                if (segGraph.IsStartExist(key))
-                {
-
-                    IList<SegToken> tokenList = segGraph.GetStartList(key);
-
-                    // Calculate all tokens for a given key.
-                    foreach (SegToken t1 in tokenList)
-                    {
-                        oneWordFreq = t1.Weight;
-                        next = t1.EndOffset;
-                        nextTokens = null;
-                        // Find the next corresponding Token.
-                        // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
-                        // If we cannot find the next Token, then go to the end and repeat the same cycle.
-                        while (next <= maxStart)
-                        {
-                            // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
-                            if (segGraph.IsStartExist(next))
-                            {
-                                nextTokens = segGraph.GetStartList(next);
-                                break;
-                            }
-                            next++;
-                        }
-                        if (nextTokens == null)
-                        {
-                            break;
-                        }
-                        foreach (SegToken t2 in nextTokens)
-                        {
-                            idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
-                            System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
-                            idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
-                            System.Array.Copy(t2.CharArray, 0, idBuffer,
-                                t1.CharArray.Length + 1, t2.CharArray.Length);
-
-                            // Two linked Words frequency
-                            wordPairFreq = bigramDict.GetFrequency(idBuffer);
-
-                            // Smoothing
-
-                            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
-                            weight = -Math
-                                .Log(smooth
-                                    * (1.0 + oneWordFreq)
-                                    / (Utility.MAX_FREQUENCE + 0.0)
-                                    + (1.0 - smooth)
-                                    * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
-
-                            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
-                                t2.Index, weight);
-                            this.AddSegTokenPair(tokenPair);
-                        }
-                    }
-                }
-                key++;
-            }
-
-        }
-
-        /// <summary>
-        /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token)
-        /// </summary>
-        /// <param name="to">index of the second token in the token pair</param>
-        /// <returns><c>true</c> if a token pair exists</returns>
-        public virtual bool IsToExist(int to)
-        {
-            //return tokenPairListTable.get(Integer.valueOf(to)) != null;
-            //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null;
-            IList<SegTokenPair> result;
-            return tokenPairListTable.TryGetValue(to, out result) && result != null;
-        }
-
-        /// <summary>
-        /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token)
-        /// </summary>
-        /// <param name="to">index of the second token in the token pair</param>
-        /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns>
-        public virtual IList<SegTokenPair> GetToList(int to)
-        {
-            IList<SegTokenPair> result;
-            tokenPairListTable.TryGetValue(to, out result);
-            return result;
-        }
-
-        /// <summary>
-        /// Add a <see cref="SegTokenPair"/>
-        /// </summary>
-        /// <param name="tokenPair"><see cref="SegTokenPair"/></param>
-        public virtual void AddSegTokenPair(SegTokenPair tokenPair)
-        {
-            int to = tokenPair.To;
-            if (!IsToExist(to))
-            {
-                List<SegTokenPair> newlist = new List<SegTokenPair>();
-                newlist.Add(tokenPair);
-                tokenPairListTable[to] = newlist;
-            }
-            else
-            {
-                IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
-                tokenPairList.Add(tokenPair);
-            }
-        }
-
-        /// <summary>
-        /// Get the number of <see cref="SegTokenPair"/> entries in the table.
-        /// </summary>
-        /// <returns>number of <see cref="SegTokenPair"/> entries</returns>
-        public virtual int ToCount
-        {
-            get { return tokenPairListTable.Count; }
-        }
-
-        /// <summary>
-        /// Find the shortest path with the Viterbi algorithm.
-        /// </summary>
-        /// <returns><see cref="T:IList{SegToken}"/></returns>
-        [ExceptionToNetNumericConvention]
-        public virtual IList<SegToken> GetShortPath()
-        {
-            int current;
-            int nodeCount = ToCount;
-            IList<PathNode> path = new List<PathNode>();
-            PathNode zeroPath = new PathNode();
-            zeroPath.Weight = 0;
-            zeroPath.PreNode = 0;
-            path.Add(zeroPath);
-            for (current = 1; current <= nodeCount; current++)
-            {
-                double weight;
-                IList<SegTokenPair> edges = GetToList(current);
-
-                double minWeight = double.MaxValue;
-                SegTokenPair minEdge = null;
-                foreach (SegTokenPair edge in edges)
-                {
-                    weight = edge.Weight;
-                    PathNode preNode2 = path[edge.From];
-                    if (preNode2.Weight + weight < minWeight)
-                    {
-                        minWeight = preNode2.Weight + weight;
-                        minEdge = edge;
-                    }
-                }
-                PathNode newNode = new PathNode();
-                newNode.Weight = minWeight;
-                newNode.PreNode = minEdge.From;
-                path.Add(newNode);
-            }
-
-            // Calculate PathNodes
-            int preNode, lastNode;
-            lastNode = path.Count - 1;
-            current = lastNode;
-            IList<int> rpath = new List<int>();
-            IList<SegToken> resultPath = new List<SegToken>();
-
-            rpath.Add(current);
-            while (current != 0)
-            {
-                PathNode currentPathNode = path[current];
-                preNode = currentPathNode.PreNode;
-                rpath.Add(preNode);
-                current = preNode;
-            }
-            for (int j = rpath.Count - 1; j >= 0; j--)
-            {
-                //int idInteger = rpath.get(j);
-                //int id = idInteger.intValue();
-                int id = rpath[j];
-                SegToken t = segTokenList[id];
-                resultPath.Add(t);
-            }
-            return resultPath;
-        }
-
-        public override string ToString()
-        {
-            StringBuilder sb = new StringBuilder();
-            ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values;
-            foreach (IList<SegTokenPair> segList in values)
-            {
-                foreach (SegTokenPair pair in segList)
-                {
-                    sb.Append(pair + "\n");
-                }
-            }
-            return sb.ToString();
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
deleted file mode 100644
index 72e5f1f..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
+++ /dev/null
@@ -1,432 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-using Lucene.Net.Support.IO;
-using System;
-using System.IO;
-using System.Reflection;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// SmartChineseAnalyzer Bigram dictionary.
-    /// <para/>
-    /// @lucene.experimental
-    /// </summary>
-    internal class BigramDictionary : AbstractDictionary
-    {
-        private BigramDictionary()
-        {
-        }
-
-        public static readonly char WORD_SEGMENT_CHAR = '@';
-
-        private static BigramDictionary singleInstance;
-
-        public static readonly int PRIME_BIGRAM_LENGTH = 402137;
-
-        /// <summary>
-        /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.  
-        /// </summary>
-        private long[] bigramHashTable;
-
-        private int[] frequencyTable;
-
-        private int max = 0;
-
-        private int repeat = 0;
-
-        // static Logger log = Logger.getLogger(BigramDictionary.class);
-
-        private static object syncLock = new object();
-
-        public static BigramDictionary GetInstance()
-        {
-            lock (syncLock)
-            {
-                if (singleInstance == null)
-                {
-                    singleInstance = new BigramDictionary();
-
-                    // LUCENENET specific
-                    // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817
-                    // This issue still existed as of 4.8.0. Here is the fix - we only
-                    // load from a directory if the actual directory exists (AnalyzerProfile
-                    // ensures it is an empty string if it is not available).
-                    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
-                    if (string.IsNullOrEmpty(dictRoot))
-                    {
-                        singleInstance.Load();
-                    }
-                    else
-                    {
-                        singleInstance.Load(dictRoot);
-                    }
-
-
-                    //try
-                    //{
-                    //    singleInstance.Load();
-                    //}
-                    //catch (IOException e)
-                    //{
-                    //    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
-                    //    singleInstance.Load(dictRoot);
-                    //}
-                    //catch (TypeLoadException e)
-                    //{
-                    //    throw new Exception(e.ToString(), e);
-                    //}
-                }
-                return singleInstance;
-            }
-        }
-
-        private bool LoadFromObj(FileInfo serialObj)
-        {
-            try
-            {
-                using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read))
-                    LoadFromInputStream(input);
-                return true;
-            }
-            catch (Exception e)
-            {
-                throw new Exception(e.ToString(), e);
-            }
-        }
-
-        // LUCENENET conversion note:
-        // The data in Lucene is stored in a proprietary binary format (similar to
-        // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the
-        // data was extracted using Java's DataOutputStream using the following Java code.
-        // It can then be read in using the LoadFromInputStream method below 
-        // (using a DataInputStream instead of a BinaryReader), and saved
-        // in the correct (BinaryWriter) format by calling the SaveToObj method.
-        // Alternatively, the data can be loaded from disk using the files
-        // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, 
-        // which will automatically produce the .mem files.
-
-        //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException
-        //{
-        //    // save wordIndexTable
-        //    int wiLen = wordIndexTable.length;
-        //    stream.writeInt(wiLen);
-        //    for (int i = 0; i<wiLen; i++)
-        //    {
-        //        stream.writeShort(wordIndexTable[i]);
-        //    }
-
-        //    // save charIndexTable
-        //    int ciLen = charIndexTable.length;
-        //    stream.writeInt(ciLen);
-        //    for (int i = 0; i<ciLen; i++)
-        //    {
-        //        stream.writeChar(charIndexTable[i]);
-        //    }
-
-        //    int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length;
-        //    stream.writeInt(caDim1);
-        //    for (int i = 0; i<caDim1; i++)
-        //    {
-        //        int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length;
-        //        stream.writeInt(caDim2);
-        //        for (int j = 0; j<caDim2; j++)
-        //        {
-        //            int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length;
-        //            stream.writeInt(caDim3);
-        //            for (int k = 0; k<caDim3; k++)
-        //            {
-        //                stream.writeChar(wordItem_charArrayTable[i][j][k]);
-        //            }
-        //        }
-        //    }
-
-        //    int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length;
-        //    stream.writeInt(fDim1);
-        //    for (int i = 0; i<fDim1; i++)
-        //    {
-        //        int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length;
-        //        stream.writeInt(fDim2);
-        //        for (int j = 0; j<fDim2; j++)
-        //        {
-        //            stream.writeInt(wordItem_frequencyTable[i][j]);
-        //        }
-        //    }
-        //}
-
-        private void LoadFromInputStream(Stream serialObjectInputStream)
-        {
-            //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
-            //bigramHashTable = (long[])input.readObject();
-            //frequencyTable = (int[])input.readObject();
-            //// log.info("load bigram dict from serialization.");
-            //input.close();
-
-            using (var reader = new BinaryReader(serialObjectInputStream))
-            //using (var reader = new DataInputStream(serialObjectInputStream))
-            {
-                // Read bigramHashTable
-                int bhLen = reader.ReadInt32();
-                bigramHashTable = new long[bhLen];
-                for (int i = 0; i < bhLen; i++)
-                {
-                    bigramHashTable[i] = reader.ReadInt64();
-                }
-
-                // Read frequencyTable
-                int fLen = reader.ReadInt32();
-                frequencyTable = new int[fLen];
-                for (int i = 0; i < fLen; i++)
-                {
-                    frequencyTable[i] = reader.ReadInt32();
-                }
-            }
-
-            // log.info("load bigram dict from serialization.");
-        }
-
-        private void SaveToObj(FileInfo serialObj)
-        {
-            try
-            {
-                //ObjectOutputStream output = new ObjectOutputStream(new FileStream(
-                //    serialObj.FullName, FileMode.Create, FileAccess.Write));
-                //output.writeObject(bigramHashTable);
-                //output.writeObject(frequencyTable);
-                //output.close();
-                
-                using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write))
-                {
-                    using (BinaryWriter writer = new BinaryWriter(output))
-                    {
-                        int bhLen = bigramHashTable.Length;
-                        writer.Write(bhLen);
-                        for (int i = 0; i < bhLen; i++)
-                        {
-                            writer.Write(bigramHashTable[i]);
-                        }
-
-                        int fLen = frequencyTable.Length;
-                        writer.Write(fLen);
-                        for (int i = 0; i < fLen; i++)
-                        {
-                            writer.Write(frequencyTable[i]);
-                        }
-                    }
-                }
-                // log.info("serialize bigram dict.");
-            }
-#pragma warning disable 168
-            catch (Exception e)
-#pragma warning restore 168
-            {
-                // log.warn(e.getMessage());
-            }
-        }
-
-        private void Load()
-        {
-            using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem"))
-            {
-                LoadFromInputStream(input);
-            }
-        }
-
-        private void Load(string dictRoot)
-        {
-            string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct");
-
-            FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem"));
-
-            if (serialObj.Exists && LoadFromObj(serialObj))
-            {
-
-            }
-            else
-            {
-                try
-                {
-                    bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
-                    frequencyTable = new int[PRIME_BIGRAM_LENGTH];
-                    for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++)
-                    {
-                        // it is possible for a value to hash to 0, but the probability is extremely low
-                        bigramHashTable[i] = 0;
-                        frequencyTable[i] = 0;
-                    }
-                    LoadFromFile(bigramDictPath);
-                }
-                catch (IOException e)
-                {
-                    throw new Exception(e.ToString(), e);
-                }
-                SaveToObj(serialObj);
-            }
-        }
-
-        /// <summary>
-        /// Load the datafile into this <see cref="BigramDictionary"/>
-        /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
-        /// <exception cref="IOException">If there is a low-level I/O error</exception>
-        public virtual void LoadFromFile(string dctFilePath)
-        {
-            int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.  
-            // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
-            string tmpword;
-            //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
-            using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
-            {
-
-                // GB2312 characters 0 - 6768
-                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
-                {
-                    string currentStr = GetCCByGB2312Id(i);
-                    // if (i == 5231)
-                    // System.out.println(i);
-
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must be converted to work with Java
-                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
-                    if (cnt <= 0)
-                    {
-                        continue;
-                    }
-                    total += cnt;
-                    int j = 0;
-                    while (j < cnt)
-                    {
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                        // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
-                        length = buffer[1];
-                        if (length > 0)
-                        {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            //tmpword = new String(lchBuffer, "GB2312");
-                            tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
-                            //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                            if (i != 3755 + GB2312_FIRST_CHAR)
-                            {
-                                tmpword = currentStr + tmpword;
-                            }
-                            char[] carray = tmpword.ToCharArray();
-                            long hashId = Hash1(carray);
-                            int index = GetAvaliableIndex(hashId, carray);
-                            if (index != -1)
-                            {
-                                if (bigramHashTable[index] == 0)
-                                {
-                                    bigramHashTable[index] = hashId;
-                                    // bigramStringTable[index] = tmpword;
-                                }
-                                frequencyTable[index] += buffer[0];
-                            }
-                        }
-                        j++;
-                    }
-                }
-            }
-            // log.info("load dictionary done! " + dctFilePath + " total:" + total);
-        }
-
-        private int GetAvaliableIndex(long hashId, char[] carray)
-        {
-            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
-            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
-            if (hash1 < 0)
-                hash1 = PRIME_BIGRAM_LENGTH + hash1;
-            if (hash2 < 0)
-                hash2 = PRIME_BIGRAM_LENGTH + hash2;
-            int index = hash1;
-            int i = 1;
-            while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
-                && i < PRIME_BIGRAM_LENGTH)
-            {
-                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
-                i++;
-            }
-            // System.out.println(i - 1);
-
-            if (i < PRIME_BIGRAM_LENGTH
-                && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId))
-            {
-                return index;
-            }
-            else
-                return -1;
-        }
-
-        /// <summary>
-        /// lookup the index into the frequency array.
-        /// </summary>
-        private int GetBigramItemIndex(char[] carray)
-        {
-            long hashId = Hash1(carray);
-            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
-            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
-            if (hash1 < 0)
-                hash1 = PRIME_BIGRAM_LENGTH + hash1;
-            if (hash2 < 0)
-                hash2 = PRIME_BIGRAM_LENGTH + hash2;
-            int index = hash1;
-            int i = 1;
-            repeat++;
-            while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
-                && i < PRIME_BIGRAM_LENGTH)
-            {
-                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
-                i++;
-                repeat++;
-                if (i > max)
-                    max = i;
-            }
-            // System.out.println(i - 1);
-
-            if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId)
-            {
-                return index;
-            }
-            else
-                return -1;
-        }
-
-        public int GetFrequency(char[] carray)
-        {
-            int index = GetBigramItemIndex(carray);
-            if (index != -1)
-                return frequencyTable[index];
-            return 0;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
deleted file mode 100644
index e2ef365..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
+++ /dev/null
@@ -1,253 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using System.Collections.Generic;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Finds the optimal segmentation of a sentence into Chinese words
-    /// <para/>
-    /// @lucene.experimental
-    /// </summary>
-    public class HHMMSegmenter
-    {
-        private static WordDictionary wordDict = WordDictionary.GetInstance();
-
-        /// <summary>
-        /// Create the <see cref="SegGraph"/> for a sentence.
-        /// </summary>
-        /// <param name="sentence">input sentence, without start and end markers</param>
-        /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
-        private SegGraph CreateSegGraph(string sentence)
-        {
-            int i = 0, j;
-            int length = sentence.Length;
-            int foundIndex;
-            CharType[] charTypeArray = GetCharTypes(sentence);
-            StringBuilder wordBuf = new StringBuilder();
-            SegToken token;
-            int frequency = 0; // the number of times word appears.
-            bool hasFullWidth;
-            WordType wordType;
-            char[] charArray;
-
-            SegGraph segGraph = new SegGraph();
-            while (i < length)
-            {
-                hasFullWidth = false;
-                switch (charTypeArray[i])
-                {
-                    case CharType.SPACE_LIKE:
-                        i++;
-                        break;
-                    case CharType.HANZI:
-                        j = i + 1;
-                        //wordBuf.delete(0, wordBuf.length());
-                        wordBuf.Remove(0, wordBuf.Length);
-                        // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
-                        // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
-                        // cause word division.
-                        wordBuf.Append(sentence[i]);
-                        charArray = new char[] { sentence[i] };
-                        frequency = wordDict.GetFrequency(charArray);
-                        token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
-                            frequency);
-                        segGraph.AddToken(token);
-
-                        foundIndex = wordDict.GetPrefixMatch(charArray);
-                        while (j <= length && foundIndex != -1)
-                        {
-                            if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
-                            {
-                                // It is the phrase we are looking for; In other words, we have found a phrase SegToken
-                                // from i to j.  It is not a monosyllabic word (single word).
-                                frequency = wordDict.GetFrequency(charArray);
-                                token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
-                                    frequency);
-                                segGraph.AddToken(token);
-                            }
-
-                            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
-                                j++;
-
-                            if (j < length && charTypeArray[j] == CharType.HANZI)
-                            {
-                                wordBuf.Append(sentence[j]);
-                                charArray = new char[wordBuf.Length];
-                                //wordBuf.GetChars(0, charArray.Length, charArray, 0);
-                                wordBuf.CopyTo(0, charArray, 0, charArray.Length);
-                                // idArray has been found (foundWordIndex!=-1) as a prefix before.  
-                                // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.  
-                                // So start searching after foundWordIndex.
-                                foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
-                                j++;
-                            }
-                            else
-                            {
-                                break;
-                            }
-                        }
-                        i++;
-                        break;
-                    case CharType.FULLWIDTH_LETTER:
-                        hasFullWidth = true; /* intentional fallthrough */
-
-                        j = i + 1;
-                        while (j < length
-                            && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
-                        {
-                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
-                                hasFullWidth = true;
-                            j++;
-                        }
-                        // Found a Token from i to j. Type is LETTER char string.
-                        charArray = Utility.STRING_CHAR_ARRAY;
-                        frequency = wordDict.GetFrequency(charArray);
-                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
-                        token = new SegToken(charArray, i, j, wordType, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-
-                    case CharType.LETTER:
-                        j = i + 1;
-                        while (j < length
-                            && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
-                        {
-                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
-                                hasFullWidth = true;
-                            j++;
-                        }
-                        // Found a Token from i to j. Type is LETTER char string.
-                        charArray = Utility.STRING_CHAR_ARRAY;
-                        frequency = wordDict.GetFrequency(charArray);
-                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
-                        token = new SegToken(charArray, i, j, wordType, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-                    case CharType.FULLWIDTH_DIGIT:
-                        hasFullWidth = true; /* intentional fallthrough */
-
-                        j = i + 1;
-                        while (j < length
-                            && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
-                        {
-                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
-                                hasFullWidth = true;
-                            j++;
-                        }
-                        // Found a Token from i to j. Type is NUMBER char string.
-                        charArray = Utility.NUMBER_CHAR_ARRAY;
-                        frequency = wordDict.GetFrequency(charArray);
-                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
-                        token = new SegToken(charArray, i, j, wordType, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-
-                    case CharType.DIGIT:
-                        j = i + 1;
-                        while (j < length
-                            && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
-                        {
-                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
-                                hasFullWidth = true;
-                            j++;
-                        }
-                        // Found a Token from i to j. Type is NUMBER char string.
-                        charArray = Utility.NUMBER_CHAR_ARRAY;
-                        frequency = wordDict.GetFrequency(charArray);
-                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
-                        token = new SegToken(charArray, i, j, wordType, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-                    case CharType.DELIMITER:
-                        j = i + 1;
-                        // No need to search the weight for the punctuation.  Picking the highest frequency will work.
-                        frequency = Utility.MAX_FREQUENCE;
-                        charArray = new char[] { sentence[i] };
-                        token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-                    default:
-                        j = i + 1;
-                        // Treat the unrecognized char symbol as unknown string.
-                        // For example, any symbol not in GB2312 is treated as one of these.
-                        charArray = Utility.STRING_CHAR_ARRAY;
-                        frequency = wordDict.GetFrequency(charArray);
-                        token = new SegToken(charArray, i, j, WordType.STRING, frequency);
-                        segGraph.AddToken(token);
-                        i = j;
-                        break;
-                }
-            }
-
-            // Add two more Tokens: "beginning xx beginning"
-            charArray = Utility.START_CHAR_ARRAY;
-            frequency = wordDict.GetFrequency(charArray);
-            token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
-            segGraph.AddToken(token);
-
-            // "end xx end"
-            charArray = Utility.END_CHAR_ARRAY;
-            frequency = wordDict.GetFrequency(charArray);
-            token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
-                frequency);
-            segGraph.AddToken(token);
-
-            return segGraph;
-        }
-
-        /// <summary>
-        /// Get the character types for every character in a sentence.
-        /// </summary>
-        /// <param name="sentence">input sentence</param>
-        /// <returns>array of character types corresponding to character positions in the sentence</returns>
-        /// <seealso cref="Utility.GetCharType(char)"/>
-        private static CharType[] GetCharTypes(string sentence)
-        {
-            int length = sentence.Length;
-            CharType[] charTypeArray = new CharType[length];
-            // the type of each character by position
-            for (int i = 0; i < length; i++)
-            {
-                charTypeArray[i] = Utility.GetCharType(sentence[i]);
-            }
-
-            return charTypeArray;
-        }
-
-        /// <summary>
-        /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
-        /// </summary>
-        /// <param name="sentence">input sentence</param>
-        /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
-        public virtual IList<SegToken> Process(string sentence)
-        {
-            SegGraph segGraph = CreateSegGraph(sentence);
-            BiSegGraph biSegGraph = new BiSegGraph(segGraph);
-            IList<SegToken> shortPath = biSegGraph.GetShortPath();
-            return shortPath;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
deleted file mode 100644
index b8de5fb..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
+++ /dev/null
@@ -1,81 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-using System;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// SmartChineseAnalyzer internal node representation
-    /// <para>
-    /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm.
-    /// </para>
-    /// @lucene.experimental
-    /// </summary>
-    internal class PathNode : IComparable<PathNode>
-    {
-        public double Weight { get; set; }
-
-        public int PreNode { get; set; }
-
-        public virtual int CompareTo(PathNode pn)
-        {
-            if (Weight < pn.Weight)
-                return -1;
-            else if (Weight == pn.Weight)
-                return 0;
-            else
-                return 1;
-        }
-
-        /// <summary>
-        /// <see cref="object.GetHashCode()"/>
-        /// </summary>
-        public override int GetHashCode()
-        {
-            int prime = 31;
-            int result = 1;
-            result = prime * result + PreNode;
-            long temp;
-            temp = Number.DoubleToInt64Bits(Weight);
-            result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
-            return result;
-        }
-
-        /// <summary>
-        /// <see cref="object.Equals(object)"/>
-        /// </summary>
-        public override bool Equals(object obj)
-        {
-            if (this == obj)
-                return true;
-            if (obj == null)
-                return false;
-            if (GetType() != obj.GetType())
-                return false;
-            PathNode other = (PathNode)obj;
-            if (PreNode != other.PreNode)
-                return false;
-            if (Number.DoubleToInt64Bits(Weight) != Number
-                .DoubleToInt64Bits(other.Weight))
-                return false;
-            return true;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
deleted file mode 100644
index f3643eb..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
+++ /dev/null
@@ -1,161 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using System.Collections.Generic;
-using System.Text;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Graph representing possible tokens at each start offset in the sentence.
-    /// <para>
-    /// For each start offset, a list of possible tokens is stored.
-    /// </para>
-    /// @lucene.experimental
-    /// </summary>
-    internal class SegGraph
-    {
-        /// <summary>
-        /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens at that position
-        /// </summary>
-        private IDictionary<int, IList<SegToken>> tokenListTable = new Dictionary<int, IList<SegToken>>();
-
-        private int maxStart = -1;
-
-        /// <summary>
-        /// Returns <c>true</c> if a mapping for the specified start offset exists
-        /// </summary>
-        /// <param name="s">startOffset</param>
-        /// <returns><c>true</c> if there are tokens for the startOffset</returns>
-        public virtual bool IsStartExist(int s)
-        {
-            //return tokenListTable.get(s) != null;
-            IList<SegToken> result;
-            return tokenListTable.TryGetValue(s, out result) && result != null;
-        }
-
-        /// <summary>
-        ///  Get the list of tokens at the specified start offset
-        /// </summary>
-        /// <param name="s">startOffset</param>
-        /// <returns><see cref="T:IList{SegToken}"/> of tokens at the specified start offset.</returns>
-        public virtual IList<SegToken> GetStartList(int s)
-        {
-            IList<SegToken> result;
-            tokenListTable.TryGetValue(s, out result);
-            return result;
-        }
-
-        /// <summary>
-        /// Get the highest start offset in the map. Returns maximum start offset, or -1 if the map is empty.
-        /// </summary>
-        public virtual int MaxStart
-        {
-            get { return maxStart; }
-        }
-
-        /// <summary>
-        /// Set the <see cref="SegToken.Index"/> for each token, based upon its order by startOffset. 
-        /// </summary>
-        /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered tokens.</returns>
-        public virtual IList<SegToken> MakeIndex()
-        {
-            IList<SegToken> result = new List<SegToken>();
-            int s = -1, count = 0, size = tokenListTable.Count;
-            IList<SegToken> tokenList;
-            int index = 0;
-            while (count < size)
-            {
-                if (IsStartExist(s))
-                {
-                    tokenList = tokenListTable[s];
-                    foreach (SegToken st in tokenList)
-                    {
-                        st.Index = index;
-                        result.Add(st);
-                        index++;
-                    }
-                    count++;
-                }
-                s++;
-            }
-            return result;
-        }
-
-        /// <summary>
-        /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist. 
-        /// </summary>
-        /// <param name="token">token <see cref="SegToken"/>.</param>
-        public virtual void AddToken(SegToken token)
-        {
-            int s = token.StartOffset;
-            if (!IsStartExist(s))
-            {
-                List<SegToken> newlist = new List<SegToken>();
-                newlist.Add(token);
-                tokenListTable[s] = newlist;
-            }
-            else
-            {
-                IList<SegToken> tokenList = tokenListTable[s];
-                tokenList.Add(token);
-            }
-            if (s > maxStart)
-            {
-                maxStart = s;
-            }
-        }
-
-        /// <summary>
-        /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, ordered by startOffset.
-        /// </summary>
-        /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the map.</returns>
-        public virtual IList<SegToken> ToTokenList()
-        {
-            IList<SegToken> result = new List<SegToken>();
-            int s = -1, count = 0, size = tokenListTable.Count;
-            IList<SegToken> tokenList;
-
-            while (count < size)
-            {
-                if (IsStartExist(s))
-                {
-                    tokenList = tokenListTable[s];
-                    foreach (SegToken st in tokenList)
-                    {
-                        result.Add(st);
-                    }
-                    count++;
-                }
-                s++;
-            }
-            return result;
-        }
-
-        public override string ToString()
-        {
-            IList<SegToken> tokenList = this.ToTokenList();
-            StringBuilder sb = new StringBuilder();
-            foreach (SegToken t in tokenList)
-            {
-                sb.Append(t + "\n");
-            }
-            return sb.ToString();
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
deleted file mode 100644
index f557cbe..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
+++ /dev/null
@@ -1,124 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// SmartChineseAnalyzer internal token
-    /// <para/>
-    /// @lucene.experimental
-    /// </summary>
-    public class SegToken
-    {
-        /// <summary>
-        /// Character array containing token text
-        /// </summary>
-        [WritableArray]
-        public char[] CharArray { get; set; }
-
-        /// <summary>
-        /// start offset into original sentence
-        /// </summary>
-        public int StartOffset { get; set; }
-
-        /// <summary>
-        /// end offset into original sentence
-        /// </summary>
-        public int EndOffset { get; set; }
-
-        /// <summary>
-        /// <see cref="Smart.WordType"/> of the text
-        /// </summary>
-        public WordType WordType { get; set; }
-
-        /// <summary>
-        /// word frequency
-        /// </summary>
-        public int Weight { get; set; }
-
-        /// <summary>
-        /// during segmentation, this is used to store the index of the token in the token list table
-        /// </summary>
-        public int Index { get; set; }
-
-        /// <summary>
-        /// Create a new <see cref="SegToken"/> from a character array.
-        /// </summary>
-        /// <param name="idArray">character array containing text</param>
-        /// <param name="start">start offset of <see cref="SegToken"/> in original sentence</param>
-        /// <param name="end">end offset of <see cref="SegToken"/> in original sentence</param>
-        /// <param name="wordType"><see cref="Smart.WordType"/> of the text</param>
-        /// <param name="weight">word frequency</param>
-        public SegToken(char[] idArray, int start, int end, WordType wordType, int weight)
-        {
-            this.CharArray = idArray;
-            this.StartOffset = start;
-            this.EndOffset = end;
-            this.WordType = wordType;
-            this.Weight = weight;
-        }
-
-        /// <summary>
-        /// <see cref="object.GetHashCode()"/>
-        /// </summary>
-        public override int GetHashCode()
-        {
-            int prime = 31;
-            int result = 1;
-            for (int i = 0; i < CharArray.Length; i++)
-            {
-                result = prime * result + CharArray[i];
-            }
-            result = prime * result + EndOffset;
-            result = prime * result + Index;
-            result = prime * result + StartOffset;
-            result = prime * result + Weight;
-            result = prime * result + (int)WordType;
-            return result;
-        }
-
-        /// <summary>
-        /// <see cref="object.Equals(object)"/>
-        /// </summary>
-        public override bool Equals(object obj)
-        {
-            if (this == obj)
-                return true;
-            if (obj == null)
-                return false;
-            if (GetType() != obj.GetType())
-                return false;
-            SegToken other = (SegToken)obj;
-            if (!Arrays.Equals(CharArray, other.CharArray))
-                return false;
-            if (EndOffset != other.EndOffset)
-                return false;
-            if (Index != other.Index)
-                return false;
-            if (StartOffset != other.StartOffset)
-                return false;
-            if (Weight != other.Weight)
-                return false;
-            if (WordType != other.WordType)
-                return false;
-            return true;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
deleted file mode 100644
index 5b61cff..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
+++ /dev/null
@@ -1,76 +0,0 @@
-// lucene version compatibility level: 4.8.1
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// <para>
-    /// Filters a <see cref="SegToken"/> by converting full-width latin to half-width, then lowercasing latin.
-    /// Additionally, all punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/>
-    /// </para>
-    /// @lucene.experimental
-    /// </summary>
-    public class SegTokenFilter
-    {
-        /// <summary>
-        /// Filter an input <see cref="SegToken"/>
-        /// <para>
-        /// Full-width latin will be converted to half-width, then all latin will be lowercased.
-        /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/>
-        /// </para>
-        /// </summary>
-        /// <param name="token">Input <see cref="SegToken"/>.</param>
-        /// <returns>Normalized <see cref="SegToken"/>.</returns>
-        public virtual SegToken Filter(SegToken token)
-        {
-            switch (token.WordType)
-            {
-                case WordType.FULLWIDTH_NUMBER:
-                case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
-                    for (int i = 0; i < token.CharArray.Length; i++)
-                    {
-                        if (token.CharArray[i] >= 0xFF10)
-                        {
-                            token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0);
-                        }
-
-                        if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */
-                        {
-                            token.CharArray[i] = (char)(token.CharArray[i] + 0x0020);
-                        }
-                    }
-                    break;
-                case WordType.STRING:
-                    for (int i = 0; i < token.CharArray.Length; i++)
-                    {
-                        if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */
-                        {
-                            token.CharArray[i] = (char)(token.CharArray[i] + 0x0020);
-                        }
-                    }
-                    break;
-                case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
-                    token.CharArray = Utility.COMMON_DELIMITER;
-                    break;
-                default:
-                    break;
-            }
-            return token;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
deleted file mode 100644
index b5ceecd..0000000
--- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
+++ /dev/null
@@ -1,96 +0,0 @@
-// lucene version compatibility level: 4.8.1
-using Lucene.Net.Support;
-
-namespace Lucene.Net.Analysis.Cn.Smart.HHMM
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// A pair of tokens in <see cref="SegGraph"/>
-    /// <para/>
-    /// @lucene.experimental
-    /// </summary>
-    internal class SegTokenPair
-    {
-        [WritableArray]
-        public char[] CharArray { get; set; }
-
-        /// <summary>
-        /// index of the first token in <see cref="SegGraph"/>
-        /// </summary>
-        public int From { get; set; }
-
-        /// <summary>
-        /// index of the second token in <see cref="SegGraph"/>
-        /// </summary>
-        public int To { get; set; }
-
-        public double Weight { get; set; }
-
-        public SegTokenPair(char[] idArray, int from, int to, double weight)
-        {
-            this.CharArray = idArray;
-            this.From = from;
-            this.To = to;
-            this.Weight = weight;
-        }
-
-        /// <summary>
-        /// <see cref="object.GetHashCode()"/>
-        /// </summary>
-        public override int GetHashCode()
-        {
-            int prime = 31;
-            int result = 1;
-            for (int i = 0; i < CharArray.Length; i++)
-            {
-                result = prime * result + CharArray[i];
-            }
-            result = prime * result + From;
-            result = prime * result + To;
-            long temp;
-            temp = Number.DoubleToInt64Bits(Weight);
-            result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
-            return result;
-        }
-
-        /// <summary>
-        /// <see cref="object.Equals(object)"/>
-        /// </summary>
-        public override bool Equals(object obj)
-        {
-            if (this == obj)
-                return true;
-            if (obj == null)
-                return false;
-            if (GetType() != obj.GetType())
-                return false;
-            SegTokenPair other = (SegTokenPair)obj;
-            if (!Arrays.Equals(CharArray, other.CharArray))
-                return false;
-            if (From != other.From)
-                return false;
-            if (To != other.To)
-                return false;
-            if (Number.DoubleToInt64Bits(Weight) != Number
-                .DoubleToInt64Bits(other.Weight))
-                return false;
-            return true;
-        }
-    }
-}


Mime
View raw message