lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [19/52] [abbrv] lucenenet git commit: Ported Analysis.Compound namespace + tests
Date Thu, 01 Sep 2016 14:39:40 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
index d3fa779..33bc310 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
@@ -1,528 +1,580 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System;
+using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Text;
+using System.Xml;
 
 namespace Lucene.Net.Analysis.Compound.Hyphenation
 {
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     * 
+     *      http://www.apache.org/licenses/LICENSE-2.0
+     * 
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
     /// <summary>
 	/// This tree structure stores the hyphenation patterns in an efficient way for
 	/// fast lookup. It provides the provides the method to hyphenate a word.
 	/// 
 	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
 	/// </summary>
-	public class HyphenationTree : TernaryTree, PatternConsumer
-	{
-
-	  /// <summary>
-	  /// value space: stores the interletter values
-	  /// </summary>
-	  protected internal ByteVector vspace;
-
-	  /// <summary>
-	  /// This map stores hyphenation exceptions
-	  /// </summary>
-	  protected internal Dictionary<string, List<object>> stoplist;
-
-	  /// <summary>
-	  /// This map stores the character classes
-	  /// </summary>
-	  protected internal TernaryTree classmap;
-
-	  /// <summary>
-	  /// Temporary map to store interletter values on pattern loading.
-	  /// </summary>
-	  [NonSerialized]
-	  private TernaryTree ivalues;
-
-	  public HyphenationTree()
-	  {
-		stoplist = new Dictionary<>(23); // usually a small table
-		classmap = new TernaryTree();
-		vspace = new ByteVector();
-		vspace.alloc(1); // this reserves index 0, which we don't use
-	  }
-
-	  /// <summary>
-	  /// Packs the values by storing them in 4 bits, two values into a byte Values
-	  /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
-	  /// value.
-	  /// </summary>
-	  /// <param name="values"> a string of digits from '0' to '9' representing the
-	  ///        interletter values. </param>
-	  /// <returns> the index into the vspace array where the packed values are stored. </returns>
-	  protected internal virtual int packValues(string values)
-	  {
-		int i , n = values.Length;
-		int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
-		int offset = vspace.alloc(m);
-		sbyte[] va = vspace.Array;
-		for (i = 0; i < n; i++)
-		{
-		  int j = i >> 1;
-		  sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
-		  if ((i & 1) == 1)
-		  {
-			va[j + offset] = (sbyte)(va[j + offset] | v);
-		  }
-		  else
-		  {
-			va[j + offset] = (sbyte)(v << 4); // big endian
-		  }
-		}
-		va[m - 1 + offset] = 0; // terminator
-		return offset;
-	  }
-
-	  protected internal virtual string unpackValues(int k)
-	  {
-		StringBuilder buf = new StringBuilder();
-		sbyte v = vspace.get(k++);
-		while (v != 0)
-		{
-		  char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
-		  buf.Append(c);
-		  c = (char)(v & 0x0f);
-		  if (c == 0)
-		  {
-			break;
-		  }
-		  c = (char)(c - 1 + '0');
-		  buf.Append(c);
-		  v = vspace.get(k++);
-		}
-		return buf.ToString();
-	  }
-
-	  /// <summary>
-	  /// Read hyphenation patterns from an XML file.
-	  /// </summary>
-	  /// <param name="f"> the filename </param>
-	  /// <exception cref="IOException"> In case the parsing fails </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void loadPatterns(java.io.File f) throws java.io.IOException
-	  public virtual void loadPatterns(File f)
-	  {
-		InputSource src = new InputSource(f.toURI().toASCIIString());
-		loadPatterns(src);
-	  }
-
-	  /// <summary>
-	  /// Read hyphenation patterns from an XML file.
-	  /// </summary>
-	  /// <param name="source"> the InputSource for the file </param>
-	  /// <exception cref="IOException"> In case the parsing fails </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void loadPatterns(org.xml.sax.InputSource source) throws java.io.IOException
-	  public virtual void loadPatterns(InputSource source)
-	  {
-		PatternParser pp = new PatternParser(this);
-		ivalues = new TernaryTree();
-
-		pp.parse(source);
-
-		// patterns/values should be now in the tree
-		// let's optimize a bit
-		trimToSize();
-		vspace.trimToSize();
-		classmap.trimToSize();
-
-		// get rid of the auxiliary map
-		ivalues = null;
-	  }
-
-	  public virtual string findPattern(string pat)
-	  {
-		int k = base.find(pat);
-		if (k >= 0)
-		{
-		  return unpackValues(k);
-		}
-		return "";
-	  }
-
-	  /// <summary>
-	  /// String compare, returns 0 if equal or t is a substring of s
-	  /// </summary>
-	  protected internal virtual int hstrcmp(char[] s, int si, char[] t, int ti)
-	  {
-		for (; s[si] == t[ti]; si++, ti++)
-		{
-		  if (s[si] == 0)
-		  {
-			return 0;
-		  }
-		}
-		if (t[ti] == 0)
-		{
-		  return 0;
-		}
-		return s[si] - t[ti];
-	  }
-
-	  protected internal virtual sbyte[] getValues(int k)
-	  {
-		StringBuilder buf = new StringBuilder();
-		sbyte v = vspace.get(k++);
-		while (v != 0)
-		{
-		  char c = (char)(((int)((uint)v >> 4)) - 1);
-		  buf.Append(c);
-		  c = (char)(v & 0x0f);
-		  if (c == 0)
-		  {
-			break;
-		  }
-		  c = (char)(c - 1);
-		  buf.Append(c);
-		  v = vspace.get(k++);
-		}
-		sbyte[] res = new sbyte[buf.Length];
-		for (int i = 0; i < res.Length; i++)
-		{
-		  res[i] = (sbyte) buf[i];
-		}
-		return res;
-	  }
-
-	  /// <summary>
-	  /// <para>
-	  /// Search for all possible partial matches of word starting at index an update
-	  /// interletter values. In other words, it does something like:
-	  /// </para>
-	  /// <code>
-	  /// for(i=0; i&lt;patterns.length; i++) {
-	  /// if ( word.substring(index).startsWidth(patterns[i]) )
-	  /// update_interletter_values(patterns[i]);
-	  /// }
-	  /// </code>
-	  /// <para>
-	  /// But it is done in an efficient way since the patterns are stored in a
-	  /// ternary tree. In fact, this is the whole purpose of having the tree: doing
-	  /// this search without having to test every single pattern. The number of
-	  /// patterns for languages such as English range from 4000 to 10000. Thus,
-	  /// doing thousands of string comparisons for each word to hyphenate would be
-	  /// really slow without the tree. The tradeoff is memory, but using a ternary
-	  /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
-	  /// It's also faster than using a hash table
-	  /// </para>
-	  /// </summary>
-	  /// <param name="word"> null terminated word to match </param>
-	  /// <param name="index"> start index from word </param>
-	  /// <param name="il"> interletter values array to update </param>
-	  protected internal virtual void searchPatterns(char[] word, int index, sbyte[] il)
-	  {
-		sbyte[] values;
-		int i = index;
-		char p, q;
-		char sp = word[i];
-		p = root;
-
-		while (p > 0 && p < sc.Length)
-		{
-		  if (sc[p] == 0xFFFF)
-		  {
-			if (hstrcmp(word, i, kv.Array, lo[p]) == 0)
-			{
-			  values = getValues(eq[p]); // data pointer is in eq[]
-			  int j = index;
-			  for (int k = 0; k < values.Length; k++)
-			  {
-				if (j < il.Length && values[k] > il[j])
-				{
-				  il[j] = values[k];
-				}
-				j++;
-			  }
-			}
-			return;
-		  }
-		  int d = sp - sc[p];
-		  if (d == 0)
-		  {
-			if (sp == 0)
-			{
-			  break;
-			}
-			sp = word[++i];
-			p = eq[p];
-			q = p;
-
-			// look for a pattern ending at this position by searching for
-			// the null char ( splitchar == 0 )
-			while (q > 0 && q < sc.Length)
-			{
-			  if (sc[q] == 0xFFFF) // stop at compressed branch
-			  {
-				break;
-			  }
-			  if (sc[q] == 0)
-			  {
-				values = getValues(eq[q]);
-				int j = index;
-				for (int k = 0; k < values.Length; k++)
-				{
-				  if (j < il.Length && values[k] > il[j])
-				  {
-					il[j] = values[k];
-				  }
-				  j++;
-				}
-				break;
-			  }
-			  else
-			  {
-				q = lo[q];
-
-				/// <summary>
-				/// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
-				/// java chars are unsigned
-				/// </summary>
-			  }
-			}
-		  }
-		  else
-		  {
-			p = d < 0 ? lo[p] : hi[p];
-		  }
-		}
-	  }
-
-	  /// <summary>
-	  /// Hyphenate word and return a Hyphenation object.
-	  /// </summary>
-	  /// <param name="word"> the word to be hyphenated </param>
-	  /// <param name="remainCharCount"> Minimum number of characters allowed before the
-	  ///        hyphenation point. </param>
-	  /// <param name="pushCharCount"> Minimum number of characters allowed after the
-	  ///        hyphenation point. </param>
-	  /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
-	  ///         hyphenated word or null if word is not hyphenated. </returns>
-	  public virtual Hyphenation hyphenate(string word, int remainCharCount, int pushCharCount)
-	  {
-		char[] w = word.ToCharArray();
-		return hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
-	  }
-
-	  /// <summary>
-	  /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
-	  /// may be absent, the first n is at offset, the first l is at offset +
-	  /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
-	  /// into word. In the first part of the routine len = w.length, in the second
-	  /// part of the routine len = word.length. Three indices are used: index(w),
-	  /// the index in w, index(word), the index in word, letterindex(word), the
-	  /// index in the letter part of word. The following relations exist: index(w) =
-	  /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
-	  /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
-	  /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
-	  /// iIgnoreAtBeginning
-	  /// </summary>
-
-	  /// <summary>
-	  /// Hyphenate word and return an array of hyphenation points.
-	  /// </summary>
-	  /// <param name="w"> char array that contains the word </param>
-	  /// <param name="offset"> Offset to first character in word </param>
-	  /// <param name="len"> Length of word </param>
-	  /// <param name="remainCharCount"> Minimum number of characters allowed before the
-	  ///        hyphenation point. </param>
-	  /// <param name="pushCharCount"> Minimum number of characters allowed after the
-	  ///        hyphenation point. </param>
-	  /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
-	  ///         hyphenated word or null if word is not hyphenated. </returns>
-	  public virtual Hyphenation hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
-	  {
-		int i;
-		char[] word = new char[len + 3];
-
-		// normalize word
-		char[] c = new char[2];
-		int iIgnoreAtBeginning = 0;
-		int iLength = len;
-		bool bEndOfLetters = false;
-		for (i = 1; i <= len; i++)
-		{
-		  c[0] = w[offset + i - 1];
-		  int nc = classmap.find(c, 0);
-		  if (nc < 0) // found a non-letter character ...
-		  {
-			if (i == (1 + iIgnoreAtBeginning))
-			{
-			  // ... before any letter character
-			  iIgnoreAtBeginning++;
-			}
-			else
-			{
-			  // ... after a letter character
-			  bEndOfLetters = true;
-			}
-			iLength--;
-		  }
-		  else
-		  {
-			if (!bEndOfLetters)
-			{
-			  word[i - iIgnoreAtBeginning] = (char) nc;
-			}
-			else
-			{
-			  return null;
-			}
-		  }
-		}
-		len = iLength;
-		if (len < (remainCharCount + pushCharCount))
-		{
-		  // word is too short to be hyphenated
-		  return null;
-		}
-		int[] result = new int[len + 1];
-		int k = 0;
-
-		// check exception list first
-		string sw = new string(word, 1, len);
-		if (stoplist.ContainsKey(sw))
-		{
-		  // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
-		  // null)
-		  List<object> hw = stoplist[sw];
-		  int j = 0;
-		  for (i = 0; i < hw.Count; i++)
-		  {
-			object o = hw[i];
-			// j = index(sw) = letterindex(word)?
-			// result[k] = corresponding index(w)
-			if (o is string)
-			{
-			  j += ((string) o).Length;
-			  if (j >= remainCharCount && j < (len - pushCharCount))
-			  {
-				result[k++] = j + iIgnoreAtBeginning;
-			  }
-			}
-		  }
-		}
-		else
-		{
-		  // use algorithm to get hyphenation points
-		  word[0] = '.'; // word start marker
-		  word[len + 1] = '.'; // word end marker
-		  word[len + 2] = (char)0; // null terminated
-		  sbyte[] il = new sbyte[len + 3]; // initialized to zero
-		  for (i = 0; i < len + 1; i++)
-		  {
-			searchPatterns(word, i, il);
-		  }
-
-		  // hyphenation points are located where interletter value is odd
-		  // i is letterindex(word),
-		  // i + 1 is index(word),
-		  // result[k] = corresponding index(w)
-		  for (i = 0; i < len; i++)
-		  {
-			if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
-			{
-			  result[k++] = i + iIgnoreAtBeginning;
-			}
-		  }
-		}
-
-		if (k > 0)
-		{
-		  // trim result array
-		  int[] res = new int[k + 2];
-		  Array.Copy(result, 0, res, 1, k);
-		  // We add the synthetical hyphenation points
-		  // at the beginning and end of the word
-		  res[0] = 0;
-		  res[k + 1] = len;
-		  return new Hyphenation(res);
-		}
-		else
-		{
-		  return null;
-		}
-	  }
-
-	  /// <summary>
-	  /// Add a character class to the tree. It is used by
-	  /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
-	  /// Character classes define the valid word characters for hyphenation. If a
-	  /// word contains a character not defined in any of the classes, it is not
-	  /// hyphenated. It also defines a way to normalize the characters in order to
-	  /// compare them with the stored patterns. Usually pattern files use only lower
-	  /// case characters, in this case a class for letter 'a', for example, should
-	  /// be defined as "aA", the first character being the normalization char.
-	  /// </summary>
-	  public virtual void addClass(string chargroup)
-	  {
-		if (chargroup.Length > 0)
-		{
-		  char equivChar = chargroup[0];
-		  char[] key = new char[2];
-		  key[1] = (char)0;
-		  for (int i = 0; i < chargroup.Length; i++)
-		  {
-			key[0] = chargroup[i];
-			classmap.insert(key, 0, equivChar);
-		  }
-		}
-	  }
-
-	  /// <summary>
-	  /// Add an exception to the tree. It is used by
-	  /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
-	  /// hyphenation exceptions.
-	  /// </summary>
-	  /// <param name="word"> normalized word </param>
-	  /// <param name="hyphenatedword"> a vector of alternating strings and
-	  ///        <seealso cref="Hyphen hyphen"/> objects. </param>
-	  public virtual void addException(string word, List<object> hyphenatedword)
-	  {
-		stoplist[word] = hyphenatedword;
-	  }
-
-	  /// <summary>
-	  /// Add a pattern to the tree. Mainly, to be used by
-	  /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
-	  /// the tree.
-	  /// </summary>
-	  /// <param name="pattern"> the hyphenation pattern </param>
-	  /// <param name="ivalue"> interletter weight values indicating the desirability and
-	  ///        priority of hyphenating at a given point within the pattern. It
-	  ///        should contain only digit characters. (i.e. '0' to '9'). </param>
-	  public virtual void addPattern(string pattern, string ivalue)
-	  {
-		int k = ivalues.find(ivalue);
-		if (k <= 0)
-		{
-		  k = packValues(ivalue);
-		  ivalues.insert(ivalue, (char) k);
-		}
-		insert(pattern, (char) k);
-	  }
-
-	  public override void printStats(PrintStream @out)
-	  {
-		@out.println("Value space size = " + Convert.ToString(vspace.length()));
-		base.printStats(@out);
-
-	  }
-	}
-
+	public class HyphenationTree : TernaryTree, IPatternConsumer
+    {
+
+        /// <summary>
+        /// value space: stores the interletter values
+        /// </summary>
+        protected internal ByteVector vspace;
+
+        /// <summary>
+        /// This map stores hyphenation exceptions
+        /// </summary>
+        protected internal Dictionary<string, List<object>> stoplist;
+
+        /// <summary>
+        /// This map stores the character classes
+        /// </summary>
+        protected internal TernaryTree classmap;
+
+        /// <summary>
+        /// Temporary map to store interletter values on pattern loading.
+        /// </summary>
+        [NonSerialized]
+        private TernaryTree ivalues;
+
+        public HyphenationTree()
+        {
+            stoplist = new Dictionary<string, List<object>>(23); // usually a small table
+            classmap = new TernaryTree();
+            vspace = new ByteVector();
+            vspace.Alloc(1); // this reserves index 0, which we don't use
+        }
+
+        /// <summary>
+        /// Packs the values by storing them in 4 bits, two values into a byte Values
+        /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+        /// value.
+        /// </summary>
+        /// <param name="values"> a string of digits from '0' to '9' representing the
+        ///        interletter values. </param>
+        /// <returns> the index into the vspace array where the packed values are stored. </returns>
+        protected internal virtual int PackValues(string values)
+        {
+            int i, n = values.Length;
+            int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+            int offset = vspace.Alloc(m);
+            sbyte[] va = vspace.Array;
+            for (i = 0; i < n; i++)
+            {
+                int j = i >> 1;
+                sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
+                if ((i & 1) == 1)
+                {
+                    va[j + offset] = (sbyte)(va[j + offset] | v);
+                }
+                else
+                {
+                    va[j + offset] = (sbyte)(v << 4); // big endian
+                }
+            }
+            va[m - 1 + offset] = 0; // terminator
+            return offset;
+        }
+
+        protected internal virtual string UnpackValues(int k)
+        {
+            StringBuilder buf = new StringBuilder();
+            sbyte v = vspace[k++];
+            while (v != 0)
+            {
+                char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
+                buf.Append(c);
+                c = (char)(v & 0x0f);
+                if (c == 0)
+                {
+                    break;
+                }
+                c = (char)(c - 1 + '0');
+                buf.Append(c);
+                v = vspace[k++];
+            }
+            return buf.ToString();
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="f"> the filename </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(string filename)
+        {
+            LoadPatterns(filename, Encoding.UTF8);
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="f"> the filename </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(string filename, Encoding encoding)
+        {
+            var src = new FileStream(filename, FileMode.Open, FileAccess.Read);
+            LoadPatterns(src, encoding);
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="f"> the filename </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(FileInfo f)
+        {
+            LoadPatterns(f, Encoding.UTF8);
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="f"> the filename </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(FileInfo f, Encoding encoding)
+        {
+            var src = new FileStream(f.FullName, FileMode.Open, FileAccess.Read);
+            LoadPatterns(src, encoding);
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="source"> the InputSource for the file </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(Stream source)
+        {
+            LoadPatterns(source, Encoding.UTF8);
+        }
+
+        /// <summary>
+        /// Read hyphenation patterns from an XML file.
+        /// </summary>
+        /// <param name="source"> the InputSource for the file </param>
+        /// <exception cref="IOException"> In case the parsing fails </exception>
+        public virtual void LoadPatterns(Stream source, Encoding encoding)
+        {
+            // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+            using (var reader = XmlReader.Create(new StreamReader(source, encoding), new XmlReaderSettings
+            {
+                DtdProcessing = DtdProcessing.Parse,
+                XmlResolver = new PatternParser.DtdResolver()
+            }))
+            {
+                LoadPatterns(reader);
+            }
+        }
+
+        public virtual void LoadPatterns(XmlReader source)
+        {
+            PatternParser pp = new PatternParser(this);
+            ivalues = new TernaryTree();
+
+            pp.Parse(source);
+
+            // patterns/values should be now in the tree
+            // let's optimize a bit
+            TrimToSize();
+            vspace.TrimToSize();
+            classmap.TrimToSize();
+
+            // get rid of the auxiliary map
+            ivalues = null;
+        }
+
+        public virtual string FindPattern(string pat)
+        {
+            int k = base.Find(pat);
+            if (k >= 0)
+            {
+                return UnpackValues(k);
+            }
+            return "";
+        }
+
+        /// <summary>
+        /// String compare, returns 0 if equal or t is a substring of s
+        /// </summary>
+        protected internal virtual int HStrCmp(char[] s, int si, char[] t, int ti)
+        {
+            for (; s[si] == t[ti]; si++, ti++)
+            {
+                if (s[si] == 0)
+                {
+                    return 0;
+                }
+            }
+            if (t[ti] == 0)
+            {
+                return 0;
+            }
+            return s[si] - t[ti];
+        }
+
+        protected internal virtual sbyte[] GetValues(int k)
+        {
+            StringBuilder buf = new StringBuilder();
+            sbyte v = vspace[k++];
+            while (v != 0)
+            {
+                char c = (char)(((int)((uint)v >> 4)) - 1);
+                buf.Append(c);
+                c = (char)(v & 0x0f);
+                if (c == 0)
+                {
+                    break;
+                }
+                c = (char)(c - 1);
+                buf.Append(c);
+                v = vspace[k++];
+            }
+            sbyte[] res = new sbyte[buf.Length];
+            for (int i = 0; i < res.Length; i++)
+            {
+                res[i] = (sbyte)buf[i];
+            }
+            return res;
+        }
+
+        /// <summary>
+        /// <para>
+        /// Search for all possible partial matches of word starting at index an update
+        /// interletter values. In other words, it does something like:
+        /// </para>
+        /// <code>
+        /// for(i=0; i&lt;patterns.length; i++) {
+        /// if ( word.substring(index).startsWidth(patterns[i]) )
+        /// update_interletter_values(patterns[i]);
+        /// }
+        /// </code>
+        /// <para>
+        /// But it is done in an efficient way since the patterns are stored in a
+        /// ternary tree. In fact, this is the whole purpose of having the tree: doing
+        /// this search without having to test every single pattern. The number of
+        /// patterns for languages such as English range from 4000 to 10000. Thus,
+        /// doing thousands of string comparisons for each word to hyphenate would be
+        /// really slow without the tree. The tradeoff is memory, but using a ternary
+        /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
+        /// It's also faster than using a hash table
+        /// </para>
+        /// </summary>
+        /// <param name="word"> null terminated word to match </param>
+        /// <param name="index"> start index from word </param>
+        /// <param name="il"> interletter values array to update </param>
+        protected internal virtual void SearchPatterns(char[] word, int index, sbyte[] il)
+        {
+            sbyte[] values;
+            int i = index;
+            char p, q;
+            char sp = word[i];
+            p = root;
+
+            while (p > 0 && p < sc.Length)
+            {
+                if (sc[p] == 0xFFFF)
+                {
+                    if (HStrCmp(word, i, kv.Array, lo[p]) == 0)
+                    {
+                        values = GetValues(eq[p]); // data pointer is in eq[]
+                        int j = index;
+                        for (int k = 0; k < values.Length; k++)
+                        {
+                            if (j < il.Length && values[k] > il[j])
+                            {
+                                il[j] = values[k];
+                            }
+                            j++;
+                        }
+                    }
+                    return;
+                }
+                int d = sp - sc[p];
+                if (d == 0)
+                {
+                    if (sp == 0)
+                    {
+                        break;
+                    }
+                    sp = word[++i];
+                    p = eq[p];
+                    q = p;
+
+                    // look for a pattern ending at this position by searching for
+                    // the null char ( splitchar == 0 )
+                    while (q > 0 && q < sc.Length)
+                    {
+                        if (sc[q] == 0xFFFF) // stop at compressed branch
+                        {
+                            break;
+                        }
+                        if (sc[q] == 0)
+                        {
+                            values = GetValues(eq[q]);
+                            int j = index;
+                            for (int k = 0; k < values.Length; k++)
+                            {
+                                if (j < il.Length && values[k] > il[j])
+                                {
+                                    il[j] = values[k];
+                                }
+                                j++;
+                            }
+                            break;
+                        }
+                        else
+                        {
+                            q = lo[q];
+
+                            /// <summary>
+                            /// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+                            /// java chars are unsigned
+                            /// </summary>
+                        }
+                    }
+                }
+                else
+                {
+                    p = d < 0 ? lo[p] : hi[p];
+                }
+            }
+        }
+
+        /// <summary>
+        /// Hyphenate word and return a Hyphenation object.
+        /// </summary>
+        /// <param name="word"> the word to be hyphenated </param>
+        /// <param name="remainCharCount"> Minimum number of characters allowed before the
+        ///        hyphenation point. </param>
+        /// <param name="pushCharCount"> Minimum number of characters allowed after the
+        ///        hyphenation point. </param>
+        /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+        ///         hyphenated word or null if word is not hyphenated. </returns>
+        public virtual Hyphenation Hyphenate(string word, int remainCharCount, int pushCharCount)
+        {
+            char[] w = word.ToCharArray();
+            return Hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
+        }
+
+        /// <summary>
+        /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+        /// may be absent, the first n is at offset, the first l is at offset +
+        /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+        /// into word. In the first part of the routine len = w.length, in the second
+        /// part of the routine len = word.length. Three indices are used: index(w),
+        /// the index in w, index(word), the index in word, letterindex(word), the
+        /// index in the letter part of word. The following relations exist: index(w) =
+        /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+        /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+        /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+        /// iIgnoreAtBeginning
+        /// </summary>
+
+        /// <summary>
+        /// Hyphenate word and return an array of hyphenation points.
+        /// </summary>
+        /// <param name="w"> char array that contains the word </param>
+        /// <param name="offset"> Offset to first character in word </param>
+        /// <param name="len"> Length of word </param>
+        /// <param name="remainCharCount"> Minimum number of characters allowed before the
+        ///        hyphenation point. </param>
+        /// <param name="pushCharCount"> Minimum number of characters allowed after the
+        ///        hyphenation point. </param>
+        /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+        ///         hyphenated word or null if word is not hyphenated. </returns>
+        public virtual Hyphenation Hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
+        {
+            int i;
+            char[] word = new char[len + 3];
+
+            // normalize word
+            char[] c = new char[2];
+            int iIgnoreAtBeginning = 0;
+            int iLength = len;
+            bool bEndOfLetters = false;
+            for (i = 1; i <= len; i++)
+            {
+                c[0] = w[offset + i - 1];
+                int nc = classmap.Find(c, 0);
+                if (nc < 0) // found a non-letter character ...
+                {
+                    if (i == (1 + iIgnoreAtBeginning))
+                    {
+                        // ... before any letter character
+                        iIgnoreAtBeginning++;
+                    }
+                    else
+                    {
+                        // ... after a letter character
+                        bEndOfLetters = true;
+                    }
+                    iLength--;
+                }
+                else
+                {
+                    if (!bEndOfLetters)
+                    {
+                        word[i - iIgnoreAtBeginning] = (char)nc;
+                    }
+                    else
+                    {
+                        return null;
+                    }
+                }
+            }
+            len = iLength;
+            if (len < (remainCharCount + pushCharCount))
+            {
+                // word is too short to be hyphenated
+                return null;
+            }
+            int[] result = new int[len + 1];
+            int k = 0;
+
+            // check exception list first
+            string sw = new string(word, 1, len);
+            if (stoplist.ContainsKey(sw))
+            {
+                // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+                // null)
+                List<object> hw = stoplist[sw];
+                int j = 0;
+                for (i = 0; i < hw.Count; i++)
+                {
+                    object o = hw[i];
+                    // j = index(sw) = letterindex(word)?
+                    // result[k] = corresponding index(w)
+                    if (o is string)
+                    {
+                        j += ((string)o).Length;
+                        if (j >= remainCharCount && j < (len - pushCharCount))
+                        {
+                            result[k++] = j + iIgnoreAtBeginning;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // use algorithm to get hyphenation points
+                word[0] = '.'; // word start marker
+                word[len + 1] = '.'; // word end marker
+                word[len + 2] = (char)0; // null terminated
+                sbyte[] il = new sbyte[len + 3]; // initialized to zero
+                for (i = 0; i < len + 1; i++)
+                {
+                    SearchPatterns(word, i, il);
+                }
+
+                // hyphenation points are located where interletter value is odd
+                // i is letterindex(word),
+                // i + 1 is index(word),
+                // result[k] = corresponding index(w)
+                for (i = 0; i < len; i++)
+                {
+                    if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
+                    {
+                        result[k++] = i + iIgnoreAtBeginning;
+                    }
+                }
+            }
+
+            if (k > 0)
+            {
+                // trim result array
+                int[] res = new int[k + 2];
+                Array.Copy(result, 0, res, 1, k);
+                // We add the synthetical hyphenation points
+                // at the beginning and end of the word
+                res[0] = 0;
+                res[k + 1] = len;
+                return new Hyphenation(res);
+            }
+            else
+            {
+                return null;
+            }
+        }
+
+        /// <summary>
+        /// Add a character class to the tree. It is used by
+        /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
+        /// Character classes define the valid word characters for hyphenation. If a
+        /// word contains a character not defined in any of the classes, it is not
+        /// hyphenated. It also defines a way to normalize the characters in order to
+        /// compare them with the stored patterns. Usually pattern files use only lower
+        /// case characters, in this case a class for letter 'a', for example, should
+        /// be defined as "aA", the first character being the normalization char.
+        /// </summary>
+        public virtual void AddClass(string chargroup)
+        {
+            if (chargroup.Length > 0)
+            {
+                char equivChar = chargroup[0];
+                char[] key = new char[2];
+                key[1] = (char)0;
+                for (int i = 0; i < chargroup.Length; i++)
+                {
+                    key[0] = chargroup[i];
+                    classmap.Insert(key, 0, equivChar);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Add an exception to the tree. It is used by
+        /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
+        /// hyphenation exceptions.
+        /// </summary>
+        /// <param name="word"> normalized word </param>
+        /// <param name="hyphenatedword"> a vector of alternating strings and
+        ///        <seealso cref="Hyphen hyphen"/> objects. </param>
+        public virtual void AddException(string word, List<object> hyphenatedword)
+        {
+            stoplist[word] = hyphenatedword;
+        }
+
+        /// <summary>
+        /// Add a pattern to the tree. Mainly, to be used by
+        /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
+        /// the tree.
+        /// </summary>
+        /// <param name="pattern"> the hyphenation pattern </param>
+        /// <param name="ivalue"> interletter weight values indicating the desirability and
+        ///        priority of hyphenating at a given point within the pattern. It
+        ///        should contain only digit characters. (i.e. '0' to '9'). </param>
+        public virtual void AddPattern(string pattern, string ivalue)
+        {
+            int k = ivalues.Find(ivalue);
+            if (k <= 0)
+            {
+                k = PackValues(ivalue);
+                ivalues.Insert(ivalue, (char)k);
+            }
+            Insert(pattern, (char)k);
+        }
+
+        // public override void printStats(PrintStream @out)
+        // {
+        //@out.println("Value space size = " + Convert.ToString(vspace.length()));
+        //base.printStats(@out);
+
+        // }
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
index 762b832..069badd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
@@ -1,31 +1,31 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System.Collections.Generic;
+using System.Collections.Generic;
 
 namespace Lucene.Net.Analysis.Compound.Hyphenation
 {
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     * 
+     *      http://www.apache.org/licenses/LICENSE-2.0
+     * 
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
 
-	/// <summary>
-	/// This interface is used to connect the XML pattern file parser to the
-	/// hyphenation tree.
-	/// 
-	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
-	/// </summary>
-	public interface PatternConsumer
+    /// <summary>
+    /// This interface is used to connect the XML pattern file parser to the
+    /// hyphenation tree.
+    /// 
+    /// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+    /// </summary>
+    public interface IPatternConsumer
 	{
 
 	  /// <summary>
@@ -34,7 +34,7 @@ namespace Lucene.Net.Analysis.Compound.Hyphenation
 	  /// usually means to ignore case.
 	  /// </summary>
 	  /// <param name="chargroup"> character group </param>
-	  void addClass(string chargroup);
+	  void AddClass(string chargroup);
 
 	  /// <summary>
 	  /// Add a hyphenation exception. An exception replaces the result obtained by
@@ -42,15 +42,13 @@ namespace Lucene.Net.Analysis.Compound.Hyphenation
 	  /// his own hyphenation. A hyphenatedword is a vector of alternating String's
 	  /// and <seealso cref="Hyphen"/> instances
 	  /// </summary>
-	  void addException(string word, List<object> hyphenatedword);
+	  void AddException(string word, List<object> hyphenatedword);
 
 	  /// <summary>
 	  /// Add hyphenation patterns.
 	  /// </summary>
 	  /// <param name="pattern"> the pattern </param>
 	  /// <param name="values"> interletter values expressed as a string of digit characters. </param>
-	  void addPattern(string pattern, string values);
-
+	  void AddPattern(string pattern, string values);
 	}
-
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87c1d606/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
index 1d012c4..e94e8cf 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
@@ -1,457 +1,484 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * 
- *      http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-using System;
-using System.Collections;
+using System;
 using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using System.Text;
+using System.Xml;
 
 namespace Lucene.Net.Analysis.Compound.Hyphenation
 {
-
-	// SAX
-    
-    // Java
-
-	/// <summary>
-	/// A SAX document handler to read and parse hyphenation patterns from a XML
-	/// file.
-	/// 
-	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
-	/// </summary>
-	public class PatternParser : DefaultHandler
-	{
-
-	  internal XMLReader parser;
-
-	  internal int currElement;
-
-	  internal PatternConsumer consumer;
-
-	  internal StringBuilder token;
-
-	  internal List<object> exception;
-
-	  internal char hyphenChar;
-
-	  internal string errMsg;
-
-	  internal const int ELEM_CLASSES = 1;
-
-	  internal const int ELEM_EXCEPTIONS = 2;
-
-	  internal const int ELEM_PATTERNS = 3;
-
-	  internal const int ELEM_HYPHEN = 4;
-
-	  public PatternParser()
-	  {
-		token = new StringBuilder();
-		parser = createParser();
-		parser.ContentHandler = this;
-		parser.ErrorHandler = this;
-		parser.EntityResolver = this;
-		hyphenChar = '-'; // default
-
-	  }
-
-	  public PatternParser(PatternConsumer consumer) : this()
-	  {
-		this.consumer = consumer;
-	  }
-
-	  public virtual PatternConsumer Consumer
-	  {
-		  set
-		  {
-			this.consumer = value;
-		  }
-	  }
-
-	  /// <summary>
-	  /// Parses a hyphenation pattern file.
-	  /// </summary>
-	  /// <param name="filename"> the filename </param>
-	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(String filename) throws java.io.IOException
-	  public virtual void parse(string filename)
-	  {
-		parse(new InputSource(filename));
-	  }
-
-	  /// <summary>
-	  /// Parses a hyphenation pattern file.
-	  /// </summary>
-	  /// <param name="file"> the pattern file </param>
-	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(java.io.File file) throws java.io.IOException
-	  public virtual void parse(File file)
-	  {
-		InputSource src = new InputSource(file.toURI().toASCIIString());
-		parse(src);
-	  }
-
-	  /// <summary>
-	  /// Parses a hyphenation pattern file.
-	  /// </summary>
-	  /// <param name="source"> the InputSource for the file </param>
-	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void parse(org.xml.sax.InputSource source) throws java.io.IOException
-	  public virtual void parse(InputSource source)
-	  {
-		try
-		{
-		  parser.parse(source);
-		}
-		catch (SAXException e)
-		{
-		  throw new IOException(e);
-		}
-	  }
-
-	  /// <summary>
-	  /// Creates a SAX parser using JAXP
-	  /// </summary>
-	  /// <returns> the created SAX parser </returns>
-	  internal static XMLReader createParser()
-	  {
-		try
-		{
-		  SAXParserFactory factory = SAXParserFactory.newInstance();
-		  factory.NamespaceAware = true;
-		  return factory.newSAXParser().XMLReader;
-		}
-		catch (Exception e)
-		{
-		  throw new Exception("Couldn't create XMLReader: " + e.Message);
-		}
-	  }
-
-	  protected internal virtual string readToken(StringBuilder chars)
-	  {
-		string word;
-		bool space = false;
-		int i;
-		for (i = 0; i < chars.Length; i++)
-		{
-		  if (char.IsWhiteSpace(chars[i]))
-		  {
-			space = true;
-		  }
-		  else
-		  {
-			break;
-		  }
-		}
-		if (space)
-		{
-		  // chars.delete(0,i);
-		  for (int countr = i; countr < chars.Length; countr++)
-		  {
-			chars[countr - i] = chars[countr];
-		  }
-		  chars.Length = chars.Length - i;
-		  if (token.Length > 0)
-		  {
-			word = token.ToString();
-			token.Length = 0;
-			return word;
-		  }
-		}
-		space = false;
-		for (i = 0; i < chars.Length; i++)
-		{
-		  if (char.IsWhiteSpace(chars[i]))
-		  {
-			space = true;
-			break;
-		  }
-		}
-		token.Append(chars.ToString().Substring(0, i));
-		// chars.delete(0,i);
-		for (int countr = i; countr < chars.Length; countr++)
-		{
-		  chars[countr - i] = chars[countr];
-		}
-		chars.Length = chars.Length - i;
-		if (space)
-		{
-		  word = token.ToString();
-		  token.Length = 0;
-		  return word;
-		}
-		token.Append(chars);
-		return null;
-	  }
-
-	  protected internal static string getPattern(string word)
-	  {
-		StringBuilder pat = new StringBuilder();
-		int len = word.Length;
-		for (int i = 0; i < len; i++)
-		{
-		  if (!char.IsDigit(word[i]))
-		  {
-			pat.Append(word[i]);
-		  }
-		}
-		return pat.ToString();
-	  }
-
-	  protected internal virtual List<object> normalizeException(List<T1> ex)
-	  {
-		List<object> res = new List<object>();
-		for (int i = 0; i < ex.Count; i++)
-		{
-		  object item = ex[i];
-		  if (item is string)
-		  {
-			string str = (string) item;
-			StringBuilder buf = new StringBuilder();
-			for (int j = 0; j < str.Length; j++)
-			{
-			  char c = str[j];
-			  if (c != hyphenChar)
-			  {
-				buf.Append(c);
-			  }
-			  else
-			  {
-				res.Add(buf.ToString());
-				buf.Length = 0;
-				char[] h = new char[1];
-				h[0] = hyphenChar;
-				// we use here hyphenChar which is not necessarily
-				// the one to be printed
-				res.Add(new Hyphen(new string(h), null, null));
-			  }
-			}
-			if (buf.Length > 0)
-			{
-			  res.Add(buf.ToString());
-			}
-		  }
-		  else
-		  {
-			res.Add(item);
-		  }
-		}
-		return res;
-	  }
-
-	  protected internal virtual string getExceptionWord<T1>(List<T1> ex)
-	  {
-		StringBuilder res = new StringBuilder();
-		for (int i = 0; i < ex.Count; i++)
-		{
-		  object item = ex[i];
-		  if (item is string)
-		  {
-			res.Append((string) item);
-		  }
-		  else
-		  {
-			if (((Hyphen) item).noBreak != null)
-			{
-			  res.Append(((Hyphen) item).noBreak);
-			}
-		  }
-		}
-		return res.ToString();
-	  }
-
-	  protected internal static string getInterletterValues(string pat)
-	  {
-		StringBuilder il = new StringBuilder();
-		string word = pat + "a"; // add dummy letter to serve as sentinel
-		int len = word.Length;
-		for (int i = 0; i < len; i++)
-		{
-		  char c = word[i];
-		  if (char.IsDigit(c))
-		  {
-			il.Append(c);
-			i++;
-		  }
-		  else
-		  {
-			il.Append('0');
-		  }
-		}
-		return il.ToString();
-	  }
-
-	  //
-	  // EntityResolver methods
-	  //
-	  public override InputSource resolveEntity(string publicId, string systemId)
-	  {
-		// supply the internal hyphenation.dtd if possible
-		if ((systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || ("hyphenation-info".Equals(publicId)))
-		{
-		  // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
-		  return new InputSource(this.GetType().getResource("hyphenation.dtd").toExternalForm());
-		}
-		return null;
-	  }
-
-	  //
-	  // ContentHandler methods
-	  //
-
-	  /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
-	  ///      java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
-	  public override void startElement(string uri, string local, string raw, Attributes attrs)
-	  {
-		if (local.Equals("hyphen-char"))
-		{
-		  string h = attrs.getValue("value");
-		  if (h != null && h.Length == 1)
-		  {
-			hyphenChar = h[0];
-		  }
-		}
-		else if (local.Equals("classes"))
-		{
-		  currElement = ELEM_CLASSES;
-		}
-		else if (local.Equals("patterns"))
-		{
-		  currElement = ELEM_PATTERNS;
-		}
-		else if (local.Equals("exceptions"))
-		{
-		  currElement = ELEM_EXCEPTIONS;
-		  exception = new List<>();
-		}
-		else if (local.Equals("hyphen"))
-		{
-		  if (token.Length > 0)
-		  {
-			exception.Add(token.ToString());
-		  }
-		  exception.Add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), attrs.getValue("post")));
-		  currElement = ELEM_HYPHEN;
-		}
-		token.Length = 0;
-	  }
-
-	  /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
-	  ///      java.lang.String, java.lang.String) </seealso>
-//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
-//ORIGINAL LINE: @Override @SuppressWarnings("unchecked") public void endElement(String uri, String local, String raw)
-	  public override void endElement(string uri, string local, string raw)
-	  {
-
-		if (token.Length > 0)
-		{
-		  string word = token.ToString();
-		  switch (currElement)
-		  {
-			case ELEM_CLASSES:
-			  consumer.addClass(word);
-			  break;
-			case ELEM_EXCEPTIONS:
-			  exception.Add(word);
-			  exception = normalizeException(exception);
-			  consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
-			  break;
-			case ELEM_PATTERNS:
-			  consumer.addPattern(getPattern(word), getInterletterValues(word));
-			  break;
-			case ELEM_HYPHEN:
-			  // nothing to do
-			  break;
-		  }
-		  if (currElement != ELEM_HYPHEN)
-		  {
-			token.Length = 0;
-		  }
-		}
-		if (currElement == ELEM_HYPHEN)
-		{
-		  currElement = ELEM_EXCEPTIONS;
-		}
-		else
-		{
-		  currElement = 0;
-		}
-
-	  }
-
-	  /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
-//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
-//ORIGINAL LINE: @SuppressWarnings("unchecked") @Override public void characters(char ch[] , int start, int length)
-	  public override void characters(char[] ch, int start, int length)
-	  {
-		StringBuilder chars = new StringBuilder(length);
-		chars.Append(ch, start, length);
-		string word = readToken(chars);
-		while (word != null)
-		{
-		  // System.out.println("\"" + word + "\"");
-		  switch (currElement)
-		  {
-			case ELEM_CLASSES:
-			  consumer.addClass(word);
-			  break;
-			case ELEM_EXCEPTIONS:
-			  exception.Add(word);
-			  exception = normalizeException(exception);
-			  consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
-			  exception.Clear();
-			  break;
-			case ELEM_PATTERNS:
-			  consumer.addPattern(getPattern(word), getInterletterValues(word));
-			  break;
-		  }
-		  word = readToken(chars);
-		}
-
-	  }
-
-	  /// <summary>
-	  /// Returns a string of the location.
-	  /// </summary>
-	  private string getLocationString(SAXParseException ex)
-	  {
-		StringBuilder str = new StringBuilder();
-
-		string systemId = ex.SystemId;
-		if (systemId != null)
-		{
-		  int index = systemId.LastIndexOf('/');
-		  if (index != -1)
-		  {
-			systemId = systemId.Substring(index + 1);
-		  }
-		  str.Append(systemId);
-		}
-		str.Append(':');
-		str.Append(ex.LineNumber);
-		str.Append(':');
-		str.Append(ex.ColumnNumber);
-
-		return str.ToString();
-
-	  } // getLocationString(SAXParseException):String
-	}
-
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     * 
+     *      http://www.apache.org/licenses/LICENSE-2.0
+     * 
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A XMLReader document handler to read and parse hyphenation patterns from a XML
+    /// file.
+    /// 
+    /// LUCENENET: This class has been refactored from its Java counterpart to use XmlReader rather
+    /// than a SAX parser.
+    /// </summary>
+    public class PatternParser
+    {
+        internal int currElement;
+
+        internal IPatternConsumer consumer;
+
+        internal StringBuilder token;
+
+        internal List<object> exception;
+
+        internal char hyphenChar;
+
+        internal string errMsg;
+
+        internal const int ELEM_CLASSES = 1;
+
+        internal const int ELEM_EXCEPTIONS = 2;
+
+        internal const int ELEM_PATTERNS = 3;
+
+        internal const int ELEM_HYPHEN = 4;
+
+        public PatternParser()
+        {
+            token = new StringBuilder();
+            hyphenChar = '-'; // default
+        }
+
+        public PatternParser(IPatternConsumer consumer) : this()
+        {
+            this.consumer = consumer;
+        }
+
+        public virtual IPatternConsumer Consumer
+        {
+            set
+            {
+                this.consumer = value;
+            }
+        }
+
+        /// <summary>
+        /// Parses a hyphenation pattern file.
+        /// </summary>
+        /// <param name="filename"> the filename </param>
+        /// <exception cref="IOException"> In case of an exception while parsing </exception>
+        public virtual void Parse(string filename)
+        {
+            // LUCENENET TODO: Create overloads that allow XmlReaderSettings to be passed in.
+            using (var src = XmlReader.Create(filename, new XmlReaderSettings
+            {
+                DtdProcessing = DtdProcessing.Parse,
+                XmlResolver = new DtdResolver()
+            }))
+            {
+                Parse(src);
+            }
+        }
+
+        /// <summary>
+        /// Parses a hyphenation pattern file.
+        /// </summary>
+        /// <param name="file"> the pattern file </param>
+        public virtual void Parse(FileInfo file)
+        {
+            Parse(file, Encoding.UTF8);
+        }
+
+        /// <summary>
+        /// Parses a hyphenation pattern file.
+        /// </summary>
+        /// <param name="file"> the pattern file </param>
+        public virtual void Parse(FileInfo file, Encoding encoding)
+        {
+            using (var src = XmlReader.Create(new StreamReader(file.FullName, encoding), new XmlReaderSettings
+            {
+                DtdProcessing = DtdProcessing.Parse,
+                XmlResolver = new DtdResolver()
+            }))
+            {
+
+                Parse(src);
+            }
+        }
+
+        /// <summary>
+        /// Parses a hyphenation pattern file.
+        /// </summary>
+        /// <param name="file"> the pattern file </param>
+        public virtual void Parse(Stream xmlStream)
+        {
+            using (var src = XmlReader.Create(xmlStream, new XmlReaderSettings
+            {
+                DtdProcessing = DtdProcessing.Parse,
+                XmlResolver = new DtdResolver()
+            }))
+            {
+                Parse(src);
+            }
+        }
+
+        /// <summary>
+        /// Parses a hyphenation pattern file.
+        /// </summary>
+        /// <param name="source"> the InputSource for the file </param>
+        /// <exception cref="IOException"> In case of an exception while parsing </exception>
+        public virtual void Parse(XmlReader source)
+        {
+            source.MoveToContent();
+            while (source.Read())
+            {
+                ParseNode(source);
+            }
+        }
+
+        private void ParseNode(XmlReader node)
+        {
+            string uri, name, raw;
+            switch (node.NodeType)
+            {
+                case XmlNodeType.Element:
+
+                    // Element start
+                    uri = node.NamespaceURI;
+                    name = node.Name;
+                    var attributes = GetAttributes(node);
+                    raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+                    this.StartElement(uri, name, raw, attributes);
+                    if (node.IsEmptyElement)
+                    {
+                        this.EndElement(uri, name, raw);
+                    }
+                    break;
+
+                case XmlNodeType.Text:
+
+                    this.Characters(node.Value.ToCharArray(), 0, node.Value.Length);
+                    break;
+
+                case XmlNodeType.EndElement:
+                    uri = node.NamespaceURI;
+                    name = node.Name;
+                    raw = string.Empty; // node.ReadOuterXml(); - not used, but was messing with the node pointer
+
+                    // Element end
+                    this.EndElement(uri, name, raw);
+                    break;
+            }
+        }
+
+        private IDictionary<string, string> GetAttributes(XmlReader node)
+        {
+            var result = new Dictionary<string, string>();
+            if (node.HasAttributes)
+            {
+                for (int i = 0; i < node.AttributeCount; i++)
+                {
+                    node.MoveToAttribute(i);
+                    result.Add(node.Name, node.Value);
+                }
+            }
+
+            return result;
+        }
+
+        protected internal virtual string ReadToken(StringBuilder chars)
+        {
+            string word;
+            bool space = false;
+            int i;
+            for (i = 0; i < chars.Length; i++)
+            {
+                if (char.IsWhiteSpace(chars[i]))
+                {
+                    space = true;
+                }
+                else
+                {
+                    break;
+                }
+            }
+            if (space)
+            {
+                // chars.delete(0,i);
+                for (int countr = i; countr < chars.Length; countr++)
+                {
+                    chars[countr - i] = chars[countr];
+                }
+                chars.Length = chars.Length - i;
+                if (token.Length > 0)
+                {
+                    word = token.ToString();
+                    token.Length = 0;
+                    return word;
+                }
+            }
+            space = false;
+            for (i = 0; i < chars.Length; i++)
+            {
+                if (char.IsWhiteSpace(chars[i]))
+                {
+                    space = true;
+                    break;
+                }
+            }
+            token.Append(chars.ToString(0, i));
+            // chars.delete(0,i);
+            for (int countr = i; countr < chars.Length; countr++)
+            {
+                chars[countr - i] = chars[countr];
+            }
+            chars.Length = chars.Length - i;
+            if (space)
+            {
+                word = token.ToString();
+                token.Length = 0;
+                return word;
+            }
+            token.Append(chars);
+            return null;
+        }
+
+        protected internal static string GetPattern(string word)
+        {
+            StringBuilder pat = new StringBuilder();
+            int len = word.Length;
+            for (int i = 0; i < len; i++)
+            {
+                if (!char.IsDigit(word[i]))
+                {
+                    pat.Append(word[i]);
+                }
+            }
+            return pat.ToString();
+        }
+
+        protected internal virtual List<object> NormalizeException<T1>(List<T1> ex)
+        {
+            List<object> res = new List<object>();
+            for (int i = 0; i < ex.Count; i++)
+            {
+                object item = ex[i];
+                if (item is string)
+                {
+                    string str = (string)item;
+                    StringBuilder buf = new StringBuilder();
+                    for (int j = 0; j < str.Length; j++)
+                    {
+                        char c = str[j];
+                        if (c != hyphenChar)
+                        {
+                            buf.Append(c);
+                        }
+                        else
+                        {
+                            res.Add(buf.ToString());
+                            buf.Length = 0;
+                            char[] h = new char[1];
+                            h[0] = hyphenChar;
+                            // we use here hyphenChar which is not necessarily
+                            // the one to be printed
+                            res.Add(new Hyphen(new string(h), null, null));
+                        }
+                    }
+                    if (buf.Length > 0)
+                    {
+                        res.Add(buf.ToString());
+                    }
+                }
+                else
+                {
+                    res.Add(item);
+                }
+            }
+            return res;
+        }
+
+        protected internal virtual string GetExceptionWord<T1>(List<T1> ex)
+        {
+            StringBuilder res = new StringBuilder();
+            for (int i = 0; i < ex.Count; i++)
+            {
+                object item = ex[i];
+                if (item is string)
+                {
+                    res.Append((string)item);
+                }
+                else
+                {
+                    if (((Hyphen)item).noBreak != null)
+                    {
+                        res.Append(((Hyphen)item).noBreak);
+                    }
+                }
+            }
+            return res.ToString();
+        }
+
+        protected internal static string GetInterletterValues(string pat)
+        {
+            StringBuilder il = new StringBuilder();
+            string word = pat + "a"; // add dummy letter to serve as sentinel
+            int len = word.Length;
+            for (int i = 0; i < len; i++)
+            {
+                char c = word[i];
+                if (char.IsDigit(c))
+                {
+                    il.Append(c);
+                    i++;
+                }
+                else
+                {
+                    il.Append('0');
+                }
+            }
+            return il.ToString();
+        }
+
+        /// <summary>
+        /// LUCENENET specific helper class to force the DTD file to be read from the embedded resource
+        /// rather than from the file system.
+        /// </summary>
+        internal class DtdResolver : XmlUrlResolver
+        {
+            public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
+            {
+                string dtdFilename = "hyphenation.dtd";
+                if (dtdFilename.Equals(absoluteUri.Segments.LastOrDefault()))
+                {
+                    var qualifedDtdFilename = string.Concat(GetType().Namespace, ".", dtdFilename);
+                    return GetType().Assembly.GetManifestResourceStream(qualifedDtdFilename);
+                }
+
+                return base.GetEntity(absoluteUri, role, ofObjectToReturn);
+            }
+        }
+
+        //
+        // ContentHandler methods
+        //
+
+        /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
+        ///      java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
+        public void StartElement(string uri, string local, string raw, IDictionary<string, string> attrs)
+        {
+            if (local.Equals("hyphen-char"))
+            {
+                string h = attrs.ContainsKey("value") ? attrs["value"] : null;
+                if (h != null && h.Length == 1)
+                {
+                    hyphenChar = h[0];
+                }
+            }
+            else if (local.Equals("classes"))
+            {
+                currElement = ELEM_CLASSES;
+            }
+            else if (local.Equals("patterns"))
+            {
+                currElement = ELEM_PATTERNS;
+            }
+            else if (local.Equals("exceptions"))
+            {
+                currElement = ELEM_EXCEPTIONS;
+                exception = new List<object>();
+            }
+            else if (local.Equals("hyphen"))
+            {
+                if (token.Length > 0)
+                {
+                    exception.Add(token.ToString());
+                }
+                exception.Add(new Hyphen(attrs["pre"], attrs["no"], attrs["post"]));
+                currElement = ELEM_HYPHEN;
+            }
+            token.Length = 0;
+        }
+
+        /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
+        ///      java.lang.String, java.lang.String) </seealso>
+        public void EndElement(string uri, string local, string raw)
+        {
+
+            if (token.Length > 0)
+            {
+                string word = token.ToString();
+                switch (currElement)
+                {
+                    case ELEM_CLASSES:
+                        consumer.AddClass(word);
+                        break;
+                    case ELEM_EXCEPTIONS:
+                        exception.Add(word);
+                        exception = NormalizeException(exception);
+                        consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+                        break;
+                    case ELEM_PATTERNS:
+                        consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+                        break;
+                    case ELEM_HYPHEN:
+                        // nothing to do
+                        break;
+                }
+                if (currElement != ELEM_HYPHEN)
+                {
+                    token.Length = 0;
+                }
+            }
+            if (currElement == ELEM_HYPHEN)
+            {
+                currElement = ELEM_EXCEPTIONS;
+            }
+            else
+            {
+                currElement = 0;
+            }
+
+        }
+
+        /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
+        public void Characters(char[] ch, int start, int length)
+        {
+            StringBuilder chars = new StringBuilder(length);
+            chars.Append(ch, start, length);
+            string word = ReadToken(chars);
+            while (!string.IsNullOrEmpty(word))
+            {
+                // System.out.println("\"" + word + "\"");
+                switch (currElement)
+                {
+                    case ELEM_CLASSES:
+                        consumer.AddClass(word);
+                        break;
+                    case ELEM_EXCEPTIONS:
+                        exception.Add(word);
+                        exception = NormalizeException(exception);
+                        consumer.AddException(GetExceptionWord(exception), new List<object>(exception));
+                        exception.Clear();
+                        break;
+                    case ELEM_PATTERNS:
+                        consumer.AddPattern(GetPattern(word), GetInterletterValues(word));
+                        break;
+                }
+                word = ReadToken(chars);
+            }
+
+        }
+    }
 }
\ No newline at end of file


Mime
View raw message