lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [31/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:35 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
new file mode 100644
index 0000000..58b40a1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
@@ -0,0 +1,202 @@
+using System.Diagnostics;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Base class for decomposition token filters.
+	/// <para>
+	/// 
+	/// <a name="version"></a>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// CompoundWordTokenFilterBase:
+	/// <ul>
+	/// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+	/// supplementary characters in strings and char arrays provided as compound word
+	/// dictionaries.
+	/// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public abstract class CompoundWordTokenFilterBase : TokenFilter
+	{
+	  /// <summary>
+	  /// The default for minimal word length that gets decomposed
+	  /// </summary>
+	  public const int DEFAULT_MIN_WORD_SIZE = 5;
+
+	  /// <summary>
+	  /// The default for minimal length of subwords that get propagated to the output of this filter
+	  /// </summary>
+	  public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
+
+	  /// <summary>
+	  /// The default for maximal length of subwords that get propagated to the output of this filter
+	  /// </summary>
+	  public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
+
+	  protected internal readonly Version matchVersion;
+	  protected internal readonly CharArraySet dictionary;
+	  protected internal readonly LinkedList<CompoundToken> tokens;
+	  protected internal readonly int minWordSize;
+	  protected internal readonly int minSubwordSize;
+	  protected internal readonly int maxSubwordSize;
+	  protected internal readonly bool onlyLongestMatch;
+
+	  protected internal readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  protected internal readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+
+	  private AttributeSource.State current;
+
+	  protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
+	  {
+	  }
+
+	  protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false)
+	  {
+	  }
+
+	  protected internal CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input)
+	  {
+		this.matchVersion = matchVersion;
+		this.tokens = new LinkedList<>();
+		if (minWordSize < 0)
+		{
+		  throw new System.ArgumentException("minWordSize cannot be negative");
+		}
+		this.minWordSize = minWordSize;
+		if (minSubwordSize < 0)
+		{
+		  throw new System.ArgumentException("minSubwordSize cannot be negative");
+		}
+		this.minSubwordSize = minSubwordSize;
+		if (maxSubwordSize < 0)
+		{
+		  throw new System.ArgumentException("maxSubwordSize cannot be negative");
+		}
+		this.maxSubwordSize = maxSubwordSize;
+		this.onlyLongestMatch = onlyLongestMatch;
+		this.dictionary = dictionary;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (tokens.Count > 0)
+		{
+		  Debug.Assert(current != null);
+		  CompoundToken token = tokens.RemoveFirst();
+		  restoreState(current); // keep all other attributes untouched
+		  termAtt.setEmpty().append(token.txt);
+		  offsetAtt.setOffset(token.startOffset, token.endOffset);
+		  posIncAtt.PositionIncrement = 0;
+		  return true;
+		}
+
+		current = null; // not really needed, but for safety
+		if (input.incrementToken())
+		{
+		  // Only words longer than minWordSize get processed
+		  if (termAtt.length() >= this.minWordSize)
+		  {
+			decompose();
+			// only capture the state if we really need it for producing new tokens
+			if (tokens.Count > 0)
+			{
+			  current = captureState();
+			}
+		  }
+		  // return original token:
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+
+	  /// <summary>
+	  /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
+	  /// The original token may not be placed in the list, as it is automatically passed through this filter.
+	  /// </summary>
+	  protected internal abstract void decompose();
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		tokens.Clear();
+		current = null;
+	  }
+
+	  /// <summary>
+	  /// Helper class to hold decompounded token information
+	  /// </summary>
+	  protected internal class CompoundToken
+	  {
+		  private readonly CompoundWordTokenFilterBase outerInstance;
+
+		public readonly CharSequence txt;
+		public readonly int startOffset, endOffset;
+
+		/// <summary>
+		/// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
+		public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
+		{
+			this.outerInstance = outerInstance;
+		  this.txt = outerInstance.termAtt.subSequence(offset, offset + length);
+
+		  // offsets of the original word
+		  int startOff = outerInstance.offsetAtt.startOffset();
+		  int endOff = outerInstance.offsetAtt.endOffset();
+
+		  if (outerInstance.matchVersion.onOrAfter(Version.LUCENE_44) || endOff - startOff != outerInstance.termAtt.length())
+		  {
+			// if length by start + end offsets doesn't match the term text then assume
+			// this is a synonym and don't adjust the offsets.
+			this.startOffset = startOff;
+			this.endOffset = endOff;
+		  }
+		  else
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newStart = startOff + offset;
+			int newStart = startOff + offset;
+			this.startOffset = newStart;
+			this.endOffset = newStart + length;
+		  }
+		}
+
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
new file mode 100644
index 0000000..6b875e0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilter.cs
@@ -0,0 +1,137 @@
+namespace org.apache.lucene.analysis.compound
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that decomposes compound words found in many Germanic languages.
+	/// <para>
+	/// "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+	/// "Donaudampfschiff" even when you only enter "schiff". 
+	///  It uses a brute-force algorithm to achieve this.
+	/// </para>
+	/// <para>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// CompoundWordTokenFilterBase:
+	/// <ul>
+	/// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+	/// supplementary characters in strings and char arrays provided as compound word
+	/// dictionaries.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public class DictionaryCompoundWordTokenFilter : CompoundWordTokenFilterBase
+	{
+
+	  /// <summary>
+	  /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to enable correct Unicode 4.0 behavior in the
+	  ///          dictionaries if Version > 3.0. See <a
+	  ///          href="CompoundWordTokenFilterBase.html#version"
+	  ///          >CompoundWordTokenFilterBase</a> for details. </param>
+	  /// <param name="input">
+	  ///          the <seealso cref="TokenStream"/> to process </param>
+	  /// <param name="dictionary">
+	  ///          the word dictionary to match against. </param>
+	  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) : base(matchVersion, input, dictionary)
+	  {
+		if (dictionary == null)
+		{
+		  throw new System.ArgumentException("dictionary cannot be null");
+		}
+	  }
+
+	  /// <summary>
+	  /// Creates a new <seealso cref="DictionaryCompoundWordTokenFilter"/>
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to enable correct Unicode 4.0 behavior in the
+	  ///          dictionaries if Version > 3.0. See <a
+	  ///          href="CompoundWordTokenFilterBase.html#version"
+	  ///          >CompoundWordTokenFilterBase</a> for details. </param>
+	  /// <param name="input">
+	  ///          the <seealso cref="TokenStream"/> to process </param>
+	  /// <param name="dictionary">
+	  ///          the word dictionary to match against. </param>
+	  /// <param name="minWordSize">
+	  ///          only words longer than this get processed </param>
+	  /// <param name="minSubwordSize">
+	  ///          only subwords longer than this get to the output stream </param>
+	  /// <param name="maxSubwordSize">
+	  ///          only subwords shorter than this get to the output stream </param>
+	  /// <param name="onlyLongestMatch">
+	  ///          Add only the longest matching subword to the stream </param>
+	  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
+	  {
+		if (dictionary == null)
+		{
+		  throw new System.ArgumentException("dictionary cannot be null");
+		}
+	  }
+
+	  protected internal override void decompose()
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int len = termAtt.length();
+		int len = termAtt.length();
+		for (int i = 0;i <= len - this.minSubwordSize;++i)
+		{
+			CompoundToken longestMatchToken = null;
+			for (int j = this.minSubwordSize;j <= this.maxSubwordSize;++j)
+			{
+				if (i + j > len)
+				{
+					break;
+				}
+				if (dictionary.contains(termAtt.buffer(), i, j))
+				{
+					if (this.onlyLongestMatch)
+					{
+					   if (longestMatchToken != null)
+					   {
+						 if (longestMatchToken.txt.length() < j)
+						 {
+						   longestMatchToken = new CompoundToken(this, i,j);
+						 }
+					   }
+					   else
+					   {
+						 longestMatchToken = new CompoundToken(this, i,j);
+					   }
+					}
+					else
+					{
+					   tokens.AddLast(new CompoundToken(this, i,j));
+					}
+				}
+			}
+			if (this.onlyLongestMatch && longestMatchToken != null)
+			{
+			  tokens.AddLast(longestMatchToken);
+			}
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
new file mode 100644
index 0000000..497d89d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/DictionaryCompoundWordTokenFilterFactory.cs
@@ -0,0 +1,81 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+
+
+	/// <summary>
+	/// Factory for <seealso cref="DictionaryCompoundWordTokenFilter"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
+	///         minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class DictionaryCompoundWordTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  private CharArraySet dictionary;
+	  private readonly string dictFile;
+	  private readonly int minWordSize;
+	  private readonly int minSubwordSize;
+	  private readonly int maxSubwordSize;
+	  private readonly bool onlyLongestMatch;
+
+	  /// <summary>
+	  /// Creates a new DictionaryCompoundWordTokenFilterFactory </summary>
+	  public DictionaryCompoundWordTokenFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		dictFile = require(args, "dictionary");
+		minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+		minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+		maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+		onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		dictionary = base.getWordSet(loader, dictFile, false);
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		// if the dictionary is null, it means it was empty
+		return dictionary == null ? input : new DictionaryCompoundWordTokenFilter(luceneMatchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
new file mode 100644
index 0000000..0b5e99c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilter.cs
@@ -0,0 +1,255 @@
+namespace org.apache.lucene.analysis.compound
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using Hyphenation = org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
+	using HyphenationTree = org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using Version = org.apache.lucene.util.Version;
+	using InputSource = org.xml.sax.InputSource;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that decomposes compound words found in many Germanic languages.
+	/// <para>
+	/// "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+	/// "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
+	/// grammar and a word dictionary to achieve this.
+	/// </para>
+	/// <para>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// CompoundWordTokenFilterBase:
+	/// <ul>
+	/// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+	/// supplementary characters in strings and char arrays provided as compound word
+	/// dictionaries.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public class HyphenationCompoundWordTokenFilter : CompoundWordTokenFilterBase
+	{
+	  private HyphenationTree hyphenator;
+
+	  /// <summary>
+	  /// Creates a new <seealso cref="HyphenationCompoundWordTokenFilter"/> instance. 
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to enable correct Unicode 4.0 behavior in the
+	  ///          dictionaries if Version > 3.0. See <a
+	  ///          href="CompoundWordTokenFilterBase.html#version"
+	  ///          >CompoundWordTokenFilterBase</a> for details. </param>
+	  /// <param name="input">
+	  ///          the <seealso cref="TokenStream"/> to process </param>
+	  /// <param name="hyphenator">
+	  ///          the hyphenation pattern tree to use for hyphenation </param>
+	  /// <param name="dictionary">
+	  ///          the word dictionary to match against. </param>
+	  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary) : this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates a new <seealso cref="HyphenationCompoundWordTokenFilter"/> instance.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to enable correct Unicode 4.0 behavior in the
+	  ///          dictionaries if Version > 3.0. See <a
+	  ///          href="CompoundWordTokenFilterBase.html#version"
+	  ///          >CompoundWordTokenFilterBase</a> for details. </param>
+	  /// <param name="input">
+	  ///          the <seealso cref="TokenStream"/> to process </param>
+	  /// <param name="hyphenator">
+	  ///          the hyphenation pattern tree to use for hyphenation </param>
+	  /// <param name="dictionary">
+	  ///          the word dictionary to match against. </param>
+	  /// <param name="minWordSize">
+	  ///          only words longer than this get processed </param>
+	  /// <param name="minSubwordSize">
+	  ///          only subwords longer than this get to the output stream </param>
+	  /// <param name="maxSubwordSize">
+	  ///          only subwords shorter than this get to the output stream </param>
+	  /// <param name="onlyLongestMatch">
+	  ///          Add only the longest matching subword to the stream </param>
+	  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch)
+	  {
+
+		this.hyphenator = hyphenator;
+	  }
+
+	  /// <summary>
+	  /// Create a HyphenationCompoundWordTokenFilter with no dictionary.
+	  /// <para>
+	  /// Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
+	  /// HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+	  /// null, minWordSize, minSubwordSize, maxSubwordSize }
+	  /// </para>
+	  /// </summary>
+	  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) : this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Create a HyphenationCompoundWordTokenFilter with no dictionary.
+	  /// <para>
+	  /// Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int) 
+	  /// HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, 
+	  /// DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+	  /// </para>
+	  /// </summary>
+	  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator) : this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Create a hyphenator tree
+	  /// </summary>
+	  /// <param name="hyphenationFilename"> the filename of the XML grammar to load </param>
+	  /// <returns> An object representing the hyphenation patterns </returns>
+	  /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(String hyphenationFilename) throws java.io.IOException
+	  public static HyphenationTree getHyphenationTree(string hyphenationFilename)
+	  {
+		return getHyphenationTree(new InputSource(hyphenationFilename));
+	  }
+
+	  /// <summary>
+	  /// Create a hyphenator tree
+	  /// </summary>
+	  /// <param name="hyphenationFile"> the file of the XML grammar to load </param>
+	  /// <returns> An object representing the hyphenation patterns </returns>
+	  /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(java.io.File hyphenationFile) throws java.io.IOException
+	  public static HyphenationTree getHyphenationTree(File hyphenationFile)
+	  {
+		return getHyphenationTree(new InputSource(hyphenationFile.toURI().toASCIIString()));
+	  }
+
+	  /// <summary>
+	  /// Create a hyphenator tree
+	  /// </summary>
+	  /// <param name="hyphenationSource"> the InputSource pointing to the XML grammar </param>
+	  /// <returns> An object representing the hyphenation patterns </returns>
+	  /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static org.apache.lucene.analysis.compound.hyphenation.HyphenationTree getHyphenationTree(org.xml.sax.InputSource hyphenationSource) throws java.io.IOException
+	  public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
+	  {
+		HyphenationTree tree = new HyphenationTree();
+		tree.loadPatterns(hyphenationSource);
+		return tree;
+	  }
+
+	  protected internal override void decompose()
+	  {
+		// get the hyphenation points
+		Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
+		// No hyphen points found -> exit
+		if (hyphens == null)
+		{
+		  return;
+		}
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] hyp = hyphens.getHyphenationPoints();
+		int[] hyp = hyphens.HyphenationPoints;
+
+		for (int i = 0; i < hyp.Length; ++i)
+		{
+		  int remaining = hyp.Length - i;
+		  int start = hyp[i];
+		  CompoundToken longestMatchToken = null;
+		  for (int j = 1; j < remaining; j++)
+		  {
+			int partLength = hyp[i + j] - start;
+
+			// if the part is longer than maxSubwordSize we
+			// are done with this round
+			if (partLength > this.maxSubwordSize)
+			{
+			  break;
+			}
+
+			// we only put subwords to the token stream
+			// that are longer than minPartSize
+			if (partLength < this.minSubwordSize)
+			{
+			  // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
+			  // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
+			  continue;
+			}
+
+			// check the dictionary
+			if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength))
+			{
+			  if (this.onlyLongestMatch)
+			  {
+				if (longestMatchToken != null)
+				{
+				  if (longestMatchToken.txt.length() < partLength)
+				  {
+					longestMatchToken = new CompoundToken(this, start, partLength);
+				  }
+				}
+				else
+				{
+				  longestMatchToken = new CompoundToken(this, start, partLength);
+				}
+			  }
+			  else
+			  {
+				tokens.AddLast(new CompoundToken(this, start, partLength));
+			  }
+			}
+			else if (dictionary.contains(termAtt.buffer(), start, partLength - 1))
+			{
+			  // check the dictionary again with a word that is one character
+			  // shorter
+			  // to avoid problems with genitive 's characters and other binding
+			  // characters
+			  if (this.onlyLongestMatch)
+			  {
+				if (longestMatchToken != null)
+				{
+				  if (longestMatchToken.txt.length() < partLength - 1)
+				  {
+					longestMatchToken = new CompoundToken(this, start, partLength - 1);
+				  }
+				}
+				else
+				{
+				  longestMatchToken = new CompoundToken(this, start, partLength - 1);
+				}
+			  }
+			  else
+			  {
+				tokens.AddLast(new CompoundToken(this, start, partLength - 1));
+			  }
+			}
+		  }
+		  if (this.onlyLongestMatch && longestMatchToken != null)
+		  {
+			tokens.AddLast(longestMatchToken);
+		  }
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
new file mode 100644
index 0000000..4a51f7b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/HyphenationCompoundWordTokenFilterFactory.cs
@@ -0,0 +1,125 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.compound
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using HyphenationTree = org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+	using IOUtils = org.apache.lucene.util.IOUtils;
+
+	using InputSource = org.xml.sax.InputSource;
+
+	/// <summary>
+	/// Factory for <seealso cref="HyphenationCompoundWordTokenFilter"/>.
+	/// <para>
+	/// This factory accepts the following parameters:
+	/// <ul>
+	///  <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern. 
+	///  See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
+	///  <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
+	///  <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
+	///  <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
+	///  <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
+	///  <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
+	///  <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword 
+	///    to the stream. defaults to false.
+	/// </ul>
+	/// </para>
+	/// <para>
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
+	///         dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// </para>
+	/// </summary>
+	/// <seealso cref= HyphenationCompoundWordTokenFilter </seealso>
+	public class HyphenationCompoundWordTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  private CharArraySet dictionary;
+	  private HyphenationTree hyphenator;
+	  private readonly string dictFile;
+	  private readonly string hypFile;
+	  private readonly string encoding;
+	  private readonly int minWordSize;
+	  private readonly int minSubwordSize;
+	  private readonly int maxSubwordSize;
+	  private readonly bool onlyLongestMatch;
+
+	  /// <summary>
+	  /// Creates a new HyphenationCompoundWordTokenFilterFactory </summary>
+	  public HyphenationCompoundWordTokenFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		dictFile = get(args, "dictionary");
+		encoding = get(args, "encoding");
+		hypFile = require(args, "hyphenator");
+		minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+		minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+		maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+		onlyLongestMatch = getBoolean(args, "onlyLongestMatch", false);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		InputStream stream = null;
+		try
+		{
+		  if (dictFile != null) // the dictionary can be empty.
+		  {
+			dictionary = getWordSet(loader, dictFile, false);
+		  }
+		  // TODO: Broken, because we cannot resolve real system id
+		  // ResourceLoader should also supply method like ClassLoader to get resource URL
+		  stream = loader.openResource(hypFile);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.xml.sax.InputSource is = new org.xml.sax.InputSource(stream);
+		  InputSource @is = new InputSource(stream);
+		  @is.Encoding = encoding; // if it's null let xml parser decide
+		  @is.SystemId = hypFile;
+		  hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(@is);
+		}
+		finally
+		{
+		  IOUtils.closeWhileHandlingException(stream);
+		}
+	  }
+
+	  public override HyphenationCompoundWordTokenFilter create(TokenStream input)
+	  {
+		return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
new file mode 100644
index 0000000..963ad0d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/ByteVector.cs
@@ -0,0 +1,151 @@
+using System;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	/// <summary>
+	/// This class implements a simple byte vector with access to the underlying
+	/// array.
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+	public class ByteVector
+	{
+
+	  /// <summary>
+	  /// Capacity increment size
+	  /// </summary>
+	  private const int DEFAULT_BLOCK_SIZE = 2048;
+
+	  private int blockSize;
+
+	  /// <summary>
+	  /// The encapsulated array
+	  /// </summary>
+	  private sbyte[] array;
+
+	  /// <summary>
+	  /// Points to next free item
+	  /// </summary>
+	  private int n;
+
+	  public ByteVector() : this(DEFAULT_BLOCK_SIZE)
+	  {
+	  }
+
+	  public ByteVector(int capacity)
+	  {
+		if (capacity_Renamed > 0)
+		{
+		  blockSize = capacity_Renamed;
+		}
+		else
+		{
+		  blockSize = DEFAULT_BLOCK_SIZE;
+		}
+		array = new sbyte[blockSize];
+		n = 0;
+	  }
+
+	  public ByteVector(sbyte[] a)
+	  {
+		blockSize = DEFAULT_BLOCK_SIZE;
+		array = a;
+		n = 0;
+	  }
+
+	  public ByteVector(sbyte[] a, int capacity)
+	  {
+		if (capacity_Renamed > 0)
+		{
+		  blockSize = capacity_Renamed;
+		}
+		else
+		{
+		  blockSize = DEFAULT_BLOCK_SIZE;
+		}
+		array = a;
+		n = 0;
+	  }
+
+	  public virtual sbyte[] Array
+	  {
+		  get
+		  {
+			return array;
+		  }
+	  }
+
+	  /// <summary>
+	  /// return number of items in array
+	  /// </summary>
+	  public virtual int length()
+	  {
+		return n;
+	  }
+
+	  /// <summary>
+	  /// returns current capacity of array
+	  /// </summary>
+	  public virtual int capacity()
+	  {
+		return array.Length;
+	  }
+
+	  public virtual void put(int index, sbyte val)
+	  {
+		array[index] = val;
+	  }
+
+	  public virtual sbyte get(int index)
+	  {
+		return array[index];
+	  }
+
+	  /// <summary>
+	  /// This is to implement memory allocation in the array. Like malloc().
+	  /// </summary>
+	  public virtual int alloc(int size)
+	  {
+		int index = n;
+		int len = array.Length;
+		if (n + size >= len)
+		{
+		  sbyte[] aux = new sbyte[len + blockSize];
+		  Array.Copy(array, 0, aux, 0, len);
+		  array = aux;
+		}
+		n += size;
+		return index;
+	  }
+
+	  public virtual void trimToSize()
+	  {
+		if (n < array.Length)
+		{
+		  sbyte[] aux = new sbyte[n];
+		  Array.Copy(array, 0, aux, 0, n);
+		  array = aux;
+		}
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
new file mode 100644
index 0000000..6868911
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/CharVector.cs
@@ -0,0 +1,163 @@
+using System;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	/// <summary>
+	/// This class implements a simple char vector with access to the underlying
+	/// array.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+	public class CharVector : ICloneable
+	{
+
+	  /// <summary>
+	  /// Capacity increment size
+	  /// </summary>
+	  private const int DEFAULT_BLOCK_SIZE = 2048;
+
+	  private int blockSize;
+
+	  /// <summary>
+	  /// The encapsulated array
+	  /// </summary>
+	  private char[] array;
+
+	  /// <summary>
+	  /// Points to next free item
+	  /// </summary>
+	  private int n;
+
+	  public CharVector() : this(DEFAULT_BLOCK_SIZE)
+	  {
+	  }
+
+	  public CharVector(int capacity)
+	  {
+		if (capacity_Renamed > 0)
+		{
+		  blockSize = capacity_Renamed;
+		}
+		else
+		{
+		  blockSize = DEFAULT_BLOCK_SIZE;
+		}
+		array = new char[blockSize];
+		n = 0;
+	  }
+
+	  public CharVector(char[] a)
+	  {
+		blockSize = DEFAULT_BLOCK_SIZE;
+		array = a;
+		n = a.Length;
+	  }
+
+	  public CharVector(char[] a, int capacity)
+	  {
+		if (capacity_Renamed > 0)
+		{
+		  blockSize = capacity_Renamed;
+		}
+		else
+		{
+		  blockSize = DEFAULT_BLOCK_SIZE;
+		}
+		array = a;
+		n = a.Length;
+	  }
+
+	  /// <summary>
+	  /// Reset Vector but don't resize or clear elements
+	  /// </summary>
+	  public virtual void clear()
+	  {
+		n = 0;
+	  }
+
+	  public override CharVector clone()
+	  {
+		CharVector cv = new CharVector(array.Clone(), blockSize);
+		cv.n = this.n;
+		return cv;
+	  }
+
+	  public virtual char[] Array
+	  {
+		  get
+		  {
+			return array;
+		  }
+	  }
+
+	  /// <summary>
+	  /// return number of items in array
+	  /// </summary>
+	  public virtual int length()
+	  {
+		return n;
+	  }
+
+	  /// <summary>
+	  /// returns current capacity of array
+	  /// </summary>
+	  public virtual int capacity()
+	  {
+		return array.Length;
+	  }
+
+	  public virtual void put(int index, char val)
+	  {
+		array[index] = val;
+	  }
+
+	  public virtual char get(int index)
+	  {
+		return array[index];
+	  }
+
+	  public virtual int alloc(int size)
+	  {
+		int index = n;
+		int len = array.Length;
+		if (n + size >= len)
+		{
+		  char[] aux = new char[len + blockSize];
+		  Array.Copy(array, 0, aux, 0, len);
+		  array = aux;
+		}
+		n += size;
+		return index;
+	  }
+
+	  public virtual void trimToSize()
+	  {
+		if (n < array.Length)
+		{
+		  char[] aux = new char[n];
+		  Array.Copy(array, 0, aux, 0, n);
+		  array = aux;
+		}
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
new file mode 100644
index 0000000..819d756
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphen.cs
@@ -0,0 +1,76 @@
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	/// <summary>
+	/// This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
+	/// pre-break text, post-break text and no-break. If no line-break is generated
+	/// at this position, the no-break text is used, otherwise, pre-break and
+	/// post-break are used. Typically, pre-break is equal to the hyphen character
+	/// and the others are empty. However, this general scheme allows support for
+	/// cases in some languages where words change spelling if they're split across
+	/// lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
+	/// from TeX.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+
+	public class Hyphen
+	{
+	  public string preBreak;
+
+	  public string noBreak;
+
+	  public string postBreak;
+
+	  internal Hyphen(string pre, string no, string post)
+	  {
+		preBreak = pre;
+		noBreak = no;
+		postBreak = post;
+	  }
+
+	  internal Hyphen(string pre)
+	  {
+		preBreak = pre;
+		noBreak = null;
+		postBreak = null;
+	  }
+
+	  public override string ToString()
+	  {
+		if (noBreak == null && postBreak == null && preBreak != null && preBreak.Equals("-"))
+		{
+		  return "-";
+		}
+		StringBuilder res = new StringBuilder("{");
+		res.Append(preBreak);
+		res.Append("}{");
+		res.Append(postBreak);
+		res.Append("}{");
+		res.Append(noBreak);
+		res.Append('}');
+		return res.ToString();
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
new file mode 100644
index 0000000..ccf7387
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/Hyphenation.cs
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	/// <summary>
+	/// This class represents a hyphenated word.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+	/// </summary>
+	public class Hyphenation
+	{
+
+	  private int[] hyphenPoints;
+
+	  /// <summary>
+	  /// rawWord as made of alternating strings and <seealso cref="Hyphen Hyphen"/> instances
+	  /// </summary>
+	  internal Hyphenation(int[] points)
+	  {
+		hyphenPoints = points;
+	  }
+
+	  /// <returns> the number of hyphenation points in the word </returns>
+	  public virtual int length()
+	  {
+		return hyphenPoints.Length;
+	  }
+
+	  /// <returns> the hyphenation points </returns>
+	  public virtual int[] HyphenationPoints
+	  {
+		  get
+		  {
+			return hyphenPoints;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
new file mode 100644
index 0000000..9bc4cc0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/HyphenationTree.cs
@@ -0,0 +1,533 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+
+	using InputSource = org.xml.sax.InputSource;
+
+	/// <summary>
+	/// This tree structure stores the hyphenation patterns in an efficient way for
+	/// fast lookup. It provides the provides the method to hyphenate a word.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+	public class HyphenationTree : TernaryTree, PatternConsumer
+	{
+
+	  /// <summary>
+	  /// value space: stores the interletter values
+	  /// </summary>
+	  protected internal ByteVector vspace;
+
+	  /// <summary>
+	  /// This map stores hyphenation exceptions
+	  /// </summary>
+	  protected internal Dictionary<string, List<object>> stoplist;
+
+	  /// <summary>
+	  /// This map stores the character classes
+	  /// </summary>
+	  protected internal TernaryTree classmap;
+
+	  /// <summary>
+	  /// Temporary map to store interletter values on pattern loading.
+	  /// </summary>
+	  [NonSerialized]
+	  private TernaryTree ivalues;
+
+	  public HyphenationTree()
+	  {
+		stoplist = new Dictionary<>(23); // usually a small table
+		classmap = new TernaryTree();
+		vspace = new ByteVector();
+		vspace.alloc(1); // this reserves index 0, which we don't use
+	  }
+
+	  /// <summary>
+	  /// Packs the values by storing them in 4 bits, two values into a byte Values
+	  /// range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+	  /// value.
+	  /// </summary>
+	  /// <param name="values"> a string of digits from '0' to '9' representing the
+	  ///        interletter values. </param>
+	  /// <returns> the index into the vspace array where the packed values are stored. </returns>
+	  protected internal virtual int packValues(string values)
+	  {
+		int i , n = values.Length;
+		int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+		int offset = vspace.alloc(m);
+		sbyte[] va = vspace.Array;
+		for (i = 0; i < n; i++)
+		{
+		  int j = i >> 1;
+		  sbyte v = (sbyte)((values[i] - '0' + 1) & 0x0f);
+		  if ((i & 1) == 1)
+		  {
+			va[j + offset] = (sbyte)(va[j + offset] | v);
+		  }
+		  else
+		  {
+			va[j + offset] = (sbyte)(v << 4); // big endian
+		  }
+		}
+		va[m - 1 + offset] = 0; // terminator
+		return offset;
+	  }
+
+	  protected internal virtual string unpackValues(int k)
+	  {
+		StringBuilder buf = new StringBuilder();
+		sbyte v = vspace.get(k++);
+		while (v != 0)
+		{
+		  char c = (char)(((int)((uint)v >> 4)) - 1 + '0');
+		  buf.Append(c);
+		  c = (char)(v & 0x0f);
+		  if (c == 0)
+		  {
+			break;
+		  }
+		  c = (char)(c - 1 + '0');
+		  buf.Append(c);
+		  v = vspace.get(k++);
+		}
+		return buf.ToString();
+	  }
+
+	  /// <summary>
+	  /// Read hyphenation patterns from an XML file.
+	  /// </summary>
+	  /// <param name="f"> the filename </param>
+	  /// <exception cref="IOException"> In case the parsing fails </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void loadPatterns(java.io.File f) throws java.io.IOException
+	  public virtual void loadPatterns(File f)
+	  {
+		InputSource src = new InputSource(f.toURI().toASCIIString());
+		loadPatterns(src);
+	  }
+
+	  /// <summary>
+	  /// Read hyphenation patterns from an XML file.
+	  /// </summary>
+	  /// <param name="source"> the InputSource for the file </param>
+	  /// <exception cref="IOException"> In case the parsing fails </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void loadPatterns(org.xml.sax.InputSource source) throws java.io.IOException
+	  public virtual void loadPatterns(InputSource source)
+	  {
+		PatternParser pp = new PatternParser(this);
+		ivalues = new TernaryTree();
+
+		pp.parse(source);
+
+		// patterns/values should be now in the tree
+		// let's optimize a bit
+		trimToSize();
+		vspace.trimToSize();
+		classmap.trimToSize();
+
+		// get rid of the auxiliary map
+		ivalues = null;
+	  }
+
+	  public virtual string findPattern(string pat)
+	  {
+		int k = base.find(pat);
+		if (k >= 0)
+		{
+		  return unpackValues(k);
+		}
+		return "";
+	  }
+
+	  /// <summary>
+	  /// String compare, returns 0 if equal or t is a substring of s
+	  /// </summary>
+	  protected internal virtual int hstrcmp(char[] s, int si, char[] t, int ti)
+	  {
+		for (; s[si] == t[ti]; si++, ti++)
+		{
+		  if (s[si] == 0)
+		  {
+			return 0;
+		  }
+		}
+		if (t[ti] == 0)
+		{
+		  return 0;
+		}
+		return s[si] - t[ti];
+	  }
+
+	  protected internal virtual sbyte[] getValues(int k)
+	  {
+		StringBuilder buf = new StringBuilder();
+		sbyte v = vspace.get(k++);
+		while (v != 0)
+		{
+		  char c = (char)(((int)((uint)v >> 4)) - 1);
+		  buf.Append(c);
+		  c = (char)(v & 0x0f);
+		  if (c == 0)
+		  {
+			break;
+		  }
+		  c = (char)(c - 1);
+		  buf.Append(c);
+		  v = vspace.get(k++);
+		}
+		sbyte[] res = new sbyte[buf.Length];
+		for (int i = 0; i < res.Length; i++)
+		{
+		  res[i] = (sbyte) buf[i];
+		}
+		return res;
+	  }
+
+	  /// <summary>
+	  /// <para>
+	  /// Search for all possible partial matches of word starting at index an update
+	  /// interletter values. In other words, it does something like:
+	  /// </para>
+	  /// <code>
+	  /// for(i=0; i&lt;patterns.length; i++) {
+	  /// if ( word.substring(index).startsWidth(patterns[i]) )
+	  /// update_interletter_values(patterns[i]);
+	  /// }
+	  /// </code>
+	  /// <para>
+	  /// But it is done in an efficient way since the patterns are stored in a
+	  /// ternary tree. In fact, this is the whole purpose of having the tree: doing
+	  /// this search without having to test every single pattern. The number of
+	  /// patterns for languages such as English range from 4000 to 10000. Thus,
+	  /// doing thousands of string comparisons for each word to hyphenate would be
+	  /// really slow without the tree. The tradeoff is memory, but using a ternary
+	  /// tree instead of a trie, almost halves the the memory used by Lout or TeX.
+	  /// It's also faster than using a hash table
+	  /// </para>
+	  /// </summary>
+	  /// <param name="word"> null terminated word to match </param>
+	  /// <param name="index"> start index from word </param>
+	  /// <param name="il"> interletter values array to update </param>
+	  protected internal virtual void searchPatterns(char[] word, int index, sbyte[] il)
+	  {
+		sbyte[] values;
+		int i = index;
+		char p, q;
+		char sp = word[i];
+		p = root;
+
+		while (p > 0 && p < sc.Length)
+		{
+		  if (sc[p] == 0xFFFF)
+		  {
+			if (hstrcmp(word, i, kv.Array, lo[p]) == 0)
+			{
+			  values = getValues(eq[p]); // data pointer is in eq[]
+			  int j = index;
+			  for (int k = 0; k < values.Length; k++)
+			  {
+				if (j < il.Length && values[k] > il[j])
+				{
+				  il[j] = values[k];
+				}
+				j++;
+			  }
+			}
+			return;
+		  }
+		  int d = sp - sc[p];
+		  if (d == 0)
+		  {
+			if (sp == 0)
+			{
+			  break;
+			}
+			sp = word[++i];
+			p = eq[p];
+			q = p;
+
+			// look for a pattern ending at this position by searching for
+			// the null char ( splitchar == 0 )
+			while (q > 0 && q < sc.Length)
+			{
+			  if (sc[q] == 0xFFFF) // stop at compressed branch
+			  {
+				break;
+			  }
+			  if (sc[q] == 0)
+			  {
+				values = getValues(eq[q]);
+				int j = index;
+				for (int k = 0; k < values.Length; k++)
+				{
+				  if (j < il.Length && values[k] > il[j])
+				  {
+					il[j] = values[k];
+				  }
+				  j++;
+				}
+				break;
+			  }
+			  else
+			  {
+				q = lo[q];
+
+				/// <summary>
+				/// actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+				/// java chars are unsigned
+				/// </summary>
+			  }
+			}
+		  }
+		  else
+		  {
+			p = d < 0 ? lo[p] : hi[p];
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Hyphenate word and return a Hyphenation object.
+	  /// </summary>
+	  /// <param name="word"> the word to be hyphenated </param>
+	  /// <param name="remainCharCount"> Minimum number of characters allowed before the
+	  ///        hyphenation point. </param>
+	  /// <param name="pushCharCount"> Minimum number of characters allowed after the
+	  ///        hyphenation point. </param>
+	  /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+	  ///         hyphenated word or null if word is not hyphenated. </returns>
+	  public virtual Hyphenation hyphenate(string word, int remainCharCount, int pushCharCount)
+	  {
+		char[] w = word.ToCharArray();
+		return hyphenate(w, 0, w.Length, remainCharCount, pushCharCount);
+	  }
+
+	  /// <summary>
+	  /// w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+	  /// may be absent, the first n is at offset, the first l is at offset +
+	  /// iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+	  /// into word. In the first part of the routine len = w.length, in the second
+	  /// part of the routine len = word.length. Three indices are used: index(w),
+	  /// the index in w, index(word), the index in word, letterindex(word), the
+	  /// index in the letter part of word. The following relations exist: index(w) =
+	  /// offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+	  /// index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+	  /// offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+	  /// iIgnoreAtBeginning
+	  /// </summary>
+
+	  /// <summary>
+	  /// Hyphenate word and return an array of hyphenation points.
+	  /// </summary>
+	  /// <param name="w"> char array that contains the word </param>
+	  /// <param name="offset"> Offset to first character in word </param>
+	  /// <param name="len"> Length of word </param>
+	  /// <param name="remainCharCount"> Minimum number of characters allowed before the
+	  ///        hyphenation point. </param>
+	  /// <param name="pushCharCount"> Minimum number of characters allowed after the
+	  ///        hyphenation point. </param>
+	  /// <returns> a <seealso cref="Hyphenation Hyphenation"/> object representing the
+	  ///         hyphenated word or null if word is not hyphenated. </returns>
+	  public virtual Hyphenation hyphenate(char[] w, int offset, int len, int remainCharCount, int pushCharCount)
+	  {
+		int i;
+		char[] word = new char[len + 3];
+
+		// normalize word
+		char[] c = new char[2];
+		int iIgnoreAtBeginning = 0;
+		int iLength = len;
+		bool bEndOfLetters = false;
+		for (i = 1; i <= len; i++)
+		{
+		  c[0] = w[offset + i - 1];
+		  int nc = classmap.find(c, 0);
+		  if (nc < 0) // found a non-letter character ...
+		  {
+			if (i == (1 + iIgnoreAtBeginning))
+			{
+			  // ... before any letter character
+			  iIgnoreAtBeginning++;
+			}
+			else
+			{
+			  // ... after a letter character
+			  bEndOfLetters = true;
+			}
+			iLength--;
+		  }
+		  else
+		  {
+			if (!bEndOfLetters)
+			{
+			  word[i - iIgnoreAtBeginning] = (char) nc;
+			}
+			else
+			{
+			  return null;
+			}
+		  }
+		}
+		len = iLength;
+		if (len < (remainCharCount + pushCharCount))
+		{
+		  // word is too short to be hyphenated
+		  return null;
+		}
+		int[] result = new int[len + 1];
+		int k = 0;
+
+		// check exception list first
+		string sw = new string(word, 1, len);
+		if (stoplist.ContainsKey(sw))
+		{
+		  // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+		  // null)
+		  List<object> hw = stoplist[sw];
+		  int j = 0;
+		  for (i = 0; i < hw.Count; i++)
+		  {
+			object o = hw[i];
+			// j = index(sw) = letterindex(word)?
+			// result[k] = corresponding index(w)
+			if (o is string)
+			{
+			  j += ((string) o).Length;
+			  if (j >= remainCharCount && j < (len - pushCharCount))
+			  {
+				result[k++] = j + iIgnoreAtBeginning;
+			  }
+			}
+		  }
+		}
+		else
+		{
+		  // use algorithm to get hyphenation points
+		  word[0] = '.'; // word start marker
+		  word[len + 1] = '.'; // word end marker
+		  word[len + 2] = (char)0; // null terminated
+		  sbyte[] il = new sbyte[len + 3]; // initialized to zero
+		  for (i = 0; i < len + 1; i++)
+		  {
+			searchPatterns(word, i, il);
+		  }
+
+		  // hyphenation points are located where interletter value is odd
+		  // i is letterindex(word),
+		  // i + 1 is index(word),
+		  // result[k] = corresponding index(w)
+		  for (i = 0; i < len; i++)
+		  {
+			if (((il[i + 1] & 1) == 1) && i >= remainCharCount && i <= (len - pushCharCount))
+			{
+			  result[k++] = i + iIgnoreAtBeginning;
+			}
+		  }
+		}
+
+		if (k > 0)
+		{
+		  // trim result array
+		  int[] res = new int[k + 2];
+		  Array.Copy(result, 0, res, 1, k);
+		  // We add the synthetical hyphenation points
+		  // at the beginning and end of the word
+		  res[0] = 0;
+		  res[k + 1] = len;
+		  return new Hyphenation(res);
+		}
+		else
+		{
+		  return null;
+		}
+	  }
+
+	  /// <summary>
+	  /// Add a character class to the tree. It is used by
+	  /// <seealso cref="PatternParser PatternParser"/> as callback to add character classes.
+	  /// Character classes define the valid word characters for hyphenation. If a
+	  /// word contains a character not defined in any of the classes, it is not
+	  /// hyphenated. It also defines a way to normalize the characters in order to
+	  /// compare them with the stored patterns. Usually pattern files use only lower
+	  /// case characters, in this case a class for letter 'a', for example, should
+	  /// be defined as "aA", the first character being the normalization char.
+	  /// </summary>
+	  public virtual void addClass(string chargroup)
+	  {
+		if (chargroup.Length > 0)
+		{
+		  char equivChar = chargroup[0];
+		  char[] key = new char[2];
+		  key[1] = (char)0;
+		  for (int i = 0; i < chargroup.Length; i++)
+		  {
+			key[0] = chargroup[i];
+			classmap.insert(key, 0, equivChar);
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Add an exception to the tree. It is used by
+	  /// <seealso cref="PatternParser PatternParser"/> class as callback to store the
+	  /// hyphenation exceptions.
+	  /// </summary>
+	  /// <param name="word"> normalized word </param>
+	  /// <param name="hyphenatedword"> a vector of alternating strings and
+	  ///        <seealso cref="Hyphen hyphen"/> objects. </param>
+	  public virtual void addException(string word, List<object> hyphenatedword)
+	  {
+		stoplist[word] = hyphenatedword;
+	  }
+
+	  /// <summary>
+	  /// Add a pattern to the tree. Mainly, to be used by
+	  /// <seealso cref="PatternParser PatternParser"/> class as callback to add a pattern to
+	  /// the tree.
+	  /// </summary>
+	  /// <param name="pattern"> the hyphenation pattern </param>
+	  /// <param name="ivalue"> interletter weight values indicating the desirability and
+	  ///        priority of hyphenating at a given point within the pattern. It
+	  ///        should contain only digit characters. (i.e. '0' to '9'). </param>
+	  public virtual void addPattern(string pattern, string ivalue)
+	  {
+		int k = ivalues.find(ivalue);
+		if (k <= 0)
+		{
+		  k = packValues(ivalue);
+		  ivalues.insert(ivalue, (char) k);
+		}
+		insert(pattern, (char) k);
+	  }
+
+	  public override void printStats(PrintStream @out)
+	  {
+		@out.println("Value space size = " + Convert.ToString(vspace.length()));
+		base.printStats(@out);
+
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
new file mode 100644
index 0000000..5b3fc39
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternConsumer.cs
@@ -0,0 +1,57 @@
+using System.Collections.Generic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	/// <summary>
+	/// This interface is used to connect the XML pattern file parser to the
+	/// hyphenation tree.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+	/// </summary>
+	public interface PatternConsumer
+	{
+
+	  /// <summary>
+	  /// Add a character class. A character class defines characters that are
+	  /// considered equivalent for the purpose of hyphenation (e.g. "aA"). It
+	  /// usually means to ignore case.
+	  /// </summary>
+	  /// <param name="chargroup"> character group </param>
+	  void addClass(string chargroup);
+
+	  /// <summary>
+	  /// Add a hyphenation exception. An exception replaces the result obtained by
+	  /// the algorithm for cases for which this fails or the user wants to provide
+	  /// his own hyphenation. A hyphenatedword is a vector of alternating String's
+	  /// and <seealso cref="Hyphen Hyphen"/> instances
+	  /// </summary>
+	  void addException(string word, List<object> hyphenatedword);
+
+	  /// <summary>
+	  /// Add hyphenation patterns.
+	  /// </summary>
+	  /// <param name="pattern"> the pattern </param>
+	  /// <param name="values"> interletter values expressed as a string of digit characters. </param>
+	  void addPattern(string pattern, string values);
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
new file mode 100644
index 0000000..50d3eb8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/PatternParser.cs
@@ -0,0 +1,463 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+	// SAX
+	using XMLReader = org.xml.sax.XMLReader;
+	using InputSource = org.xml.sax.InputSource;
+	using SAXException = org.xml.sax.SAXException;
+	using SAXParseException = org.xml.sax.SAXParseException;
+	using DefaultHandler = org.xml.sax.helpers.DefaultHandler;
+	using Attributes = org.xml.sax.Attributes;
+
+	// Java
+
+	/// <summary>
+	/// A SAX document handler to read and parse hyphenation patterns from a XML
+	/// file.
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+	public class PatternParser : DefaultHandler
+	{
+
+	  internal XMLReader parser;
+
+	  internal int currElement;
+
+	  internal PatternConsumer consumer;
+
+	  internal StringBuilder token;
+
+	  internal List<object> exception;
+
+	  internal char hyphenChar;
+
+	  internal string errMsg;
+
+	  internal const int ELEM_CLASSES = 1;
+
+	  internal const int ELEM_EXCEPTIONS = 2;
+
+	  internal const int ELEM_PATTERNS = 3;
+
+	  internal const int ELEM_HYPHEN = 4;
+
+	  public PatternParser()
+	  {
+		token = new StringBuilder();
+		parser = createParser();
+		parser.ContentHandler = this;
+		parser.ErrorHandler = this;
+		parser.EntityResolver = this;
+		hyphenChar = '-'; // default
+
+	  }
+
+	  public PatternParser(PatternConsumer consumer) : this()
+	  {
+		this.consumer = consumer;
+	  }
+
+	  public virtual PatternConsumer Consumer
+	  {
+		  set
+		  {
+			this.consumer = value;
+		  }
+	  }
+
+	  /// <summary>
+	  /// Parses a hyphenation pattern file.
+	  /// </summary>
+	  /// <param name="filename"> the filename </param>
+	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(String filename) throws java.io.IOException
+	  public virtual void parse(string filename)
+	  {
+		parse(new InputSource(filename));
+	  }
+
+	  /// <summary>
+	  /// Parses a hyphenation pattern file.
+	  /// </summary>
+	  /// <param name="file"> the pattern file </param>
+	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(java.io.File file) throws java.io.IOException
+	  public virtual void parse(File file)
+	  {
+		InputSource src = new InputSource(file.toURI().toASCIIString());
+		parse(src);
+	  }
+
+	  /// <summary>
+	  /// Parses a hyphenation pattern file.
+	  /// </summary>
+	  /// <param name="source"> the InputSource for the file </param>
+	  /// <exception cref="IOException"> In case of an exception while parsing </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void parse(org.xml.sax.InputSource source) throws java.io.IOException
+	  public virtual void parse(InputSource source)
+	  {
+		try
+		{
+		  parser.parse(source);
+		}
+		catch (SAXException e)
+		{
+		  throw new IOException(e);
+		}
+	  }
+
+	  /// <summary>
+	  /// Creates a SAX parser using JAXP
+	  /// </summary>
+	  /// <returns> the created SAX parser </returns>
+	  internal static XMLReader createParser()
+	  {
+		try
+		{
+		  SAXParserFactory factory = SAXParserFactory.newInstance();
+		  factory.NamespaceAware = true;
+		  return factory.newSAXParser().XMLReader;
+		}
+		catch (Exception e)
+		{
+		  throw new Exception("Couldn't create XMLReader: " + e.Message);
+		}
+	  }
+
+	  protected internal virtual string readToken(StringBuilder chars)
+	  {
+		string word;
+		bool space = false;
+		int i;
+		for (i = 0; i < chars.Length; i++)
+		{
+		  if (char.IsWhiteSpace(chars[i]))
+		  {
+			space = true;
+		  }
+		  else
+		  {
+			break;
+		  }
+		}
+		if (space)
+		{
+		  // chars.delete(0,i);
+		  for (int countr = i; countr < chars.Length; countr++)
+		  {
+			chars[countr - i] = chars[countr];
+		  }
+		  chars.Length = chars.Length - i;
+		  if (token.Length > 0)
+		  {
+			word = token.ToString();
+			token.Length = 0;
+			return word;
+		  }
+		}
+		space = false;
+		for (i = 0; i < chars.Length; i++)
+		{
+		  if (char.IsWhiteSpace(chars[i]))
+		  {
+			space = true;
+			break;
+		  }
+		}
+		token.Append(chars.ToString().Substring(0, i));
+		// chars.delete(0,i);
+		for (int countr = i; countr < chars.Length; countr++)
+		{
+		  chars[countr - i] = chars[countr];
+		}
+		chars.Length = chars.Length - i;
+		if (space)
+		{
+		  word = token.ToString();
+		  token.Length = 0;
+		  return word;
+		}
+		token.Append(chars);
+		return null;
+	  }
+
+	  protected internal static string getPattern(string word)
+	  {
+		StringBuilder pat = new StringBuilder();
+		int len = word.Length;
+		for (int i = 0; i < len; i++)
+		{
+		  if (!char.IsDigit(word[i]))
+		  {
+			pat.Append(word[i]);
+		  }
+		}
+		return pat.ToString();
+	  }
+
+	  protected internal virtual List<object> normalizeException(List<T1> ex)
+	  {
+		List<object> res = new List<object>();
+		for (int i = 0; i < ex.Count; i++)
+		{
+		  object item = ex[i];
+		  if (item is string)
+		  {
+			string str = (string) item;
+			StringBuilder buf = new StringBuilder();
+			for (int j = 0; j < str.Length; j++)
+			{
+			  char c = str[j];
+			  if (c != hyphenChar)
+			  {
+				buf.Append(c);
+			  }
+			  else
+			  {
+				res.Add(buf.ToString());
+				buf.Length = 0;
+				char[] h = new char[1];
+				h[0] = hyphenChar;
+				// we use here hyphenChar which is not necessarily
+				// the one to be printed
+				res.Add(new Hyphen(new string(h), null, null));
+			  }
+			}
+			if (buf.Length > 0)
+			{
+			  res.Add(buf.ToString());
+			}
+		  }
+		  else
+		  {
+			res.Add(item);
+		  }
+		}
+		return res;
+	  }
+
+	  protected internal virtual string getExceptionWord<T1>(List<T1> ex)
+	  {
+		StringBuilder res = new StringBuilder();
+		for (int i = 0; i < ex.Count; i++)
+		{
+		  object item = ex[i];
+		  if (item is string)
+		  {
+			res.Append((string) item);
+		  }
+		  else
+		  {
+			if (((Hyphen) item).noBreak != null)
+			{
+			  res.Append(((Hyphen) item).noBreak);
+			}
+		  }
+		}
+		return res.ToString();
+	  }
+
+	  protected internal static string getInterletterValues(string pat)
+	  {
+		StringBuilder il = new StringBuilder();
+		string word = pat + "a"; // add dummy letter to serve as sentinel
+		int len = word.Length;
+		for (int i = 0; i < len; i++)
+		{
+		  char c = word[i];
+		  if (char.IsDigit(c))
+		  {
+			il.Append(c);
+			i++;
+		  }
+		  else
+		  {
+			il.Append('0');
+		  }
+		}
+		return il.ToString();
+	  }
+
+	  //
+	  // EntityResolver methods
+	  //
+	  public override InputSource resolveEntity(string publicId, string systemId)
+	  {
+		// supply the internal hyphenation.dtd if possible
+		if ((systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || ("hyphenation-info".Equals(publicId)))
+		{
+		  // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
+		  return new InputSource(this.GetType().getResource("hyphenation.dtd").toExternalForm());
+		}
+		return null;
+	  }
+
+	  //
+	  // ContentHandler methods
+	  //
+
+	  /// <seealso cref= org.xml.sax.ContentHandler#startElement(java.lang.String,
+	  ///      java.lang.String, java.lang.String, org.xml.sax.Attributes) </seealso>
+	  public override void startElement(string uri, string local, string raw, Attributes attrs)
+	  {
+		if (local.Equals("hyphen-char"))
+		{
+		  string h = attrs.getValue("value");
+		  if (h != null && h.Length == 1)
+		  {
+			hyphenChar = h[0];
+		  }
+		}
+		else if (local.Equals("classes"))
+		{
+		  currElement = ELEM_CLASSES;
+		}
+		else if (local.Equals("patterns"))
+		{
+		  currElement = ELEM_PATTERNS;
+		}
+		else if (local.Equals("exceptions"))
+		{
+		  currElement = ELEM_EXCEPTIONS;
+		  exception = new List<>();
+		}
+		else if (local.Equals("hyphen"))
+		{
+		  if (token.Length > 0)
+		  {
+			exception.Add(token.ToString());
+		  }
+		  exception.Add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), attrs.getValue("post")));
+		  currElement = ELEM_HYPHEN;
+		}
+		token.Length = 0;
+	  }
+
+	  /// <seealso cref= org.xml.sax.ContentHandler#endElement(java.lang.String,
+	  ///      java.lang.String, java.lang.String) </seealso>
+//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
+//ORIGINAL LINE: @Override @SuppressWarnings("unchecked") public void endElement(String uri, String local, String raw)
+	  public override void endElement(string uri, string local, string raw)
+	  {
+
+		if (token.Length > 0)
+		{
+		  string word = token.ToString();
+		  switch (currElement)
+		  {
+			case ELEM_CLASSES:
+			  consumer.addClass(word);
+			  break;
+			case ELEM_EXCEPTIONS:
+			  exception.Add(word);
+			  exception = normalizeException(exception);
+			  consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
+			  break;
+			case ELEM_PATTERNS:
+			  consumer.addPattern(getPattern(word), getInterletterValues(word));
+			  break;
+			case ELEM_HYPHEN:
+			  // nothing to do
+			  break;
+		  }
+		  if (currElement != ELEM_HYPHEN)
+		  {
+			token.Length = 0;
+		  }
+		}
+		if (currElement == ELEM_HYPHEN)
+		{
+		  currElement = ELEM_EXCEPTIONS;
+		}
+		else
+		{
+		  currElement = 0;
+		}
+
+	  }
+
+	  /// <seealso cref= org.xml.sax.ContentHandler#characters(char[], int, int) </seealso>
+//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
+//ORIGINAL LINE: @SuppressWarnings("unchecked") @Override public void characters(char ch[] , int start, int length)
+	  public override void characters(char[] ch, int start, int length)
+	  {
+		StringBuilder chars = new StringBuilder(length);
+		chars.Append(ch, start, length);
+		string word = readToken(chars);
+		while (word != null)
+		{
+		  // System.out.println("\"" + word + "\"");
+		  switch (currElement)
+		  {
+			case ELEM_CLASSES:
+			  consumer.addClass(word);
+			  break;
+			case ELEM_EXCEPTIONS:
+			  exception.Add(word);
+			  exception = normalizeException(exception);
+			  consumer.addException(getExceptionWord(exception), (ArrayList) exception.clone());
+			  exception.Clear();
+			  break;
+			case ELEM_PATTERNS:
+			  consumer.addPattern(getPattern(word), getInterletterValues(word));
+			  break;
+		  }
+		  word = readToken(chars);
+		}
+
+	  }
+
+	  /// <summary>
+	  /// Returns a string of the location.
+	  /// </summary>
+	  private string getLocationString(SAXParseException ex)
+	  {
+		StringBuilder str = new StringBuilder();
+
+		string systemId = ex.SystemId;
+		if (systemId != null)
+		{
+		  int index = systemId.LastIndexOf('/');
+		  if (index != -1)
+		  {
+			systemId = systemId.Substring(index + 1);
+		  }
+		  str.Append(systemId);
+		}
+		str.Append(':');
+		str.Append(ex.LineNumber);
+		str.Append(':');
+		str.Append(ex.ColumnNumber);
+
+		return str.ToString();
+
+	  } // getLocationString(SAXParseException):String
+	}
+
+}
\ No newline at end of file


Mime
View raw message