lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [04/52] [abbrv] lucenenet git commit: Ported Analysis.Hunspell + tests
Date Thu, 01 Sep 2016 14:39:25 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/e4d9f44c/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
index ff6f4e2..05c2a26 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs
@@ -1,12 +1,19 @@
-´╗┐using System;
-using System.Diagnostics;
+´╗┐using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using Lucene.Net.Util.Fst;
+using System;
 using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+using System.IO;
 using System.Text;
+using System.Text.RegularExpressions;
 
-namespace org.apache.lucene.analysis.hunspell
+namespace Lucene.Net.Analysis.Hunspell
 {
-
-	/*
+    /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
@@ -23,1213 +30,1155 @@ namespace org.apache.lucene.analysis.hunspell
 	 * limitations under the License.
 	 */
 
-	using ByteArrayDataOutput = org.apache.lucene.store.ByteArrayDataOutput;
-	using ArrayUtil = org.apache.lucene.util.ArrayUtil;
-	using BytesRef = org.apache.lucene.util.BytesRef;
-	using BytesRefHash = org.apache.lucene.util.BytesRefHash;
-	using CharsRef = org.apache.lucene.util.CharsRef;
-	using IOUtils = org.apache.lucene.util.IOUtils;
-	using IntsRef = org.apache.lucene.util.IntsRef;
-	using OfflineSorter = org.apache.lucene.util.OfflineSorter;
-	using ByteSequencesReader = org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
-	using ByteSequencesWriter = org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
-	using CharacterRunAutomaton = org.apache.lucene.util.automaton.CharacterRunAutomaton;
-	using RegExp = org.apache.lucene.util.automaton.RegExp;
-	using Builder = org.apache.lucene.util.fst.Builder;
-	using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs;
-	using FST = org.apache.lucene.util.fst.FST;
-	using IntSequenceOutputs = org.apache.lucene.util.fst.IntSequenceOutputs;
-	using Outputs = org.apache.lucene.util.fst.Outputs;
-	using Util = org.apache.lucene.util.fst.Util;
-
-
-	/// <summary>
-	/// In-memory structure for the dictionary (.dic) and affix (.aff)
-	/// data of a hunspell dictionary.
-	/// </summary>
-	public class Dictionary
-	{
-
-	  internal static readonly char[] NOFLAGS = new char[0];
-
-	  private const string ALIAS_KEY = "AF";
-	  private const string PREFIX_KEY = "PFX";
-	  private const string SUFFIX_KEY = "SFX";
-	  private const string FLAG_KEY = "FLAG";
-	  private const string COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
-	  private const string CIRCUMFIX_KEY = "CIRCUMFIX";
-	  private const string IGNORE_KEY = "IGNORE";
-	  private const string ICONV_KEY = "ICONV";
-	  private const string OCONV_KEY = "OCONV";
-
-	  private const string NUM_FLAG_TYPE = "num";
-	  private const string UTF8_FLAG_TYPE = "UTF-8";
-	  private const string LONG_FLAG_TYPE = "long";
-
-	  // TODO: really for suffixes we should reverse the automaton and run them backwards
-	  private const string PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
-	  private const string SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
-
-	  internal FST<IntsRef> prefixes;
-	  internal FST<IntsRef> suffixes;
-
-	  // all condition checks used by prefixes and suffixes. these are typically re-used across
-	  // many affix stripping rules. so these are deduplicated, to save RAM.
-	  internal List<CharacterRunAutomaton> patterns = new List<CharacterRunAutomaton>();
-
-	  // the entries in the .dic file, mapping to their set of flags.
-	  // the fst output is the ordinal list for flagLookup
-	  internal FST<IntsRef> words;
-	  // the list of unique flagsets (wordforms). theoretically huge, but practically
-	  // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
-	  internal BytesRefHash flagLookup = new BytesRefHash();
-
-	  // the list of unique strip affixes.
-	  internal char[] stripData;
-	  internal int[] stripOffsets;
-
-	  // 8 bytes per affix
-	  internal sbyte[] affixData = new sbyte[64];
-	  private int currentAffix = 0;
-
-	  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
-
-	  private string[] aliases;
-	  private int aliasCount = 0;
-
-	  private readonly File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
-
-	  internal bool ignoreCase;
-	  internal bool complexPrefixes;
-	  internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
-
-	  internal int circumfix = -1; // circumfix flag, or -1 if one is not defined
-
-	  // ignored characters (dictionary, affix, inputs)
-	  private char[] ignore;
-
-	  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
-	  internal FST<CharsRef> iconv;
-	  internal FST<CharsRef> oconv;
-
-	  internal bool needsInputCleaning;
-	  internal bool needsOutputCleaning;
-
-	  /// <summary>
-	  /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
-	  /// and dictionary files.
-	  /// You have to close the provided InputStreams yourself.
-	  /// </summary>
-	  /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param>
-	  /// <param name="dictionary"> InputStream for reading the hunspell dictionary file (won't be closed). </param>
-	  /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception>
-	  /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public Dictionary(java.io.InputStream affix, java.io.InputStream dictionary) throws java.io.IOException, java.text.ParseException
-	  public Dictionary(InputStream affix, InputStream dictionary) : this(affix, Collections.singletonList(dictionary), false)
-	  {
-	  }
-
-	  /// <summary>
-	  /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
-	  /// and dictionary files.
-	  /// You have to close the provided InputStreams yourself.
-	  /// </summary>
-	  /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param>
-	  /// <param name="dictionaries"> InputStream for reading the hunspell dictionary files (won't be closed). </param>
-	  /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception>
-	  /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public Dictionary(java.io.InputStream affix, java.util.List<java.io.InputStream> dictionaries, boolean ignoreCase) throws java.io.IOException, java.text.ParseException
-	  public Dictionary(InputStream affix, IList<InputStream> dictionaries, bool ignoreCase)
-	  {
-		this.ignoreCase = ignoreCase;
-		this.needsInputCleaning = ignoreCase;
-		this.needsOutputCleaning = false; // set if we have an OCONV
-		flagLookup.add(new BytesRef()); // no flags -> ord 0
-
-		File aff = File.createTempFile("affix", "aff", tempDir);
-		OutputStream @out = new BufferedOutputStream(new FileOutputStream(aff));
-		InputStream aff1 = null;
-		InputStream aff2 = null;
-		try
-		{
-		  // copy contents of affix stream to temp file
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final byte [] buffer = new byte [1024 * 8];
-		  sbyte[] buffer = new sbyte [1024 * 8];
-		  int len;
-		  while ((len = affix.read(buffer)) > 0)
-		  {
-			@out.write(buffer, 0, len);
-		  }
-		  @out.close();
-
-		  // pass 1: get encoding
-		  aff1 = new BufferedInputStream(new FileInputStream(aff));
-		  string encoding = getDictionaryEncoding(aff1);
-
-		  // pass 2: parse affixes
-		  CharsetDecoder decoder = getJavaEncoding(encoding);
-		  aff2 = new BufferedInputStream(new FileInputStream(aff));
-		  readAffixFile(aff2, decoder);
-
-		  // read dictionary entries
-		  IntSequenceOutputs o = IntSequenceOutputs.Singleton;
-		  Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
-		  readDictionaryFiles(dictionaries, decoder, b);
-		  words = b.finish();
-		  aliases = null; // no longer needed
-		}
-		finally
-		{
-		  IOUtils.closeWhileHandlingException(@out, aff1, aff2);
-		  aff.delete();
-		}
-	  }
-
-	  /// <summary>
-	  /// Looks up Hunspell word forms from the dictionary
-	  /// </summary>
-	  internal virtual IntsRef lookupWord(char[] word, int offset, int length)
-	  {
-		return lookup(words, word, offset, length);
-	  }
-
-	  /// <summary>
-	  /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
-	  /// </summary>
-	  /// <param name="word"> Char array to generate the String from </param>
-	  /// <param name="offset"> Offset in the char array that the String starts at </param>
-	  /// <param name="length"> Length from the offset that the String is </param>
-	  /// <returns> List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found </returns>
-	  internal virtual IntsRef lookupPrefix(char[] word, int offset, int length)
-	  {
-		return lookup(prefixes, word, offset, length);
-	  }
-
-	  /// <summary>
-	  /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
-	  /// </summary>
-	  /// <param name="word"> Char array to generate the String from </param>
-	  /// <param name="offset"> Offset in the char array that the String starts at </param>
-	  /// <param name="length"> Length from the offset that the String is </param>
-	  /// <returns> List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found </returns>
-	  internal virtual IntsRef lookupSuffix(char[] word, int offset, int length)
-	  {
-		return lookup(suffixes, word, offset, length);
-	  }
-
-	  // TODO: this is pretty stupid, considering how the stemming algorithm works
-	  // we can speed it up to be significantly faster!
-	  internal virtual IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length)
-	  {
-		if (fst == null)
-		{
-		  return null;
-		}
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader bytesReader = fst.getBytesReader();
-		FST.BytesReader bytesReader = fst.BytesReader;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.IntsRef> arc = fst.getFirstArc(new org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.IntsRef>());
-		FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
-		// Accumulate output as we go
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-		IntsRef NO_OUTPUT = fst.outputs.NoOutput;
-		IntsRef output = NO_OUTPUT;
-
-		int l = offset + length;
-		try
-		{
-		  for (int i = offset, cp = 0; i < l; i += char.charCount(cp))
-		  {
-			cp = char.codePointAt(word, i, l);
-			if (fst.findTargetArc(cp, arc, arc, bytesReader) == null)
-			{
-			  return null;
-			}
-			else if (arc.output != NO_OUTPUT)
-			{
-			  output = fst.outputs.add(output, arc.output);
-			}
-		  }
-		  if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null)
-		  {
-			return null;
-		  }
-		  else if (arc.output != NO_OUTPUT)
-		  {
-			return fst.outputs.add(output, arc.output);
-		  }
-		  else
-		  {
-			return output;
-		  }
-		}
-		catch (IOException bogus)
-		{
-		  throw new Exception(bogus);
-		}
-	  }
-
-	  /// <summary>
-	  /// Reads the affix file through the provided InputStream, building up the prefix and suffix maps
-	  /// </summary>
-	  /// <param name="affixStream"> InputStream to read the content of the affix file from </param>
-	  /// <param name="decoder"> CharsetDecoder to decode the content of the file </param>
-	  /// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void readAffixFile(java.io.InputStream affixStream, java.nio.charset.CharsetDecoder decoder) throws java.io.IOException, java.text.ParseException
-	  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
-	  {
-		SortedDictionary<string, IList<char?>> prefixes = new SortedDictionary<string, IList<char?>>();
-		SortedDictionary<string, IList<char?>> suffixes = new SortedDictionary<string, IList<char?>>();
-		IDictionary<string, int?> seenPatterns = new Dictionary<string, int?>();
-
-		// zero condition -> 0 ord
-		seenPatterns[".*"] = 0;
-		patterns.Add(null);
-
-		// zero strip -> 0 ord
-		IDictionary<string, int?> seenStrips = new LinkedHashMap<string, int?>();
-		seenStrips[""] = 0;
-
-		LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
-		string line = null;
-		while ((line = reader.readLine()) != null)
-		{
-		  // ignore any BOM marker on first line
-		  if (reader.LineNumber == 1 && line.StartsWith("\uFEFF", StringComparison.Ordinal))
-		  {
-			line = line.Substring(1);
-		  }
-		  if (line.StartsWith(ALIAS_KEY, StringComparison.Ordinal))
-		  {
-			parseAlias(line);
-		  }
-		  else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal))
-		  {
-			parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
-		  }
-		  else if (line.StartsWith(SUFFIX_KEY, StringComparison.Ordinal))
-		  {
-			parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
-		  }
-		  else if (line.StartsWith(FLAG_KEY, StringComparison.Ordinal))
-		  {
-			// Assume that the FLAG line comes before any prefix or suffixes
-			// Store the strategy so it can be used when parsing the dic file
-			flagParsingStrategy = getFlagParsingStrategy(line);
-		  }
-		  else if (line.Equals(COMPLEXPREFIXES_KEY))
-		  {
-			complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
-		  }
-		  else if (line.StartsWith(CIRCUMFIX_KEY, StringComparison.Ordinal))
-		  {
-			string[] parts = line.Split("\\s+", true);
-			if (parts.Length != 2)
-			{
-			  throw new ParseException("Illegal CIRCUMFIX declaration", reader.LineNumber);
-			}
-			circumfix = flagParsingStrategy.parseFlag(parts[1]);
-		  }
-		  else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal))
-		  {
-			string[] parts = line.Split("\\s+", true);
-			if (parts.Length != 2)
-			{
-			  throw new ParseException("Illegal IGNORE declaration", reader.LineNumber);
-			}
-			ignore = parts[1].ToCharArray();
-			Arrays.sort(ignore);
-			needsInputCleaning = true;
-		  }
-		  else if (line.StartsWith(ICONV_KEY, StringComparison.Ordinal) || line.StartsWith(OCONV_KEY, StringComparison.Ordinal))
-		  {
-			string[] parts = line.Split("\\s+", true);
-			string type = parts[0];
-			if (parts.Length != 2)
-			{
-			  throw new ParseException("Illegal " + type + " declaration", reader.LineNumber);
-			}
-			int num = int.Parse(parts[1]);
-			FST<CharsRef> res = parseConversions(reader, num);
-			if (type.Equals("ICONV"))
-			{
-			  iconv = res;
-			  needsInputCleaning |= iconv != null;
-			}
-			else
-			{
-			  oconv = res;
-			  needsOutputCleaning |= oconv != null;
-			}
-		  }
-		}
-
-		this.prefixes = affixFST(prefixes);
-		this.suffixes = affixFST(suffixes);
-
-		int totalChars = 0;
-		foreach (string strip in seenStrips.Keys)
-		{
-		  totalChars += strip.Length;
-		}
-		stripData = new char[totalChars];
-		stripOffsets = new int[seenStrips.Count + 1];
-		int currentOffset = 0;
-		int currentIndex = 0;
-		foreach (string strip in seenStrips.Keys)
-		{
-		  stripOffsets[currentIndex++] = currentOffset;
-		  strip.CopyTo(0, stripData, currentOffset, strip.Length - 0);
-		  currentOffset += strip.Length;
-		}
-		Debug.Assert(currentIndex == seenStrips.Count);
-		stripOffsets[currentIndex] = currentOffset;
-	  }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private org.apache.lucene.util.fst.FST<org.apache.lucene.util.IntsRef> affixFST(java.util.TreeMap<String,java.util.List<Character>> affixes) throws java.io.IOException
-	  private FST<IntsRef> affixFST(SortedDictionary<string, IList<char?>> affixes)
-	  {
-		IntSequenceOutputs outputs = IntSequenceOutputs.Singleton;
-		Builder<IntsRef> builder = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, outputs);
-
-		IntsRef scratch = new IntsRef();
-		foreach (KeyValuePair<string, IList<char?>> entry in affixes.SetOfKeyValuePairs())
-		{
-		  Util.toUTF32(entry.Key, scratch);
-		  IList<char?> entries = entry.Value;
-		  IntsRef output = new IntsRef(entries.Count);
-		  foreach (char? c in entries)
-		  {
-			output.ints[output.length++] = c;
-		  }
-		  builder.add(scratch, output);
-		}
-		return builder.finish();
-	  }
-
-	  /// <summary>
-	  /// Parses a specific affix rule putting the result into the provided affix map
-	  /// </summary>
-	  /// <param name="affixes"> Map where the result of the parsing will be put </param>
-	  /// <param name="header"> Header line of the affix rule </param>
-	  /// <param name="reader"> BufferedReader to read the content of the rule from </param>
-	  /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex
-	  ///                         pattern </param>
-	  /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param>
-	  /// <exception cref="IOException"> Can be thrown while reading the rule </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void parseAffix(java.util.TreeMap<String,java.util.List<Character>> affixes, String header, java.io.LineNumberReader reader, String conditionPattern, java.util.Map<String,Integer> seenPatterns, java.util.Map<String,Integer> seenStrips) throws java.io.IOException, java.text.ParseException
-	  private void parseAffix(SortedDictionary<string, IList<char?>> affixes, string header, LineNumberReader reader, string conditionPattern, IDictionary<string, int?> seenPatterns, IDictionary<string, int?> seenStrips)
-	  {
-
-		BytesRef scratch = new BytesRef();
-		StringBuilder sb = new StringBuilder();
-		string[] args = header.Split("\\s+", true);
-
-		bool crossProduct = args[2].Equals("Y");
-
-		int numLines = int.Parse(args[3]);
-		affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
-		ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
-
-		for (int i = 0; i < numLines; i++)
-		{
-		  Debug.Assert(affixWriter.Position == currentAffix << 3);
-		  string line = reader.readLine();
-		  string[] ruleArgs = line.Split("\\s+", true);
-
-		  // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
-		  // condition is optional
-		  if (ruleArgs.Length < 4)
-		  {
-			  throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.LineNumber);
-		  }
-
-		  char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
-		  string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2];
-		  string affixArg = ruleArgs[3];
-		  char[] appendFlags = null;
-
-		  int flagSep = affixArg.LastIndexOf('/');
-		  if (flagSep != -1)
-		  {
-			string flagPart = affixArg.Substring(flagSep + 1);
-			affixArg = affixArg.Substring(0, flagSep);
-
-			if (aliasCount > 0)
-			{
-			  flagPart = getAliasValue(int.Parse(flagPart));
-			}
-
-			appendFlags = flagParsingStrategy.parseFlags(flagPart);
-			Arrays.sort(appendFlags);
-			twoStageAffix = true;
-		  }
-
-		  // TODO: add test and fix zero-affix handling!
-
-		  string condition = ruleArgs.Length > 4 ? ruleArgs[4] : ".";
-		  // at least the gascon affix file has this issue
-		  if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal))
-		  {
-			condition = condition + "]";
-		  }
-		  // "dash hasn't got special meaning" (we must escape it)
-		  if (condition.IndexOf('-') >= 0)
-		  {
-			condition = condition.Replace("-", "\\-");
-		  }
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String regex;
-		  string regex;
-		  if (".".Equals(condition))
-		  {
-			regex = ".*"; // Zero condition is indicated by dot
-		  }
-		  else if (condition.Equals(strip))
-		  {
-			regex = ".*"; // TODO: optimize this better:
-						  // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
-						  // but this is complicated...
-		  }
-		  else
-		  {
-			regex = string.format(Locale.ROOT, conditionPattern, condition);
-		  }
-
-		  // deduplicate patterns
-		  int? patternIndex = seenPatterns[regex];
-		  if (patternIndex == null)
-		  {
-			patternIndex = patterns.Count;
-			if (patternIndex > short.MaxValue)
-			{
-			  throw new System.NotSupportedException("Too many patterns, please report this to dev@lucene.apache.org");
-			}
-			seenPatterns[regex] = patternIndex;
-			CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).toAutomaton());
-			patterns.Add(pattern);
-		  }
-
-		  int? stripOrd = seenStrips[strip];
-		  if (stripOrd == null)
-		  {
-			stripOrd = seenStrips.Count;
-			seenStrips[strip] = stripOrd;
-			if (stripOrd > Char.MaxValue)
-			{
-			  throw new System.NotSupportedException("Too many unique strips, please report this to dev@lucene.apache.org");
-			}
-		  }
-
-		  if (appendFlags == null)
-		  {
-			appendFlags = NOFLAGS;
-		  }
-
-		  encodeFlags(scratch, appendFlags);
-		  int appendFlagsOrd = flagLookup.add(scratch);
-		  if (appendFlagsOrd < 0)
-		  {
-			// already exists in our hash
-			appendFlagsOrd = (-appendFlagsOrd) - 1;
-		  }
-		  else if (appendFlagsOrd > short.MaxValue)
-		  {
-			// this limit is probably flexible, but its a good sanity check too
-			throw new System.NotSupportedException("Too many unique append flags, please report this to dev@lucene.apache.org");
-		  }
-
-		  affixWriter.writeShort((short)flag);
-		  affixWriter.writeShort((int)(short)stripOrd);
-		  // encode crossProduct into patternIndex
-		  int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0);
-		  affixWriter.writeShort((short)patternOrd);
-		  affixWriter.writeShort((short)appendFlagsOrd);
-
-		  if (needsInputCleaning)
-		  {
-			CharSequence cleaned = cleanInput(affixArg, sb);
-			affixArg = cleaned.ToString();
-		  }
-
-		  IList<char?> list = affixes[affixArg];
-		  if (list == null)
-		  {
-			list = new List<>();
-			affixes[affixArg] = list;
-		  }
-
-		  list.Add((char)currentAffix);
-		  currentAffix++;
-		}
-	  }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> parseConversions(java.io.LineNumberReader reader, int num) throws java.io.IOException, java.text.ParseException
-	  private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
-	  {
-		IDictionary<string, string> mappings = new SortedDictionary<string, string>();
-
-		for (int i = 0; i < num; i++)
-		{
-		  string line = reader.readLine();
-		  string[] parts = line.Split("\\s+", true);
-		  if (parts.Length != 3)
-		  {
-			throw new ParseException("invalid syntax: " + line, reader.LineNumber);
-		  }
-		  if (mappings.put(parts[1], parts[2]) != null)
-		  {
-			throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]);
-		  }
-		}
-
-		Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
-		Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
-		IntsRef scratchInts = new IntsRef();
-		foreach (KeyValuePair<string, string> entry in mappings.SetOfKeyValuePairs())
-		{
-		  Util.toUTF16(entry.Key, scratchInts);
-		  builder.add(scratchInts, new CharsRef(entry.Value));
-		}
-
-		return builder.finish();
-	  }
-
-	  /// <summary>
-	  /// pattern accepts optional BOM + SET + any whitespace </summary>
-	  internal static readonly Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");
-
-	  /// <summary>
-	  /// Parses the encoding specified in the affix file readable through the provided InputStream
-	  /// </summary>
-	  /// <param name="affix"> InputStream for reading the affix file </param>
-	  /// <returns> Encoding specified in the affix file </returns>
-	  /// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception>
-	  /// <exception cref="ParseException"> Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>} </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: static String getDictionaryEncoding(java.io.InputStream affix) throws java.io.IOException, java.text.ParseException
-	  internal static string getDictionaryEncoding(InputStream affix)
-	  {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final StringBuilder encoding = new StringBuilder();
-		StringBuilder encoding = new StringBuilder();
-		for (;;)
-		{
-		  encoding.Length = 0;
-		  int ch;
-		  while ((ch = affix.read()) >= 0)
-		  {
-			if (ch == '\n')
-			{
-			  break;
-			}
-			if (ch != '\r')
-			{
-			  encoding.Append((char)ch);
-			}
-		  }
-		  if (encoding.Length == 0 || encoding[0] == '#' || encoding.ToString().Trim().Length == 0)
-		  {
-			  // this test only at the end as ineffective but would allow lines only containing spaces:
-			if (ch < 0)
-			{
-			  throw new ParseException("Unexpected end of affix file.", 0);
-			}
-			continue;
-		  }
-		  Matcher matcher = ENCODING_PATTERN.matcher(encoding);
-		  if (matcher.find())
-		  {
-			int last = matcher.end();
-			return encoding.Substring(last).Trim();
-		  }
-		}
-	  }
-
-	  internal static readonly IDictionary<string, string> CHARSET_ALIASES;
-	  static Dictionary()
-	  {
-		IDictionary<string, string> m = new Dictionary<string, string>();
-		m["microsoft-cp1251"] = "windows-1251";
-		m["TIS620-2533"] = "TIS-620";
-		CHARSET_ALIASES = Collections.unmodifiableMap(m);
-	  }
-
-	  /// <summary>
-	  /// Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
-	  /// MICROSOFT-CP1251 etc are allowed...
-	  /// </summary>
-	  /// <param name="encoding"> Encoding to retrieve the CharsetDecoder for </param>
-	  /// <returns> CharSetDecoder for the given encoding </returns>
-	  private CharsetDecoder getJavaEncoding(string encoding)
-	  {
-		if ("ISO8859-14".Equals(encoding))
-		{
-		  return new ISO8859_14Decoder();
-		}
-		string canon = CHARSET_ALIASES[encoding];
-		if (canon != null)
-		{
-		  encoding = canon;
-		}
-		Charset charset = Charset.forName(encoding);
-		return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
-	  }
-
-	  /// <summary>
-	  /// Determines the appropriate <seealso cref="FlagParsingStrategy"/> based on the FLAG definition line taken from the affix file
-	  /// </summary>
-	  /// <param name="flagLine"> Line containing the flag information </param>
-	  /// <returns> FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition </returns>
-	  internal static FlagParsingStrategy getFlagParsingStrategy(string flagLine)
-	  {
-		string[] parts = flagLine.Split("\\s+", true);
-		if (parts.Length != 2)
-		{
-		  throw new System.ArgumentException("Illegal FLAG specification: " + flagLine);
-		}
-		string flagType = parts[1];
-
-		if (NUM_FLAG_TYPE.Equals(flagType))
-		{
-		  return new NumFlagParsingStrategy();
-		}
-		else if (UTF8_FLAG_TYPE.Equals(flagType))
-		{
-		  return new SimpleFlagParsingStrategy();
-		}
-		else if (LONG_FLAG_TYPE.Equals(flagType))
-		{
-		  return new DoubleASCIIFlagParsingStrategy();
-		}
-
-		throw new System.ArgumentException("Unknown flag type: " + flagType);
-	  }
-
-	  internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping
-
-	  internal virtual string unescapeEntry(string entry)
-	  {
-		StringBuilder sb = new StringBuilder();
-		for (int i = 0; i < entry.Length; i++)
-		{
-		  char ch = entry[i];
-		  if (ch == '\\' && i + 1 < entry.Length)
-		  {
-			sb.Append(entry[i + 1]);
-			i++;
-		  }
-		  else if (ch == '/')
-		  {
-			sb.Append(FLAG_SEPARATOR);
-		  }
-		  else
-		  {
-			sb.Append(ch);
-		  }
-		}
-		return sb.ToString();
-	  }
-
-	  /// <summary>
-	  /// Reads the dictionary file through the provided InputStreams, building up the words map
-	  /// </summary>
-	  /// <param name="dictionaries"> InputStreams to read the dictionary file through </param>
-	  /// <param name="decoder"> CharsetDecoder used to decode the contents of the file </param>
-	  /// <exception cref="IOException"> Can be thrown while reading from the file </exception>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void readDictionaryFiles(java.util.List<java.io.InputStream> dictionaries, java.nio.charset.CharsetDecoder decoder, org.apache.lucene.util.fst.Builder<org.apache.lucene.util.IntsRef> words) throws java.io.IOException
-	  private void readDictionaryFiles(IList<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
-	  {
-		BytesRef flagsScratch = new BytesRef();
-		IntsRef scratchInts = new IntsRef();
-
-		StringBuilder sb = new StringBuilder();
-
-		File unsorted = File.createTempFile("unsorted", "dat", tempDir);
-		OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted);
-		bool success = false;
-		try
-		{
-		  foreach (InputStream dictionary in dictionaries)
-		  {
-			BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
-			string line = lines.readLine(); // first line is number of entries (approximately, sometimes)
-
-			while ((line = lines.readLine()) != null)
-			{
-			  line = unescapeEntry(line);
-			  if (needsInputCleaning)
-			  {
-				int flagSep = line.LastIndexOf(FLAG_SEPARATOR);
-				if (flagSep == -1)
-				{
-				  CharSequence cleansed = cleanInput(line, sb);
-				  writer.write(cleansed.ToString().GetBytes(StandardCharsets.UTF_8));
-				}
-				else
-				{
-				  string text = line.Substring(0, flagSep);
-				  CharSequence cleansed = cleanInput(text, sb);
-				  if (cleansed != sb)
-				  {
-					sb.Length = 0;
-					sb.Append(cleansed);
-				  }
-				  sb.Append(line.Substring(flagSep));
-				  writer.write(sb.ToString().GetBytes(StandardCharsets.UTF_8));
-				}
-			  }
-			  else
-			  {
-				writer.write(line.GetBytes(StandardCharsets.UTF_8));
-			  }
-			}
-		  }
-		  success = true;
-		}
-		finally
-		{
-		  if (success)
-		  {
-			IOUtils.close(writer);
-		  }
-		  else
-		  {
-			IOUtils.closeWhileHandlingException(writer);
-		  }
-		}
-		File sorted = File.createTempFile("sorted", "dat", tempDir);
-
-		OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this));
-		sorter.sort(unsorted, sorted);
-		unsorted.delete();
-
-		OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted);
-		BytesRef scratchLine = new BytesRef();
-
-		// TODO: the flags themselves can be double-chars (long) or also numeric
-		// either way the trick is to encode them as char... but they must be parsed differently
-
-		string currentEntry = null;
-		IntsRef currentOrds = new IntsRef();
-
-		string line;
-		while (reader.read(scratchLine))
-		{
-		  line = scratchLine.utf8ToString();
-		  string entry;
-		  char[] wordForm;
-
-		  int flagSep = line.LastIndexOf(FLAG_SEPARATOR);
-		  if (flagSep == -1)
-		  {
-			wordForm = NOFLAGS;
-			entry = line;
-		  }
-		  else
-		  {
-			// note, there can be comments (morph description) after a flag.
-			// we should really look for any whitespace: currently just tab and space
-			int end = line.IndexOf('\t', flagSep);
-			if (end == -1)
-			{
-			  end = line.Length;
-			}
-			int end2 = line.IndexOf(' ', flagSep);
-			if (end2 == -1)
-			{
-			  end2 = line.Length;
-			}
-			end = Math.Min(end, end2);
-
-			string flagPart = StringHelperClass.SubstringSpecial(line, flagSep + 1, end);
-			if (aliasCount > 0)
-			{
-			  flagPart = getAliasValue(int.Parse(flagPart));
-			}
-
-			wordForm = flagParsingStrategy.parseFlags(flagPart);
-			Arrays.sort(wordForm);
-			entry = line.Substring(0, flagSep);
-		  }
-
-		  int cmp = currentEntry == null ? 1 : entry.CompareTo(currentEntry);
-		  if (cmp < 0)
-		  {
-			throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry);
-		  }
-		  else
-		  {
-			encodeFlags(flagsScratch, wordForm);
-			int ord = flagLookup.add(flagsScratch);
-			if (ord < 0)
-			{
-			  // already exists in our hash
-			  ord = (-ord) - 1;
-			}
-			// finalize current entry, and switch "current" if necessary
-			if (cmp > 0 && currentEntry != null)
-			{
-			  Util.toUTF32(currentEntry, scratchInts);
-			  words.add(scratchInts, currentOrds);
-			}
-			// swap current
-			if (cmp > 0 || currentEntry == null)
-			{
-			  currentEntry = entry;
-			  currentOrds = new IntsRef(); // must be this way
-			}
-			currentOrds.grow(currentOrds.length + 1);
-			currentOrds.ints[currentOrds.length++] = ord;
-		  }
-		}
-
-		// finalize last entry
-		Util.toUTF32(currentEntry, scratchInts);
-		words.add(scratchInts, currentOrds);
-
-		reader.close();
-		sorted.delete();
-	  }
-
-	  private class ComparatorAnonymousInnerClassHelper : IComparer<BytesRef>
-	  {
-		  private readonly Dictionary outerInstance;
-
-		  public ComparatorAnonymousInnerClassHelper(Dictionary outerInstance)
-		  {
-			  this.outerInstance = outerInstance;
-			  scratch1 = new BytesRef();
-			  scratch2 = new BytesRef();
-		  }
-
-		  internal BytesRef scratch1;
-		  internal BytesRef scratch2;
-
-		  public virtual int Compare(BytesRef o1, BytesRef o2)
-		  {
-			scratch1.bytes = o1.bytes;
-			scratch1.offset = o1.offset;
-			scratch1.length = o1.length;
-
-			for (int i = scratch1.length - 1; i >= 0; i--)
-			{
-			  if (scratch1.bytes[scratch1.offset + i] == outerInstance.FLAG_SEPARATOR)
-			  {
-				scratch1.length = i;
-				break;
-			  }
-			}
-
-			scratch2.bytes = o2.bytes;
-			scratch2.offset = o2.offset;
-			scratch2.length = o2.length;
-
-			for (int i = scratch2.length - 1; i >= 0; i--)
-			{
-			  if (scratch2.bytes[scratch2.offset + i] == outerInstance.FLAG_SEPARATOR)
-			  {
-				scratch2.length = i;
-				break;
-			  }
-			}
-
-			int cmp = scratch1.compareTo(scratch2);
-			if (cmp == 0)
-			{
-			  // tie break on whole row
-			  return o1.compareTo(o2);
-			}
-			else
-			{
-			  return cmp;
-			}
-		  }
-	  }
-
-	  internal static char[] decodeFlags(BytesRef b)
-	  {
-		if (b.length == 0)
-		{
-		  return CharsRef.EMPTY_CHARS;
-		}
-		int len = (int)((uint)b.length >> 1);
-		char[] flags = new char[len];
-		int upto = 0;
-		int end = b.offset + b.length;
-		for (int i = b.offset; i < end; i += 2)
-		{
-		  flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i + 1] & 0xff));
-		}
-		return flags;
-	  }
-
-	  internal static void encodeFlags(BytesRef b, char[] flags)
-	  {
-		int len = flags.Length << 1;
-		b.grow(len);
-		b.length = len;
-		int upto = b.offset;
-		for (int i = 0; i < flags.Length; i++)
-		{
-		  int flag = flags[i];
-		  b.bytes[upto++] = unchecked((sbyte)((flag >> 8) & 0xff));
-		  b.bytes[upto++] = unchecked((sbyte)(flag & 0xff));
-		}
-	  }
-
-	  private void parseAlias(string line)
-	  {
-		string[] ruleArgs = line.Split("\\s+", true);
-		if (aliases == null)
-		{
-		  //first line should be the aliases count
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int count = Integer.parseInt(ruleArgs[1]);
-		  int count = int.Parse(ruleArgs[1]);
-		  aliases = new string[count];
-		}
-		else
-		{
-		  // an alias can map to no flags
-		  string aliasValue = ruleArgs.Length == 1 ? "" : ruleArgs[1];
-		  aliases[aliasCount++] = aliasValue;
-		}
-	  }
-
-	  private string getAliasValue(int id)
-	  {
-		try
-		{
-		  return aliases[id - 1];
-		}
-		catch (System.IndexOutOfRangeException ex)
-		{
-		  throw new System.ArgumentException("Bad flag alias number:" + id, ex);
-		}
-	  }
-
-	  /// <summary>
-	  /// Abstraction of the process of parsing flags taken from the affix and dic files
-	  /// </summary>
-	  internal abstract class FlagParsingStrategy
-	  {
-
-		/// <summary>
-		/// Parses the given String into a single flag
-		/// </summary>
-		/// <param name="rawFlag"> String to parse into a flag </param>
-		/// <returns> Parsed flag </returns>
-		internal virtual char parseFlag(string rawFlag)
-		{
-		  char[] flags = parseFlags(rawFlag);
-		  if (flags.Length != 1)
-		  {
-			throw new System.ArgumentException("expected only one flag, got: " + rawFlag);
-		  }
-		  return flags[0];
-		}
-
-		/// <summary>
-		/// Parses the given String into multiple flags
-		/// </summary>
-		/// <param name="rawFlags"> String to parse into flags </param>
-		/// <returns> Parsed flags </returns>
-		internal abstract char[] parseFlags(string rawFlags);
-	  }
-
-	  /// <summary>
-	  /// Simple implementation of <seealso cref="FlagParsingStrategy"/> that treats the chars in each String as a individual flags.
-	  /// Can be used with both the ASCII and UTF-8 flag types.
-	  /// </summary>
-	  private class SimpleFlagParsingStrategy : FlagParsingStrategy
-	  {
-		public override char[] parseFlags(string rawFlags)
-		{
-		  return rawFlags.ToCharArray();
-		}
-	  }
-
-	  /// <summary>
-	  /// Implementation of <seealso cref="FlagParsingStrategy"/> that assumes each flag is encoded in its numerical form.  In the case
-	  /// of multiple flags, each number is separated by a comma.
-	  /// </summary>
-	  private class NumFlagParsingStrategy : FlagParsingStrategy
-	  {
-		public override char[] parseFlags(string rawFlags)
-		{
-		  string[] rawFlagParts = rawFlags.Trim().Split(",", true);
-		  char[] flags = new char[rawFlagParts.Length];
-		  int upto = 0;
-
-		  for (int i = 0; i < rawFlagParts.Length; i++)
-		  {
-			// note, removing the trailing X/leading I for nepali... what is the rule here?! 
-			string replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
-			// note, ignoring empty flags (this happens in danish, for example)
-			if (replacement.Length == 0)
-			{
-			  continue;
-			}
-			flags[upto++] = (char) int.Parse(replacement);
-		  }
-
-		  if (upto < flags.Length)
-		  {
-			flags = Arrays.copyOf(flags, upto);
-		  }
-		  return flags;
-		}
-	  }
-
-	  /// <summary>
-	  /// Implementation of <seealso cref="FlagParsingStrategy"/> that assumes each flag is encoded as two ASCII characters whose codes
-	  /// must be combined into a single character.
-	  /// 
-	  /// TODO (rmuir) test
-	  /// </summary>
-	  private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy
-	  {
-
-		public override char[] parseFlags(string rawFlags)
-		{
-		  if (rawFlags.Length == 0)
-		  {
-			return new char[0];
-		  }
-
-		  StringBuilder builder = new StringBuilder();
-		  if (rawFlags.Length % 2 == 1)
-		  {
-			throw new System.ArgumentException("Invalid flags (should be even number of characters): " + rawFlags);
-		  }
-		  for (int i = 0; i < rawFlags.Length; i += 2)
-		  {
-			char cookedFlag = (char)((int) rawFlags[i] + (int) rawFlags[i + 1]);
-			builder.Append(cookedFlag);
-		  }
-
-		  char[] flags = new char[builder.Length];
-		  builder.getChars(0, builder.Length, flags, 0);
-		  return flags;
-		}
-	  }
-
-	  internal static bool hasFlag(char[] flags, char flag)
-	  {
-		return Arrays.binarySearch(flags, flag) >= 0;
-	  }
-
-	  internal virtual CharSequence cleanInput(CharSequence input, StringBuilder reuse)
-	  {
-		reuse.Length = 0;
-
-		for (int i = 0; i < input.length(); i++)
-		{
-		  char ch = input.charAt(i);
-
-		  if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0)
-		  {
-			continue;
-		  }
-
-		  if (ignoreCase && iconv == null)
-		  {
-			// if we have no input conversion mappings, do this on-the-fly
-			ch = char.ToLower(ch);
-		  }
-
-		  reuse.Append(ch);
-		}
-
-		if (iconv != null)
-		{
-		  try
-		  {
-			applyMappings(iconv, reuse);
-		  }
-		  catch (IOException bogus)
-		  {
-			throw new Exception(bogus);
-		  }
-		  if (ignoreCase)
-		  {
-			for (int i = 0; i < reuse.Length; i++)
-			{
-			  reuse[i] = char.ToLower(reuse[i]);
-			}
-		  }
-		}
-
-		return reuse;
-	  }
-
-	  // TODO: this could be more efficient!
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: static void applyMappings(org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> fst, StringBuilder sb) throws java.io.IOException
-	  internal static void applyMappings(FST<CharsRef> fst, StringBuilder sb)
-	  {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader bytesReader = fst.getBytesReader();
-		FST.BytesReader bytesReader = fst.BytesReader;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> firstArc = fst.getFirstArc(new org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef>());
-		FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
-		CharsRef NO_OUTPUT = fst.outputs.NoOutput;
-
-		// temporary stuff
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> arc = new org.apache.lucene.util.fst.FST.Arc<>();
-		FST.Arc<CharsRef> arc = new FST.Arc<CharsRef>();
-		int longestMatch;
-		CharsRef longestOutput;
-
-		for (int i = 0; i < sb.Length; i++)
-		{
-		  arc.copyFrom(firstArc);
-		  CharsRef output = NO_OUTPUT;
-		  longestMatch = -1;
-		  longestOutput = null;
-
-		  for (int j = i; j < sb.Length; j++)
-		  {
-			char ch = sb[j];
-			if (fst.findTargetArc(ch, arc, arc, bytesReader) == null)
-			{
-			  break;
-			}
-			else
-			{
-			  output = fst.outputs.add(output, arc.output);
-			}
-			if (arc.Final)
-			{
-			  longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
-			  longestMatch = j;
-			}
-		  }
-
-		  if (longestMatch >= 0)
-		  {
-			sb.Remove(i, longestMatch + 1 - i);
-			sb.Insert(i, longestOutput);
-			i += (longestOutput.length - 1);
-		  }
-		}
-	  }
-	}
-
+    /// <summary>
+    /// In-memory structure for the dictionary (.dic) and affix (.aff)
+    /// data of a hunspell dictionary.
+    /// </summary>
+    public class Dictionary
+    {
+        internal static readonly char[] NOFLAGS = new char[0];
+
+        private const string ALIAS_KEY = "AF";
+        private const string PREFIX_KEY = "PFX";
+        private const string SUFFIX_KEY = "SFX";
+        private const string FLAG_KEY = "FLAG";
+        private const string COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
+        private const string CIRCUMFIX_KEY = "CIRCUMFIX";
+        private const string IGNORE_KEY = "IGNORE";
+        private const string ICONV_KEY = "ICONV";
+        private const string OCONV_KEY = "OCONV";
+
+        private const string NUM_FLAG_TYPE = "num";
+        private const string UTF8_FLAG_TYPE = "UTF-8";
+        private const string LONG_FLAG_TYPE = "long";
+
+        // TODO: really for suffixes we should reverse the automaton and run them backwards
+        private const string PREFIX_CONDITION_REGEX_PATTERN = "{0}.*";
+        private const string SUFFIX_CONDITION_REGEX_PATTERN = ".*{0}";
+
+        internal FST<IntsRef> prefixes;
+        internal FST<IntsRef> suffixes;
+
+        // all condition checks used by prefixes and suffixes. these are typically re-used across
+        // many affix stripping rules. so these are deduplicated, to save RAM.
+        internal List<CharacterRunAutomaton> patterns = new List<CharacterRunAutomaton>();
+
+        // the entries in the .dic file, mapping to their set of flags.
+        // the fst output is the ordinal list for flagLookup
+        internal FST<IntsRef> words;
+        // the list of unique flagsets (wordforms). theoretically huge, but practically
+        // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
+        internal BytesRefHash flagLookup = new BytesRefHash();
+
+        // the list of unique strip affixes.
+        internal char[] stripData;
+        internal int[] stripOffsets;
+
+        // 8 bytes per affix
+        internal byte[] affixData = new byte[64];
+        private int currentAffix = 0;
+
+        private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
+
+        private string[] aliases;
+        private int aliasCount = 0;
+
+        private readonly DirectoryInfo tempDir = OfflineSorter.DefaultTempDir(); // TODO: make this configurable?
+
+        internal bool ignoreCase;
+        internal bool complexPrefixes;
+        internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
+
+        internal int circumfix = -1; // circumfix flag, or -1 if one is not defined
+
+        // ignored characters (dictionary, affix, inputs)
+        private char[] ignore;
+
+        // FSTs used for ICONV/OCONV, output ord pointing to replacement text
+        internal FST<CharsRef> iconv;
+        internal FST<CharsRef> oconv;
+
+        internal bool needsInputCleaning;
+        internal bool needsOutputCleaning;
+
+        // LUCENENET: Added so we can get better performance than creating the regex in every tight loop.
+        private static Regex whitespacePattern = new Regex("\\s+", RegexOptions.Compiled);
+
+        /// <summary>
+        /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
+        /// and dictionary files.
+        /// You have to close the provided InputStreams yourself.
+        /// </summary>
+        /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param>
+        /// <param name="dictionary"> InputStream for reading the hunspell dictionary file (won't be closed). </param>
+        /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception>
+        /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception>
+        public Dictionary(Stream affix, Stream dictionary) 
+            : this(affix, new List<Stream>() { dictionary }, false)
+        {
+        }
+
+        /// <summary>
+        /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
+        /// and dictionary files.
+        /// You have to close the provided InputStreams yourself.
+        /// </summary>
+        /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param>
+        /// <param name="dictionaries"> InputStream for reading the hunspell dictionary files (won't be closed). </param>
+        /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception>
+        /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception>
+        public Dictionary(Stream affix, IList<Stream> dictionaries, bool ignoreCase)
+        {
+            this.ignoreCase = ignoreCase;
+            this.needsInputCleaning = ignoreCase;
+            this.needsOutputCleaning = false; // set if we have an OCONV
+            flagLookup.Add(new BytesRef()); // no flags -> ord 0
+
+            FileInfo aff = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "affix.aff"));
+            using (Stream @out = aff.Create())
+            {
+                Stream aff1 = null;
+                Stream aff2 = null;
+                try
+                {
+                    // copy contents of affix stream to temp file
+                    byte[] buffer = new byte[1024 * 8];
+                    int len;
+                    while ((len = affix.Read(buffer, 0, buffer.Length)) > 0)
+                    {
+                        @out.Write(buffer, 0, len);
+                    }
+                    @out.Close(); // LUCENENET: Release the file handle - we dispose @out later
+
+                    // pass 1: get encoding
+                    aff1 = File.OpenRead(aff.FullName);
+                    string encoding = GetDictionaryEncoding(aff1);
+
+                    // pass 2: parse affixes
+                    Encoding decoder = GetSystemEncoding(encoding);
+                    aff2 = File.OpenRead(aff.FullName);
+                    ReadAffixFile(aff2, decoder);
+
+                    // read dictionary entries
+                    IntSequenceOutputs o = IntSequenceOutputs.Singleton;
+                    Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
+                    ReadDictionaryFiles(dictionaries, decoder, b);
+                    words = b.Finish();
+                    aliases = null; // no longer needed
+                }
+                finally
+                {
+                    IOUtils.CloseWhileHandlingException(aff1, aff2);
+                    aff.Delete();
+                }
+            }
+        }
+
+        /// <summary>
+        /// Looks up Hunspell word forms from the dictionary
+        /// </summary>
+        internal virtual IntsRef LookupWord(char[] word, int offset, int length)
+        {
+            return Lookup(words, word, offset, length);
+        }
+
+        /// <summary>
+        /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
+        /// </summary>
+        /// <param name="word"> Char array to generate the String from </param>
+        /// <param name="offset"> Offset in the char array that the String starts at </param>
+        /// <param name="length"> Length from the offset that the String is </param>
+        /// <returns> List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found </returns>
+        internal virtual IntsRef LookupPrefix(char[] word, int offset, int length)
+        {
+            return Lookup(prefixes, word, offset, length);
+        }
+
+        /// <summary>
+        /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
+        /// </summary>
+        /// <param name="word"> Char array to generate the String from </param>
+        /// <param name="offset"> Offset in the char array that the String starts at </param>
+        /// <param name="length"> Length from the offset that the String is </param>
+        /// <returns> List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found </returns>
+        internal virtual IntsRef LookupSuffix(char[] word, int offset, int length)
+        {
+            return Lookup(suffixes, word, offset, length);
+        }
+
+        // TODO: this is pretty stupid, considering how the stemming algorithm works
+        // we can speed it up to be significantly faster!
+        internal virtual IntsRef Lookup(FST<IntsRef> fst, char[] word, int offset, int length)
+        {
+            if (fst == null)
+            {
+                return null;
+            }
+            FST.BytesReader bytesReader = fst.BytesReader;
+            FST.Arc<IntsRef> arc = fst.GetFirstArc(new FST.Arc<IntsRef>());
+            // Accumulate output as we go
+            IntsRef NO_OUTPUT = fst.Outputs.NoOutput;
+            IntsRef output = NO_OUTPUT;
+
+            int l = offset + length;
+            try
+            {
+                for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp))
+                {
+                    cp = Character.CodePointAt(word, i, l);
+                    if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null)
+                    {
+                        return null;
+                    }
+                    else if (arc.Output != NO_OUTPUT)
+                    {
+                        output = fst.Outputs.Add(output, arc.Output);
+                    }
+                }
+                if (fst.FindTargetArc(FST<IntsRef>.END_LABEL, arc, arc, bytesReader) == null)
+                {
+                    return null;
+                }
+                else if (arc.Output != NO_OUTPUT)
+                {
+                    return fst.Outputs.Add(output, arc.Output);
+                }
+                else
+                {
+                    return output;
+                }
+            }
+            catch (IOException bogus)
+            {
+                throw new Exception(bogus.Message, bogus);
+            }
+        }
+
+        /// <summary>
+        /// Reads the affix file through the provided InputStream, building up the prefix and suffix maps
+        /// </summary>
+        /// <param name="affixStream"> InputStream to read the content of the affix file from </param>
+        /// <param name="decoder"> CharsetDecoder to decode the content of the file </param>
+        /// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception>
+        private void ReadAffixFile(Stream affixStream, Encoding decoder)
+        {
+            SortedDictionary<string, IList<char?>> prefixes = new SortedDictionary<string, IList<char?>>();
+            SortedDictionary<string, IList<char?>> suffixes = new SortedDictionary<string, IList<char?>>();
+            IDictionary<string, int?> seenPatterns = new Dictionary<string, int?>();
+
+            // zero condition -> 0 ord
+            seenPatterns[".*"] = 0;
+            patterns.Add(null);
+
+            // zero strip -> 0 ord
+            IDictionary<string, int?> seenStrips = new Dictionary<string, int?>();
+            seenStrips[""] = 0;
+
+            var reader = new StreamReader(affixStream, decoder);
+            string line = null;
+            int lineNumber = 0;
+            while ((line = reader.ReadLine()) != null)
+            {
+                lineNumber++;
+                // ignore any BOM marker on first line
+                if (lineNumber == 1 && line.StartsWith("\uFEFF", StringComparison.Ordinal))
+                {
+                    line = line.Substring(1);
+                }
+                if (line.StartsWith(ALIAS_KEY, StringComparison.Ordinal))
+                {
+                    ParseAlias(line);
+                }
+                else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal))
+                {
+                    ParseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
+                }
+                else if (line.StartsWith(SUFFIX_KEY, StringComparison.Ordinal))
+                {
+                    ParseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
+                }
+                else if (line.StartsWith(FLAG_KEY, StringComparison.Ordinal))
+                {
+                    // Assume that the FLAG line comes before any prefix or suffixes
+                    // Store the strategy so it can be used when parsing the dic file
+                    flagParsingStrategy = GetFlagParsingStrategy(line);
+                }
+                else if (line.Equals(COMPLEXPREFIXES_KEY))
+                {
+                    complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
+                }
+                else if (line.StartsWith(CIRCUMFIX_KEY, StringComparison.Ordinal))
+                {
+                    string[] parts = whitespacePattern.Split(line);
+                    if (parts.Length != 2)
+                    {
+                        throw new Exception(string.Format("Illegal CIRCUMFIX declaration, line {0}", lineNumber));
+                    }
+                    circumfix = flagParsingStrategy.parseFlag(parts[1]);
+                }
+                else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal))
+                {
+                    string[] parts = whitespacePattern.Split(line);
+                    if (parts.Length != 2)
+                    {
+                        throw new Exception(string.Format("Illegal IGNORE declaration, line {0}", lineNumber));
+                    }
+                    ignore = parts[1].ToCharArray();
+                    Array.Sort(ignore);
+                    needsInputCleaning = true;
+                }
+                else if (line.StartsWith(ICONV_KEY, StringComparison.Ordinal) || line.StartsWith(OCONV_KEY, StringComparison.Ordinal))
+                {
+                    string[] parts = whitespacePattern.Split(line);
+                    string type = parts[0];
+                    if (parts.Length != 2)
+                    {
+                        throw new Exception(string.Format("Illegal {0} declaration, line {1}", type, lineNumber));
+                    }
+                    int num = int.Parse(parts[1], CultureInfo.InvariantCulture);
+                    FST<CharsRef> res = ParseConversions(reader, num);
+                    if (type.Equals("ICONV"))
+                    {
+                        iconv = res;
+                        needsInputCleaning |= iconv != null;
+                    }
+                    else
+                    {
+                        oconv = res;
+                        needsOutputCleaning |= oconv != null;
+                    }
+                }
+            }
+
+            this.prefixes = AffixFST(prefixes);
+            this.suffixes = AffixFST(suffixes);
+
+            int totalChars = 0;
+            foreach (string strip in seenStrips.Keys)
+            {
+                totalChars += strip.Length;
+            }
+            stripData = new char[totalChars];
+            stripOffsets = new int[seenStrips.Count + 1];
+            int currentOffset = 0;
+            int currentIndex = 0;
+            foreach (string strip in seenStrips.Keys)
+            {
+                stripOffsets[currentIndex++] = currentOffset;
+                strip.CopyTo(0, stripData, currentOffset, strip.Length - 0);
+                currentOffset += strip.Length;
+            }
+            Debug.Assert(currentIndex == seenStrips.Count);
+            stripOffsets[currentIndex] = currentOffset;
+        }
+
+        private FST<IntsRef> AffixFST(SortedDictionary<string, IList<char?>> affixes)
+        {
+            IntSequenceOutputs outputs = IntSequenceOutputs.Singleton;
+            Builder<IntsRef> builder = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, outputs);
+
+            IntsRef scratch = new IntsRef();
+            foreach (KeyValuePair<string, IList<char?>> entry in affixes)
+            {
+                Lucene.Net.Util.Fst.Util.ToUTF32(entry.Key, scratch);
+                IList<char?> entries = entry.Value;
+                IntsRef output = new IntsRef(entries.Count);
+                foreach (char? c in entries)
+                {
+                    output.Ints[output.Length++] = c.HasValue ? c.Value : 0;
+                }
+                builder.Add(scratch, output);
+            }
+            return builder.Finish();
+        }
+
+        /// <summary>
+        /// Parses a specific affix rule putting the result into the provided affix map
+        /// </summary>
+        /// <param name="affixes"> Map where the result of the parsing will be put </param>
+        /// <param name="header"> Header line of the affix rule </param>
+        /// <param name="reader"> BufferedReader to read the content of the rule from </param>
+        /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex
+        ///                         pattern </param>
+        /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param>
+        /// <exception cref="IOException"> Can be thrown while reading the rule </exception>
+        private void ParseAffix(SortedDictionary<string, IList<char?>> affixes, string header, TextReader reader, string conditionPattern, IDictionary<string, int?> seenPatterns, IDictionary<string, int?> seenStrips)
+        {
+
+            BytesRef scratch = new BytesRef();
+            StringBuilder sb = new StringBuilder();
+            string[] args = whitespacePattern.Split(header);
+
+            bool crossProduct = args[2].Equals("Y");
+
+            int numLines = int.Parse(args[3], CultureInfo.InvariantCulture);
+            affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3));
+            ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
+
+            for (int i = 0; i < numLines; i++)
+            {
+                Debug.Assert(affixWriter.Position == currentAffix << 3);
+                string line = reader.ReadLine();
+                string[] ruleArgs = whitespacePattern.Split(line);
+
+                // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
+                // condition is optional
+                if (ruleArgs.Length < 4)
+                {
+                    throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader
+                }
+
+                char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
+                string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2];
+                string affixArg = ruleArgs[3];
+                char[] appendFlags = null;
+
+                int flagSep = affixArg.LastIndexOf('/');
+                if (flagSep != -1)
+                {
+                    string flagPart = affixArg.Substring(flagSep + 1);
+                    affixArg = affixArg.Substring(0, flagSep - 0);
+
+                    if (aliasCount > 0)
+                    {
+                        flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
+                    }
+
+                    appendFlags = flagParsingStrategy.ParseFlags(flagPart);
+                    Array.Sort(appendFlags);
+                    twoStageAffix = true;
+                }
+
+                // TODO: add test and fix zero-affix handling!
+
+                string condition = ruleArgs.Length > 4 ? ruleArgs[4] : ".";
+                // at least the gascon affix file has this issue
+                if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal))
+                {
+                    condition = condition + "]";
+                }
+                // "dash hasn't got special meaning" (we must escape it)
+                if (condition.IndexOf('-') >= 0)
+                {
+                    condition = condition.Replace("-", "\\-");
+                }
+
+                string regex;
+                if (".".Equals(condition))
+                {
+                    regex = ".*"; // Zero condition is indicated by dot
+                }
+                else if (condition.Equals(strip))
+                {
+                    regex = ".*"; // TODO: optimize this better:
+                                  // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
+                                  // but this is complicated...
+                }
+                else
+                {
+                    regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition);
+                }
+
+                // deduplicate patterns
+                int? patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null;
+                if (patternIndex == null)
+                {
+                    patternIndex = patterns.Count;
+                    if (patternIndex > short.MaxValue)
+                    {
+                        throw new System.NotSupportedException("Too many patterns, please report this to dev@lucene.apache.org");
+                    }
+                    seenPatterns[regex] = patternIndex;
+                    CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton());
+                    patterns.Add(pattern);
+                }
+
+                int? stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null;
+                if (stripOrd == null)
+                {
+                    stripOrd = seenStrips.Count;
+                    seenStrips[strip] = stripOrd;
+                    if (stripOrd > char.MaxValue)
+                    {
+                        throw new System.NotSupportedException("Too many unique strips, please report this to dev@lucene.apache.org");
+                    }
+                }
+
+                if (appendFlags == null)
+                {
+                    appendFlags = NOFLAGS;
+                }
+
+                EncodeFlags(scratch, appendFlags);
+                int appendFlagsOrd = flagLookup.Add(scratch);
+                if (appendFlagsOrd < 0)
+                {
+                    // already exists in our hash
+                    appendFlagsOrd = (-appendFlagsOrd) - 1;
+                }
+                else if (appendFlagsOrd > short.MaxValue)
+                {
+                    // this limit is probably flexible, but its a good sanity check too
+                    throw new System.NotSupportedException("Too many unique append flags, please report this to dev@lucene.apache.org");
+                }
+
+                affixWriter.WriteShort((short)flag);
+                affixWriter.WriteShort((short)stripOrd);
+                // encode crossProduct into patternIndex
+                int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0);
+                affixWriter.WriteShort((short)patternOrd);
+                affixWriter.WriteShort((short)appendFlagsOrd);
+
+                if (needsInputCleaning)
+                {
+                    string cleaned = CleanInput(affixArg, sb);
+                    affixArg = cleaned.ToString();
+                }
+
+                IList<char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null;
+                if (list == null)
+                {
+                    list = new List<char?>();
+                    affixes[affixArg] = list;
+                }
+
+                list.Add((char)currentAffix);
+                currentAffix++;
+            }
+        }
+
+        private FST<CharsRef> ParseConversions(TextReader reader, int num)
+        {
+            IDictionary<string, string> mappings = new SortedDictionary<string, string>();
+
+            for (int i = 0; i < num; i++)
+            {
+                string line = reader.ReadLine();
+                string[] parts = whitespacePattern.Split(line);
+                if (parts.Length != 3)
+                {
+                    throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader
+                }
+                if (mappings.Put(parts[1], parts[2]) != null)
+                {
+                    throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]);
+                }
+            }
+
+            Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
+            Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
+            IntsRef scratchInts = new IntsRef();
+            foreach (KeyValuePair<string, string> entry in mappings)
+            {
+                Lucene.Net.Util.Fst.Util.ToUTF16(entry.Key, scratchInts);
+                builder.Add(scratchInts, new CharsRef(entry.Value));
+            }
+
+            return builder.Finish();
+        }
+
+        /// <summary>
+        /// pattern accepts optional BOM + SET + any whitespace </summary>
+        internal static readonly Regex ENCODING_PATTERN = new Regex("^(\u00EF\u00BB\u00BF)?SET\\s+", RegexOptions.Compiled);
+
+        /// <summary>
+        /// Parses the encoding specified in the affix file readable through the provided InputStream
+        /// </summary>
+        /// <param name="affix"> InputStream for reading the affix file </param>
+        /// <returns> Encoding specified in the affix file </returns>
+        /// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception>
+        /// <exception cref="ParseException"> Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>} </exception>
+        internal static string GetDictionaryEncoding(Stream affix)
+        {
+            StringBuilder encoding = new StringBuilder();
+            for (;;)
+            {
+                encoding.Length = 0;
+                int ch;
+                while ((ch = affix.ReadByte()) > 0)
+                {
+                    if (ch == '\n')
+                    {
+                        break;
+                    }
+                    if (ch != '\r')
+                    {
+                        encoding.Append((char)ch);
+                    }
+                }
+                if (encoding.Length == 0 || encoding[0] == '#' || encoding.ToString().Trim().Length == 0)
+                {
+                    // this test only at the end as ineffective but would allow lines only containing spaces:
+                    if (ch < 0)
+                    {
+                        throw new Exception("Unexpected end of affix file." /*, 0*/);
+                    }
+                    continue;
+                }
+                Match matcher = ENCODING_PATTERN.Match(encoding.ToString());
+                if (matcher.Success)
+                {
+                    int last = matcher.Index + matcher.Length;
+                    return encoding.ToString(last, encoding.Length - last).Trim();
+                }
+            }
+        }
+
+        internal static readonly IDictionary<string, string> CHARSET_ALIASES;
+        static Dictionary()
+        {
+            IDictionary<string, string> m = new Dictionary<string, string>();
+            m["microsoft-cp1251"] = "windows-1251";
+            m["TIS620-2533"] = "TIS-620";
+            CHARSET_ALIASES = Collections.UnmodifiableMap(m);
+        }
+
+        /// <summary>
+        /// Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
+        /// MICROSOFT-CP1251 etc are allowed...
+        /// </summary>
+        /// <param name="encoding"> Encoding to retrieve the CharsetDecoder for </param>
+        /// <returns> CharSetDecoder for the given encoding </returns>
+        // LUCENENET NOTE: This was getJavaEncoding in the original
+        private Encoding GetSystemEncoding(string encoding)
+        {
+            if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase))
+            {
+                return new ISO8859_14Encoding();
+            }
+            return Encoding.GetEncoding(encoding);
+        }
+
+
+        /// <summary>
+        /// Determines the appropriate <seealso cref="FlagParsingStrategy"/> based on the FLAG definition line taken from the affix file
+        /// </summary>
+        /// <param name="flagLine"> Line containing the flag information </param>
+        /// <returns> FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition </returns>
+        internal static FlagParsingStrategy GetFlagParsingStrategy(string flagLine)
+        {
+            string[] parts = whitespacePattern.Split(flagLine);
+            if (parts.Length != 2)
+            {
+                throw new System.ArgumentException("Illegal FLAG specification: " + flagLine);
+            }
+            string flagType = parts[1];
+
+            if (NUM_FLAG_TYPE.Equals(flagType))
+            {
+                return new NumFlagParsingStrategy();
+            }
+            else if (UTF8_FLAG_TYPE.Equals(flagType))
+            {
+                return new SimpleFlagParsingStrategy();
+            }
+            else if (LONG_FLAG_TYPE.Equals(flagType))
+            {
+                return new DoubleASCIIFlagParsingStrategy();
+            }
+
+            throw new System.ArgumentException("Unknown flag type: " + flagType);
+        }
+
+        internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping
+
+        internal virtual string UnescapeEntry(string entry)
+        {
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < entry.Length; i++)
+            {
+                char ch = entry[i];
+                if (ch == '\\' && i + 1 < entry.Length)
+                {
+                    sb.Append(entry[i + 1]);
+                    i++;
+                }
+                else if (ch == '/')
+                {
+                    sb.Append(FLAG_SEPARATOR);
+                }
+                else
+                {
+                    sb.Append(ch);
+                }
+            }
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Reads the dictionary file through the provided InputStreams, building up the words map
+        /// </summary>
+        /// <param name="dictionaries"> InputStreams to read the dictionary file through </param>
+        /// <param name="decoder"> CharsetDecoder used to decode the contents of the file </param>
+        /// <exception cref="IOException"> Can be thrown while reading from the file </exception>
+        private void ReadDictionaryFiles(IList<Stream> dictionaries, Encoding decoder, Builder<IntsRef> words)
+        {
+            BytesRef flagsScratch = new BytesRef();
+            IntsRef scratchInts = new IntsRef();
+
+            StringBuilder sb = new StringBuilder();
+
+            FileInfo unsorted = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "unsorted.dat"));
+            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted);
+            bool success = false;
+            try
+            {
+                foreach (Stream dictionary in dictionaries)
+                {
+                    var lines = new StreamReader(dictionary, decoder);
+                    string line = lines.ReadLine(); // first line is number of entries (approximately, sometimes)
+
+                    while ((line = lines.ReadLine()) != null)
+                    {
+                        line = UnescapeEntry(line);
+                        if (needsInputCleaning)
+                        {
+                            int flagSep = line.LastIndexOf(FLAG_SEPARATOR);
+                            if (flagSep == -1)
+                            {
+                                string cleansed = CleanInput(line, sb);
+                                writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8));
+                            }
+                            else
+                            {
+                                string text = line.Substring(0, flagSep - 0);
+                                string cleansed = CleanInput(text, sb);
+                                if (cleansed != sb.ToString())
+                                {
+                                    sb.Length = 0;
+                                    sb.Append(cleansed);
+                                }
+                                sb.Append(line.Substring(flagSep));
+                                writer.Write(sb.ToString().GetBytes(Encoding.UTF8));
+                            }
+                        }
+                        else
+                        {
+                            writer.Write(line.GetBytes(Encoding.UTF8));
+                        }
+                    }
+                }
+                success = true;
+            }
+            finally
+            {
+                if (success)
+                {
+                    IOUtils.Close(writer);
+                }
+                else
+                {
+                    IOUtils.CloseWhileHandlingException(writer);
+                }
+            }
+            FileInfo sorted = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "sorted.dat"));
+            using (var temp = sorted.Create()) { }
+
+            OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this));
+            sorter.Sort(unsorted, sorted);
+            unsorted.Delete();
+
+            OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted);
+            BytesRef scratchLine = new BytesRef();
+
+            // TODO: the flags themselves can be double-chars (long) or also numeric
+            // either way the trick is to encode them as char... but they must be parsed differently
+
+            string currentEntry = null;
+            IntsRef currentOrds = new IntsRef();
+
+            string line2;
+            while (reader.Read(scratchLine))
+            {
+                line2 = scratchLine.Utf8ToString();
+                string entry;
+                char[] wordForm;
+
+                int flagSep = line2.LastIndexOf(FLAG_SEPARATOR);
+                if (flagSep == -1)
+                {
+                    wordForm = NOFLAGS;
+                    entry = line2;
+                }
+                else
+                {
+                    // note, there can be comments (morph description) after a flag.
+                    // we should really look for any whitespace: currently just tab and space
+                    int end = line2.IndexOf('\t', flagSep);
+                    if (end == -1)
+                    {
+                        end = line2.Length;
+                    }
+                    int end2 = line2.IndexOf(' ', flagSep);
+                    if (end2 == -1)
+                    {
+                        end2 = line2.Length;
+                    }
+                    end = Math.Min(end, end2);
+
+                    string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1));
+                    if (aliasCount > 0)
+                    {
+                        flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
+                    }
+
+                    wordForm = flagParsingStrategy.ParseFlags(flagPart);
+                    Array.Sort(wordForm);
+                    entry = line2.Substring(0, flagSep - 0);
+                }
+
+                int cmp = currentEntry == null ? 1 : entry.CompareTo(currentEntry);
+                if (cmp < 0)
+                {
+                    throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry);
+                }
+                else
+                {
+                    EncodeFlags(flagsScratch, wordForm);
+                    int ord = flagLookup.Add(flagsScratch);
+                    if (ord < 0)
+                    {
+                        // already exists in our hash
+                        ord = (-ord) - 1;
+                    }
+                    // finalize current entry, and switch "current" if necessary
+                    if (cmp > 0 && currentEntry != null)
+                    {
+                        Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
+                        words.Add(scratchInts, currentOrds);
+                    }
+                    // swap current
+                    if (cmp > 0 || currentEntry == null)
+                    {
+                        currentEntry = entry;
+                        currentOrds = new IntsRef(); // must be this way
+                    }
+                    currentOrds.Grow(currentOrds.Length + 1);
+                    currentOrds.Ints[currentOrds.Length++] = ord;
+                }
+            }
+
+            // finalize last entry
+            Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
+            words.Add(scratchInts, currentOrds);
+
+            reader.Dispose();
+            sorted.Delete();
+        }
+
+        private class ComparatorAnonymousInnerClassHelper : IComparer<BytesRef>
+        {
+            private readonly Dictionary outerInstance;
+
+            public ComparatorAnonymousInnerClassHelper(Dictionary outerInstance)
+            {
+                this.outerInstance = outerInstance;
+                scratch1 = new BytesRef();
+                scratch2 = new BytesRef();
+            }
+
+            internal BytesRef scratch1;
+            internal BytesRef scratch2;
+
+            public virtual int Compare(BytesRef o1, BytesRef o2)
+            {
+                scratch1.Bytes = o1.Bytes;
+                scratch1.Offset = o1.Offset;
+                scratch1.Length = o1.Length;
+
+                for (int i = scratch1.Length - 1; i >= 0; i--)
+                {
+                    if (scratch1.Bytes[scratch1.Offset + i] == outerInstance.FLAG_SEPARATOR)
+                    {
+                        scratch1.Length = i;
+                        break;
+                    }
+                }
+
+                scratch2.Bytes = o2.Bytes;
+                scratch2.Offset = o2.Offset;
+                scratch2.Length = o2.Length;
+
+                for (int i = scratch2.Length - 1; i >= 0; i--)
+                {
+                    if (scratch2.Bytes[scratch2.Offset + i] == outerInstance.FLAG_SEPARATOR)
+                    {
+                        scratch2.Length = i;
+                        break;
+                    }
+                }
+
+                int cmp = scratch1.CompareTo(scratch2);
+                if (cmp == 0)
+                {
+                    // tie break on whole row
+                    return o1.CompareTo(o2);
+                }
+                else
+                {
+                    return cmp;
+                }
+            }
+        }
+
+        internal static char[] DecodeFlags(BytesRef b)
+        {
+            if (b.Length == 0)
+            {
+                return CharsRef.EMPTY_CHARS;
+            }
+            int len = (int)((uint)b.Length >> 1);
+            char[] flags = new char[len];
+            int upto = 0;
+            int end = b.Offset + b.Length;
+            for (int i = b.Offset; i < end; i += 2)
+            {
+                flags[upto++] = (char)((b.Bytes[i] << 8) | (b.Bytes[i + 1] & 0xff));
+            }
+            return flags;
+        }
+
+        internal static void EncodeFlags(BytesRef b, char[] flags)
+        {
+            int len = flags.Length << 1;
+            b.Grow(len);
+            b.Length = len;
+            int upto = b.Offset;
+            for (int i = 0; i < flags.Length; i++)
+            {
+                int flag = flags[i];
+                b.Bytes[upto++] = (byte)((flag >> 8) & 0xff);
+                b.Bytes[upto++] = (byte)(flag & 0xff);
+            }
+        }
+
+        private void ParseAlias(string line)
+        {
+            string[] ruleArgs = whitespacePattern.Split(line);
+            if (aliases == null)
+            {
+                //first line should be the aliases count
+                int count = int.Parse(ruleArgs[1], CultureInfo.InvariantCulture);
+                aliases = new string[count];
+            }
+            else
+            {
+                // an alias can map to no flags
+                string aliasValue = ruleArgs.Length == 1 ? "" : ruleArgs[1];
+                aliases[aliasCount++] = aliasValue;
+            }
+        }
+
+        private string GetAliasValue(int id)
+        {
+            try
+            {
+                return aliases[id - 1];
+            }
+            catch (System.IndexOutOfRangeException ex)
+            {
+                throw new System.ArgumentException("Bad flag alias number:" + id, ex);
+            }
+        }
+
+        /// <summary>
+        /// Abstraction of the process of parsing flags taken from the affix and dic files
+        /// </summary>
+        internal abstract class FlagParsingStrategy
+        {
+
+            /// <summary>
+            /// Parses the given String into a single flag
+            /// </summary>
+            /// <param name="rawFlag"> String to parse into a flag </param>
+            /// <returns> Parsed flag </returns>
+            internal virtual char parseFlag(string rawFlag)
+            {
+                char[] flags = ParseFlags(rawFlag);
+                if (flags.Length != 1)
+                {
+                    throw new System.ArgumentException("expected only one flag, got: " + rawFlag);
+                }
+                return flags[0];
+            }
+
+            /// <summary>
+            /// Parses the given String into multiple flags
+            /// </summary>
+            /// <param name="rawFlags"> String to parse into flags </param>
+            /// <returns> Parsed flags </returns>
+            internal abstract char[] ParseFlags(string rawFlags);
+        }
+
+        /// <summary>
+        /// Simple implementation of <seealso cref="FlagParsingStrategy"/> that treats the chars in each String as a individual flags.
+        /// Can be used with both the ASCII and UTF-8 flag types.
+        /// </summary>
+        private class SimpleFlagParsingStrategy : FlagParsingStrategy
+        {
+            internal override char[] ParseFlags(string rawFlags)
+            {
+                return rawFlags.ToCharArray();
+            }
+        }
+
+        /// <summary>
+        /// Implementation of <seealso cref="FlagParsingStrategy"/> that assumes each flag is encoded in its numerical form.  In the case
+        /// of multiple flags, each number is separated by a comma.
+        /// </summary>
+        private class NumFlagParsingStrategy : FlagParsingStrategy
+        {
+            internal override char[] ParseFlags(string rawFlags)
+            {
+                string[] rawFlagParts = rawFlags.Trim().Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
+                char[] flags = new char[rawFlagParts.Length];
+                int upto = 0;
+
+                for (int i = 0; i < rawFlagParts.Length; i++)
+                {
+                    // note, removing the trailing X/leading I for nepali... what is the rule here?! 
+                    string replacement = Regex.Replace(rawFlagParts[i], "[^0-9]", "");
+                    // note, ignoring empty flags (this happens in danish, for example)
+                    if (replacement.Length == 0)
+                    {
+                        continue;
+                    }
+                    flags[upto++] = (char)int.Parse(replacement, CultureInfo.InvariantCulture);
+                }
+
+                if (upto < flags.Length)
+                {
+                    flags = Arrays.CopyOf(flags, upto);
+                }
+                return flags;
+            }
+        }
+
+        /// <summary>
+        /// Implementation of <seealso cref="FlagParsingStrategy"/> that assumes each flag is encoded as two ASCII characters whose codes
+        /// must be combined into a single character.
+        /// 
+        /// TODO (rmuir) test
+        /// </summary>
+        private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy
+        {
+            internal override char[] ParseFlags(string rawFlags)
+            {
+                if (rawFlags.Length == 0)
+                {
+                    return new char[0];
+                }
+
+                StringBuilder builder = new StringBuilder();
+                if (rawFlags.Length % 2 == 1)
+                {
+                    throw new System.ArgumentException("Invalid flags (should be even number of characters): " + rawFlags);

<TRUNCATED>

Mime
View raw message