Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id A772A200B92 for ; Wed, 24 Aug 2016 01:17:59 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id A5ECB160ABF; Tue, 23 Aug 2016 23:17:59 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id CC270160AC7 for ; Wed, 24 Aug 2016 01:17:55 +0200 (CEST) Received: (qmail 42892 invoked by uid 500); 23 Aug 2016 23:17:54 -0000 Mailing-List: contact commits-help@lucenenet.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: lucene-net-dev@lucenenet.apache.org Delivered-To: mailing list commits@lucenenet.apache.org Received: (qmail 42662 invoked by uid 99); 23 Aug 2016 23:17:54 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 23 Aug 2016 23:17:54 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 6C995E1777; Tue, 23 Aug 2016 23:17:54 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: synhershko@apache.org To: commits@lucenenet.apache.org Date: Tue, 23 Aug 2016 23:17:59 -0000 Message-Id: In-Reply-To: <7ea169ebc34c46fb8a7c1c3199804cae@git.apache.org> References: <7ea169ebc34c46fb8a7c1c3199804cae@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [06/50] [abbrv] lucenenet git commit: Ported Analysis.Hunspell + tests archived-at: Tue, 23 Aug 2016 23:17:59 -0000 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/e4d9f44c/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs index ff6f4e2..05c2a26 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs @@ -1,12 +1,19 @@ -using System; -using System.Diagnostics; +using Lucene.Net.Store; +using Lucene.Net.Support; +using Lucene.Net.Util; +using Lucene.Net.Util.Automaton; +using Lucene.Net.Util.Fst; +using System; using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.IO; using System.Text; +using System.Text.RegularExpressions; -namespace org.apache.lucene.analysis.hunspell +namespace Lucene.Net.Analysis.Hunspell { - - /* + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -23,1213 +30,1155 @@ namespace org.apache.lucene.analysis.hunspell * limitations under the License. */ - using ByteArrayDataOutput = org.apache.lucene.store.ByteArrayDataOutput; - using ArrayUtil = org.apache.lucene.util.ArrayUtil; - using BytesRef = org.apache.lucene.util.BytesRef; - using BytesRefHash = org.apache.lucene.util.BytesRefHash; - using CharsRef = org.apache.lucene.util.CharsRef; - using IOUtils = org.apache.lucene.util.IOUtils; - using IntsRef = org.apache.lucene.util.IntsRef; - using OfflineSorter = org.apache.lucene.util.OfflineSorter; - using ByteSequencesReader = org.apache.lucene.util.OfflineSorter.ByteSequencesReader; - using ByteSequencesWriter = org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; - using CharacterRunAutomaton = org.apache.lucene.util.automaton.CharacterRunAutomaton; - using RegExp = org.apache.lucene.util.automaton.RegExp; - using Builder = org.apache.lucene.util.fst.Builder; - using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs; - using FST = org.apache.lucene.util.fst.FST; - using IntSequenceOutputs = org.apache.lucene.util.fst.IntSequenceOutputs; - using Outputs = org.apache.lucene.util.fst.Outputs; - using Util = org.apache.lucene.util.fst.Util; - - - /// - /// In-memory structure for the dictionary (.dic) and affix (.aff) - /// data of a hunspell dictionary. - /// - public class Dictionary - { - - internal static readonly char[] NOFLAGS = new char[0]; - - private const string ALIAS_KEY = "AF"; - private const string PREFIX_KEY = "PFX"; - private const string SUFFIX_KEY = "SFX"; - private const string FLAG_KEY = "FLAG"; - private const string COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES"; - private const string CIRCUMFIX_KEY = "CIRCUMFIX"; - private const string IGNORE_KEY = "IGNORE"; - private const string ICONV_KEY = "ICONV"; - private const string OCONV_KEY = "OCONV"; - - private const string NUM_FLAG_TYPE = "num"; - private const string UTF8_FLAG_TYPE = "UTF-8"; - private const string LONG_FLAG_TYPE = "long"; - - // TODO: really for suffixes we should reverse the automaton and run them backwards - private const string PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; - private const string SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - - internal FST prefixes; - internal FST suffixes; - - // all condition checks used by prefixes and suffixes. these are typically re-used across - // many affix stripping rules. so these are deduplicated, to save RAM. - internal List patterns = new List(); - - // the entries in the .dic file, mapping to their set of flags. - // the fst output is the ordinal list for flagLookup - internal FST words; - // the list of unique flagsets (wordforms). theoretically huge, but practically - // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. - internal BytesRefHash flagLookup = new BytesRefHash(); - - // the list of unique strip affixes. - internal char[] stripData; - internal int[] stripOffsets; - - // 8 bytes per affix - internal sbyte[] affixData = new sbyte[64]; - private int currentAffix = 0; - - private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy - - private string[] aliases; - private int aliasCount = 0; - - private readonly File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable? - - internal bool ignoreCase; - internal bool complexPrefixes; - internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping - - internal int circumfix = -1; // circumfix flag, or -1 if one is not defined - - // ignored characters (dictionary, affix, inputs) - private char[] ignore; - - // FSTs used for ICONV/OCONV, output ord pointing to replacement text - internal FST iconv; - internal FST oconv; - - internal bool needsInputCleaning; - internal bool needsOutputCleaning; - - /// - /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix - /// and dictionary files. - /// You have to close the provided InputStreams yourself. - /// - /// InputStream for reading the hunspell affix file (won't be closed). - /// InputStream for reading the hunspell dictionary file (won't be closed). - /// Can be thrown while reading from the InputStreams - /// Can be thrown if the content of the files does not meet expected formats -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: public Dictionary(java.io.InputStream affix, java.io.InputStream dictionary) throws java.io.IOException, java.text.ParseException - public Dictionary(InputStream affix, InputStream dictionary) : this(affix, Collections.singletonList(dictionary), false) - { - } - - /// - /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix - /// and dictionary files. - /// You have to close the provided InputStreams yourself. - /// - /// InputStream for reading the hunspell affix file (won't be closed). - /// InputStream for reading the hunspell dictionary files (won't be closed). - /// Can be thrown while reading from the InputStreams - /// Can be thrown if the content of the files does not meet expected formats -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: public Dictionary(java.io.InputStream affix, java.util.List dictionaries, boolean ignoreCase) throws java.io.IOException, java.text.ParseException - public Dictionary(InputStream affix, IList dictionaries, bool ignoreCase) - { - this.ignoreCase = ignoreCase; - this.needsInputCleaning = ignoreCase; - this.needsOutputCleaning = false; // set if we have an OCONV - flagLookup.add(new BytesRef()); // no flags -> ord 0 - - File aff = File.createTempFile("affix", "aff", tempDir); - OutputStream @out = new BufferedOutputStream(new FileOutputStream(aff)); - InputStream aff1 = null; - InputStream aff2 = null; - try - { - // copy contents of affix stream to temp file -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final byte [] buffer = new byte [1024 * 8]; - sbyte[] buffer = new sbyte [1024 * 8]; - int len; - while ((len = affix.read(buffer)) > 0) - { - @out.write(buffer, 0, len); - } - @out.close(); - - // pass 1: get encoding - aff1 = new BufferedInputStream(new FileInputStream(aff)); - string encoding = getDictionaryEncoding(aff1); - - // pass 2: parse affixes - CharsetDecoder decoder = getJavaEncoding(encoding); - aff2 = new BufferedInputStream(new FileInputStream(aff)); - readAffixFile(aff2, decoder); - - // read dictionary entries - IntSequenceOutputs o = IntSequenceOutputs.Singleton; - Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFiles(dictionaries, decoder, b); - words = b.finish(); - aliases = null; // no longer needed - } - finally - { - IOUtils.closeWhileHandlingException(@out, aff1, aff2); - aff.delete(); - } - } - - /// - /// Looks up Hunspell word forms from the dictionary - /// - internal virtual IntsRef lookupWord(char[] word, int offset, int length) - { - return lookup(words, word, offset, length); - } - - /// - /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length - /// - /// Char array to generate the String from - /// Offset in the char array that the String starts at - /// Length from the offset that the String is - /// List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found - internal virtual IntsRef lookupPrefix(char[] word, int offset, int length) - { - return lookup(prefixes, word, offset, length); - } - - /// - /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length - /// - /// Char array to generate the String from - /// Offset in the char array that the String starts at - /// Length from the offset that the String is - /// List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found - internal virtual IntsRef lookupSuffix(char[] word, int offset, int length) - { - return lookup(suffixes, word, offset, length); - } - - // TODO: this is pretty stupid, considering how the stemming algorithm works - // we can speed it up to be significantly faster! - internal virtual IntsRef lookup(FST fst, char[] word, int offset, int length) - { - if (fst == null) - { - return null; - } -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader bytesReader = fst.getBytesReader(); - FST.BytesReader bytesReader = fst.BytesReader; -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc arc = fst.getFirstArc(new org.apache.lucene.util.fst.FST.Arc()); - FST.Arc arc = fst.getFirstArc(new FST.Arc()); - // Accumulate output as we go -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); - IntsRef NO_OUTPUT = fst.outputs.NoOutput; - IntsRef output = NO_OUTPUT; - - int l = offset + length; - try - { - for (int i = offset, cp = 0; i < l; i += char.charCount(cp)) - { - cp = char.codePointAt(word, i, l); - if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) - { - return null; - } - else if (arc.output != NO_OUTPUT) - { - output = fst.outputs.add(output, arc.output); - } - } - if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) - { - return null; - } - else if (arc.output != NO_OUTPUT) - { - return fst.outputs.add(output, arc.output); - } - else - { - return output; - } - } - catch (IOException bogus) - { - throw new Exception(bogus); - } - } - - /// - /// Reads the affix file through the provided InputStream, building up the prefix and suffix maps - /// - /// InputStream to read the content of the affix file from - /// CharsetDecoder to decode the content of the file - /// Can be thrown while reading from the InputStream -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: private void readAffixFile(java.io.InputStream affixStream, java.nio.charset.CharsetDecoder decoder) throws java.io.IOException, java.text.ParseException - private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) - { - SortedDictionary> prefixes = new SortedDictionary>(); - SortedDictionary> suffixes = new SortedDictionary>(); - IDictionary seenPatterns = new Dictionary(); - - // zero condition -> 0 ord - seenPatterns[".*"] = 0; - patterns.Add(null); - - // zero strip -> 0 ord - IDictionary seenStrips = new LinkedHashMap(); - seenStrips[""] = 0; - - LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); - string line = null; - while ((line = reader.readLine()) != null) - { - // ignore any BOM marker on first line - if (reader.LineNumber == 1 && line.StartsWith("\uFEFF", StringComparison.Ordinal)) - { - line = line.Substring(1); - } - if (line.StartsWith(ALIAS_KEY, StringComparison.Ordinal)) - { - parseAlias(line); - } - else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal)) - { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); - } - else if (line.StartsWith(SUFFIX_KEY, StringComparison.Ordinal)) - { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); - } - else if (line.StartsWith(FLAG_KEY, StringComparison.Ordinal)) - { - // Assume that the FLAG line comes before any prefix or suffixes - // Store the strategy so it can be used when parsing the dic file - flagParsingStrategy = getFlagParsingStrategy(line); - } - else if (line.Equals(COMPLEXPREFIXES_KEY)) - { - complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix - } - else if (line.StartsWith(CIRCUMFIX_KEY, StringComparison.Ordinal)) - { - string[] parts = line.Split("\\s+", true); - if (parts.Length != 2) - { - throw new ParseException("Illegal CIRCUMFIX declaration", reader.LineNumber); - } - circumfix = flagParsingStrategy.parseFlag(parts[1]); - } - else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal)) - { - string[] parts = line.Split("\\s+", true); - if (parts.Length != 2) - { - throw new ParseException("Illegal IGNORE declaration", reader.LineNumber); - } - ignore = parts[1].ToCharArray(); - Arrays.sort(ignore); - needsInputCleaning = true; - } - else if (line.StartsWith(ICONV_KEY, StringComparison.Ordinal) || line.StartsWith(OCONV_KEY, StringComparison.Ordinal)) - { - string[] parts = line.Split("\\s+", true); - string type = parts[0]; - if (parts.Length != 2) - { - throw new ParseException("Illegal " + type + " declaration", reader.LineNumber); - } - int num = int.Parse(parts[1]); - FST res = parseConversions(reader, num); - if (type.Equals("ICONV")) - { - iconv = res; - needsInputCleaning |= iconv != null; - } - else - { - oconv = res; - needsOutputCleaning |= oconv != null; - } - } - } - - this.prefixes = affixFST(prefixes); - this.suffixes = affixFST(suffixes); - - int totalChars = 0; - foreach (string strip in seenStrips.Keys) - { - totalChars += strip.Length; - } - stripData = new char[totalChars]; - stripOffsets = new int[seenStrips.Count + 1]; - int currentOffset = 0; - int currentIndex = 0; - foreach (string strip in seenStrips.Keys) - { - stripOffsets[currentIndex++] = currentOffset; - strip.CopyTo(0, stripData, currentOffset, strip.Length - 0); - currentOffset += strip.Length; - } - Debug.Assert(currentIndex == seenStrips.Count); - stripOffsets[currentIndex] = currentOffset; - } - -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: private org.apache.lucene.util.fst.FST affixFST(java.util.TreeMap> affixes) throws java.io.IOException - private FST affixFST(SortedDictionary> affixes) - { - IntSequenceOutputs outputs = IntSequenceOutputs.Singleton; - Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, outputs); - - IntsRef scratch = new IntsRef(); - foreach (KeyValuePair> entry in affixes.SetOfKeyValuePairs()) - { - Util.toUTF32(entry.Key, scratch); - IList entries = entry.Value; - IntsRef output = new IntsRef(entries.Count); - foreach (char? c in entries) - { - output.ints[output.length++] = c; - } - builder.add(scratch, output); - } - return builder.finish(); - } - - /// - /// Parses a specific affix rule putting the result into the provided affix map - /// - /// Map where the result of the parsing will be put - /// Header line of the affix rule - /// BufferedReader to read the content of the rule from - /// pattern to be used to generate the condition regex - /// pattern - /// map from condition -> index of patterns, for deduplication. - /// Can be thrown while reading the rule -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: private void parseAffix(java.util.TreeMap> affixes, String header, java.io.LineNumberReader reader, String conditionPattern, java.util.Map seenPatterns, java.util.Map seenStrips) throws java.io.IOException, java.text.ParseException - private void parseAffix(SortedDictionary> affixes, string header, LineNumberReader reader, string conditionPattern, IDictionary seenPatterns, IDictionary seenStrips) - { - - BytesRef scratch = new BytesRef(); - StringBuilder sb = new StringBuilder(); - string[] args = header.Split("\\s+", true); - - bool crossProduct = args[2].Equals("Y"); - - int numLines = int.Parse(args[3]); - affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); - ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); - - for (int i = 0; i < numLines; i++) - { - Debug.Assert(affixWriter.Position == currentAffix << 3); - string line = reader.readLine(); - string[] ruleArgs = line.Split("\\s+", true); - - // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] - // condition is optional - if (ruleArgs.Length < 4) - { - throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.LineNumber); - } - - char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); - string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2]; - string affixArg = ruleArgs[3]; - char[] appendFlags = null; - - int flagSep = affixArg.LastIndexOf('/'); - if (flagSep != -1) - { - string flagPart = affixArg.Substring(flagSep + 1); - affixArg = affixArg.Substring(0, flagSep); - - if (aliasCount > 0) - { - flagPart = getAliasValue(int.Parse(flagPart)); - } - - appendFlags = flagParsingStrategy.parseFlags(flagPart); - Arrays.sort(appendFlags); - twoStageAffix = true; - } - - // TODO: add test and fix zero-affix handling! - - string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; - // at least the gascon affix file has this issue - if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) - { - condition = condition + "]"; - } - // "dash hasn't got special meaning" (we must escape it) - if (condition.IndexOf('-') >= 0) - { - condition = condition.Replace("-", "\\-"); - } - -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final String regex; - string regex; - if (".".Equals(condition)) - { - regex = ".*"; // Zero condition is indicated by dot - } - else if (condition.Equals(strip)) - { - regex = ".*"; // TODO: optimize this better: - // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! - // but this is complicated... - } - else - { - regex = string.format(Locale.ROOT, conditionPattern, condition); - } - - // deduplicate patterns - int? patternIndex = seenPatterns[regex]; - if (patternIndex == null) - { - patternIndex = patterns.Count; - if (patternIndex > short.MaxValue) - { - throw new System.NotSupportedException("Too many patterns, please report this to dev@lucene.apache.org"); - } - seenPatterns[regex] = patternIndex; - CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).toAutomaton()); - patterns.Add(pattern); - } - - int? stripOrd = seenStrips[strip]; - if (stripOrd == null) - { - stripOrd = seenStrips.Count; - seenStrips[strip] = stripOrd; - if (stripOrd > Char.MaxValue) - { - throw new System.NotSupportedException("Too many unique strips, please report this to dev@lucene.apache.org"); - } - } - - if (appendFlags == null) - { - appendFlags = NOFLAGS; - } - - encodeFlags(scratch, appendFlags); - int appendFlagsOrd = flagLookup.add(scratch); - if (appendFlagsOrd < 0) - { - // already exists in our hash - appendFlagsOrd = (-appendFlagsOrd) - 1; - } - else if (appendFlagsOrd > short.MaxValue) - { - // this limit is probably flexible, but its a good sanity check too - throw new System.NotSupportedException("Too many unique append flags, please report this to dev@lucene.apache.org"); - } - - affixWriter.writeShort((short)flag); - affixWriter.writeShort((int)(short)stripOrd); - // encode crossProduct into patternIndex - int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0); - affixWriter.writeShort((short)patternOrd); - affixWriter.writeShort((short)appendFlagsOrd); - - if (needsInputCleaning) - { - CharSequence cleaned = cleanInput(affixArg, sb); - affixArg = cleaned.ToString(); - } - - IList list = affixes[affixArg]; - if (list == null) - { - list = new List<>(); - affixes[affixArg] = list; - } - - list.Add((char)currentAffix); - currentAffix++; - } - } - -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: private org.apache.lucene.util.fst.FST parseConversions(java.io.LineNumberReader reader, int num) throws java.io.IOException, java.text.ParseException - private FST parseConversions(LineNumberReader reader, int num) - { - IDictionary mappings = new SortedDictionary(); - - for (int i = 0; i < num; i++) - { - string line = reader.readLine(); - string[] parts = line.Split("\\s+", true); - if (parts.Length != 3) - { - throw new ParseException("invalid syntax: " + line, reader.LineNumber); - } - if (mappings.put(parts[1], parts[2]) != null) - { - throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]); - } - } - - Outputs outputs = CharSequenceOutputs.Singleton; - Builder builder = new Builder(FST.INPUT_TYPE.BYTE2, outputs); - IntsRef scratchInts = new IntsRef(); - foreach (KeyValuePair entry in mappings.SetOfKeyValuePairs()) - { - Util.toUTF16(entry.Key, scratchInts); - builder.add(scratchInts, new CharsRef(entry.Value)); - } - - return builder.finish(); - } - - /// - /// pattern accepts optional BOM + SET + any whitespace - internal static readonly Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+"); - - /// - /// Parses the encoding specified in the affix file readable through the provided InputStream - /// - /// InputStream for reading the affix file - /// Encoding specified in the affix file - /// Can be thrown while reading from the InputStream - /// Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: static String getDictionaryEncoding(java.io.InputStream affix) throws java.io.IOException, java.text.ParseException - internal static string getDictionaryEncoding(InputStream affix) - { -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final StringBuilder encoding = new StringBuilder(); - StringBuilder encoding = new StringBuilder(); - for (;;) - { - encoding.Length = 0; - int ch; - while ((ch = affix.read()) >= 0) - { - if (ch == '\n') - { - break; - } - if (ch != '\r') - { - encoding.Append((char)ch); - } - } - if (encoding.Length == 0 || encoding[0] == '#' || encoding.ToString().Trim().Length == 0) - { - // this test only at the end as ineffective but would allow lines only containing spaces: - if (ch < 0) - { - throw new ParseException("Unexpected end of affix file.", 0); - } - continue; - } - Matcher matcher = ENCODING_PATTERN.matcher(encoding); - if (matcher.find()) - { - int last = matcher.end(); - return encoding.Substring(last).Trim(); - } - } - } - - internal static readonly IDictionary CHARSET_ALIASES; - static Dictionary() - { - IDictionary m = new Dictionary(); - m["microsoft-cp1251"] = "windows-1251"; - m["TIS620-2533"] = "TIS-620"; - CHARSET_ALIASES = Collections.unmodifiableMap(m); - } - - /// - /// Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and - /// MICROSOFT-CP1251 etc are allowed... - /// - /// Encoding to retrieve the CharsetDecoder for - /// CharSetDecoder for the given encoding - private CharsetDecoder getJavaEncoding(string encoding) - { - if ("ISO8859-14".Equals(encoding)) - { - return new ISO8859_14Decoder(); - } - string canon = CHARSET_ALIASES[encoding]; - if (canon != null) - { - encoding = canon; - } - Charset charset = Charset.forName(encoding); - return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); - } - - /// - /// Determines the appropriate based on the FLAG definition line taken from the affix file - /// - /// Line containing the flag information - /// FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition - internal static FlagParsingStrategy getFlagParsingStrategy(string flagLine) - { - string[] parts = flagLine.Split("\\s+", true); - if (parts.Length != 2) - { - throw new System.ArgumentException("Illegal FLAG specification: " + flagLine); - } - string flagType = parts[1]; - - if (NUM_FLAG_TYPE.Equals(flagType)) - { - return new NumFlagParsingStrategy(); - } - else if (UTF8_FLAG_TYPE.Equals(flagType)) - { - return new SimpleFlagParsingStrategy(); - } - else if (LONG_FLAG_TYPE.Equals(flagType)) - { - return new DoubleASCIIFlagParsingStrategy(); - } - - throw new System.ArgumentException("Unknown flag type: " + flagType); - } - - internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping - - internal virtual string unescapeEntry(string entry) - { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < entry.Length; i++) - { - char ch = entry[i]; - if (ch == '\\' && i + 1 < entry.Length) - { - sb.Append(entry[i + 1]); - i++; - } - else if (ch == '/') - { - sb.Append(FLAG_SEPARATOR); - } - else - { - sb.Append(ch); - } - } - return sb.ToString(); - } - - /// - /// Reads the dictionary file through the provided InputStreams, building up the words map - /// - /// InputStreams to read the dictionary file through - /// CharsetDecoder used to decode the contents of the file - /// Can be thrown while reading from the file -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: private void readDictionaryFiles(java.util.List dictionaries, java.nio.charset.CharsetDecoder decoder, org.apache.lucene.util.fst.Builder words) throws java.io.IOException - private void readDictionaryFiles(IList dictionaries, CharsetDecoder decoder, Builder words) - { - BytesRef flagsScratch = new BytesRef(); - IntsRef scratchInts = new IntsRef(); - - StringBuilder sb = new StringBuilder(); - - File unsorted = File.createTempFile("unsorted", "dat", tempDir); - OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted); - bool success = false; - try - { - foreach (InputStream dictionary in dictionaries) - { - BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); - string line = lines.readLine(); // first line is number of entries (approximately, sometimes) - - while ((line = lines.readLine()) != null) - { - line = unescapeEntry(line); - if (needsInputCleaning) - { - int flagSep = line.LastIndexOf(FLAG_SEPARATOR); - if (flagSep == -1) - { - CharSequence cleansed = cleanInput(line, sb); - writer.write(cleansed.ToString().GetBytes(StandardCharsets.UTF_8)); - } - else - { - string text = line.Substring(0, flagSep); - CharSequence cleansed = cleanInput(text, sb); - if (cleansed != sb) - { - sb.Length = 0; - sb.Append(cleansed); - } - sb.Append(line.Substring(flagSep)); - writer.write(sb.ToString().GetBytes(StandardCharsets.UTF_8)); - } - } - else - { - writer.write(line.GetBytes(StandardCharsets.UTF_8)); - } - } - } - success = true; - } - finally - { - if (success) - { - IOUtils.close(writer); - } - else - { - IOUtils.closeWhileHandlingException(writer); - } - } - File sorted = File.createTempFile("sorted", "dat", tempDir); - - OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this)); - sorter.sort(unsorted, sorted); - unsorted.delete(); - - OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted); - BytesRef scratchLine = new BytesRef(); - - // TODO: the flags themselves can be double-chars (long) or also numeric - // either way the trick is to encode them as char... but they must be parsed differently - - string currentEntry = null; - IntsRef currentOrds = new IntsRef(); - - string line; - while (reader.read(scratchLine)) - { - line = scratchLine.utf8ToString(); - string entry; - char[] wordForm; - - int flagSep = line.LastIndexOf(FLAG_SEPARATOR); - if (flagSep == -1) - { - wordForm = NOFLAGS; - entry = line; - } - else - { - // note, there can be comments (morph description) after a flag. - // we should really look for any whitespace: currently just tab and space - int end = line.IndexOf('\t', flagSep); - if (end == -1) - { - end = line.Length; - } - int end2 = line.IndexOf(' ', flagSep); - if (end2 == -1) - { - end2 = line.Length; - } - end = Math.Min(end, end2); - - string flagPart = StringHelperClass.SubstringSpecial(line, flagSep + 1, end); - if (aliasCount > 0) - { - flagPart = getAliasValue(int.Parse(flagPart)); - } - - wordForm = flagParsingStrategy.parseFlags(flagPart); - Arrays.sort(wordForm); - entry = line.Substring(0, flagSep); - } - - int cmp = currentEntry == null ? 1 : entry.CompareTo(currentEntry); - if (cmp < 0) - { - throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry); - } - else - { - encodeFlags(flagsScratch, wordForm); - int ord = flagLookup.add(flagsScratch); - if (ord < 0) - { - // already exists in our hash - ord = (-ord) - 1; - } - // finalize current entry, and switch "current" if necessary - if (cmp > 0 && currentEntry != null) - { - Util.toUTF32(currentEntry, scratchInts); - words.add(scratchInts, currentOrds); - } - // swap current - if (cmp > 0 || currentEntry == null) - { - currentEntry = entry; - currentOrds = new IntsRef(); // must be this way - } - currentOrds.grow(currentOrds.length + 1); - currentOrds.ints[currentOrds.length++] = ord; - } - } - - // finalize last entry - Util.toUTF32(currentEntry, scratchInts); - words.add(scratchInts, currentOrds); - - reader.close(); - sorted.delete(); - } - - private class ComparatorAnonymousInnerClassHelper : IComparer - { - private readonly Dictionary outerInstance; - - public ComparatorAnonymousInnerClassHelper(Dictionary outerInstance) - { - this.outerInstance = outerInstance; - scratch1 = new BytesRef(); - scratch2 = new BytesRef(); - } - - internal BytesRef scratch1; - internal BytesRef scratch2; - - public virtual int Compare(BytesRef o1, BytesRef o2) - { - scratch1.bytes = o1.bytes; - scratch1.offset = o1.offset; - scratch1.length = o1.length; - - for (int i = scratch1.length - 1; i >= 0; i--) - { - if (scratch1.bytes[scratch1.offset + i] == outerInstance.FLAG_SEPARATOR) - { - scratch1.length = i; - break; - } - } - - scratch2.bytes = o2.bytes; - scratch2.offset = o2.offset; - scratch2.length = o2.length; - - for (int i = scratch2.length - 1; i >= 0; i--) - { - if (scratch2.bytes[scratch2.offset + i] == outerInstance.FLAG_SEPARATOR) - { - scratch2.length = i; - break; - } - } - - int cmp = scratch1.compareTo(scratch2); - if (cmp == 0) - { - // tie break on whole row - return o1.compareTo(o2); - } - else - { - return cmp; - } - } - } - - internal static char[] decodeFlags(BytesRef b) - { - if (b.length == 0) - { - return CharsRef.EMPTY_CHARS; - } - int len = (int)((uint)b.length >> 1); - char[] flags = new char[len]; - int upto = 0; - int end = b.offset + b.length; - for (int i = b.offset; i < end; i += 2) - { - flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i + 1] & 0xff)); - } - return flags; - } - - internal static void encodeFlags(BytesRef b, char[] flags) - { - int len = flags.Length << 1; - b.grow(len); - b.length = len; - int upto = b.offset; - for (int i = 0; i < flags.Length; i++) - { - int flag = flags[i]; - b.bytes[upto++] = unchecked((sbyte)((flag >> 8) & 0xff)); - b.bytes[upto++] = unchecked((sbyte)(flag & 0xff)); - } - } - - private void parseAlias(string line) - { - string[] ruleArgs = line.Split("\\s+", true); - if (aliases == null) - { - //first line should be the aliases count -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final int count = Integer.parseInt(ruleArgs[1]); - int count = int.Parse(ruleArgs[1]); - aliases = new string[count]; - } - else - { - // an alias can map to no flags - string aliasValue = ruleArgs.Length == 1 ? "" : ruleArgs[1]; - aliases[aliasCount++] = aliasValue; - } - } - - private string getAliasValue(int id) - { - try - { - return aliases[id - 1]; - } - catch (System.IndexOutOfRangeException ex) - { - throw new System.ArgumentException("Bad flag alias number:" + id, ex); - } - } - - /// - /// Abstraction of the process of parsing flags taken from the affix and dic files - /// - internal abstract class FlagParsingStrategy - { - - /// - /// Parses the given String into a single flag - /// - /// String to parse into a flag - /// Parsed flag - internal virtual char parseFlag(string rawFlag) - { - char[] flags = parseFlags(rawFlag); - if (flags.Length != 1) - { - throw new System.ArgumentException("expected only one flag, got: " + rawFlag); - } - return flags[0]; - } - - /// - /// Parses the given String into multiple flags - /// - /// String to parse into flags - /// Parsed flags - internal abstract char[] parseFlags(string rawFlags); - } - - /// - /// Simple implementation of that treats the chars in each String as a individual flags. - /// Can be used with both the ASCII and UTF-8 flag types. - /// - private class SimpleFlagParsingStrategy : FlagParsingStrategy - { - public override char[] parseFlags(string rawFlags) - { - return rawFlags.ToCharArray(); - } - } - - /// - /// Implementation of that assumes each flag is encoded in its numerical form. In the case - /// of multiple flags, each number is separated by a comma. - /// - private class NumFlagParsingStrategy : FlagParsingStrategy - { - public override char[] parseFlags(string rawFlags) - { - string[] rawFlagParts = rawFlags.Trim().Split(",", true); - char[] flags = new char[rawFlagParts.Length]; - int upto = 0; - - for (int i = 0; i < rawFlagParts.Length; i++) - { - // note, removing the trailing X/leading I for nepali... what is the rule here?! - string replacement = rawFlagParts[i].replaceAll("[^0-9]", ""); - // note, ignoring empty flags (this happens in danish, for example) - if (replacement.Length == 0) - { - continue; - } - flags[upto++] = (char) int.Parse(replacement); - } - - if (upto < flags.Length) - { - flags = Arrays.copyOf(flags, upto); - } - return flags; - } - } - - /// - /// Implementation of that assumes each flag is encoded as two ASCII characters whose codes - /// must be combined into a single character. - /// - /// TODO (rmuir) test - /// - private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy - { - - public override char[] parseFlags(string rawFlags) - { - if (rawFlags.Length == 0) - { - return new char[0]; - } - - StringBuilder builder = new StringBuilder(); - if (rawFlags.Length % 2 == 1) - { - throw new System.ArgumentException("Invalid flags (should be even number of characters): " + rawFlags); - } - for (int i = 0; i < rawFlags.Length; i += 2) - { - char cookedFlag = (char)((int) rawFlags[i] + (int) rawFlags[i + 1]); - builder.Append(cookedFlag); - } - - char[] flags = new char[builder.Length]; - builder.getChars(0, builder.Length, flags, 0); - return flags; - } - } - - internal static bool hasFlag(char[] flags, char flag) - { - return Arrays.binarySearch(flags, flag) >= 0; - } - - internal virtual CharSequence cleanInput(CharSequence input, StringBuilder reuse) - { - reuse.Length = 0; - - for (int i = 0; i < input.length(); i++) - { - char ch = input.charAt(i); - - if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) - { - continue; - } - - if (ignoreCase && iconv == null) - { - // if we have no input conversion mappings, do this on-the-fly - ch = char.ToLower(ch); - } - - reuse.Append(ch); - } - - if (iconv != null) - { - try - { - applyMappings(iconv, reuse); - } - catch (IOException bogus) - { - throw new Exception(bogus); - } - if (ignoreCase) - { - for (int i = 0; i < reuse.Length; i++) - { - reuse[i] = char.ToLower(reuse[i]); - } - } - } - - return reuse; - } - - // TODO: this could be more efficient! -//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: -//ORIGINAL LINE: static void applyMappings(org.apache.lucene.util.fst.FST fst, StringBuilder sb) throws java.io.IOException - internal static void applyMappings(FST fst, StringBuilder sb) - { -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader bytesReader = fst.getBytesReader(); - FST.BytesReader bytesReader = fst.BytesReader; -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc firstArc = fst.getFirstArc(new org.apache.lucene.util.fst.FST.Arc()); - FST.Arc firstArc = fst.getFirstArc(new FST.Arc()); -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.CharsRef NO_OUTPUT = fst.outputs.getNoOutput(); - CharsRef NO_OUTPUT = fst.outputs.NoOutput; - - // temporary stuff -//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': -//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc arc = new org.apache.lucene.util.fst.FST.Arc<>(); - FST.Arc arc = new FST.Arc(); - int longestMatch; - CharsRef longestOutput; - - for (int i = 0; i < sb.Length; i++) - { - arc.copyFrom(firstArc); - CharsRef output = NO_OUTPUT; - longestMatch = -1; - longestOutput = null; - - for (int j = i; j < sb.Length; j++) - { - char ch = sb[j]; - if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) - { - break; - } - else - { - output = fst.outputs.add(output, arc.output); - } - if (arc.Final) - { - longestOutput = fst.outputs.add(output, arc.nextFinalOutput); - longestMatch = j; - } - } - - if (longestMatch >= 0) - { - sb.Remove(i, longestMatch + 1 - i); - sb.Insert(i, longestOutput); - i += (longestOutput.length - 1); - } - } - } - } - + /// + /// In-memory structure for the dictionary (.dic) and affix (.aff) + /// data of a hunspell dictionary. + /// + public class Dictionary + { + internal static readonly char[] NOFLAGS = new char[0]; + + private const string ALIAS_KEY = "AF"; + private const string PREFIX_KEY = "PFX"; + private const string SUFFIX_KEY = "SFX"; + private const string FLAG_KEY = "FLAG"; + private const string COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES"; + private const string CIRCUMFIX_KEY = "CIRCUMFIX"; + private const string IGNORE_KEY = "IGNORE"; + private const string ICONV_KEY = "ICONV"; + private const string OCONV_KEY = "OCONV"; + + private const string NUM_FLAG_TYPE = "num"; + private const string UTF8_FLAG_TYPE = "UTF-8"; + private const string LONG_FLAG_TYPE = "long"; + + // TODO: really for suffixes we should reverse the automaton and run them backwards + private const string PREFIX_CONDITION_REGEX_PATTERN = "{0}.*"; + private const string SUFFIX_CONDITION_REGEX_PATTERN = ".*{0}"; + + internal FST prefixes; + internal FST suffixes; + + // all condition checks used by prefixes and suffixes. these are typically re-used across + // many affix stripping rules. so these are deduplicated, to save RAM. + internal List patterns = new List(); + + // the entries in the .dic file, mapping to their set of flags. + // the fst output is the ordinal list for flagLookup + internal FST words; + // the list of unique flagsets (wordforms). theoretically huge, but practically + // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. + internal BytesRefHash flagLookup = new BytesRefHash(); + + // the list of unique strip affixes. + internal char[] stripData; + internal int[] stripOffsets; + + // 8 bytes per affix + internal byte[] affixData = new byte[64]; + private int currentAffix = 0; + + private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy + + private string[] aliases; + private int aliasCount = 0; + + private readonly DirectoryInfo tempDir = OfflineSorter.DefaultTempDir(); // TODO: make this configurable? + + internal bool ignoreCase; + internal bool complexPrefixes; + internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping + + internal int circumfix = -1; // circumfix flag, or -1 if one is not defined + + // ignored characters (dictionary, affix, inputs) + private char[] ignore; + + // FSTs used for ICONV/OCONV, output ord pointing to replacement text + internal FST iconv; + internal FST oconv; + + internal bool needsInputCleaning; + internal bool needsOutputCleaning; + + // LUCENENET: Added so we can get better performance than creating the regex in every tight loop. + private static Regex whitespacePattern = new Regex("\\s+", RegexOptions.Compiled); + + /// + /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + /// and dictionary files. + /// You have to close the provided InputStreams yourself. + /// + /// InputStream for reading the hunspell affix file (won't be closed). + /// InputStream for reading the hunspell dictionary file (won't be closed). + /// Can be thrown while reading from the InputStreams + /// Can be thrown if the content of the files does not meet expected formats + public Dictionary(Stream affix, Stream dictionary) + : this(affix, new List() { dictionary }, false) + { + } + + /// + /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + /// and dictionary files. + /// You have to close the provided InputStreams yourself. + /// + /// InputStream for reading the hunspell affix file (won't be closed). + /// InputStream for reading the hunspell dictionary files (won't be closed). + /// Can be thrown while reading from the InputStreams + /// Can be thrown if the content of the files does not meet expected formats + public Dictionary(Stream affix, IList dictionaries, bool ignoreCase) + { + this.ignoreCase = ignoreCase; + this.needsInputCleaning = ignoreCase; + this.needsOutputCleaning = false; // set if we have an OCONV + flagLookup.Add(new BytesRef()); // no flags -> ord 0 + + FileInfo aff = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "affix.aff")); + using (Stream @out = aff.Create()) + { + Stream aff1 = null; + Stream aff2 = null; + try + { + // copy contents of affix stream to temp file + byte[] buffer = new byte[1024 * 8]; + int len; + while ((len = affix.Read(buffer, 0, buffer.Length)) > 0) + { + @out.Write(buffer, 0, len); + } + @out.Close(); // LUCENENET: Release the file handle - we dispose @out later + + // pass 1: get encoding + aff1 = File.OpenRead(aff.FullName); + string encoding = GetDictionaryEncoding(aff1); + + // pass 2: parse affixes + Encoding decoder = GetSystemEncoding(encoding); + aff2 = File.OpenRead(aff.FullName); + ReadAffixFile(aff2, decoder); + + // read dictionary entries + IntSequenceOutputs o = IntSequenceOutputs.Singleton; + Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); + ReadDictionaryFiles(dictionaries, decoder, b); + words = b.Finish(); + aliases = null; // no longer needed + } + finally + { + IOUtils.CloseWhileHandlingException(aff1, aff2); + aff.Delete(); + } + } + } + + /// + /// Looks up Hunspell word forms from the dictionary + /// + internal virtual IntsRef LookupWord(char[] word, int offset, int length) + { + return Lookup(words, word, offset, length); + } + + /// + /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length + /// + /// Char array to generate the String from + /// Offset in the char array that the String starts at + /// Length from the offset that the String is + /// List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found + internal virtual IntsRef LookupPrefix(char[] word, int offset, int length) + { + return Lookup(prefixes, word, offset, length); + } + + /// + /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length + /// + /// Char array to generate the String from + /// Offset in the char array that the String starts at + /// Length from the offset that the String is + /// List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found + internal virtual IntsRef LookupSuffix(char[] word, int offset, int length) + { + return Lookup(suffixes, word, offset, length); + } + + // TODO: this is pretty stupid, considering how the stemming algorithm works + // we can speed it up to be significantly faster! + internal virtual IntsRef Lookup(FST fst, char[] word, int offset, int length) + { + if (fst == null) + { + return null; + } + FST.BytesReader bytesReader = fst.BytesReader; + FST.Arc arc = fst.GetFirstArc(new FST.Arc()); + // Accumulate output as we go + IntsRef NO_OUTPUT = fst.Outputs.NoOutput; + IntsRef output = NO_OUTPUT; + + int l = offset + length; + try + { + for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp)) + { + cp = Character.CodePointAt(word, i, l); + if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null) + { + return null; + } + else if (arc.Output != NO_OUTPUT) + { + output = fst.Outputs.Add(output, arc.Output); + } + } + if (fst.FindTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) + { + return null; + } + else if (arc.Output != NO_OUTPUT) + { + return fst.Outputs.Add(output, arc.Output); + } + else + { + return output; + } + } + catch (IOException bogus) + { + throw new Exception(bogus.Message, bogus); + } + } + + /// + /// Reads the affix file through the provided InputStream, building up the prefix and suffix maps + /// + /// InputStream to read the content of the affix file from + /// CharsetDecoder to decode the content of the file + /// Can be thrown while reading from the InputStream + private void ReadAffixFile(Stream affixStream, Encoding decoder) + { + SortedDictionary> prefixes = new SortedDictionary>(); + SortedDictionary> suffixes = new SortedDictionary>(); + IDictionary seenPatterns = new Dictionary(); + + // zero condition -> 0 ord + seenPatterns[".*"] = 0; + patterns.Add(null); + + // zero strip -> 0 ord + IDictionary seenStrips = new Dictionary(); + seenStrips[""] = 0; + + var reader = new StreamReader(affixStream, decoder); + string line = null; + int lineNumber = 0; + while ((line = reader.ReadLine()) != null) + { + lineNumber++; + // ignore any BOM marker on first line + if (lineNumber == 1 && line.StartsWith("\uFEFF", StringComparison.Ordinal)) + { + line = line.Substring(1); + } + if (line.StartsWith(ALIAS_KEY, StringComparison.Ordinal)) + { + ParseAlias(line); + } + else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal)) + { + ParseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); + } + else if (line.StartsWith(SUFFIX_KEY, StringComparison.Ordinal)) + { + ParseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); + } + else if (line.StartsWith(FLAG_KEY, StringComparison.Ordinal)) + { + // Assume that the FLAG line comes before any prefix or suffixes + // Store the strategy so it can be used when parsing the dic file + flagParsingStrategy = GetFlagParsingStrategy(line); + } + else if (line.Equals(COMPLEXPREFIXES_KEY)) + { + complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix + } + else if (line.StartsWith(CIRCUMFIX_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line); + if (parts.Length != 2) + { + throw new Exception(string.Format("Illegal CIRCUMFIX declaration, line {0}", lineNumber)); + } + circumfix = flagParsingStrategy.parseFlag(parts[1]); + } + else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line); + if (parts.Length != 2) + { + throw new Exception(string.Format("Illegal IGNORE declaration, line {0}", lineNumber)); + } + ignore = parts[1].ToCharArray(); + Array.Sort(ignore); + needsInputCleaning = true; + } + else if (line.StartsWith(ICONV_KEY, StringComparison.Ordinal) || line.StartsWith(OCONV_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line); + string type = parts[0]; + if (parts.Length != 2) + { + throw new Exception(string.Format("Illegal {0} declaration, line {1}", type, lineNumber)); + } + int num = int.Parse(parts[1], CultureInfo.InvariantCulture); + FST res = ParseConversions(reader, num); + if (type.Equals("ICONV")) + { + iconv = res; + needsInputCleaning |= iconv != null; + } + else + { + oconv = res; + needsOutputCleaning |= oconv != null; + } + } + } + + this.prefixes = AffixFST(prefixes); + this.suffixes = AffixFST(suffixes); + + int totalChars = 0; + foreach (string strip in seenStrips.Keys) + { + totalChars += strip.Length; + } + stripData = new char[totalChars]; + stripOffsets = new int[seenStrips.Count + 1]; + int currentOffset = 0; + int currentIndex = 0; + foreach (string strip in seenStrips.Keys) + { + stripOffsets[currentIndex++] = currentOffset; + strip.CopyTo(0, stripData, currentOffset, strip.Length - 0); + currentOffset += strip.Length; + } + Debug.Assert(currentIndex == seenStrips.Count); + stripOffsets[currentIndex] = currentOffset; + } + + private FST AffixFST(SortedDictionary> affixes) + { + IntSequenceOutputs outputs = IntSequenceOutputs.Singleton; + Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, outputs); + + IntsRef scratch = new IntsRef(); + foreach (KeyValuePair> entry in affixes) + { + Lucene.Net.Util.Fst.Util.ToUTF32(entry.Key, scratch); + IList entries = entry.Value; + IntsRef output = new IntsRef(entries.Count); + foreach (char? c in entries) + { + output.Ints[output.Length++] = c.HasValue ? c.Value : 0; + } + builder.Add(scratch, output); + } + return builder.Finish(); + } + + /// + /// Parses a specific affix rule putting the result into the provided affix map + /// + /// Map where the result of the parsing will be put + /// Header line of the affix rule + /// BufferedReader to read the content of the rule from + /// pattern to be used to generate the condition regex + /// pattern + /// map from condition -> index of patterns, for deduplication. + /// Can be thrown while reading the rule + private void ParseAffix(SortedDictionary> affixes, string header, TextReader reader, string conditionPattern, IDictionary seenPatterns, IDictionary seenStrips) + { + + BytesRef scratch = new BytesRef(); + StringBuilder sb = new StringBuilder(); + string[] args = whitespacePattern.Split(header); + + bool crossProduct = args[2].Equals("Y"); + + int numLines = int.Parse(args[3], CultureInfo.InvariantCulture); + affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3)); + ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); + + for (int i = 0; i < numLines; i++) + { + Debug.Assert(affixWriter.Position == currentAffix << 3); + string line = reader.ReadLine(); + string[] ruleArgs = whitespacePattern.Split(line); + + // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] + // condition is optional + if (ruleArgs.Length < 4) + { + throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader + } + + char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); + string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2]; + string affixArg = ruleArgs[3]; + char[] appendFlags = null; + + int flagSep = affixArg.LastIndexOf('/'); + if (flagSep != -1) + { + string flagPart = affixArg.Substring(flagSep + 1); + affixArg = affixArg.Substring(0, flagSep - 0); + + if (aliasCount > 0) + { + flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); + } + + appendFlags = flagParsingStrategy.ParseFlags(flagPart); + Array.Sort(appendFlags); + twoStageAffix = true; + } + + // TODO: add test and fix zero-affix handling! + + string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; + // at least the gascon affix file has this issue + if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) + { + condition = condition + "]"; + } + // "dash hasn't got special meaning" (we must escape it) + if (condition.IndexOf('-') >= 0) + { + condition = condition.Replace("-", "\\-"); + } + + string regex; + if (".".Equals(condition)) + { + regex = ".*"; // Zero condition is indicated by dot + } + else if (condition.Equals(strip)) + { + regex = ".*"; // TODO: optimize this better: + // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! + // but this is complicated... + } + else + { + regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition); + } + + // deduplicate patterns + int? patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null; + if (patternIndex == null) + { + patternIndex = patterns.Count; + if (patternIndex > short.MaxValue) + { + throw new System.NotSupportedException("Too many patterns, please report this to dev@lucene.apache.org"); + } + seenPatterns[regex] = patternIndex; + CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton()); + patterns.Add(pattern); + } + + int? stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null; + if (stripOrd == null) + { + stripOrd = seenStrips.Count; + seenStrips[strip] = stripOrd; + if (stripOrd > char.MaxValue) + { + throw new System.NotSupportedException("Too many unique strips, please report this to dev@lucene.apache.org"); + } + } + + if (appendFlags == null) + { + appendFlags = NOFLAGS; + } + + EncodeFlags(scratch, appendFlags); + int appendFlagsOrd = flagLookup.Add(scratch); + if (appendFlagsOrd < 0) + { + // already exists in our hash + appendFlagsOrd = (-appendFlagsOrd) - 1; + } + else if (appendFlagsOrd > short.MaxValue) + { + // this limit is probably flexible, but its a good sanity check too + throw new System.NotSupportedException("Too many unique append flags, please report this to dev@lucene.apache.org"); + } + + affixWriter.WriteShort((short)flag); + affixWriter.WriteShort((short)stripOrd); + // encode crossProduct into patternIndex + int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0); + affixWriter.WriteShort((short)patternOrd); + affixWriter.WriteShort((short)appendFlagsOrd); + + if (needsInputCleaning) + { + string cleaned = CleanInput(affixArg, sb); + affixArg = cleaned.ToString(); + } + + IList list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null; + if (list == null) + { + list = new List(); + affixes[affixArg] = list; + } + + list.Add((char)currentAffix); + currentAffix++; + } + } + + private FST ParseConversions(TextReader reader, int num) + { + IDictionary mappings = new SortedDictionary(); + + for (int i = 0; i < num; i++) + { + string line = reader.ReadLine(); + string[] parts = whitespacePattern.Split(line); + if (parts.Length != 3) + { + throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader + } + if (mappings.Put(parts[1], parts[2]) != null) + { + throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]); + } + } + + Outputs outputs = CharSequenceOutputs.Singleton; + Builder builder = new Builder(FST.INPUT_TYPE.BYTE2, outputs); + IntsRef scratchInts = new IntsRef(); + foreach (KeyValuePair entry in mappings) + { + Lucene.Net.Util.Fst.Util.ToUTF16(entry.Key, scratchInts); + builder.Add(scratchInts, new CharsRef(entry.Value)); + } + + return builder.Finish(); + } + + /// + /// pattern accepts optional BOM + SET + any whitespace + internal static readonly Regex ENCODING_PATTERN = new Regex("^(\u00EF\u00BB\u00BF)?SET\\s+", RegexOptions.Compiled); + + /// + /// Parses the encoding specified in the affix file readable through the provided InputStream + /// + /// InputStream for reading the affix file + /// Encoding specified in the affix file + /// Can be thrown while reading from the InputStream + /// Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } + internal static string GetDictionaryEncoding(Stream affix) + { + StringBuilder encoding = new StringBuilder(); + for (;;) + { + encoding.Length = 0; + int ch; + while ((ch = affix.ReadByte()) > 0) + { + if (ch == '\n') + { + break; + } + if (ch != '\r') + { + encoding.Append((char)ch); + } + } + if (encoding.Length == 0 || encoding[0] == '#' || encoding.ToString().Trim().Length == 0) + { + // this test only at the end as ineffective but would allow lines only containing spaces: + if (ch < 0) + { + throw new Exception("Unexpected end of affix file." /*, 0*/); + } + continue; + } + Match matcher = ENCODING_PATTERN.Match(encoding.ToString()); + if (matcher.Success) + { + int last = matcher.Index + matcher.Length; + return encoding.ToString(last, encoding.Length - last).Trim(); + } + } + } + + internal static readonly IDictionary CHARSET_ALIASES; + static Dictionary() + { + IDictionary m = new Dictionary(); + m["microsoft-cp1251"] = "windows-1251"; + m["TIS620-2533"] = "TIS-620"; + CHARSET_ALIASES = Collections.UnmodifiableMap(m); + } + + /// + /// Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and + /// MICROSOFT-CP1251 etc are allowed... + /// + /// Encoding to retrieve the CharsetDecoder for + /// CharSetDecoder for the given encoding + // LUCENENET NOTE: This was getJavaEncoding in the original + private Encoding GetSystemEncoding(string encoding) + { + if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase)) + { + return new ISO8859_14Encoding(); + } + return Encoding.GetEncoding(encoding); + } + + + /// + /// Determines the appropriate based on the FLAG definition line taken from the affix file + /// + /// Line containing the flag information + /// FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition + internal static FlagParsingStrategy GetFlagParsingStrategy(string flagLine) + { + string[] parts = whitespacePattern.Split(flagLine); + if (parts.Length != 2) + { + throw new System.ArgumentException("Illegal FLAG specification: " + flagLine); + } + string flagType = parts[1]; + + if (NUM_FLAG_TYPE.Equals(flagType)) + { + return new NumFlagParsingStrategy(); + } + else if (UTF8_FLAG_TYPE.Equals(flagType)) + { + return new SimpleFlagParsingStrategy(); + } + else if (LONG_FLAG_TYPE.Equals(flagType)) + { + return new DoubleASCIIFlagParsingStrategy(); + } + + throw new System.ArgumentException("Unknown flag type: " + flagType); + } + + internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping + + internal virtual string UnescapeEntry(string entry) + { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < entry.Length; i++) + { + char ch = entry[i]; + if (ch == '\\' && i + 1 < entry.Length) + { + sb.Append(entry[i + 1]); + i++; + } + else if (ch == '/') + { + sb.Append(FLAG_SEPARATOR); + } + else + { + sb.Append(ch); + } + } + return sb.ToString(); + } + + /// + /// Reads the dictionary file through the provided InputStreams, building up the words map + /// + /// InputStreams to read the dictionary file through + /// CharsetDecoder used to decode the contents of the file + /// Can be thrown while reading from the file + private void ReadDictionaryFiles(IList dictionaries, Encoding decoder, Builder words) + { + BytesRef flagsScratch = new BytesRef(); + IntsRef scratchInts = new IntsRef(); + + StringBuilder sb = new StringBuilder(); + + FileInfo unsorted = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "unsorted.dat")); + OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted); + bool success = false; + try + { + foreach (Stream dictionary in dictionaries) + { + var lines = new StreamReader(dictionary, decoder); + string line = lines.ReadLine(); // first line is number of entries (approximately, sometimes) + + while ((line = lines.ReadLine()) != null) + { + line = UnescapeEntry(line); + if (needsInputCleaning) + { + int flagSep = line.LastIndexOf(FLAG_SEPARATOR); + if (flagSep == -1) + { + string cleansed = CleanInput(line, sb); + writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8)); + } + else + { + string text = line.Substring(0, flagSep - 0); + string cleansed = CleanInput(text, sb); + if (cleansed != sb.ToString()) + { + sb.Length = 0; + sb.Append(cleansed); + } + sb.Append(line.Substring(flagSep)); + writer.Write(sb.ToString().GetBytes(Encoding.UTF8)); + } + } + else + { + writer.Write(line.GetBytes(Encoding.UTF8)); + } + } + } + success = true; + } + finally + { + if (success) + { + IOUtils.Close(writer); + } + else + { + IOUtils.CloseWhileHandlingException(writer); + } + } + FileInfo sorted = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "sorted.dat")); + using (var temp = sorted.Create()) { } + + OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this)); + sorter.Sort(unsorted, sorted); + unsorted.Delete(); + + OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted); + BytesRef scratchLine = new BytesRef(); + + // TODO: the flags themselves can be double-chars (long) or also numeric + // either way the trick is to encode them as char... but they must be parsed differently + + string currentEntry = null; + IntsRef currentOrds = new IntsRef(); + + string line2; + while (reader.Read(scratchLine)) + { + line2 = scratchLine.Utf8ToString(); + string entry; + char[] wordForm; + + int flagSep = line2.LastIndexOf(FLAG_SEPARATOR); + if (flagSep == -1) + { + wordForm = NOFLAGS; + entry = line2; + } + else + { + // note, there can be comments (morph description) after a flag. + // we should really look for any whitespace: currently just tab and space + int end = line2.IndexOf('\t', flagSep); + if (end == -1) + { + end = line2.Length; + } + int end2 = line2.IndexOf(' ', flagSep); + if (end2 == -1) + { + end2 = line2.Length; + } + end = Math.Min(end, end2); + + string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1)); + if (aliasCount > 0) + { + flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); + } + + wordForm = flagParsingStrategy.ParseFlags(flagPart); + Array.Sort(wordForm); + entry = line2.Substring(0, flagSep - 0); + } + + int cmp = currentEntry == null ? 1 : entry.CompareTo(currentEntry); + if (cmp < 0) + { + throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry); + } + else + { + EncodeFlags(flagsScratch, wordForm); + int ord = flagLookup.Add(flagsScratch); + if (ord < 0) + { + // already exists in our hash + ord = (-ord) - 1; + } + // finalize current entry, and switch "current" if necessary + if (cmp > 0 && currentEntry != null) + { + Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts); + words.Add(scratchInts, currentOrds); + } + // swap current + if (cmp > 0 || currentEntry == null) + { + currentEntry = entry; + currentOrds = new IntsRef(); // must be this way + } + currentOrds.Grow(currentOrds.Length + 1); + currentOrds.Ints[currentOrds.Length++] = ord; + } + } + + // finalize last entry + Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts); + words.Add(scratchInts, currentOrds); + + reader.Dispose(); + sorted.Delete(); + } + + private class ComparatorAnonymousInnerClassHelper : IComparer + { + private readonly Dictionary outerInstance; + + public ComparatorAnonymousInnerClassHelper(Dictionary outerInstance) + { + this.outerInstance = outerInstance; + scratch1 = new BytesRef(); + scratch2 = new BytesRef(); + } + + internal BytesRef scratch1; + internal BytesRef scratch2; + + public virtual int Compare(BytesRef o1, BytesRef o2) + { + scratch1.Bytes = o1.Bytes; + scratch1.Offset = o1.Offset; + scratch1.Length = o1.Length; + + for (int i = scratch1.Length - 1; i >= 0; i--) + { + if (scratch1.Bytes[scratch1.Offset + i] == outerInstance.FLAG_SEPARATOR) + { + scratch1.Length = i; + break; + } + } + + scratch2.Bytes = o2.Bytes; + scratch2.Offset = o2.Offset; + scratch2.Length = o2.Length; + + for (int i = scratch2.Length - 1; i >= 0; i--) + { + if (scratch2.Bytes[scratch2.Offset + i] == outerInstance.FLAG_SEPARATOR) + { + scratch2.Length = i; + break; + } + } + + int cmp = scratch1.CompareTo(scratch2); + if (cmp == 0) + { + // tie break on whole row + return o1.CompareTo(o2); + } + else + { + return cmp; + } + } + } + + internal static char[] DecodeFlags(BytesRef b) + { + if (b.Length == 0) + { + return CharsRef.EMPTY_CHARS; + } + int len = (int)((uint)b.Length >> 1); + char[] flags = new char[len]; + int upto = 0; + int end = b.Offset + b.Length; + for (int i = b.Offset; i < end; i += 2) + { + flags[upto++] = (char)((b.Bytes[i] << 8) | (b.Bytes[i + 1] & 0xff)); + } + return flags; + } + + internal static void EncodeFlags(BytesRef b, char[] flags) + { + int len = flags.Length << 1; + b.Grow(len); + b.Length = len; + int upto = b.Offset; + for (int i = 0; i < flags.Length; i++) + { + int flag = flags[i]; + b.Bytes[upto++] = (byte)((flag >> 8) & 0xff); + b.Bytes[upto++] = (byte)(flag & 0xff); + } + } + + private void ParseAlias(string line) + { + string[] ruleArgs = whitespacePattern.Split(line); + if (aliases == null) + { + //first line should be the aliases count + int count = int.Parse(ruleArgs[1], CultureInfo.InvariantCulture); + aliases = new string[count]; + } + else + { + // an alias can map to no flags + string aliasValue = ruleArgs.Length == 1 ? "" : ruleArgs[1]; + aliases[aliasCount++] = aliasValue; + } + } + + private string GetAliasValue(int id) + { + try + { + return aliases[id - 1]; + } + catch (System.IndexOutOfRangeException ex) + { + throw new System.ArgumentException("Bad flag alias number:" + id, ex); + } + } + + /// + /// Abstraction of the process of parsing flags taken from the affix and dic files + /// + internal abstract class FlagParsingStrategy + { + + /// + /// Parses the given String into a single flag + /// + /// String to parse into a flag + /// Parsed flag + internal virtual char parseFlag(string rawFlag) + { + char[] flags = ParseFlags(rawFlag); + if (flags.Length != 1) + { + throw new System.ArgumentException("expected only one flag, got: " + rawFlag); + } + return flags[0]; + } + + /// + /// Parses the given String into multiple flags + /// + /// String to parse into flags + /// Parsed flags + internal abstract char[] ParseFlags(string rawFlags); + } + + /// + /// Simple implementation of that treats the chars in each String as a individual flags. + /// Can be used with both the ASCII and UTF-8 flag types. + /// + private class SimpleFlagParsingStrategy : FlagParsingStrategy + { + internal override char[] ParseFlags(string rawFlags) + { + return rawFlags.ToCharArray(); + } + } + + /// + /// Implementation of that assumes each flag is encoded in its numerical form. In the case + /// of multiple flags, each number is separated by a comma. + /// + private class NumFlagParsingStrategy : FlagParsingStrategy + { + internal override char[] ParseFlags(string rawFlags) + { + string[] rawFlagParts = rawFlags.Trim().Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); + char[] flags = new char[rawFlagParts.Length]; + int upto = 0; + + for (int i = 0; i < rawFlagParts.Length; i++) + { + // note, removing the trailing X/leading I for nepali... what is the rule here?! + string replacement = Regex.Replace(rawFlagParts[i], "[^0-9]", ""); + // note, ignoring empty flags (this happens in danish, for example) + if (replacement.Length == 0) + { + continue; + } + flags[upto++] = (char)int.Parse(replacement, CultureInfo.InvariantCulture); + } + + if (upto < flags.Length) + { + flags = Arrays.CopyOf(flags, upto); + } + return flags; + } + } + + /// + /// Implementation of that assumes each flag is encoded as two ASCII characters whose codes + /// must be combined into a single character. + /// + /// TODO (rmuir) test + /// + private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy + { + internal override char[] ParseFlags(string rawFlags) + { + if (rawFlags.Length == 0) + { + return new char[0]; + } + + StringBuilder builder = new StringBuilder(); + if (rawFlags.Length % 2 == 1) + { + throw new System.ArgumentException("Invalid flags (should be even number of characters): " + rawFlags);