lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [05/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:09 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
new file mode 100644
index 0000000..d3bb929
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
@@ -0,0 +1,789 @@
+using System;
+using System.Diagnostics;
+
+namespace org.apache.lucene.analysis.synonym
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+	using ByteArrayDataInput = org.apache.lucene.store.ByteArrayDataInput;
+	using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+	using BytesRef = org.apache.lucene.util.BytesRef;
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using RamUsageEstimator = org.apache.lucene.util.RamUsageEstimator;
+	using UnicodeUtil = org.apache.lucene.util.UnicodeUtil;
+	using FST = org.apache.lucene.util.fst.FST;
+
+	/// <summary>
+	/// Matches single or multi word synonyms in a token stream.
+	/// This token stream cannot properly handle position
+	/// increments != 1, ie, you should place this filter before
+	/// filtering out stop words.
+	/// 
+	/// <para>Note that with the current implementation, parsing is
+	/// greedy, so whenever multiple parses would apply, the rule
+	/// starting the earliest and parsing the most tokens wins.
+	/// For example if you have these rules:
+	///      
+	/// <pre>
+	///   a -> x
+	///   a b -> y
+	///   b c d -> z
+	/// </pre>
+	/// 
+	/// Then input <code>a b c d e</code> parses to <code>y b c
+	/// d</code>, ie the 2nd rule "wins" because it started
+	/// earliest and matched the most input tokens of other rules
+	/// starting at that point.</para>
+	/// 
+	/// <para>A future improvement to this filter could allow
+	/// non-greedy parsing, such that the 3rd rule would win, and
+	/// also separately allow multiple parses, such that all 3
+	/// rules would match, perhaps even on a rule by rule
+	/// basis.</para>
+	/// 
+	/// <para><b>NOTE</b>: when a match occurs, the output tokens
+	/// associated with the matching rule are "stacked" on top of
+	/// the input stream (if the rule had
+	/// <code>keepOrig=true</code>) and also on top of another
+	/// matched rule's output tokens.  This is not a correct
+	/// solution, as really the output should be an arbitrary
+	/// graph/lattice.  For example, with the above match, you
+	/// would expect an exact <code>PhraseQuery</code> <code>"y b
+	/// c"</code> to match the parsed tokens, but it will fail to
+	/// do so.  This limitation is necessary because Lucene's
+	/// TokenStream (and index) cannot yet represent an arbitrary
+	/// graph.</para>
+	/// 
+	/// <para><b>NOTE</b>: If multiple incoming tokens arrive on the
+	/// same position, only the first token at that position is
+	/// used for parsing.  Subsequent tokens simply pass through
+	/// and are not parsed.  A future improvement would be to
+	/// allow these tokens to also be matched.</para>
+	/// </summary>
+
+	// TODO: maybe we should resolve token -> wordID then run
+	// FST on wordIDs, for better perf?
+
+	// TODO: a more efficient approach would be Aho/Corasick's
+	// algorithm
+	// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
+	// It improves over the current approach here
+	// because it does not fully re-start matching at every
+	// token.  For example if one pattern is "a b c x"
+	// and another is "b c d" and the input is "a b c d", on
+	// trying to parse "a b c x" but failing when you got to x,
+	// rather than starting over again your really should
+	// immediately recognize that "b c d" matches at the next
+	// input.  I suspect this won't matter that much in
+	// practice, but it's possible on some set of synonyms it
+	// will.  We'd have to modify Aho/Corasick to enforce our
+	// conflict resolving (eg greedy matching) because that algo
+	// finds all matches.  This really amounts to adding a .*
+	// closure to the FST and then determinizing it.
+
+	public sealed class SynonymFilter : TokenFilter
+	{
+
+	  public const string TYPE_SYNONYM = "SYNONYM";
+
+	  private readonly SynonymMap synonyms;
+
+	  private readonly bool ignoreCase;
+	  private readonly int rollBufferSize;
+
+	  private int captureCount;
+
+	  // TODO: we should set PositionLengthAttr too...
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly PositionLengthAttribute posLenAtt = addAttribute(typeof(PositionLengthAttribute));
+	  private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  // How many future input tokens have already been matched
+	  // to a synonym; because the matching is "greedy" we don't
+	  // try to do any more matching for such tokens:
+	  private int inputSkipCount;
+
+	  // Hold all buffered (read ahead) stacked input tokens for
+	  // a future position.  When multiple tokens are at the
+	  // same position, we only store (and match against) the
+	  // term for the first token at the position, but capture
+	  // state for (and enumerate) all other tokens at this
+	  // position:
+	  private class PendingInput
+	  {
+		internal readonly CharsRef term = new CharsRef();
+		internal AttributeSource.State state;
+		internal bool keepOrig;
+		internal bool matched;
+		internal bool consumed = true;
+		internal int startOffset;
+		internal int endOffset;
+
+		public virtual void reset()
+		{
+		  state = null;
+		  consumed = true;
+		  keepOrig = false;
+		  matched = false;
+		}
+	  }
+
+	  // Rolling buffer, holding pending input tokens we had to
+	  // clone because we needed to look ahead, indexed by
+	  // position:
+	  private readonly PendingInput[] futureInputs;
+
+	  // Holds pending output synonyms for one future position:
+	  private class PendingOutputs
+	  {
+		internal CharsRef[] outputs;
+		internal int[] endOffsets;
+		internal int[] posLengths;
+		internal int upto;
+		internal int count;
+		internal int posIncr = 1;
+		internal int lastEndOffset;
+		internal int lastPosLength;
+
+		public PendingOutputs()
+		{
+		  outputs = new CharsRef[1];
+		  endOffsets = new int[1];
+		  posLengths = new int[1];
+		}
+
+		public virtual void reset()
+		{
+		  upto = count = 0;
+		  posIncr = 1;
+		}
+
+		public virtual CharsRef pullNext()
+		{
+		  Debug.Assert(upto < count);
+		  lastEndOffset = endOffsets[upto];
+		  lastPosLength = posLengths[upto];
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.CharsRef result = outputs[upto++];
+		  CharsRef result = outputs[upto++];
+		  posIncr = 0;
+		  if (upto == count)
+		  {
+			reset();
+		  }
+		  return result;
+		}
+
+		public virtual int LastEndOffset
+		{
+			get
+			{
+			  return lastEndOffset;
+			}
+		}
+
+		public virtual int LastPosLength
+		{
+			get
+			{
+			  return lastPosLength;
+			}
+		}
+
+		public virtual void add(char[] output, int offset, int len, int endOffset, int posLength)
+		{
+		  if (count == outputs.Length)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.CharsRef[] next = new org.apache.lucene.util.CharsRef[org.apache.lucene.util.ArrayUtil.oversize(1+count, org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+			CharsRef[] next = new CharsRef[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+			Array.Copy(outputs, 0, next, 0, count);
+			outputs = next;
+		  }
+		  if (count == endOffsets.Length)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] next = new int[org.apache.lucene.util.ArrayUtil.oversize(1+count, org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_INT)];
+			int[] next = new int[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)];
+			Array.Copy(endOffsets, 0, next, 0, count);
+			endOffsets = next;
+		  }
+		  if (count == posLengths.Length)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] next = new int[org.apache.lucene.util.ArrayUtil.oversize(1+count, org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_INT)];
+			int[] next = new int[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)];
+			Array.Copy(posLengths, 0, next, 0, count);
+			posLengths = next;
+		  }
+		  if (outputs[count] == null)
+		  {
+			outputs[count] = new CharsRef();
+		  }
+		  outputs[count].copyChars(output, offset, len);
+		  // endOffset can be -1, in which case we should simply
+		  // use the endOffset of the input token, or X >= 0, in
+		  // which case we use X as the endOffset for this output
+		  endOffsets[count] = endOffset;
+		  posLengths[count] = posLength;
+		  count++;
+		}
+	  }
+
+	  private readonly ByteArrayDataInput bytesReader = new ByteArrayDataInput();
+
+	  // Rolling buffer, holding stack of pending synonym
+	  // outputs, indexed by position:
+	  private readonly PendingOutputs[] futureOutputs;
+
+	  // Where (in rolling buffers) to write next input saved state:
+	  private int nextWrite;
+
+	  // Where (in rolling buffers) to read next input saved state:
+	  private int nextRead;
+
+	  // True once we've read last token
+	  private bool finished;
+
+	  private readonly FST.Arc<BytesRef> scratchArc;
+
+	  private readonly FST<BytesRef> fst;
+
+	  private readonly FST.BytesReader fstReader;
+
+
+	  private readonly BytesRef scratchBytes = new BytesRef();
+	  private readonly CharsRef scratchChars = new CharsRef();
+
+	  /// <param name="input"> input tokenstream </param>
+	  /// <param name="synonyms"> synonym map </param>
+	  /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
+	  ///                   Note, if you set this to true, its your responsibility to lowercase
+	  ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
+	  public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input)
+	  {
+		this.synonyms = synonyms;
+		this.ignoreCase = ignoreCase;
+		this.fst = synonyms.fst;
+		if (fst == null)
+		{
+		  throw new System.ArgumentException("fst must be non-null");
+		}
+		this.fstReader = fst.BytesReader;
+
+		// Must be 1+ so that when roll buffer is at full
+		// lookahead we can distinguish this full buffer from
+		// the empty buffer:
+		rollBufferSize = 1 + synonyms.maxHorizontalContext;
+
+		futureInputs = new PendingInput[rollBufferSize];
+		futureOutputs = new PendingOutputs[rollBufferSize];
+		for (int pos = 0;pos < rollBufferSize;pos++)
+		{
+		  futureInputs[pos] = new PendingInput();
+		  futureOutputs[pos] = new PendingOutputs();
+		}
+
+		//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
+
+		scratchArc = new FST.Arc<>();
+	  }
+
+	  private void capture()
+	  {
+		captureCount++;
+		//System.out.println("  capture slot=" + nextWrite);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
+		PendingInput input = futureInputs[nextWrite];
+
+		input.state = captureState();
+		input.consumed = false;
+		input.term.copyChars(termAtt.buffer(), 0, termAtt.length());
+
+		nextWrite = rollIncr(nextWrite);
+
+		// Buffer head should never catch up to tail:
+		Debug.Assert(nextWrite != nextRead);
+	  }
+
+	  /*
+	   This is the core of this TokenFilter: it locates the
+	   synonym matches and buffers up the results into
+	   futureInputs/Outputs.
+	
+	   NOTE: this calls input.incrementToken and does not
+	   capture the state if no further tokens were checked.  So
+	   caller must then forward state to our caller, or capture:
+	  */
+	  private int lastStartOffset;
+	  private int lastEndOffset;
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void parse() throws java.io.IOException
+	  private void parse()
+	  {
+		//System.out.println("\nS: parse");
+
+		Debug.Assert(inputSkipCount == 0);
+
+		int curNextRead = nextRead;
+
+		// Holds the longest match we've seen so far:
+		BytesRef matchOutput = null;
+		int matchInputLength = 0;
+		int matchEndOffset = -1;
+
+		BytesRef pendingOutput = fst.outputs.NoOutput;
+		fst.getFirstArc(scratchArc);
+
+		Debug.Assert(scratchArc.output == fst.outputs.NoOutput);
+
+		int tokenCount = 0;
+
+		while (true)
+		{
+
+		  // Pull next token's chars:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer;
+		  char[] buffer;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int bufferLen;
+		  int bufferLen;
+		  //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
+
+		  int inputEndOffset = 0;
+
+		  if (curNextRead == nextWrite)
+		  {
+
+			// We used up our lookahead buffer of input tokens
+			// -- pull next real input token:
+
+			if (finished)
+			{
+			  break;
+			}
+			else
+			{
+			  //System.out.println("  input.incrToken");
+			  Debug.Assert(futureInputs[nextWrite].consumed);
+			  // Not correct: a syn match whose output is longer
+			  // than its input can set future inputs keepOrig
+			  // to true:
+			  //assert !futureInputs[nextWrite].keepOrig;
+			  if (input.incrementToken())
+			  {
+				buffer = termAtt.buffer();
+				bufferLen = termAtt.length();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
+				PendingInput input = futureInputs[nextWrite];
+				lastStartOffset = input.startOffset = offsetAtt.startOffset();
+				lastEndOffset = input.endOffset = offsetAtt.endOffset();
+				inputEndOffset = input.endOffset;
+				//System.out.println("  new token=" + new String(buffer, 0, bufferLen));
+				if (nextRead != nextWrite)
+				{
+				  capture();
+				}
+				else
+				{
+				  input.consumed = false;
+				}
+
+			  }
+			  else
+			  {
+				// No more input tokens
+				//System.out.println("      set end");
+				finished = true;
+				break;
+			  }
+			}
+		  }
+		  else
+		  {
+			// Still in our lookahead
+			buffer = futureInputs[curNextRead].term.chars;
+			bufferLen = futureInputs[curNextRead].term.length;
+			inputEndOffset = futureInputs[curNextRead].endOffset;
+			//System.out.println("  old token=" + new String(buffer, 0, bufferLen));
+		  }
+
+		  tokenCount++;
+
+		  // Run each char in this token through the FST:
+		  int bufUpto = 0;
+		  while (bufUpto < bufferLen)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+			int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
+			if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+			{
+			  //System.out.println("    stop");
+			  goto byTokenBreak;
+			}
+
+			// Accum the output
+			pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+			//System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
+			bufUpto += char.charCount(codePoint);
+		  }
+
+		  // OK, entire token matched; now see if this is a final
+		  // state:
+		  if (scratchArc.Final)
+		  {
+			matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+			matchInputLength = tokenCount;
+			matchEndOffset = inputEndOffset;
+			//System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
+		  }
+
+		  // See if the FST wants to continue matching (ie, needs to
+		  // see the next input token):
+		  if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
+		  {
+			// No further rules can match here; we're done
+			// searching for matching rules starting at the
+			// current input position.
+			break;
+		  }
+		  else
+		  {
+			// More matching is possible -- accum the output (if
+			// any) of the WORD_SEP arc:
+			pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+			if (nextRead == nextWrite)
+			{
+			  capture();
+			}
+		  }
+
+		  curNextRead = rollIncr(curNextRead);
+			byTokenContinue:;
+		}
+		byTokenBreak:
+
+		if (nextRead == nextWrite && !finished)
+		{
+		  //System.out.println("  skip write slot=" + nextWrite);
+		  nextWrite = rollIncr(nextWrite);
+		}
+
+		if (matchOutput != null)
+		{
+		  //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
+		  inputSkipCount = matchInputLength;
+		  addOutput(matchOutput, matchInputLength, matchEndOffset);
+		}
+		else if (nextRead != nextWrite)
+		{
+		  // Even though we had no match here, we set to 1
+		  // because we need to skip current input token before
+		  // trying to match again:
+		  inputSkipCount = 1;
+		}
+		else
+		{
+		  Debug.Assert(finished);
+		}
+
+		//System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+	  }
+
+	  // Interleaves all output tokens onto the futureOutputs:
+	  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
+	  {
+		bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int code = bytesReader.readVInt();
+		int code = bytesReader.readVInt();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final boolean keepOrig = (code & 0x1) == 0;
+		bool keepOrig = (code & 0x1) == 0;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int count = code >>> 1;
+		int count = (int)((uint)code >> 1);
+		//System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
+		for (int outputIDX = 0;outputIDX < count;outputIDX++)
+		{
+		  synonyms.words.get(bytesReader.readVInt(), scratchBytes);
+		  //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
+		  UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
+		  int lastStart = scratchChars.offset;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int chEnd = lastStart + scratchChars.length;
+		  int chEnd = lastStart + scratchChars.length;
+		  int outputUpto = nextRead;
+		  for (int chIDX = lastStart;chIDX <= chEnd;chIDX++)
+		  {
+			if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int outputLen = chIDX - lastStart;
+			  int outputLen = chIDX - lastStart;
+			  // Caller is not allowed to have empty string in
+			  // the output:
+			  Debug.Assert(outputLen > 0, "output contains empty string: " + scratchChars);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int endOffset;
+			  int endOffset;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int posLen;
+			  int posLen;
+			  if (chIDX == chEnd && lastStart == scratchChars.offset)
+			  {
+				// This rule had a single output token, so, we set
+				// this output's endOffset to the current
+				// endOffset (ie, endOffset of the last input
+				// token it matched):
+				endOffset = matchEndOffset;
+				posLen = keepOrig ? matchInputLength : 1;
+			  }
+			  else
+			  {
+				// This rule has more than one output token; we
+				// can't pick any particular endOffset for this
+				// case, so, we inherit the endOffset for the
+				// input token which this output overlaps:
+				endOffset = -1;
+				posLen = 1;
+			  }
+			  futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset, posLen);
+			  //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
+			  lastStart = 1 + chIDX;
+			  //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
+			  outputUpto = rollIncr(outputUpto);
+			  Debug.Assert(futureOutputs[outputUpto].posIncr == 1, "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite);
+			}
+		  }
+		}
+
+		int upto = nextRead;
+		for (int idx = 0;idx < matchInputLength;idx++)
+		{
+		  futureInputs[upto].keepOrig |= keepOrig;
+		  futureInputs[upto].matched = true;
+		  upto = rollIncr(upto);
+		}
+	  }
+
+	  // ++ mod rollBufferSize
+	  private int rollIncr(int count)
+	  {
+		count++;
+		if (count == rollBufferSize)
+		{
+		  return 0;
+		}
+		else
+		{
+		  return count;
+		}
+	  }
+
+	  // for testing
+	  internal int CaptureCount
+	  {
+		  get
+		  {
+			return captureCount;
+		  }
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+
+		//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+
+		while (true)
+		{
+
+		  // First play back any buffered future inputs/outputs
+		  // w/o running parsing again:
+		  while (inputSkipCount != 0)
+		  {
+
+			// At each position, we first output the original
+			// token
+
+			// TODO: maybe just a PendingState class, holding
+			// both input & outputs?
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final PendingInput input = futureInputs[nextRead];
+			PendingInput input = futureInputs[nextRead];
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final PendingOutputs outputs = futureOutputs[nextRead];
+			PendingOutputs outputs = futureOutputs[nextRead];
+
+			//System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
+
+			if (!input.consumed && (input.keepOrig || !input.matched))
+			{
+			  if (input.state != null)
+			  {
+				// Return a previously saved token (because we
+				// had to lookahead):
+				restoreState(input.state);
+			  }
+			  else
+			  {
+				// Pass-through case: return token we just pulled
+				// but didn't capture:
+				Debug.Assert(inputSkipCount == 1, "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead);
+			  }
+			  input.reset();
+			  if (outputs.count > 0)
+			  {
+				outputs.posIncr = 0;
+			  }
+			  else
+			  {
+				nextRead = rollIncr(nextRead);
+				inputSkipCount--;
+			  }
+			  //System.out.println("  return token=" + termAtt.toString());
+			  return true;
+			}
+			else if (outputs.upto < outputs.count)
+			{
+			  // Still have pending outputs to replay at this
+			  // position
+			  input.reset();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int posIncr = outputs.posIncr;
+			  int posIncr = outputs.posIncr;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.CharsRef output = outputs.pullNext();
+			  CharsRef output = outputs.pullNext();
+			  clearAttributes();
+			  termAtt.copyBuffer(output.chars, output.offset, output.length);
+			  typeAtt.Type = TYPE_SYNONYM;
+			  int endOffset = outputs.LastEndOffset;
+			  if (endOffset == -1)
+			  {
+				endOffset = input.endOffset;
+			  }
+			  offsetAtt.setOffset(input.startOffset, endOffset);
+			  posIncrAtt.PositionIncrement = posIncr;
+			  posLenAtt.PositionLength = outputs.LastPosLength;
+			  if (outputs.count == 0)
+			  {
+				// Done with the buffered input and all outputs at
+				// this position
+				nextRead = rollIncr(nextRead);
+				inputSkipCount--;
+			  }
+			  //System.out.println("  return token=" + termAtt.toString());
+			  return true;
+			}
+			else
+			{
+			  // Done with the buffered input and all outputs at
+			  // this position
+			  input.reset();
+			  nextRead = rollIncr(nextRead);
+			  inputSkipCount--;
+			}
+		  }
+
+		  if (finished && nextRead == nextWrite)
+		  {
+			// End case: if any output syns went beyond end of
+			// input stream, enumerate them now:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final PendingOutputs outputs = futureOutputs[nextRead];
+			PendingOutputs outputs = futureOutputs[nextRead];
+			if (outputs.upto < outputs.count)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int posIncr = outputs.posIncr;
+			  int posIncr = outputs.posIncr;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.CharsRef output = outputs.pullNext();
+			  CharsRef output = outputs.pullNext();
+			  futureInputs[nextRead].reset();
+			  if (outputs.count == 0)
+			  {
+				nextWrite = nextRead = rollIncr(nextRead);
+			  }
+			  clearAttributes();
+			  // Keep offset from last input token:
+			  offsetAtt.setOffset(lastStartOffset, lastEndOffset);
+			  termAtt.copyBuffer(output.chars, output.offset, output.length);
+			  typeAtt.Type = TYPE_SYNONYM;
+			  //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
+			  posIncrAtt.PositionIncrement = posIncr;
+			  //System.out.println("  return token=" + termAtt.toString());
+			  return true;
+			}
+			else
+			{
+			  return false;
+			}
+		  }
+
+		  // Find new synonym matches:
+		  parse();
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		captureCount = 0;
+		finished = false;
+		inputSkipCount = 0;
+		nextRead = nextWrite = 0;
+
+		// In normal usage these resets would not be needed,
+		// since they reset-as-they-are-consumed, but the app
+		// may not consume all input tokens (or we might hit an
+		// exception), in which case we have leftover state
+		// here:
+		foreach (PendingInput input in futureInputs)
+		{
+		  input.reset();
+		}
+		foreach (PendingOutputs output in futureOutputs)
+		{
+		  output.reset();
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilterFactory.cs
new file mode 100644
index 0000000..b6967d8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilterFactory.cs
@@ -0,0 +1,115 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Synonym;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.synonym
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using Version = org.apache.lucene.util.Version;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="SynonymFilter"/>.
+	/// <pre class="prettyprint" >
+	/// &lt;fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
+	///             format="solr" ignoreCase="false" expand="true" 
+	///             tokenizerFactory="solr.WhitespaceTokenizerFactory"
+	///             [optional tokenizer factory parameters]/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// <para>
+	/// An optional param name prefix of "tokenizerFactory." may be used for any 
+	/// init params that the SynonymFilterFactory needs to pass to the specified 
+	/// TokenizerFactory.  If the TokenizerFactory expects an init parameters with 
+	/// the same name as an init param used by the SynonymFilterFactory, the prefix 
+	/// is mandatory.
+	/// </para>
+	/// <para>
+	/// The optional {@code format} parameter controls how the synonyms will be parsed:
+	/// It supports the short names of {@code solr} for <seealso cref="SolrSynonymParser"/> 
+	/// and {@code wordnet} for and <seealso cref="WordnetSynonymParser"/>, or your own 
+	/// {@code SynonymMap.Parser} class name. The default is {@code solr}.
+	/// A custom <seealso cref="SynonymMap.Parser"/> is expected to have a constructor taking:
+	/// <ul>
+	///   <li><code>boolean dedup</code> - true if duplicates should be ignored, false otherwise</li>
+	///   <li><code>boolean expand</code> - true if conflation groups should be expanded, false if they are one-directional</li>
+	///   <li><code><seealso cref="Analyzer"/> analyzer</code> - an analyzer used for each raw synonym</li>
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public class SynonymFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  private readonly TokenFilterFactory delegator;
+
+	  public SynonymFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		if (luceneMatchVersion.onOrAfter(Version.LUCENE_34))
+		{
+		  delegator = new FSTSynonymFilterFactory(new Dictionary<>(OriginalArgs));
+		}
+		else
+		{
+		  // check if you use the new optional arg "format". this makes no sense for the old one, 
+		  // as its wired to solr's synonyms format only.
+		  if (args.ContainsKey("format") && !args["format"].Equals("solr"))
+		  {
+			throw new System.ArgumentException("You must specify luceneMatchVersion >= 3.4 to use alternate synonyms formats");
+		  }
+		  delegator = new SlowSynonymFilterFactory(new Dictionary<>(OriginalArgs));
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return delegator.create(input);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		((ResourceLoaderAware) delegator).inform(loader);
+	  }
+
+	  /// <summary>
+	  /// Access to the delegator TokenFilterFactory for test verification
+	  /// </summary>
+	  /// @deprecated Method exists only for testing 4x, will be removed in 5.0
+	  /// @lucene.internal 
+	  [Obsolete("Method exists only for testing 4x, will be removed in 5.0")]
+	  internal virtual TokenFilterFactory Delegator
+	  {
+		  get
+		  {
+			return delegator;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymMap.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymMap.cs
new file mode 100644
index 0000000..004572d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymMap.cs
@@ -0,0 +1,430 @@
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.synonym
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using ByteArrayDataOutput = org.apache.lucene.store.ByteArrayDataOutput;
+	using BytesRef = org.apache.lucene.util.BytesRef;
+	using BytesRefHash = org.apache.lucene.util.BytesRefHash;
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using IOUtils = org.apache.lucene.util.IOUtils;
+	using IntsRef = org.apache.lucene.util.IntsRef;
+	using UnicodeUtil = org.apache.lucene.util.UnicodeUtil;
+	using ByteSequenceOutputs = org.apache.lucene.util.fst.ByteSequenceOutputs;
+	using FST = org.apache.lucene.util.fst.FST;
+	using Util = org.apache.lucene.util.fst.Util;
+
+	/// <summary>
+	/// A map of synonyms, keys and values are phrases.
+	/// @lucene.experimental
+	/// </summary>
+	public class SynonymMap
+	{
+	  /// <summary>
+	  /// for multiword support, you must separate words with this separator </summary>
+	  public const char WORD_SEPARATOR = (char)0;
+	  /// <summary>
+	  /// map&lt;input word, list&lt;ord&gt;&gt; </summary>
+	  public readonly FST<BytesRef> fst;
+	  /// <summary>
+	  /// map&lt;ord, outputword&gt; </summary>
+	  public readonly BytesRefHash words;
+	  /// <summary>
+	  /// maxHorizontalContext: maximum context we need on the tokenstream </summary>
+	  public readonly int maxHorizontalContext;
+
+	  public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext)
+	  {
+		this.fst = fst;
+		this.words = words;
+		this.maxHorizontalContext = maxHorizontalContext;
+	  }
+
+	  /// <summary>
+	  /// Builds an FSTSynonymMap.
+	  /// <para>
+	  /// Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
+	  /// @lucene.experimental
+	  /// </para>
+	  /// </summary>
+	  public class Builder
+	  {
+		internal readonly Dictionary<CharsRef, MapEntry> workingSet = new Dictionary<CharsRef, MapEntry>();
+		internal readonly BytesRefHash words = new BytesRefHash();
+		internal readonly BytesRef utf8Scratch = new BytesRef(8);
+		internal int maxHorizontalContext;
+		internal readonly bool dedup;
+
+		/// <summary>
+		/// If dedup is true then identical rules (same input,
+		///  same output) will be added only once. 
+		/// </summary>
+		public Builder(bool dedup)
+		{
+		  this.dedup = dedup;
+		}
+
+		private class MapEntry
+		{
+		  internal bool includeOrig;
+		  // we could sort for better sharing ultimately, but it could confuse people
+		  internal List<int?> ords = new List<int?>();
+		}
+
+		/// <summary>
+		/// Sugar: just joins the provided terms with {@link
+		///  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
+		///  must not be null. 
+		/// </summary>
+		public static CharsRef join(string[] words, CharsRef reuse)
+		{
+		  int upto = 0;
+		  char[] buffer = reuse.chars;
+		  foreach (string word in words)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int wordLen = word.length();
+			int wordLen = word.Length;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int needed = (0 == upto ? wordLen : 1 + upto + wordLen);
+			int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
+			if (needed > buffer.Length)
+			{
+			  reuse.grow(needed);
+			  buffer = reuse.chars;
+			}
+			if (upto > 0)
+			{
+			  buffer[upto++] = SynonymMap.WORD_SEPARATOR;
+			}
+
+			word.CopyTo(0, buffer, upto, wordLen - 0);
+			upto += wordLen;
+		  }
+		  reuse.length = upto;
+		  return reuse;
+		}
+
+
+
+		/// <summary>
+		/// only used for asserting! </summary>
+		internal virtual bool hasHoles(CharsRef chars)
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int end = chars.offset + chars.length;
+		  int end = chars.offset + chars.length;
+		  for (int idx = chars.offset + 1;idx < end;idx++)
+		  {
+			if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx - 1] == SynonymMap.WORD_SEPARATOR)
+			{
+			  return true;
+			}
+		  }
+		  if (chars.chars[chars.offset] == '\u0000')
+		  {
+			return true;
+		  }
+		  if (chars.chars[chars.offset + chars.length - 1] == '\u0000')
+		  {
+			return true;
+		  }
+
+		  return false;
+		}
+
+		// NOTE: while it's tempting to make this public, since
+		// caller's parser likely knows the
+		// numInput/numOutputWords, sneaky exceptions, much later
+		// on, will result if these values are wrong; so we always
+		// recompute ourselves to be safe:
+		internal virtual void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
+		{
+		  // first convert to UTF-8
+		  if (numInputWords <= 0)
+		  {
+			throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
+		  }
+		  if (input.length <= 0)
+		  {
+			throw new System.ArgumentException("input.length must be > 0 (got " + input.length + ")");
+		  }
+		  if (numOutputWords <= 0)
+		  {
+			throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
+		  }
+		  if (output.length <= 0)
+		  {
+			throw new System.ArgumentException("output.length must be > 0 (got " + output.length + ")");
+		  }
+
+		  Debug.Assert(!hasHoles(input), "input has holes: " + input);
+		  Debug.Assert(!hasHoles(output), "output has holes: " + output);
+
+		  //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
+		  UnicodeUtil.UTF16toUTF8(output.chars, output.offset, output.length, utf8Scratch);
+		  // lookup in hash
+		  int ord = words.add(utf8Scratch);
+		  if (ord < 0)
+		  {
+			// already exists in our hash
+			ord = (-ord) - 1;
+			//System.out.println("  output=" + output + " old ord=" + ord);
+		  }
+		  else
+		  {
+			//System.out.println("  output=" + output + " new ord=" + ord);
+		  }
+
+		  MapEntry e = workingSet[input];
+		  if (e == null)
+		  {
+			e = new MapEntry();
+			workingSet[CharsRef.deepCopyOf(input)] = e; // make a copy, since we will keep around in our map
+		  }
+
+		  e.ords.Add(ord);
+		  e.includeOrig |= includeOrig;
+		  maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
+		  maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
+		}
+
+		internal virtual int countWords(CharsRef chars)
+		{
+		  int wordCount = 1;
+		  int upto = chars.offset;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int limit = chars.offset + chars.length;
+		  int limit = chars.offset + chars.length;
+		  while (upto < limit)
+		  {
+			if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR)
+			{
+			  wordCount++;
+			}
+		  }
+		  return wordCount;
+		}
+
+		/// <summary>
+		/// Add a phrase->phrase synonym mapping.
+		/// Phrases are character sequences where words are
+		/// separated with character zero (U+0000).  Empty words
+		/// (two U+0000s in a row) are not allowed in the input nor
+		/// the output!
+		/// </summary>
+		/// <param name="input"> input phrase </param>
+		/// <param name="output"> output phrase </param>
+		/// <param name="includeOrig"> true if the original should be included </param>
+		public virtual void add(CharsRef input, CharsRef output, bool includeOrig)
+		{
+		  add(input, countWords(input), output, countWords(output), includeOrig);
+		}
+
+		/// <summary>
+		/// Builds an <seealso cref="SynonymMap"/> and returns it.
+		/// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public SynonymMap build() throws java.io.IOException
+		public virtual SynonymMap build()
+		{
+		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
+		  // TODO: are we using the best sharing options?
+		  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
+
+		  BytesRef scratch = new BytesRef(64);
+		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final java.util.Set<Integer> dedupSet;
+		  HashSet<int?> dedupSet;
+
+		  if (dedup)
+		  {
+			dedupSet = new HashSet<>();
+		  }
+		  else
+		  {
+			dedupSet = null;
+		  }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final byte[] spare = new byte[5];
+		  sbyte[] spare = new sbyte[5];
+
+		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
+		  CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]);
+		  Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef();
+		  IntsRef scratchIntsRef = new IntsRef();
+
+		  //System.out.println("fmap.build");
+		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
+		  {
+			CharsRef input = sortedKeys[keyIdx];
+			MapEntry output = workingSet[input];
+
+			int numEntries = output.ords.Count;
+			// output size, assume the worst case
+			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
+
+			scratch.grow(estimatedSize);
+			scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
+			Debug.Assert(scratch.offset == 0);
+
+			// now write our output data:
+			int count = 0;
+			for (int i = 0; i < numEntries; i++)
+			{
+			  if (dedupSet != null)
+			  {
+				// box once
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final Integer ent = output.ords.get(i);
+				int? ent = output.ords[i];
+				if (dedupSet.Contains(ent))
+				{
+				  continue;
+				}
+				dedupSet.Add(ent);
+			  }
+			  scratchOutput.writeVInt(output.ords[i]);
+			  count++;
+			}
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int pos = scratchOutput.getPosition();
+			int pos = scratchOutput.Position;
+			scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int pos2 = scratchOutput.getPosition();
+			int pos2 = scratchOutput.Position;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int vIntLen = pos2-pos;
+			int vIntLen = pos2 - pos;
+
+			// Move the count + includeOrig to the front of the byte[]:
+			Array.Copy(scratch.bytes, pos, spare, 0, vIntLen);
+			Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
+			Array.Copy(spare, 0, scratch.bytes, 0, vIntLen);
+
+			if (dedupSet != null)
+			{
+			  dedupSet.Clear();
+			}
+
+			scratch.length = scratchOutput.Position - scratch.offset;
+			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
+			builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
+		  }
+
+		  FST<BytesRef> fst = builder.finish();
+		  return new SynonymMap(fst, words, maxHorizontalContext);
+		}
+	  }
+
+	  /// <summary>
+	  /// Abstraction for parsing synonym files.
+	  /// 
+	  /// @lucene.experimental
+	  /// </summary>
+	  public abstract class Parser : Builder
+	  {
+
+		internal readonly Analyzer analyzer;
+
+		public Parser(bool dedup, Analyzer analyzer) : base(dedup)
+		{
+		  this.analyzer = analyzer;
+		}
+
+		/// <summary>
+		/// Parse the given input, adding synonyms to the inherited <seealso cref="Builder"/>. </summary>
+		/// <param name="in"> The input to parse </param>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public abstract void parse(java.io.Reader in) throws java.io.IOException, java.text.ParseException;
+		public abstract void parse(Reader @in);
+
+		/// <summary>
+		/// Sugar: analyzes the text with the analyzer and
+		///  separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>.
+		///  reuse and its chars must not be null. 
+		/// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public org.apache.lucene.util.CharsRef analyze(String text, org.apache.lucene.util.CharsRef reuse) throws java.io.IOException
+		public virtual CharsRef analyze(string text, CharsRef reuse)
+		{
+		  IOException priorException = null;
+		  TokenStream ts = analyzer.tokenStream("", text);
+		  try
+		  {
+			CharTermAttribute termAtt = ts.addAttribute(typeof(CharTermAttribute));
+			PositionIncrementAttribute posIncAtt = ts.addAttribute(typeof(PositionIncrementAttribute));
+			ts.reset();
+			reuse.length = 0;
+			while (ts.incrementToken())
+			{
+			  int length = termAtt.length();
+			  if (length == 0)
+			  {
+				throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token");
+			  }
+			  if (posIncAtt.PositionIncrement != 1)
+			  {
+				throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1");
+			  }
+			  reuse.grow(reuse.length + length + 1); // current + word + separator
+			  int end = reuse.offset + reuse.length;
+			  if (reuse.length > 0)
+			  {
+				reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
+				reuse.length++;
+			  }
+			  Array.Copy(termAtt.buffer(), 0, reuse.chars, end, length);
+			  reuse.length += length;
+			}
+			ts.end();
+		  }
+		  catch (IOException e)
+		  {
+			priorException = e;
+		  }
+		  finally
+		  {
+			IOUtils.closeWhileHandlingException(priorException, ts);
+		  }
+		  if (reuse.length == 0)
+		  {
+			throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer");
+		  }
+		  return reuse;
+		}
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Synonym/WordnetSynonymParser.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/WordnetSynonymParser.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/WordnetSynonymParser.cs
new file mode 100644
index 0000000..0bf9890
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/WordnetSynonymParser.cs
@@ -0,0 +1,135 @@
+using System;
+
+namespace org.apache.lucene.analysis.synonym
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharsRef = org.apache.lucene.util.CharsRef;
+
+	/// <summary>
+	/// Parser for wordnet prolog format
+	/// <para>
+	/// See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
+	/// @lucene.experimental
+	/// </para>
+	/// </summary>
+	// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
+	public class WordnetSynonymParser : SynonymMap.Parser
+	{
+	  private readonly bool expand;
+
+	  public WordnetSynonymParser(bool dedup, bool expand, Analyzer analyzer) : base(dedup, analyzer)
+	  {
+		this.expand = expand;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void parse(java.io.Reader in) throws java.io.IOException, java.text.ParseException
+	  public override void parse(Reader @in)
+	  {
+		LineNumberReader br = new LineNumberReader(@in);
+		try
+		{
+		  string line = null;
+		  string lastSynSetID = "";
+		  CharsRef[] synset = new CharsRef[8];
+		  int synsetSize = 0;
+
+		  while ((line = br.readLine()) != null)
+		  {
+			string synSetID = line.Substring(2, 9);
+
+			if (!synSetID.Equals(lastSynSetID))
+			{
+			  addInternal(synset, synsetSize);
+			  synsetSize = 0;
+			}
+
+			if (synset.Length <= synsetSize+1)
+			{
+			  CharsRef[] larger = new CharsRef[synset.Length * 2];
+			  Array.Copy(synset, 0, larger, 0, synsetSize);
+			  synset = larger;
+			}
+
+			synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
+			synsetSize++;
+			lastSynSetID = synSetID;
+		  }
+
+		  // final synset in the file
+		  addInternal(synset, synsetSize);
+		}
+		catch (System.ArgumentException e)
+		{
+		  ParseException ex = new ParseException("Invalid synonym rule at line " + br.LineNumber, 0);
+		  ex.initCause(e);
+		  throw ex;
+		}
+		finally
+		{
+		  br.close();
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private org.apache.lucene.util.CharsRef parseSynonym(String line, org.apache.lucene.util.CharsRef reuse) throws java.io.IOException
+	  private CharsRef parseSynonym(string line, CharsRef reuse)
+	  {
+		if (reuse == null)
+		{
+		  reuse = new CharsRef(8);
+		}
+
+		int start = line.IndexOf('\'') + 1;
+		int end = line.LastIndexOf('\'');
+
+		string text = line.Substring(start, end - start).Replace("''", "'");
+		return analyze(text, reuse);
+	  }
+
+	  private void addInternal(CharsRef[] synset, int size)
+	  {
+		if (size <= 1)
+		{
+		  return; // nothing to do
+		}
+
+		if (expand)
+		{
+		  for (int i = 0; i < size; i++)
+		  {
+			for (int j = 0; j < size; j++)
+			{
+			  add(synset[i], synset[j], false);
+			}
+		  }
+		}
+		else
+		{
+		  for (int i = 0; i < size; i++)
+		  {
+			add(synset[i], synset[0], false);
+		  }
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs
new file mode 100644
index 0000000..86c0811
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs
@@ -0,0 +1,143 @@
+using System;
+
+namespace org.apache.lucene.analysis.th
+{
+
+	/// <summary>
+	/// Copyright 2006 The Apache Software Foundation
+	/// 
+	/// Licensed under the Apache License, Version 2.0 (the "License");
+	/// you may not use this file except in compliance with the License.
+	/// You may obtain a copy of the License at
+	/// 
+	///     http://www.apache.org/licenses/LICENSE-2.0
+	/// 
+	/// Unless required by applicable law or agreed to in writing, software
+	/// distributed under the License is distributed on an "AS IS" BASIS,
+	/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	/// See the License for the specific language governing permissions and
+	/// limitations under the License.
+	/// </summary>
+
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using StopAnalyzer = org.apache.lucene.analysis.core.StopAnalyzer;
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// <seealso cref="Analyzer"/> for Thai language. It uses <seealso cref="java.text.BreakIterator"/> to break words.
+	/// <para>
+	/// <a name="version"/>
+	/// </para>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating ThaiAnalyzer:
+	/// <ul>
+	///   <li> As of 3.6, a set of Thai stopwords is used by default
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class ThaiAnalyzer : StopwordAnalyzerBase
+	{
+
+	  /// <summary>
+	  /// File containing default Thai stopwords. </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+	  /// <summary>
+	  /// The comment character in the stopwords file.  
+	  /// All lines prefixed with this will be ignored.
+	  /// </summary>
+	  private const string STOPWORDS_COMMENT = "#";
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop words set. </summary>
+	  /// <returns> default stop words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  /// <summary>
+	  /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+	  /// accesses the static final set the first time.;
+	  /// </summary>
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = loadStopwordSet(false, typeof(ThaiAnalyzer), DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the default stop words.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  public ThaiAnalyzer(Version matchVersion) : this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates
+	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
+	  ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="ThaiWordFilter"/>, and
+	  ///         <seealso cref="StopFilter"/> </returns>
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+		if (matchVersion.onOrAfter(Version.LUCENE_48))
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new ThaiTokenizer(reader);
+		  Tokenizer source = new ThaiTokenizer(reader);
+		  TokenStream result = new LowerCaseFilter(matchVersion, source);
+		  result = new StopFilter(matchVersion, result, stopwords);
+		  return new TokenStreamComponents(source, result);
+		}
+		else
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
+		  TokenStream result = new StandardFilter(matchVersion, source);
+		  if (matchVersion.onOrAfter(Version.LUCENE_31))
+		  {
+			result = new LowerCaseFilter(matchVersion, result);
+		  }
+		  result = new ThaiWordFilter(matchVersion, result);
+		  return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
new file mode 100644
index 0000000..3b472ae
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -0,0 +1,116 @@
+namespace org.apache.lucene.analysis.th
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using CharArrayIterator = org.apache.lucene.analysis.util.CharArrayIterator;
+	using SegmentingTokenizerBase = org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+
+	/// <summary>
+	/// Tokenizer that use <seealso cref="BreakIterator"/> to tokenize Thai text.
+	/// <para>WARNING: this tokenizer may not be supported by all JREs.
+	///    It is known to work with Sun/Oracle and Harmony JREs.
+	///    If your application needs to be fully portable, consider using ICUTokenizer instead,
+	///    which uses an ICU Thai BreakIterator that will always be available.
+	/// </para>
+	/// </summary>
+	public class ThaiTokenizer : SegmentingTokenizerBase
+	{
+	  /// <summary>
+	  /// True if the JRE supports a working dictionary-based breakiterator for Thai.
+	  /// If this is false, this tokenizer will not work at all!
+	  /// </summary>
+	  public static readonly bool DBBI_AVAILABLE;
+	  private static readonly BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
+	  static ThaiTokenizer()
+	  {
+		// check that we have a working dictionary-based break iterator for thai
+		proto.Text = "ภาษาไทย";
+		DBBI_AVAILABLE = proto.isBoundary(4);
+	  }
+
+	  /// <summary>
+	  /// used for breaking the text into sentences </summary>
+	  private static readonly BreakIterator sentenceProto = BreakIterator.getSentenceInstance(Locale.ROOT);
+
+	  private readonly BreakIterator wordBreaker;
+	  private readonly CharArrayIterator wrapper = CharArrayIterator.newWordInstance();
+
+	  internal int sentenceStart;
+	  internal int sentenceEnd;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  /// <summary>
+	  /// Creates a new ThaiTokenizer </summary>
+	  public ThaiTokenizer(Reader reader) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
+	  public ThaiTokenizer(AttributeFactory factory, Reader reader) : base(factory, reader, (BreakIterator)sentenceProto.clone())
+	  {
+		if (!DBBI_AVAILABLE)
+		{
+		  throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
+		}
+		wordBreaker = (BreakIterator)proto.clone();
+	  }
+
+	  protected internal override void setNextSentence(int sentenceStart, int sentenceEnd)
+	  {
+		this.sentenceStart = sentenceStart;
+		this.sentenceEnd = sentenceEnd;
+		wrapper.setText(buffer, sentenceStart, sentenceEnd - sentenceStart);
+		wordBreaker.Text = wrapper;
+	  }
+
+	  protected internal override bool incrementWord()
+	  {
+		int start = wordBreaker.current();
+		if (start == BreakIterator.DONE)
+		{
+		  return false; // BreakIterator exhausted
+		}
+
+		// find the next set of boundaries, skipping over non-tokens
+		int end_Renamed = wordBreaker.next();
+		while (end_Renamed != BreakIterator.DONE && !char.IsLetterOrDigit(char.codePointAt(buffer, sentenceStart + start, sentenceEnd)))
+		{
+		  start = end_Renamed;
+		  end_Renamed = wordBreaker.next();
+		}
+
+		if (end_Renamed == BreakIterator.DONE)
+		{
+		  return false; // BreakIterator exhausted
+		}
+
+		clearAttributes();
+		termAtt.copyBuffer(buffer, sentenceStart + start, end_Renamed - start);
+		offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end_Renamed));
+		return true;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs
new file mode 100644
index 0000000..97ba897
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs
@@ -0,0 +1,56 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.th
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenizerFactory = org.apache.lucene.analysis.util.TokenizerFactory;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+	/// <summary>
+	/// Factory for <seealso cref="ThaiTokenizer"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_thai" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.ThaiTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class ThaiTokenizerFactory : TokenizerFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new ThaiTokenizerFactory </summary>
+	  public ThaiTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override Tokenizer create(AttributeSource.AttributeFactory factory, Reader reader)
+	  {
+		return new ThaiTokenizer(factory, reader);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
new file mode 100644
index 0000000..ae7fa96
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -0,0 +1,172 @@
+using System;
+
+namespace org.apache.lucene.analysis.th
+{
+
+	/// <summary>
+	/// Copyright 2006 The Apache Software Foundation
+	/// 
+	/// Licensed under the Apache License, Version 2.0 (the "License");
+	/// you may not use this file except in compliance with the License.
+	/// You may obtain a copy of the License at
+	/// 
+	///     http://www.apache.org/licenses/LICENSE-2.0
+	/// 
+	/// Unless required by applicable law or agreed to in writing, software
+	/// distributed under the License is distributed on an "AS IS" BASIS,
+	/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	/// See the License for the specific language governing permissions and
+	/// limitations under the License.
+	/// </summary>
+
+	using UnicodeBlock = Character.UnicodeBlock;
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using CharArrayIterator = org.apache.lucene.analysis.util.CharArrayIterator;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// <seealso cref="TokenFilter"/> that use <seealso cref="java.text.BreakIterator"/> to break each 
+	/// Token that is Thai into separate Token(s) for each Thai word.
+	/// <para>Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text.
+	/// <seealso cref="ThaiAnalyzer"/> will insert a <seealso cref="LowerCaseFilter"/> before this filter
+	/// so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
+	/// position increments correctly.
+	/// </para>
+	/// <para>WARNING: this filter may not be supported by all JREs.
+	///    It is known to work with Sun/Oracle and Harmony JREs.
+	///    If your application needs to be fully portable, consider using ICUTokenizer instead,
+	///    which uses an ICU Thai BreakIterator that will always be available.
+	/// </para>
+	/// </summary>
+	/// @deprecated Use <seealso cref="ThaiTokenizer"/> instead. 
+	[Obsolete("Use <seealso cref="ThaiTokenizer"/> instead.")]
+	public sealed class ThaiWordFilter : TokenFilter
+	{
+	  /// <summary>
+	  /// True if the JRE supports a working dictionary-based breakiterator for Thai.
+	  /// If this is false, this filter will not work at all!
+	  /// </summary>
+	  public static readonly bool DBBI_AVAILABLE = ThaiTokenizer.DBBI_AVAILABLE;
+	  private static readonly BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
+	  private readonly BreakIterator breaker = (BreakIterator) proto.clone();
+	  private readonly CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
+
+	  private readonly bool handlePosIncr;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posAtt = addAttribute(typeof(PositionIncrementAttribute));
+
+	  private AttributeSource clonedToken = null;
+	  private CharTermAttribute clonedTermAtt = null;
+	  private OffsetAttribute clonedOffsetAtt = null;
+	  private bool hasMoreTokensInClone = false;
+	  private bool hasIllegalOffsets = false; // only if the length changed before this filter
+
+	  /// <summary>
+	  /// Creates a new ThaiWordFilter with the specified match version. </summary>
+	  public ThaiWordFilter(Version matchVersion, TokenStream input) : base(matchVersion.onOrAfter(Version.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
+	  {
+		if (!DBBI_AVAILABLE)
+		{
+		  throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
+		}
+		handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (hasMoreTokensInClone)
+		{
+		  int start = breaker.current();
+		  int end = breaker.next();
+		  if (end != BreakIterator.DONE)
+		  {
+			clonedToken.copyTo(this);
+			termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
+			if (hasIllegalOffsets)
+			{
+			  offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+			}
+			else
+			{
+			  offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+			}
+			if (handlePosIncr)
+			{
+				posAtt.PositionIncrement = 1;
+			}
+			return true;
+		  }
+		  hasMoreTokensInClone = false;
+		}
+
+		if (!input.incrementToken())
+		{
+		  return false;
+		}
+
+		if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI)
+		{
+		  return true;
+		}
+
+		hasMoreTokensInClone = true;
+
+		// if length by start + end offsets doesn't match the term text then assume
+		// this is a synonym and don't adjust the offsets.
+		hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
+
+		// we lazy init the cloned token, as in ctor not all attributes may be added
+		if (clonedToken == null)
+		{
+		  clonedToken = cloneAttributes();
+		  clonedTermAtt = clonedToken.getAttribute(typeof(CharTermAttribute));
+		  clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute));
+		}
+		else
+		{
+		  this.copyTo(clonedToken);
+		}
+
+		// reinit CharacterIterator
+		charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
+		breaker.Text = charIterator;
+		int end = breaker.next();
+		if (end != BreakIterator.DONE)
+		{
+		  termAtt.Length = end;
+		  if (hasIllegalOffsets)
+		  {
+			offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+		  }
+		  else
+		  {
+			offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+		  }
+		  // position increment keeps as it is for first token
+		  return true;
+		}
+		return false;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		hasMoreTokensInClone = false;
+		clonedToken = null;
+		clonedTermAtt = null;
+		clonedOffsetAtt = null;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs
new file mode 100644
index 0000000..0fa779c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs
@@ -0,0 +1,59 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.th
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="ThaiWordFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_thai" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.ThaiWordFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre> </summary>
+	/// @deprecated Use <seealso cref="ThaiTokenizerFactory"/> instead 
+	[Obsolete("Use <seealso cref="ThaiTokenizerFactory"/> instead")]
+	public class ThaiWordFilterFactory : TokenFilterFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new ThaiWordFilterFactory </summary>
+	  public ThaiWordFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override ThaiWordFilter create(TokenStream input)
+	  {
+		return new ThaiWordFilter(luceneMatchVersion, input);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilter.cs
new file mode 100644
index 0000000..3c2d66d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilter.cs
@@ -0,0 +1,70 @@
+namespace org.apache.lucene.analysis.tr
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+	/// <summary>
+	/// Strips all characters after an apostrophe (including the apostrophe itself).
+	/// <para>
+	/// In Turkish, apostrophe is used to separate suffixes from proper names
+	/// (continent, sea, river, lake, mountain, upland, proper names related to
+	/// religion and mythology). This filter intended to be used before stem filters.
+	/// For more information, see <a href="http://www.ipcsit.com/vol57/015-ICNI2012-M021.pdf">
+	/// Role of Apostrophes in Turkish Information Retrieval</a>
+	/// </para>
+	/// </summary>
+	public sealed class ApostropheFilter : TokenFilter
+	{
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  public ApostropheFilter(TokenStream @in) : base(@in)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (!input.incrementToken())
+		{
+		  return false;
+		}
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
+		char[] buffer = termAtt.buffer();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int length = termAtt.length();
+		int length = termAtt.length();
+
+		for (int i = 0; i < length; i++)
+		{
+		  if (buffer[i] == '\'' || buffer[i] == '\u2019')
+		  {
+			termAtt.Length = i;
+			return true;
+		  }
+		}
+		return true;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilterFactory.cs
new file mode 100644
index 0000000..b3e0fea
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/ApostropheFilterFactory.cs
@@ -0,0 +1,52 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.tr
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="ApostropheFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_tr_lower_apostrophes" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.ApostropheFilterFactory"/&gt;
+	///     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class ApostropheFilterFactory : TokenFilterFactory
+	{
+
+	  public ApostropheFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameter(s): " + args);
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new ApostropheFilter(input);
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishAnalyzer.cs
new file mode 100644
index 0000000..836782a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishAnalyzer.cs
@@ -0,0 +1,145 @@
+using System;
+
+namespace org.apache.lucene.analysis.tr
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+	using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+	using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+	using Version = org.apache.lucene.util.Version;
+	using TurkishStemmer = org.tartarus.snowball.ext.TurkishStemmer;
+
+	/// <summary>
+	/// <seealso cref="Analyzer"/> for Turkish.
+	/// </summary>
+	public sealed class TurkishAnalyzer : StopwordAnalyzerBase
+	{
+	  private readonly CharArraySet stemExclusionSet;
+
+	  /// <summary>
+	  /// File containing default Turkish stopwords. </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+	  /// <summary>
+	  /// The comment character in the stopwords file.  
+	  /// All lines prefixed with this will be ignored.
+	  /// </summary>
+	  private const string STOPWORDS_COMMENT = "#";
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop words set. </summary>
+	  /// <returns> default stop words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  /// <summary>
+	  /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+	  /// accesses the static final set the first time.;
+	  /// </summary>
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = loadStopwordSet(false, typeof(TurkishAnalyzer), DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+	  /// </summary>
+	  public TurkishAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+	  /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+	  /// stemming.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+	  public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+	  {
+		this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+	  }
+
+	  /// <summary>
+	  /// Creates a
+	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> A
+	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
+	  ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
+	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
+	  ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+		Tokenizer source = new StandardTokenizer(matchVersion, reader);
+		TokenStream result = new StandardFilter(matchVersion, source);
+		if (matchVersion.onOrAfter(Version.LUCENE_48))
+		{
+		  result = new ApostropheFilter(result);
+		}
+		result = new TurkishLowerCaseFilter(result);
+		result = new StopFilter(matchVersion, result, stopwords);
+		if (!stemExclusionSet.Empty)
+		{
+		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+		}
+		result = new SnowballFilter(result, new TurkishStemmer());
+		return new TokenStreamComponents(source, result);
+	  }
+	}
+
+}
\ No newline at end of file


Mime
View raw message