lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From lai...@apache.org
Subject lucenenet git commit: closes apache/lucenenet#157
Date Wed, 06 Jan 2016 03:09:06 GMT
Repository: lucenenet
Updated Branches:
  refs/heads/master 8cef92b2f -> be39dfd47


closes apache/lucenenet#157


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/be39dfd4
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/be39dfd4
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/be39dfd4

Branch: refs/heads/master
Commit: be39dfd47f8cbb97249571bc22fee278cbb680fd
Parents: 8cef92b
Author: Laimonas Simutis <laimis@gmail.com>
Authored: Tue Jan 5 22:08:09 2016 -0500
Committer: Laimonas Simutis <laimis@gmail.com>
Committed: Tue Jan 5 22:08:09 2016 -0500

----------------------------------------------------------------------
 .../Analysis/Util/CharArrayIterator.cs          | 478 ++++++++-----------
 .../Analysis/Util/CharacterIterator.cs          |  19 +
 .../Analysis/Util/ICharacterIterator.cs         |  17 +
 .../Analysis/Util/SegmentingTokenizerBase.cs    | 468 +++++++++---------
 .../Lucene.Net.Analysis.Common.csproj           |  60 +++
 src/Lucene.Net.Analysis.Common/packages.config  |   4 +
 .../Analysis/Util/TestCharArrayIterator.cs      | 354 +++++++-------
 .../Analysis/Util/TestRollingCharBuffer.cs      | 245 +++++-----
 .../Util/TestSegmentingTokenizerBase.cs         | 472 +++++++++---------
 .../Lucene.Net.Tests.Analysis.Common.csproj     |  58 +++
 .../packages.config                             |   4 +
 11 files changed, 1127 insertions(+), 1052 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs
index b4f4b96..7d3820a 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs
@@ -1,276 +1,212 @@
 using System;
+using ICU4NET;
 
-namespace org.apache.lucene.analysis.util
+namespace Lucene.Net.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-
-	/// <summary>
-	/// A CharacterIterator used internally for use with <seealso cref="BreakIterator"/>
-	/// @lucene.internal
-	/// </summary>
-	public abstract class CharArrayIterator //: CharacterIterator
-	{
-	  private char[] array;
-	  private int start;
-	  private int index;
-	  private int length;
-	  private int limit;
-
-	  public virtual char [] Text
-	  {
-		  get
-		  {
-			return array;
-		  }
-	  }
-
-	  public virtual int Start
-	  {
-		  get
-		  {
-			return start;
-		  }
-	  }
-
-	  public virtual int Length
-	  {
-		  get
-		  {
-			return length;
-		  }
-	  }
-
-	  /// <summary>
-	  /// Set a new region of text to be examined by this iterator
-	  /// </summary>
-	  /// <param name="array"> text buffer to examine </param>
-	  /// <param name="start"> offset into buffer </param>
-	  /// <param name="length"> maximum length to examine </param>
-	  public virtual void setText(char[] array, int start, int length)
-	  {
-		this.array = array;
-		this.start = start;
-		this.index = start;
-		this.length = length;
-		this.limit = start + length;
-	  }
-
-	  public override char Current()
-	  {
-		return (index == limit) ? DONE : jreBugWorkaround(array[index]);
-	  }
-
-	  protected internal abstract char jreBugWorkaround(char ch);
-
-	  public override char First()
-	  {
-		index = start;
-		return Current();
-	  }
-
-	  public override int BeginIndex
-	  {
-		  get
-		  {
-			return 0;
-		  }
-	  }
-
-	  public override int EndIndex
-	  {
-		  get
-		  {
-			return length;
-		  }
-	  }
-
-	  public override int Index
-	  {
-		  get
-		  {
-			return index - start;
-		  }
-	  }
-
-	  public override char Last()
-	  {
-		index = (limit == start) ? limit : limit - 1;
-		return current();
-	  }
-
-	  public override char Next()
-	  {
-		if (++index >= limit)
-		{
-		  index = limit;
-		  return DONE;
-		}
-		else
-		{
-		  return current();
-		}
-	  }
-
-	  public override char Previous()
-	  {
-		if (--index < start)
-		{
-		  index = start;
-		  return DONE;
-		}
-		else
-		{
-		  return current();
-		}
-	  }
-
-	  public override char SetIndex(int position)
-	  {
-		if (position < BeginIndex || position > EndIndex)
-		{
-		  throw new System.ArgumentException("Illegal Position: " + position);
-		}
-		index = start + position;
-		return current();
-	  }
-
-	  public override CharArrayIterator Clone()
-	  {
-		try
-		{
-		  return (CharArrayIterator)base.clone();
-		}
-		catch (CloneNotSupportedException e)
-		{
-		  // CharacterIterator does not allow you to throw CloneNotSupported
-		  throw new Exception(e);
-		}
-	  }
-
-	  /// <summary>
-	  /// Create a new CharArrayIterator that works around JRE bugs
-	  /// in a manner suitable for <seealso cref="BreakIterator#getSentenceInstance()"/>
-	  /// </summary>
-	  public static CharArrayIterator newSentenceInstance()
-	  {
-		if (HAS_BUGGY_BREAKITERATORS)
-		{
-		  return new CharArrayIteratorAnonymousInnerClassHelper();
-		}
-		else
-		{
-		  return new CharArrayIteratorAnonymousInnerClassHelper2();
-		}
-	  }
-
-	  private class CharArrayIteratorAnonymousInnerClassHelper : CharArrayIterator
-	  {
-		  public CharArrayIteratorAnonymousInnerClassHelper()
-		  {
-		  }
-
-			  // work around this for now by lying about all surrogates to 
-			  // the sentence tokenizer, instead we treat them all as 
-			  // SContinue so we won't break around them.
-		  protected internal override char jreBugWorkaround(char ch)
-		  {
-			return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
-		  }
-	  }
-
-	  private class CharArrayIteratorAnonymousInnerClassHelper2 : CharArrayIterator
-	  {
-		  public CharArrayIteratorAnonymousInnerClassHelper2()
-		  {
-		  }
-
-			  // no bugs
-		  protected internal override char jreBugWorkaround(char ch)
-		  {
-			return ch;
-		  }
-	  }
-
-	  /// <summary>
-	  /// Create a new CharArrayIterator that works around JRE bugs
-	  /// in a manner suitable for <seealso cref="BreakIterator#getWordInstance()"/>
-	  /// </summary>
-	  public static CharArrayIterator newWordInstance()
-	  {
-		if (HAS_BUGGY_BREAKITERATORS)
-		{
-		  return new CharArrayIteratorAnonymousInnerClassHelper3();
-		}
-		else
-		{
-		  return new CharArrayIteratorAnonymousInnerClassHelper4();
-		}
-	  }
-
-	  private class CharArrayIteratorAnonymousInnerClassHelper3 : CharArrayIterator
-	  {
-		  public CharArrayIteratorAnonymousInnerClassHelper3()
-		  {
-		  }
-
-			  // work around this for now by lying about all surrogates to the word, 
-			  // instead we treat them all as ALetter so we won't break around them.
-		  protected internal override char jreBugWorkaround(char ch)
-		  {
-			return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
-		  }
-	  }
-
-	  private class CharArrayIteratorAnonymousInnerClassHelper4 : CharArrayIterator
-	  {
-		  public CharArrayIteratorAnonymousInnerClassHelper4()
-		  {
-		  }
-
-			  // no bugs
-		  protected internal override char jreBugWorkaround(char ch)
-		  {
-			return ch;
-		  }
-	  }
-
-	  /// <summary>
-	  /// True if this JRE has a buggy BreakIterator implementation
-	  /// </summary>
-	  public static readonly bool HAS_BUGGY_BREAKITERATORS;
-	  static CharArrayIterator()
-	  {
-		bool v;
-		try
-		{
-		  BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
-		  bi.Text = "\udb40\udc53";
-		  bi.next();
-		  v = false;
-		}
-		catch (Exception)
-		{
-		  v = true;
-		}
-		HAS_BUGGY_BREAKITERATORS = v;
-	  }
-	}
-
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+
+    /// <summary>
+    /// A CharacterIterator used internally for use with <seealso cref="BreakIterator"/>
+    /// @lucene.internal
+    /// </summary>
+    public abstract class CharArrayIterator : CharacterIterator
+    {
+        private char[] array;
+        private int start;
+        private int index;
+        private int length;
+        private int limit;
+
+        public virtual char[] Text
+        {
+            get
+            {
+                return array;
+            }
+        }
+
+        public virtual int Start
+        {
+            get
+            {
+                return start;
+            }
+        }
+
+        public virtual int Length
+        {
+            get
+            {
+                return length;
+            }
+        }
+
+        /// <summary>
+        /// Set a new region of text to be examined by this iterator
+        /// </summary>
+        /// <param name="array"> text buffer to examine </param>
+        /// <param name="start"> offset into buffer </param>
+        /// <param name="length"> maximum length to examine </param>
+        public virtual void SetText(char[] array, int start, int length)
+        {
+            this.array = array;
+            this.start = start;
+            this.index = start;
+            this.length = length;
+            this.limit = start + length;
+        }
+
+        public override char Current()
+        {
+            return (index == limit) ? DONE : JreBugWorkaround(array[index]);
+        }
+
+        protected internal abstract char JreBugWorkaround(char ch);
+
+        public override char First()
+        {
+            index = start;
+            return Current();
+        }
+
+        public int BeginIndex
+        {
+            get
+            {
+                return 0;
+            }
+        }
+
+        public int EndIndex
+        {
+            get
+            {
+                return length;
+            }
+        }
+
+        public int Index
+        {
+            get
+            {
+                return index - start;
+            }
+        }
+
+        public override int GetBeginIndex()
+        {
+            return 0;
+        }
+
+        public override int GetEndIndex()
+        {
+            return length;
+        }
+
+        public override int GetIndex()
+        {
+            return index - start;
+        }
+
+
+        public override char Last()
+        {
+            index = (limit == start) ? limit : limit - 1;
+            return Current();
+        }
+
+        public override char Next()
+        {
+            if (++index >= limit)
+            {
+                index = limit;
+                return DONE;
+            }
+            else
+            {
+                return Current();
+            }
+        }
+
+        public override char Previous()
+        {
+            if (--index < start)
+            {
+                index = start;
+                return DONE;
+            }
+            else
+            {
+                return Current();
+            }
+        }
+
+        public override char SetIndex(int position)
+        {
+            if (position < BeginIndex || position > EndIndex)
+            {
+                throw new ArgumentException("Illegal Position: " + position);
+            }
+            index = start + position;
+            return Current();
+        }
+
+        public override object Clone()
+        {
+            return this.MemberwiseClone();
+        }
+
+        /// <summary>
+        /// Create a new CharArrayIterator that works around JRE bugs
+        /// in a manner suitable for <seealso cref="BreakIterator#getSentenceInstance()"/>
+        /// </summary>
+        public static CharArrayIterator NewSentenceInstance()
+        {
+            return new CharArrayIteratorAnonymousInnerClassHelper2();
+        }
+
+        private class CharArrayIteratorAnonymousInnerClassHelper2 : CharArrayIterator
+        {
+            // no bugs
+            protected internal override char JreBugWorkaround(char ch)
+            {
+                return ch;
+            }
+        }
+
+        /// <summary>
+        /// Create a new CharArrayIterator that works around JRE bugs
+        /// in a manner suitable for <seealso cref="BreakIterator#getWordInstance()"/>
+        /// </summary>
+        public static CharArrayIterator NewWordInstance()
+        {
+            return new CharArrayIteratorAnonymousInnerClassHelper4();
+        }
+
+        private class CharArrayIteratorAnonymousInnerClassHelper4 : CharArrayIterator
+        {
+            // no bugs
+            protected internal override char JreBugWorkaround(char ch)
+            {
+                return ch;
+            }
+        }
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterIterator.cs
new file mode 100644
index 0000000..aae2e1d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterIterator.cs
@@ -0,0 +1,19 @@
+namespace Lucene.Net.Analysis.Util
+{
+    public abstract class CharacterIterator : ICharacterIterator
+    {
+        public static readonly char DONE = '\uFFFF';
+
+        public abstract char First();
+        public abstract char Last();
+        public abstract char Current();
+        public abstract char Next();
+        public abstract char Previous();
+        public abstract char SetIndex(int position);
+        public abstract int GetBeginIndex();
+        public abstract int GetEndIndex();
+        public abstract int GetIndex();
+
+        public abstract object Clone();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/Analysis/Util/ICharacterIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/ICharacterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/ICharacterIterator.cs
new file mode 100644
index 0000000..d12eeb2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/ICharacterIterator.cs
@@ -0,0 +1,17 @@
+using System;
+
+namespace Lucene.Net.Analysis.Util
+{
+    public interface ICharacterIterator : ICloneable
+    {
+        char First();
+        char Last();
+        char Current();
+        char Next();
+        char Previous();
+        char SetIndex(int position);
+        int GetBeginIndex();
+        int GetEndIndex();
+        int GetIndex();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
index 49fb6c4..70b7b5f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs
@@ -1,247 +1,249 @@
 using System;
 using System.Diagnostics;
 using System.IO;
+using ICU4NET;
 using Lucene.Net.Analysis.Tokenattributes;
-using org.apache.lucene.analysis.util;
 using Reader = System.IO.TextReader;
 using Version = Lucene.Net.Util.LuceneVersion;
 
 namespace Lucene.Net.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-
-	/// <summary>
-	/// Breaks text into sentences with a <seealso cref="BreakIterator"/> and
-	/// allows subclasses to decompose these sentences into words.
-	/// <para>
-	/// This can be used by subclasses that need sentence context 
-	/// for tokenization purposes, such as CJK segmenters.
-	/// </para>
-	/// <para>
-	/// Additionally it can be used by subclasses that want to mark
-	/// sentence boundaries (with a custom attribute, extra token, position
-	/// increment, etc) for downstream processing.
-	/// 
-	/// @lucene.experimental
-	/// </para>
-	/// </summary>
-	public abstract class SegmentingTokenizerBase : Tokenizer
-	{
-	  protected internal const int BUFFERMAX = 1024;
-	  protected internal readonly char[] buffer = new char[BUFFERMAX];
-	  /// <summary>
-	  /// true length of text in the buffer </summary>
-	  private int length = 0;
-	  /// <summary>
-	  /// length in buffer that can be evaluated safely, up to a safe end point </summary>
-	  private int usableLength = 0;
-	  /// <summary>
-	  /// accumulated offset of previous buffers for this reader, for offsetAtt </summary>
-	  protected internal int offset = 0;
-
-	  private readonly BreakIterator iterator;
-	  private readonly CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
-
-	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
-
-	  /// <summary>
-	  /// Construct a new SegmenterBase, using
-	  /// the provided BreakIterator for sentence segmentation.
-	  /// <para>
-	  /// Note that you should never share BreakIterators across different
-	  /// TokenStreams, instead a newly created or cloned one should always
-	  /// be provided to this constructor.
-	  /// </para>
-	  /// </summary>
-	  public SegmentingTokenizerBase(Reader reader, BreakIterator iterator) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
-	  {
-	  }
-
-	  /// <summary>
-	  /// Construct a new SegmenterBase, also supplying the AttributeFactory
-	  /// </summary>
-	  public SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator) : base(factory, reader)
-	  {
-		this.iterator = iterator;
-	  }
-
-	  public override bool IncrementToken()
-	  {
-		if (length == 0 || !IncrementWord())
-		{
-		  while (!IncrementSentence())
-		  {
-			Refill();
-			if (length <= 0) // no more bytes to read;
-			{
-			  return false;
-			}
-		  }
-		}
-
-		return true;
-	  }
-
-	  public override void Reset()
-	  {
-		base.Reset();
-		wrapper.setText(buffer, 0, 0);
-		iterator.Text = wrapper;
-		length = usableLength = offset = 0;
-	  }
-
-	  public override void End()
-	  {
-		base.End();
-		int finalOffset = CorrectOffset(length < 0 ? offset : offset + length);
-		offsetAtt.SetOffset(finalOffset, finalOffset);
-	  }
-
-	  /// <summary>
-	  /// Returns the last unambiguous break position in the text. </summary>
-	  private int FindSafeEnd()
-	  {
-		for (int i = length - 1; i >= 0; i--)
-		{
-		  if (IsSafeEnd(buffer[i]))
-		  {
-			return i + 1;
-		  }
-		}
-		return -1;
-	  }
-
-	  /// <summary>
-	  /// For sentence tokenization, these are the unambiguous break positions. </summary>
-	  protected internal virtual bool IsSafeEnd(char ch)
-	  {
-		switch ((int)ch)
-		{
-		  case 0x000D:
-		  case 0x000A:
-		  case 0x0085:
-		  case 0x2028:
-		  case 0x2029:
-			return true;
-		  default:
-			return false;
-		}
-	  }
-
-	    /// <summary>
-	    /// Refill the buffer, accumulating the offset and setting usableLength to the
-	    /// last unambiguous break position
-	    /// </summary>
-	    private void Refill()
-	    {
-	        offset += usableLength;
-	        int leftover = length - usableLength;
-	        Array.Copy(buffer, usableLength, buffer, 0, leftover);
-	        int requested = buffer.Length - leftover;
-	        int returned = Read(input, buffer, leftover, requested);
-	        length = returned < 0 ? leftover : returned + leftover;
-	        if (returned < requested) // reader has been emptied, process the rest
-	        {
-	            usableLength = length;
-	        }
-	        else // still more data to be read, find a safe-stopping place
-	        {
-	            usableLength = FindSafeEnd();
-	            if (usableLength < 0)
-	            {
-	                usableLength = length; /*
-		  }
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+
+    /// <summary>
+    /// Breaks text into sentences with a <seealso cref="BreakIterator"/> and
+    /// allows subclasses to decompose these sentences into words.
+    /// <para>
+    /// This can be used by subclasses that need sentence context 
+    /// for tokenization purposes, such as CJK segmenters.
+    /// </para>
+    /// <para>
+    /// Additionally it can be used by subclasses that want to mark
+    /// sentence boundaries (with a custom attribute, extra token, position
+    /// increment, etc) for downstream processing.
+    /// 
+    /// @lucene.experimental
+    /// </para>
+    /// </summary>
+    public abstract class SegmentingTokenizerBase : Tokenizer
+    {
+        protected internal const int BUFFERMAX = 1024;
+        protected internal readonly char[] buffer = new char[BUFFERMAX];
+        /// <summary>
+        /// true length of text in the buffer </summary>
+        private int length = 0;
+        /// <summary>
+        /// length in buffer that can be evaluated safely, up to a safe end point </summary>
+        private int usableLength = 0;
+        /// <summary>
+        /// accumulated offset of previous buffers for this reader, for offsetAtt </summary>
+        protected internal int offset = 0;
+
+        private readonly BreakIterator iterator;
+        private readonly CharArrayIterator wrapper = CharArrayIterator.NewSentenceInstance();
+
+        private readonly IOffsetAttribute offsetAtt;
+
+        /// <summary>
+        /// Construct a new SegmenterBase, using
+        /// the provided BreakIterator for sentence segmentation.
+        /// <para>
+        /// Note that you should never share BreakIterators across different
+        /// TokenStreams, instead a newly created or cloned one should always
+        /// be provided to this constructor.
+        /// </para>
+        /// </summary>
+        protected SegmentingTokenizerBase(Reader reader, BreakIterator iterator)
+            : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
+        {
+        }
+
+        /// <summary>
+        /// Construct a new SegmenterBase, also supplying the AttributeFactory
+        /// </summary>
+        protected SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator)
+            : base(factory, reader)
+        {
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            this.iterator = iterator;
+        }
+
+        public override bool IncrementToken()
+        {
+            if (length == 0 || !IncrementWord())
+            {
+                while (!IncrementSentence())
+                {
+                    Refill();
+                    if (length <= 0) // no more bytes to read;
+                    {
+                        return false;
+                    }
+                }
+            }
+
+            return true;
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            wrapper.SetText(buffer, 0, 0);
+            iterator.SetText(new string(buffer, 0, 0));
+            length = usableLength = offset = 0;
+        }
+
+        public override void End()
+        {
+            base.End();
+            int finalOffset = CorrectOffset(length < 0 ? offset : offset + length);
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        /// <summary>
+        /// Returns the last unambiguous break position in the text. </summary>
+        private int FindSafeEnd()
+        {
+            for (int i = length - 1; i >= 0; i--)
+            {
+                if (IsSafeEnd(buffer[i]))
+                {
+                    return i + 1;
+                }
+            }
+            return -1;
+        }
+
+        /// <summary>
+        /// For sentence tokenization, these are the unambiguous break positions. </summary>
+        protected internal virtual bool IsSafeEnd(char ch)
+        {
+            switch ((int)ch)
+            {
+                case 0x000D:
+                case 0x000A:
+                case 0x0085:
+                case 0x2028:
+                case 0x2029:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+
+        /// <summary>
+        /// Refill the buffer, accumulating the offset and setting usableLength to the
+        /// last unambiguous break position
+        /// </summary>
+        private void Refill()
+        {
+            offset += usableLength;
+            int leftover = length - usableLength;
+            Array.Copy(buffer, usableLength, buffer, 0, leftover);
+            int requested = buffer.Length - leftover;
+            int returned = Read(input, buffer, leftover, requested);
+            length = returned < 0 ? leftover : returned + leftover;
+            if (returned < requested) // reader has been emptied, process the rest
+            {
+                usableLength = length;
+            }
+            else // still more data to be read, find a safe-stopping place
+            {
+                usableLength = FindSafeEnd();
+                if (usableLength < 0)
+                {
+                    usableLength = length; /*
 	                                * more than IOBUFFER of text without breaks,
 	                                * gonna possibly truncate tokens
 	                                */
-	            }
-
-	            wrapper.SetText(buffer, 0, Math.Max(0, usableLength));
-	            iterator.Text = wrapper;
-	        }
-	    }
-
-	    // TODO: refactor to a shared readFully somewhere
-	  // (NGramTokenizer does this too):
-	  /// <summary>
-	  /// commons-io's readFully, but without bugs if offset != 0 </summary>
-	  private static int Read(TextReader input, char[] buffer, int offset, int length)
-	  {
-		Debug.Assert(length >= 0, "length must not be negative: " + length);
-
-		int remaining = length;
-		while (remaining > 0)
-		{
-		  int location = length - remaining;
-		  int count = input.read(buffer, offset + location, remaining);
-		  if (-1 == count) // EOF
-		  {
-			break;
-		  }
-		  remaining -= count;
-		}
-		return length - remaining;
-	  }
-
-	  /// <summary>
-	  /// return true if there is a token from the buffer, or null if it is
-	  /// exhausted.
-	  /// </summary>
-	  private bool IncrementSentence()
-	  {
-		if (length == 0) // we must refill the buffer
-		{
-		  return false;
-		}
-
-		while (true)
-		{
-		  int start = iterator.Current();
-
-		  if (start == BreakIterator.DONE)
-		  {
-			return false; // BreakIterator exhausted
-		  }
-
-		  // find the next set of boundaries
-		  int end_Renamed = iterator.next();
-
-		  if (end_Renamed == BreakIterator.DONE)
-		  {
-			return false; // BreakIterator exhausted
-		  }
-
-		  setNextSentence(start, end_Renamed);
-		  if (incrementWord())
-		  {
-			return true;
-		  }
-		}
-	  }
-
-	  /// <summary>
-	  /// Provides the next input sentence for analysis </summary>
-	  protected internal abstract void SetNextSentence(int sentenceStart, int sentenceEnd);
-
-	  /// <summary>
-	  /// Returns true if another word is available </summary>
-	  protected internal abstract bool IncrementWord();
-	}
+                }
+            }
+
+            wrapper.SetText(buffer, 0, Math.Max(0, usableLength));
+            iterator.SetText(new string(wrapper.Text, 0, Math.Max(0, usableLength)));
+        }
+
+        // TODO: refactor to a shared readFully somewhere
+        // (NGramTokenizer does this too):
+        /// <summary>
+        /// commons-io's readFully, but without bugs if offset != 0 </summary>
+        private static int Read(TextReader input, char[] buffer, int offset, int length)
+        {
+            Debug.Assert(length >= 0, "length must not be negative: " + length);
+
+            int remaining = length;
+            while (remaining > 0)
+            {
+                int location = length - remaining;
+                int count = input.Read(buffer, offset + location, remaining);
+                if (count <= 0) // EOF
+                {
+                    break;
+                }
+                remaining -= count;
+            }
+            return length - remaining;
+        }
+
+        /// <summary>
+        /// return true if there is a token from the buffer, or null if it is
+        /// exhausted.
+        /// </summary>
+        private bool IncrementSentence()
+        {
+            if (length == 0) // we must refill the buffer
+            {
+                return false;
+            }
+
+            while (true)
+            {
+                int start = iterator.Current();
+
+                if (start == BreakIterator.DONE)
+                {
+                    return false; // BreakIterator exhausted
+                }
+
+                // find the next set of boundaries
+                int end_Renamed = iterator.Next();
+
+                if (end_Renamed == BreakIterator.DONE)
+                {
+                    return false; // BreakIterator exhausted
+                }
+
+                SetNextSentence(start, end_Renamed);
+                if (IncrementWord())
+                {
+                    return true;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Provides the next input sentence for analysis </summary>
+        protected internal abstract void SetNextSentence(int sentenceStart, int sentenceEnd);
+
+        /// <summary>
+        /// Returns true if another word is available </summary>
+        protected internal abstract bool IncrementWord();
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
index ff84887..84b1c6a 100644
--- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
+++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
@@ -30,14 +30,29 @@
     <WarningLevel>4</WarningLevel>
   </PropertyGroup>
   <ItemGroup>
+    <Reference Include="ICU4NET, Version=1.0.5593.31013, Culture=neutral, processorArchitecture=x86">
+      <HintPath>..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\lib\net45\ICU4NET.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
+    <Reference Include="ICU4NETExtension, Version=1.0.0.0, Culture=neutral, processorArchitecture=x86">
+      <HintPath>..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\lib\net45\ICU4NETExtension.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
     <Reference Include="System" />
     <Reference Include="System.Core" />
     <Reference Include="Microsoft.CSharp" />
   </ItemGroup>
   <ItemGroup>
+    <Compile Include="Analysis\Util\CharacterIterator.cs" />
     <Compile Include="Analysis\Util\CharacterUtils.cs">
       <SubType>Code</SubType>
     </Compile>
+    <Compile Include="Analysis\Util\CharArrayIterator.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Analysis\Util\ICharacterIterator.cs" />
+    <Compile Include="Analysis\Util\RollingCharBuffer.cs" />
+    <Compile Include="Analysis\Util\SegmentingTokenizerBase.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
   </ItemGroup>
   <ItemGroup>
@@ -46,6 +61,51 @@
       <Name>Lucene.Net</Name>
     </ProjectReference>
   </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icudt55.dll">
+      <Link>icudt55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuin55.dll">
+      <Link>icuin55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuio55.dll">
+      <Link>icuio55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icule55.dll">
+      <Link>icule55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\iculx55.dll">
+      <Link>iculx55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icutu55.dll">
+      <Link>icutu55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuuc55.dll">
+      <Link>icuuc55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Analysis.Common/packages.config
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/packages.config b/src/Lucene.Net.Analysis.Common/packages.config
new file mode 100644
index 0000000..7685c3b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/packages.config
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="ICU4NET-ICU4C55.1-bin32" version="1.0.0" targetFramework="net451" />
+</packages>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArrayIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArrayIterator.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArrayIterator.cs
index e0f13e1..417f32b 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArrayIterator.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArrayIterator.cs
@@ -1,181 +1,185 @@
 using System;
+using ICU4NET;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using CharacterIterator = Lucene.Net.Analysis.Util.CharacterIterator;
 
-namespace org.apache.lucene.analysis.util
+namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-
-	using LuceneTestCase = org.apache.lucene.util.LuceneTestCase;
-	using TestUtil = org.apache.lucene.util.TestUtil;
-
-	public class TestCharArrayIterator : LuceneTestCase
-	{
-
-	  public virtual void testWordInstance()
-	  {
-		doTests(CharArrayIterator.newWordInstance());
-	  }
-
-	  public virtual void testConsumeWordInstance()
-	  {
-		// we use the default locale, as its randomized by LuceneTestCase
-		BreakIterator bi = BreakIterator.getWordInstance(Locale.Default);
-		CharArrayIterator ci = CharArrayIterator.newWordInstance();
-		for (int i = 0; i < 10000; i++)
-		{
-		  char[] text = TestUtil.randomUnicodeString(random()).toCharArray();
-		  ci.setText(text, 0, text.Length);
-		  consume(bi, ci);
-		}
-	  }
-
-	  /* run this to test if your JRE is buggy
-	  public void testWordInstanceJREBUG() {
-	    // we use the default locale, as its randomized by LuceneTestCase
-	    BreakIterator bi = BreakIterator.getWordInstance(Locale.getDefault());
-	    Segment ci = new Segment();
-	    for (int i = 0; i < 10000; i++) {
-	      char text[] = TestUtil.randomUnicodeString(random).toCharArray();
-	      ci.array = text;
-	      ci.offset = 0;
-	      ci.count = text.length;
-	      consume(bi, ci);
-	    }
-	  }
-	  */
-
-	  public virtual void testSentenceInstance()
-	  {
-		doTests(CharArrayIterator.newSentenceInstance());
-	  }
-
-	  public virtual void testConsumeSentenceInstance()
-	  {
-		// we use the default locale, as its randomized by LuceneTestCase
-		BreakIterator bi = BreakIterator.getSentenceInstance(Locale.Default);
-		CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
-		for (int i = 0; i < 10000; i++)
-		{
-		  char[] text = TestUtil.randomUnicodeString(random()).toCharArray();
-		  ci.setText(text, 0, text.Length);
-		  consume(bi, ci);
-		}
-	  }
-
-	  /* run this to test if your JRE is buggy
-	  public void testSentenceInstanceJREBUG() {
-	    // we use the default locale, as its randomized by LuceneTestCase
-	    BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault());
-	    Segment ci = new Segment();
-	    for (int i = 0; i < 10000; i++) {
-	      char text[] = TestUtil.randomUnicodeString(random).toCharArray();
-	      ci.array = text;
-	      ci.offset = 0;
-	      ci.count = text.length;
-	      consume(bi, ci);
-	    }
-	  }
-	  */
-
-	  private void doTests(CharArrayIterator ci)
-	  {
-		// basics
-		ci.setText("testing".ToCharArray(), 0, "testing".Length);
-		assertEquals(0, ci.BeginIndex);
-		assertEquals(7, ci.EndIndex);
-		assertEquals(0, ci.Index);
-		assertEquals('t', ci.current());
-		assertEquals('e', ci.next());
-		assertEquals('g', ci.last());
-		assertEquals('n', ci.previous());
-		assertEquals('t', ci.first());
-		assertEquals(CharacterIterator.DONE, ci.previous());
-
-		// first()
-		ci.setText("testing".ToCharArray(), 0, "testing".Length);
-		ci.next();
-		// Sets the position to getBeginIndex() and returns the character at that position. 
-		assertEquals('t', ci.first());
-		assertEquals(ci.BeginIndex, ci.Index);
-		// or DONE if the text is empty
-		ci.setText(new char[] {}, 0, 0);
-		assertEquals(CharacterIterator.DONE, ci.first());
-
-		// last()
-		ci.setText("testing".ToCharArray(), 0, "testing".Length);
-		// Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
-		// and returns the character at that position. 
-		assertEquals('g', ci.last());
-		assertEquals(ci.Index, ci.EndIndex - 1);
-		// or DONE if the text is empty
-		ci.setText(new char[] {}, 0, 0);
-		assertEquals(CharacterIterator.DONE, ci.last());
-		assertEquals(ci.EndIndex, ci.Index);
-
-		// current()
-		// Gets the character at the current position (as returned by getIndex()). 
-		ci.setText("testing".ToCharArray(), 0, "testing".Length);
-		assertEquals('t', ci.current());
-		ci.last();
-		ci.next();
-		// or DONE if the current position is off the end of the text.
-		assertEquals(CharacterIterator.DONE, ci.current());
-
-		// next()
-		ci.setText("te".ToCharArray(), 0, 2);
-		// Increments the iterator's index by one and returns the character at the new index.
-		assertEquals('e', ci.next());
-		assertEquals(1, ci.Index);
-		// or DONE if the new position is off the end of the text range.
-		assertEquals(CharacterIterator.DONE, ci.next());
-		assertEquals(ci.EndIndex, ci.Index);
-
-		// setIndex()
-		ci.setText("test".ToCharArray(), 0, "test".Length);
-		try
-		{
-		  ci.Index = 5;
-		  fail();
-		}
-		catch (Exception e)
-		{
-		  assertTrue(e is System.ArgumentException);
-		}
-
-		// clone()
-		char[] text = "testing".ToCharArray();
-		ci.setText(text, 0, text.Length);
-		ci.next();
-		CharArrayIterator ci2 = ci.clone();
-		assertEquals(ci.Index, ci2.Index);
-		assertEquals(ci.next(), ci2.next());
-		assertEquals(ci.last(), ci2.last());
-	  }
-
-	  private void consume(BreakIterator bi, CharacterIterator ci)
-	  {
-		bi.Text = ci;
-		while (bi.next() != BreakIterator.DONE)
-		{
-		  ;
-		}
-	  }
-	}
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    [TestFixture]
+    public class TestCharArrayIterator : LuceneTestCase
+    {
+        [Test]
+        public virtual void TestWordInstance()
+        {
+            DoTests(CharArrayIterator.NewWordInstance());
+        }
+
+        [Test]
+        public virtual void TestConsumeWordInstance()
+        {
+            // we use the default locale, as its randomized by LuceneTestCase
+            var bi = BreakIterator.CreateWordInstance(Locale.GetUS());
+            var ci = CharArrayIterator.NewWordInstance();
+            for (var i = 0; i < 10000; i++)
+            {
+                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
+                ci.SetText(text, 0, text.Length);
+                Consume(bi, ci);
+            }
+        }
+
+        /* run this to test if your JRE is buggy
+        public void testWordInstanceJREBUG() {
+          // we use the default locale, as its randomized by LuceneTestCase
+          BreakIterator bi = BreakIterator.getWordInstance(Locale.getDefault());
+          Segment ci = new Segment();
+          for (int i = 0; i < 10000; i++) {
+            char text[] = TestUtil.randomUnicodeString(random).toCharArray();
+            ci.array = text;
+            ci.offset = 0;
+            ci.count = text.length;
+            consume(bi, ci);
+          }
+        }
+        */
+
+        [Test]
+        public virtual void TestSentenceInstance()
+        {
+            DoTests(CharArrayIterator.NewSentenceInstance());
+        }
+
+        [Test]
+        public virtual void TestConsumeSentenceInstance()
+        {
+            // we use the default locale, as its randomized by LuceneTestCase
+            var bi = BreakIterator.CreateSentenceInstance(Locale.GetUS());
+            var ci = CharArrayIterator.NewSentenceInstance();
+            for (var i = 0; i < 10000; i++)
+            {
+                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
+                ci.SetText(text, 0, text.Length);
+                Consume(bi, ci);
+            }
+        }
+
+        /* run this to test if your JRE is buggy
+        public void testSentenceInstanceJREBUG() {
+          // we use the default locale, as its randomized by LuceneTestCase
+          BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault());
+          Segment ci = new Segment();
+          for (int i = 0; i < 10000; i++) {
+            char text[] = TestUtil.randomUnicodeString(random).toCharArray();
+            ci.array = text;
+            ci.offset = 0;
+            ci.count = text.length;
+            consume(bi, ci);
+          }
+        }
+        */
+
+        private void DoTests(CharArrayIterator ci)
+        {
+            // basics
+            ci.SetText("testing".ToCharArray(), 0, "testing".Length);
+            assertEquals(0, ci.BeginIndex);
+            assertEquals(7, ci.EndIndex);
+            assertEquals(0, ci.Index);
+            assertEquals('t', ci.Current());
+            assertEquals('e', ci.Next());
+            assertEquals('g', ci.Last());
+            assertEquals('n', ci.Previous());
+            assertEquals('t', ci.First());
+            assertEquals(CharacterIterator.DONE, ci.Previous());
+
+            // first()
+            ci.SetText("testing".ToCharArray(), 0, "testing".Length);
+            ci.Next();
+            // Sets the position to getBeginIndex() and returns the character at that position. 
+            assertEquals('t', ci.First());
+            assertEquals(ci.BeginIndex, ci.Index);
+            // or DONE if the text is empty
+            ci.SetText(new char[] { }, 0, 0);
+            assertEquals(CharacterIterator.DONE, ci.First());
+
+            // last()
+            ci.SetText("testing".ToCharArray(), 0, "testing".Length);
+            // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
+            // and returns the character at that position. 
+            assertEquals('g', ci.Last());
+            assertEquals(ci.Index, ci.EndIndex - 1);
+            // or DONE if the text is empty
+            ci.SetText(new char[] { }, 0, 0);
+            assertEquals(CharacterIterator.DONE, ci.Last());
+            assertEquals(ci.EndIndex, ci.Index);
+
+            // current()
+            // Gets the character at the current position (as returned by getIndex()). 
+            ci.SetText("testing".ToCharArray(), 0, "testing".Length);
+            assertEquals('t', ci.Current());
+            ci.Last();
+            ci.Next();
+            // or DONE if the current position is off the end of the text.
+            assertEquals(CharacterIterator.DONE, ci.Current());
+
+            // next()
+            ci.SetText("te".ToCharArray(), 0, 2);
+            // Increments the iterator's index by one and returns the character at the new index.
+            assertEquals('e', ci.Next());
+            assertEquals(1, ci.Index);
+            // or DONE if the new position is off the end of the text range.
+            assertEquals(CharacterIterator.DONE, ci.Next());
+            assertEquals(ci.EndIndex, ci.Index);
+
+            // setIndex()
+            ci.SetText("test".ToCharArray(), 0, "test".Length);
+            try
+            {
+                ci.SetIndex(5);
+                fail();
+            }
+            catch (Exception e)
+            {
+                assertTrue(e is System.ArgumentException);
+            }
+
+            // clone()
+            var text = "testing".ToCharArray();
+            ci.SetText(text, 0, text.Length);
+            ci.Next();
+            var ci2 = ci.Clone() as CharArrayIterator;
+            assertEquals(ci.Index, ci2.Index);
+            assertEquals(ci.Next(), ci2.Next());
+            assertEquals(ci.Last(), ci2.Last());
+        }
+
+        private void Consume(BreakIterator bi, CharacterIterator ci)
+        {
+            bi.SetText(ci.toString());
+            while (bi.Next() != BreakIterator.DONE)
+            {
+                ;
+            }
+        }
+    }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestRollingCharBuffer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestRollingCharBuffer.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestRollingCharBuffer.cs
index 5e9b2b6..fc34a7a 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestRollingCharBuffer.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestRollingCharBuffer.cs
@@ -1,136 +1,129 @@
 using System;
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Randomized.Generators;
+using Lucene.Net.Util;
+using NUnit.Framework;
 
-namespace org.apache.lucene.analysis.util
+namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
 
+    [TestFixture]
+    public class TestRollingCharBuffer : LuceneTestCase
+    {
+        [Test]
+        public virtual void Test()
+        {
+            var ITERS = AtLeast(1000);
 
-	using LuceneTestCase = org.apache.lucene.util.LuceneTestCase;
-	using TestUtil = org.apache.lucene.util.TestUtil;
+            var buffer = new RollingCharBuffer();
 
-	public class TestRollingCharBuffer : LuceneTestCase
-	{
+            var random = Random();
+            for (var iter = 0; iter < ITERS; iter++)
+            {   
+                var stringLen = random.NextBoolean() ? random.Next(50) : random.Next(20000);
+                
+                string s;
+                if (stringLen == 0)
+                {
+                    s = "";
+                }
+                else
+                {
+                    s = TestUtil.RandomUnicodeString(random, stringLen);
+                }
+                if (VERBOSE)
+                {
+                    Console.WriteLine("\nTEST: iter=" + iter + " s.length()=" + s.Length);
+                }
+                buffer.Reset(new StringReader(s));
+                var nextRead = 0;
+                var availCount = 0;
+                while (nextRead < s.Length)
+                {
+                    if (VERBOSE)
+                    {
+                        Console.WriteLine("  cycle nextRead=" + nextRead + " avail=" + availCount);
+                    }
+                    if (availCount == 0 || random.NextBoolean())
+                    {
+                        // Read next char
+                        if (VERBOSE)
+                        {
+                            Console.WriteLine("    new char");
+                        }
+                        assertEquals(s[nextRead], buffer.Get(nextRead));
+                        nextRead++;
+                        availCount++;
+                    }
+                    else if (random.NextBoolean())
+                    {
+                        // Read previous char
+                        var pos = TestUtil.NextInt(random, nextRead - availCount, nextRead - 1);
+                        if (VERBOSE)
+                        {
+                            Console.WriteLine("    old char pos=" + pos);
+                        }
+                        assertEquals(s[pos], buffer.Get(pos));
+                    }
+                    else
+                    {
+                        // Read slice
+                        int length;
+                        if (availCount == 1)
+                        {
+                            length = 1;
+                        }
+                        else
+                        {
+                            length = TestUtil.NextInt(random, 1, availCount);
+                        }
+                        int start;
+                        if (length == availCount)
+                        {
+                            start = nextRead - availCount;
+                        }
+                        else
+                        {
+                            start = nextRead - availCount + random.Next(availCount - length);
+                        }
+                        if (VERBOSE)
+                        {
+                            Console.WriteLine("    slice start=" + start + " length=" + length);
+                        }
+                        assertEquals(s.Substring(start, length), new string(buffer.Get(start, length)));
+                    }
 
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void test() throws Exception
-	  public virtual void test()
-	  {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int ITERS = atLeast(1000);
-		int ITERS = atLeast(1000);
-
-		RollingCharBuffer buffer = new RollingCharBuffer();
-
-		Random random = random();
-		for (int iter = 0;iter < ITERS;iter++)
-		{
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int stringLen = random.nextBoolean() ? random.nextInt(50) : random.nextInt(20000);
-		  int stringLen = random.nextBoolean() ? random.Next(50) : random.Next(20000);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String s;
-		  string s;
-		  if (stringLen == 0)
-		  {
-			s = "";
-		  }
-		  else
-		  {
-			s = TestUtil.randomUnicodeString(random, stringLen);
-		  }
-		  if (VERBOSE)
-		  {
-			Console.WriteLine("\nTEST: iter=" + iter + " s.length()=" + s.Length);
-		  }
-		  buffer.reset(new StringReader(s));
-		  int nextRead = 0;
-		  int availCount = 0;
-		  while (nextRead < s.Length)
-		  {
-			if (VERBOSE)
-			{
-			  Console.WriteLine("  cycle nextRead=" + nextRead + " avail=" + availCount);
-			}
-			if (availCount == 0 || random.nextBoolean())
-			{
-			  // Read next char
-			  if (VERBOSE)
-			  {
-				Console.WriteLine("    new char");
-			  }
-			  assertEquals(s[nextRead], buffer.get(nextRead));
-			  nextRead++;
-			  availCount++;
-			}
-			else if (random.nextBoolean())
-			{
-			  // Read previous char
-			  int pos = TestUtil.Next(random, nextRead - availCount, nextRead - 1);
-			  if (VERBOSE)
-			  {
-				Console.WriteLine("    old char pos=" + pos);
-			  }
-			  assertEquals(s[pos], buffer.get(pos));
-			}
-			else
-			{
-			  // Read slice
-			  int length;
-			  if (availCount == 1)
-			  {
-				length = 1;
-			  }
-			  else
-			  {
-				length = TestUtil.Next(random, 1, availCount);
-			  }
-			  int start;
-			  if (length == availCount)
-			  {
-				start = nextRead - availCount;
-			  }
-			  else
-			  {
-				start = nextRead - availCount + random.Next(availCount - length);
-			  }
-			  if (VERBOSE)
-			  {
-				Console.WriteLine("    slice start=" + start + " length=" + length);
-			  }
-			  assertEquals(s.Substring(start, length), new string(buffer.get(start, length)));
-			}
-
-			if (availCount > 0 && random.Next(20) == 17)
-			{
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int toFree = random.nextInt(availCount);
-			  int toFree = random.Next(availCount);
-			  if (VERBOSE)
-			  {
-				Console.WriteLine("    free " + toFree + " (avail=" + (availCount - toFree) + ")");
-			  }
-			  buffer.freeBefore(nextRead - (availCount - toFree));
-			  availCount -= toFree;
-			}
-		  }
-		}
-	  }
-	}
+                    if (availCount > 0 && random.Next(20) == 17)
+                    {
+                        var toFree = random.Next(availCount);
+                        if (VERBOSE)
+                        {
+                            Console.WriteLine("    free " + toFree + " (avail=" + (availCount - toFree) + ")");
+                        }
+                        buffer.FreeBefore(nextRead - (availCount - toFree));
+                        availCount -= toFree;
+                    }
+                }
+            }
+        }
+    }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs
index 0ea4c96..6d79d21 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs
@@ -1,251 +1,229 @@
-using System.Text;
-
-namespace org.apache.lucene.analysis.util
+using System.IO;
+using System.Text;
+using ICU4NET;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using NUnit.Framework;
+
+namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-
-	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-	/// <summary>
-	/// Basic tests for <seealso cref="SegmentingTokenizerBase"/> </summary>
-	public class TestSegmentingTokenizerBase : BaseTokenStreamTestCase
-	{
-	  private Analyzer sentence = new AnalyzerAnonymousInnerClassHelper();
-
-	  private class AnalyzerAnonymousInnerClassHelper : Analyzer
-	  {
-		  public AnalyzerAnonymousInnerClassHelper()
-		  {
-		  }
-
-		  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
-		  {
-			return new TokenStreamComponents(new WholeSentenceTokenizer(reader));
-		  }
-	  }
-
-	  private Analyzer sentenceAndWord = new AnalyzerAnonymousInnerClassHelper2();
-
-	  private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
-	  {
-		  public AnalyzerAnonymousInnerClassHelper2()
-		  {
-		  }
-
-		  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
-		  {
-			return new TokenStreamComponents(new SentenceAndWordTokenizer(reader));
-		  }
-	  }
-
-	  /// <summary>
-	  /// Some simple examples, just outputting the whole sentence boundaries as "terms" </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testBasics() throws java.io.IOException
-	  public virtual void testBasics()
-	  {
-		assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", new string[] {"The acronym for United States is U.S. but this doesn't end a sentence"});
-		assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", new string[] {"He said, \"Are you going?\" ", "John shook his head."});
-	  }
-
-	  /// <summary>
-	  /// Test a subclass that sets some custom attribute values </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testCustomAttributes() throws java.io.IOException
-	  public virtual void testCustomAttributes()
-	  {
-		assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", new string[] {"He", "said", "Are", "you", "going", "John", "shook", "his", "head"}, new int[] {0, 3, 10, 14, 18, 26, 31, 37, 41}, new int[] {2, 7, 13, 17, 23, 30, 36, 40, 45}, new int[] {1, 1, 1, 1, 1, 2, 1, 1, 1});
-	  }
-
-	  /// <summary>
-	  /// Tests tokenstream reuse </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testReuse() throws java.io.IOException
-	  public virtual void testReuse()
-	  {
-		assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"", new string[] {"He", "said", "Are", "you", "going"}, new int[] {0, 3, 10, 14, 18}, new int[] {2, 7, 13, 17, 23}, new int[] {1, 1, 1, 1, 1});
-		assertAnalyzesTo(sentenceAndWord, "John shook his head.", new string[] {"John", "shook", "his", "head"}, new int[] {0, 5, 11, 15}, new int[] {4, 10, 14, 19}, new int[] {1, 1, 1, 1});
-	  }
-
-	  /// <summary>
-	  /// Tests TokenStream.end() </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testEnd() throws java.io.IOException
-	  public virtual void testEnd()
-	  {
-		// BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
-		// we add some junk whitespace to the end just to test it.
-		assertAnalyzesTo(sentenceAndWord, "John shook his head          ", new string[] {"John", "shook", "his", "head"});
-		assertAnalyzesTo(sentenceAndWord, "John shook his head.          ", new string[] {"John", "shook", "his", "head"});
-	  }
-
-	  /// <summary>
-	  /// Tests terms which span across boundaries </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHugeDoc() throws java.io.IOException
-	  public virtual void testHugeDoc()
-	  {
-		StringBuilder sb = new StringBuilder();
-		char[] whitespace = new char[4094];
-		Arrays.fill(whitespace, '\n');
-		sb.Append(whitespace);
-		sb.Append("testing 1234");
-		string input = sb.ToString();
-		assertAnalyzesTo(sentenceAndWord, input, new string[] {"testing", "1234"});
-	  }
-
-	  /// <summary>
-	  /// Tests the handling of binary/malformed data </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testHugeTerm() throws java.io.IOException
-	  public virtual void testHugeTerm()
-	  {
-		StringBuilder sb = new StringBuilder();
-		for (int i = 0; i < 10240; i++)
-		{
-		  sb.Append('a');
-		}
-		string input = sb.ToString();
-		char[] token = new char[1024];
-		Arrays.fill(token, 'a');
-		string expectedToken = new string(token);
-		string[] expected = new string[] {expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken};
-		assertAnalyzesTo(sentence, input, expected);
-	  }
-
-	  /// <summary>
-	  /// blast some random strings through the analyzer </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testRandomStrings() throws Exception
-	  public virtual void testRandomStrings()
-	  {
-		checkRandomData(random(), sentence, 10000 * RANDOM_MULTIPLIER);
-		checkRandomData(random(), sentenceAndWord, 10000 * RANDOM_MULTIPLIER);
-	  }
-
-	  // some tokenizers for testing
-
-	  /// <summary>
-	  /// silly tokenizer that just returns whole sentences as tokens </summary>
-	  internal class WholeSentenceTokenizer : SegmentingTokenizerBase
-	  {
-		internal int sentenceStart, sentenceEnd;
-		internal bool hasSentence;
-
-		internal CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
-		internal OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
-
-		public WholeSentenceTokenizer(Reader reader) : base(reader, BreakIterator.getSentenceInstance(Locale.ROOT))
-		{
-		}
-
-		protected internal override void setNextSentence(int sentenceStart, int sentenceEnd)
-		{
-		  this.sentenceStart = sentenceStart;
-		  this.sentenceEnd = sentenceEnd;
-		  hasSentence = true;
-		}
-
-		protected internal override bool incrementWord()
-		{
-		  if (hasSentence)
-		  {
-			hasSentence = false;
-			clearAttributes();
-			termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart);
-			offsetAtt.setOffset(correctOffset(offset + sentenceStart), correctOffset(offset + sentenceEnd));
-			return true;
-		  }
-		  else
-		  {
-			return false;
-		  }
-		}
-	  }
-
-	  /// <summary>
-	  /// simple tokenizer, that bumps posinc + 1 for tokens after a 
-	  /// sentence boundary to inhibit phrase queries without slop.
-	  /// </summary>
-	  internal class SentenceAndWordTokenizer : SegmentingTokenizerBase
-	  {
-		internal int sentenceStart, sentenceEnd;
-		internal int wordStart, wordEnd;
-		internal int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
-
-		internal CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
-		internal OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
-		internal PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
-
-		public SentenceAndWordTokenizer(Reader reader) : base(reader, BreakIterator.getSentenceInstance(Locale.ROOT))
-		{
-		}
-
-		protected internal override void setNextSentence(int sentenceStart, int sentenceEnd)
-		{
-		  this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
-		  this.sentenceEnd = sentenceEnd;
-		  posBoost++;
-		}
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
-		public override void reset()
-		{
-		  base.reset();
-		  posBoost = -1;
-		}
-
-		protected internal override bool incrementWord()
-		{
-		  wordStart = wordEnd;
-		  while (wordStart < sentenceEnd)
-		  {
-			if (char.IsLetterOrDigit(buffer[wordStart]))
-			{
-			  break;
-			}
-			wordStart++;
-		  }
-
-		  if (wordStart == sentenceEnd)
-		  {
-			  return false;
-		  }
-
-		  wordEnd = wordStart + 1;
-		  while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd]))
-		  {
-			wordEnd++;
-		  }
-
-		  clearAttributes();
-		  termAtt.copyBuffer(buffer, wordStart, wordEnd - wordStart);
-		  offsetAtt.setOffset(correctOffset(offset + wordStart), correctOffset(offset + wordEnd));
-		  posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost;
-		  posBoost = 0;
-		  return true;
-		}
-	  }
-	}
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Basic tests for <seealso cref="SegmentingTokenizerBase"/> </summary>
+    [TestFixture]
+    public class TestSegmentingTokenizerBase : BaseTokenStreamTestCase
+    {
+        private Analyzer sentence = new AnalyzerAnonymousInnerClassHelper();
+
+        private class AnalyzerAnonymousInnerClassHelper : Analyzer
+        {
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                return new TokenStreamComponents(new WholeSentenceTokenizer(reader));
+            }
+        }
+
+        private Analyzer sentenceAndWord = new AnalyzerAnonymousInnerClassHelper2();
+
+        private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+        {
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                return new TokenStreamComponents(new SentenceAndWordTokenizer(reader));
+            }
+        }
+
+        [Test]
+        public virtual void TestBasics()
+        {
+            AssertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", new[] { "The acronym for United States is U.S. but this doesn't end a sentence" });
+            AssertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", new[] { "He said, \"Are you going?\" ", "John shook his head." });
+        }
+
+        [Test]
+        public virtual void TestCustomAttributes()
+        {
+            AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", new[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" }, new[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 }, new[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 }, new[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 });
+        }
+
+        [Test]
+        public virtual void TestReuse()
+        {
+            AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"", new[] { "He", "said", "Are", "you", "going" }, new[] { 0, 3, 10, 14, 18 }, new[] { 2, 7, 13, 17, 23 }, new[] { 1, 1, 1, 1, 1 });
+            AssertAnalyzesTo(sentenceAndWord, "John shook his head.", new[] { "John", "shook", "his", "head" }, new[] { 0, 5, 11, 15 }, new[] { 4, 10, 14, 19 }, new[] { 1, 1, 1, 1 });
+        }
+
+        [Test]
+        public virtual void TestEnd()
+        {
+            // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
+            // we add some junk whitespace to the end just to test it.
+            AssertAnalyzesTo(sentenceAndWord, "John shook his head          ", new[] { "John", "shook", "his", "head" });
+            AssertAnalyzesTo(sentenceAndWord, "John shook his head.          ", new[] { "John", "shook", "his", "head" });
+        }
+
+        [Test]
+        public virtual void TestHugeDoc()
+        {
+            var sb = new StringBuilder();
+            var whitespace = new char[4094];
+            Arrays.Fill(whitespace, '\n');
+            sb.Append(whitespace);
+            sb.Append("testing 1234");
+            var input = sb.ToString();
+            AssertAnalyzesTo(sentenceAndWord, input, new[] { "testing", "1234" });
+        }
+
+        [Test]
+        public virtual void TestHugeTerm()
+        {
+            var sb = new StringBuilder();
+            for (int i = 0; i < 10240; i++)
+            {
+                sb.Append('a');
+            }
+            var input = sb.ToString();
+            var token = new char[1024];
+            Arrays.Fill(token, 'a');
+            var expectedToken = new string(token);
+            var expected = new[] { expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken };
+            AssertAnalyzesTo(sentence, input, expected);
+        }
+
+        [Test]
+        public virtual void TestRandomStrings()
+        {
+            CheckRandomData(Random(), sentence, 10000 * RANDOM_MULTIPLIER);
+            CheckRandomData(Random(), sentenceAndWord, 10000 * RANDOM_MULTIPLIER);
+        }
+
+        // some tokenizers for testing
+
+        /// <summary>
+        /// silly tokenizer that just returns whole sentences as tokens </summary>
+        sealed class WholeSentenceTokenizer : SegmentingTokenizerBase
+        {
+            internal int sentenceStart, sentenceEnd;
+            internal bool hasSentence;
+
+            internal ICharTermAttribute termAtt;
+            internal IOffsetAttribute offsetAtt;
+
+            public WholeSentenceTokenizer(TextReader reader)
+                : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
+            {
+                termAtt = AddAttribute<ICharTermAttribute>();
+                offsetAtt = AddAttribute<IOffsetAttribute>();
+            }
+
+            protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
+            {
+                this.sentenceStart = sentenceStart;
+                this.sentenceEnd = sentenceEnd;
+                hasSentence = true;
+            }
+
+            protected override bool IncrementWord()
+            {
+                if (hasSentence)
+                {
+                    hasSentence = false;
+                    ClearAttributes();
+                    termAtt.CopyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart);
+                    offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart), CorrectOffset(offset + sentenceEnd));
+                    return true;
+                }
+                else
+                {
+                    return false;
+                }
+            }
+        }
+
+        /// <summary>
+        /// simple tokenizer, that bumps posinc + 1 for tokens after a 
+        /// sentence boundary to inhibit phrase queries without slop.
+        /// </summary>
+        sealed class SentenceAndWordTokenizer : SegmentingTokenizerBase
+        {
+            internal int sentenceStart, sentenceEnd;
+            internal int wordStart, wordEnd;
+            internal int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
+
+            internal ICharTermAttribute termAtt;
+            internal IOffsetAttribute offsetAtt;
+            internal IPositionIncrementAttribute posIncAtt;
+
+            public SentenceAndWordTokenizer(TextReader reader)
+                : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
+            {
+                termAtt = AddAttribute<ICharTermAttribute>();
+                offsetAtt = AddAttribute<IOffsetAttribute>();
+                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+            }
+
+            protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
+            {
+                this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
+                this.sentenceEnd = sentenceEnd;
+                posBoost++;
+            }
+
+            public override void Reset()
+            {
+                base.Reset();
+                posBoost = -1;
+            }
+
+            protected override bool IncrementWord()
+            {
+                wordStart = wordEnd;
+                while (wordStart < sentenceEnd)
+                {
+                    if (char.IsLetterOrDigit(buffer[wordStart]))
+                    {
+                        break;
+                    }
+                    wordStart++;
+                }
+
+                if (wordStart == sentenceEnd)
+                {
+                    return false;
+                }
+
+                wordEnd = wordStart + 1;
+                while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd]))
+                {
+                    wordEnd++;
+                }
+
+                ClearAttributes();
+                termAtt.CopyBuffer(buffer, wordStart, wordEnd - wordStart);
+                offsetAtt.SetOffset(CorrectOffset(offset + wordStart), CorrectOffset(offset + wordEnd));
+                posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost;
+                posBoost = 0;
+                return true;
+            }
+        }
+    }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
index bbb9b68..2475ab5 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
+++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
@@ -20,6 +20,7 @@
     <DefineConstants>DEBUG;TRACE</DefineConstants>
     <ErrorReport>prompt</ErrorReport>
     <WarningLevel>4</WarningLevel>
+    <PlatformTarget>x86</PlatformTarget>
   </PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
     <DebugType>pdbonly</DebugType>
@@ -28,8 +29,17 @@
     <DefineConstants>TRACE</DefineConstants>
     <ErrorReport>prompt</ErrorReport>
     <WarningLevel>4</WarningLevel>
+    <PlatformTarget>x86</PlatformTarget>
   </PropertyGroup>
   <ItemGroup>
+    <Reference Include="ICU4NET, Version=1.0.5593.31013, Culture=neutral, processorArchitecture=x86">
+      <HintPath>..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\lib\net45\ICU4NET.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
+    <Reference Include="ICU4NETExtension, Version=1.0.0.0, Culture=neutral, processorArchitecture=x86">
+      <HintPath>..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\lib\net45\ICU4NETExtension.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
     <Reference Include="nunit.framework, Version=2.6.3.13283, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL">
       <SpecificVersion>False</SpecificVersion>
       <HintPath>..\..\packages\NUnit.2.6.3\lib\nunit.framework.dll</HintPath>
@@ -41,6 +51,9 @@
   </ItemGroup>
   <ItemGroup>
     <Compile Include="Analysis\Util\TestCharacterUtils.cs" />
+    <Compile Include="Analysis\Util\TestCharArrayIterator.cs" />
+    <Compile Include="Analysis\Util\TestRollingCharBuffer.cs" />
+    <Compile Include="Analysis\Util\TestSegmentingTokenizerBase.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
   </ItemGroup>
   <ItemGroup>
@@ -57,6 +70,51 @@
       <Name>Lucene.Net.TestFramework</Name>
     </ProjectReference>
   </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icudt55.dll">
+      <Link>icudt55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuin55.dll">
+      <Link>icuin55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuio55.dll">
+      <Link>icuio55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icule55.dll">
+      <Link>icule55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\iculx55.dll">
+      <Link>iculx55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icutu55.dll">
+      <Link>icutu55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\packages\ICU4NET-ICU4C55.1-bin32.1.0.0\ref\icuuc55.dll">
+      <Link>icuuc55.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/be39dfd4/src/Lucene.Net.Tests.Analysis.Common/packages.config
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/packages.config b/src/Lucene.Net.Tests.Analysis.Common/packages.config
new file mode 100644
index 0000000..7685c3b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/packages.config
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="ICU4NET-ICU4C55.1-bin32" version="1.0.0" targetFramework="net451" />
+</packages>
\ No newline at end of file


Mime
View raw message