lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject lucenenet git commit: port of CharTokenizer and other code it depends on, and corresponding unit tests
Date Thu, 14 Jan 2016 09:41:49 GMT
Repository: lucenenet
Updated Branches:
  refs/heads/master be39dfd47 -> 4dcbcd1ad


port of CharTokenizer and other code it depends on, and corresponding unit tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/4dcbcd1a
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/4dcbcd1a
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/4dcbcd1a

Branch: refs/heads/master
Commit: 4dcbcd1adb698a5eb18616e67ff115e00fcd158b
Parents: be39dfd
Author: Laimonas Simutis <laimis@gmail.com>
Authored: Fri Jan 8 16:30:47 2016 -0500
Committer: Laimonas Simutis <laimis@gmail.com>
Committed: Fri Jan 8 16:30:47 2016 -0500

----------------------------------------------------------------------
 .../Analysis/Core/LetterTokenizer.cs            |   6 +-
 .../Analysis/Core/LowerCaseTokenizer.cs         |   2 +-
 .../Analysis/Util/CharTokenizer.cs              |   2 +-
 .../Analysis/Util/CharacterUtils.cs             |   2 +-
 .../Lucene.Net.Analysis.Common.csproj           |   3 +
 src/Lucene.Net.Core/Analysis/Analyzer.cs        |  22 +
 src/Lucene.Net.Core/Support/Character.cs        |  15 +
 .../Analysis/Util/TestCharTokenizers.cs         | 504 +++++++++----------
 .../Lucene.Net.Tests.Analysis.Common.csproj     |   1 +
 9 files changed, 287 insertions(+), 270 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
index a1f80ea..de59e18 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -75,10 +75,10 @@ namespace Lucene.Net.Analysis.Core
         /// <summary>
         /// Collects only characters which satisfy
         /// <seealso cref="Character#isLetter(int)"/>.
-        /// </summary>	  
-        protected override bool IsTokenChar(char c)
+        /// </summary>
+        protected override bool IsTokenChar(int c)
         {
-            return char.IsLetter(c);
+            return Character.IsLetter(c);
         }
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
index b2ff2a5..94cfbb4 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
@@ -69,7 +69,7 @@ namespace Lucene.Net.Analysis.Core
         ///          the attribute factory to use for this <seealso cref="Tokenizer"/>
</param>
         /// <param name="in">
         ///          the input to split up into tokens </param>
-        public LowerCaseTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader
@in)
+        public LowerCaseTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeFactory
factory, TextReader @in)
             : base(matchVersion, factory, @in)
         {
         }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
index 68c0d47..19772ee 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharTokenizer.cs
@@ -114,7 +114,7 @@ namespace Lucene.Net.Analysis.Util
         /// predicate. Codepoints for which this is false are used to define token
         /// boundaries and are not included in tokens.
         /// </summary>
-        protected abstract bool IsTokenChar(char c);
+        protected abstract bool IsTokenChar(int c);
 
         /// <summary>
         /// Called on each token character to normalize it before it is added to the

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
index 004b368..7237f9b 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
@@ -248,7 +248,7 @@ namespace Lucene.Net.Analysis.Util
             while (read < len)
             {
                 int r = reader.Read(dest, offset + read, len - read);
-                if (r == 0)
+                if (r <= 0)
                 {
                     break;
                 }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
index 84b1c6a..e0c7b3b 100644
--- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
+++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
@@ -43,6 +43,8 @@
     <Reference Include="Microsoft.CSharp" />
   </ItemGroup>
   <ItemGroup>
+    <Compile Include="Analysis\Core\LetterTokenizer.cs" />
+    <Compile Include="Analysis\Core\LowerCaseTokenizer.cs" />
     <Compile Include="Analysis\Util\CharacterIterator.cs" />
     <Compile Include="Analysis\Util\CharacterUtils.cs">
       <SubType>Code</SubType>
@@ -50,6 +52,7 @@
     <Compile Include="Analysis\Util\CharArrayIterator.cs">
       <SubType>Code</SubType>
     </Compile>
+    <Compile Include="Analysis\Util\CharTokenizer.cs" />
     <Compile Include="Analysis\Util\ICharacterIterator.cs" />
     <Compile Include="Analysis\Util\RollingCharBuffer.cs" />
     <Compile Include="Analysis\Util\SegmentingTokenizerBase.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Core/Analysis/Analyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Analysis/Analyzer.cs b/src/Lucene.Net.Core/Analysis/Analyzer.cs
index ce6d98e..0d0e250 100644
--- a/src/Lucene.Net.Core/Analysis/Analyzer.cs
+++ b/src/Lucene.Net.Core/Analysis/Analyzer.cs
@@ -143,6 +143,28 @@ namespace Lucene.Net.Analysis
             return components.TokenStream;
         }
 
+        public TokenStream TokenStream(string fieldName, string text)
+        {
+            TokenStreamComponents components = _reuseStrategy.GetReusableComponents(this,
fieldName);
+            ReusableStringReader strReader =
+                (components == null || components.ReusableStringReader == null)
+                    ? new ReusableStringReader()
+                    : components.ReusableStringReader;
+            strReader.Value = text;
+            var r = InitReader(fieldName, strReader);
+            if (components == null)
+            {
+                components = CreateComponents(fieldName, r);
+                _reuseStrategy.SetReusableComponents(this, fieldName, components);
+            }
+            else
+            {
+                components.Reader = r;
+            }
+            components.ReusableStringReader = strReader;
+            return components.TokenStream;
+        }
+
         /// <summary>
         /// Override this if you want to add a CharFilter chain.
         /// <p>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Core/Support/Character.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Support/Character.cs b/src/Lucene.Net.Core/Support/Character.cs
index 134ded7..e104aef 100644
--- a/src/Lucene.Net.Core/Support/Character.cs
+++ b/src/Lucene.Net.Core/Support/Character.cs
@@ -20,6 +20,7 @@
 */
 
 using System;
+using System.Globalization;
 using Lucene.Net.Util;
 
 namespace Lucene.Net.Support
@@ -263,5 +264,19 @@ namespace Lucene.Net.Support
             }
             return x;
         }
+
+        public static bool IsLetter(int c)
+        {
+            var str = Char.ConvertFromUtf32(c);
+
+            var unicodeCategory = Char.GetUnicodeCategory(str, 0);
+
+            return unicodeCategory == UnicodeCategory.LowercaseLetter ||
+                   unicodeCategory == UnicodeCategory.UppercaseLetter ||
+                   unicodeCategory == UnicodeCategory.TitlecaseLetter ||
+                   unicodeCategory == UnicodeCategory.ModifierLetter ||
+                   unicodeCategory== UnicodeCategory.OtherLetter;
+
+        }
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
index 3581c7b..4bf57e4 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
@@ -1,268 +1,244 @@
-using System.Text;
-
-namespace org.apache.lucene.analysis.util
+using System;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
 {
 
-	/*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-
-	using LetterTokenizer = org.apache.lucene.analysis.core.LetterTokenizer;
-	using LowerCaseTokenizer = org.apache.lucene.analysis.core.LowerCaseTokenizer;
-	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-	using IOUtils = org.apache.lucene.util.IOUtils;
-	using TestUtil = org.apache.lucene.util.TestUtil;
-
-
-	/// <summary>
-	/// Testcase for <seealso cref="CharTokenizer"/> subclasses
-	/// </summary>
-	public class TestCharTokenizers : BaseTokenStreamTestCase
-	{
-
-	  /*
-	   * test to read surrogate pairs without loosing the pairing 
-	   * if the surrogate pair is at the border of the internal IO buffer
-	   */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testReadSupplementaryChars() throws java.io.IOException
-	  public virtual void testReadSupplementaryChars()
-	  {
-		StringBuilder builder = new StringBuilder();
-		// create random input
-		int num = 1024 + random().Next(1024);
-		num *= RANDOM_MULTIPLIER;
-		for (int i = 1; i < num; i++)
-		{
-		  builder.Append("\ud801\udc1cabc");
-		  if ((i % 10) == 0)
-		  {
-			builder.Append(" ");
-		  }
-		}
-		// internal buffer size is 1024 make sure we have a surrogate pair right at the border
-		builder.Insert(1023, "\ud801\udc1c");
-		Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
-		assertTokenStreamContents(tokenizer, builder.ToString().ToLower(Locale.ROOT).split(" "));
-	  }
-
-	  /*
-	   * test to extend the buffer TermAttribute buffer internally. If the internal
-	   * alg that extends the size of the char array only extends by 1 char and the
-	   * next char to be filled in is a supplementary codepoint (using 2 chars) an
-	   * index out of bound exception is triggered.
-	   */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testExtendCharBuffer() throws java.io.IOException
-	  public virtual void testExtendCharBuffer()
-	  {
-		for (int i = 0; i < 40; i++)
-		{
-		  StringBuilder builder = new StringBuilder();
-		  for (int j = 0; j < 1 + i; j++)
-		  {
-			builder.Append("a");
-		  }
-		  builder.Append("\ud801\udc1cabc");
-		  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
-		  assertTokenStreamContents(tokenizer, new string[] {builder.ToString().ToLower(Locale.ROOT)});
-		}
-	  }
-
-	  /*
-	   * tests the max word length of 255 - tokenizer will split at the 255 char no matter what
happens
-	   */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testMaxWordLength() throws java.io.IOException
-	  public virtual void testMaxWordLength()
-	  {
-		StringBuilder builder = new StringBuilder();
-
-		for (int i = 0; i < 255; i++)
-		{
-		  builder.Append("A");
-		}
-		Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()
+ builder.ToString()));
-		assertTokenStreamContents(tokenizer, new string[] {builder.ToString().ToLower(Locale.ROOT),
builder.ToString().ToLower(Locale.ROOT)});
-	  }
-
-	  /*
-	   * tests the max word length of 255 with a surrogate pair at position 255
-	   */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testMaxWordLengthWithSupplementary() throws java.io.IOException
-	  public virtual void testMaxWordLengthWithSupplementary()
-	  {
-		StringBuilder builder = new StringBuilder();
-
-		for (int i = 0; i < 254; i++)
-		{
-		  builder.Append("A");
-		}
-		builder.Append("\ud801\udc1c");
-		Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()
+ builder.ToString()));
-		assertTokenStreamContents(tokenizer, new string[] {builder.ToString().ToLower(Locale.ROOT),
builder.ToString().ToLower(Locale.ROOT)});
-	  }
-
-	  // LUCENE-3642: normalize SMP->BMP and check that offsets are correct
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testCrossPlaneNormalization() throws java.io.IOException
-	  public virtual void testCrossPlaneNormalization()
-	  {
-		Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
-		int num = 1000 * RANDOM_MULTIPLIER;
-		for (int i = 0; i < num; i++)
-		{
-		  string s = TestUtil.randomUnicodeString(random());
-		  TokenStream ts = analyzer.tokenStream("foo", s);
-		  try
-		  {
-			ts.reset();
-			OffsetAttribute offsetAtt = ts.addAttribute(typeof(OffsetAttribute));
-			while (ts.incrementToken())
-			{
-			  string highlightedText = StringHelperClass.SubstringSpecial(s, offsetAtt.startOffset(),
offsetAtt.endOffset());
-			  for (int j = 0, cp = 0; j < highlightedText.Length; j += char.charCount(cp))
-			  {
-				cp = char.ConvertToUtf32(highlightedText, j);
-				assertTrue("non-letter:" + cp.ToString("x"), char.IsLetter(cp));
-			  }
-			}
-			ts.end();
-		  }
-		  finally
-		  {
-			IOUtils.closeWhileHandlingException(ts);
-		  }
-		}
-		// just for fun
-		checkRandomData(random(), analyzer, num);
-	  }
-
-	  private class AnalyzerAnonymousInnerClassHelper : Analyzer
-	  {
-		  private readonly TestCharTokenizers outerInstance;
-
-		  public AnalyzerAnonymousInnerClassHelper(TestCharTokenizers outerInstance)
-		  {
-			  this.outerInstance = outerInstance;
-		  }
-
-		  protected internal override TokenStreamComponents createComponents(string fieldName,
Reader reader)
-		  {
-			Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper(this, TEST_VERSION_CURRENT,
reader);
-			return new TokenStreamComponents(tokenizer, tokenizer);
-		  }
-
-		  private class LetterTokenizerAnonymousInnerClassHelper : LetterTokenizer
-		  {
-			  private readonly AnalyzerAnonymousInnerClassHelper outerInstance;
-
-			  public LetterTokenizerAnonymousInnerClassHelper(AnalyzerAnonymousInnerClassHelper outerInstance,
UnknownType TEST_VERSION_CURRENT, Reader reader) : base(TEST_VERSION_CURRENT, reader)
-			  {
-				  this.outerInstance = outerInstance;
-			  }
-
-			  protected internal override int normalize(int c)
-			  {
-				if (c > 0xffff)
-				{
-				  return 'δ';
-				}
-				else
-				{
-				  return c;
-				}
-			  }
-		  }
-	  }
-
-	  // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public void testCrossPlaneNormalization2() throws java.io.IOException
-	  public virtual void testCrossPlaneNormalization2()
-	  {
-		Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this);
-		int num = 1000 * RANDOM_MULTIPLIER;
-		for (int i = 0; i < num; i++)
-		{
-		  string s = TestUtil.randomUnicodeString(random());
-		  TokenStream ts = analyzer.tokenStream("foo", s);
-		  try
-		  {
-			ts.reset();
-			OffsetAttribute offsetAtt = ts.addAttribute(typeof(OffsetAttribute));
-			while (ts.incrementToken())
-			{
-			  string highlightedText = StringHelperClass.SubstringSpecial(s, offsetAtt.startOffset(),
offsetAtt.endOffset());
-			  for (int j = 0, cp = 0; j < highlightedText.Length; j += char.charCount(cp))
-			  {
-				cp = char.ConvertToUtf32(highlightedText, j);
-				assertTrue("non-letter:" + cp.ToString("x"), char.IsLetter(cp));
-			  }
-			}
-			ts.end();
-		  }
-		  finally
-		  {
-			IOUtils.closeWhileHandlingException(ts);
-		  }
-		}
-		// just for fun
-		checkRandomData(random(), analyzer, num);
-	  }
-
-	  private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
-	  {
-		  private readonly TestCharTokenizers outerInstance;
-
-		  public AnalyzerAnonymousInnerClassHelper2(TestCharTokenizers outerInstance)
-		  {
-			  this.outerInstance = outerInstance;
-		  }
-
-		  protected internal override TokenStreamComponents createComponents(string fieldName,
Reader reader)
-		  {
-			Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper2(this, TEST_VERSION_CURRENT,
reader);
-			return new TokenStreamComponents(tokenizer, tokenizer);
-		  }
-
-		  private class LetterTokenizerAnonymousInnerClassHelper2 : LetterTokenizer
-		  {
-			  private readonly AnalyzerAnonymousInnerClassHelper2 outerInstance;
-
-			  public LetterTokenizerAnonymousInnerClassHelper2(AnalyzerAnonymousInnerClassHelper2
outerInstance, UnknownType TEST_VERSION_CURRENT, Reader reader) : base(TEST_VERSION_CURRENT,
reader)
-			  {
-				  this.outerInstance = outerInstance;
-			  }
-
-			  protected internal override int normalize(int c)
-			  {
-				if (c <= 0xffff)
-				{
-				  return 0x1043C;
-				}
-				else
-				{
-				  return c;
-				}
-			  }
-		  }
-	  }
-	}
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Testcase for <seealso cref="CharTokenizer"/> subclasses
+    /// </summary>
+    [TestFixture]
+    public class TestCharTokenizers : BaseTokenStreamTestCase
+    {
+
+        /*
+         * test to read surrogate pairs without loosing the pairing 
+         * if the surrogate pair is at the border of the internal IO buffer
+         */
+        [Test]
+        public virtual void TestReadSupplementaryChars()
+        {
+            var builder = new StringBuilder();
+            // create random input
+            var num = 1024 + Random().Next(1024);
+            num *= RANDOM_MULTIPLIER;
+            for (var i = 1; i < num; i++)
+            {
+                builder.Append("\ud801\udc1cabc");
+                if ((i % 10) == 0)
+                {
+                    builder.Append(" ");
+                }
+            }
+            // internal buffer size is 1024 make sure we have a surrogate pair right at the
border
+            builder.Insert(1023, "\ud801\udc1c");
+            var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
+            AssertTokenStreamContents(tokenizer, builder.ToString().ToLowerInvariant().Split('
'));
+        }
+
+        /*
+       * test to extend the buffer TermAttribute buffer internally. If the internal
+       * alg that extends the size of the char array only extends by 1 char and the
+       * next char to be filled in is a supplementary codepoint (using 2 chars) an
+       * index out of bound exception is triggered.
+       */
+        [Test]
+        public virtual void TestExtendCharBuffer()
+        {
+            for (var i = 0; i < 40; i++)
+            {
+                var builder = new StringBuilder();
+                for (int j = 0; j < 1 + i; j++)
+                {
+                    builder.Append("a");
+                }
+                builder.Append("\ud801\udc1cabc");
+                var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
+                AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant()
});
+            }
+        }
+
+        /*
+         * tests the max word length of 255 - tokenizer will split at the 255 char no matter
what happens
+         */
+        [Test]
+        public virtual void TestMaxWordLength()
+        {
+            var builder = new StringBuilder();
+
+            for (var i = 0; i < 255; i++)
+            {
+                builder.Append("A");
+            }
+            var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()
+ builder.ToString()));
+            AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant(),
builder.ToString().ToLowerInvariant() });
+        }
+
+        /*
+         * tests the max word length of 255 with a surrogate pair at position 255
+         */
+        [Test]
+        public virtual void TestMaxWordLengthWithSupplementary()
+        {
+            var builder = new StringBuilder();
+
+            for (var i = 0; i < 254; i++)
+            {
+                builder.Append("A");
+            }
+            builder.Append("\ud801\udc1c");
+            var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()
+ builder.ToString()));
+            AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant(),
builder.ToString().ToLowerInvariant() });
+        }
+
+        // LUCENE-3642: normalize SMP->BMP and check that offsets are correct
+        [Test]
+        public virtual void TestCrossPlaneNormalization()
+        {
+            var analyzer = new AnalyzerAnonymousInnerClassHelper();
+            var num = 1000 * RANDOM_MULTIPLIER;
+            for (var i = 0; i < num; i++)
+            {
+                var s = TestUtil.RandomUnicodeString(Random());
+                var ts = analyzer.TokenStream("foo", s);
+                try
+                {
+                    ts.Reset();
+                    var offsetAtt = ts.AddAttribute<IOffsetAttribute>();
+                    while (ts.IncrementToken())
+                    {
+                        var highlightedText = s.Substring(offsetAtt.StartOffset(), offsetAtt.EndOffset()
- offsetAtt.StartOffset());
+                        for (int j = 0, cp = 0; j < highlightedText.Length; j += Character.CharCount(cp))
+                        {
+                            cp = char.ConvertToUtf32(highlightedText, j);
+                            assertTrue("non-letter:" + cp.ToString("x"), Character.IsLetter(cp));
+                        }
+                    }
+                    ts.End();
+                }
+                finally
+                {
+                    IOUtils.CloseWhileHandlingException(ts);
+                }
+            }
+            // just for fun
+            CheckRandomData(Random(), analyzer, num);
+        }
+
+        private sealed class AnalyzerAnonymousInnerClassHelper : Analyzer
+        {
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader
reader)
+            {
+                Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT,
reader);
+                return new TokenStreamComponents(tokenizer, tokenizer);
+            }
+
+            private sealed class LetterTokenizerAnonymousInnerClassHelper : LetterTokenizer
+            {
+                public LetterTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT,
TextReader reader)
+                    : base(TEST_VERSION_CURRENT, reader)
+                {
+                }
+
+                protected override int Normalize(int c)
+                {
+                    if (c > 0xffff)
+                    {
+                        return 'δ';
+                    }
+                    else
+                    {
+                        return c;
+                    }
+                }
+            }
+        }
+
+        // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
+        [Test]
+        public virtual void TestCrossPlaneNormalization2()
+        {
+            var analyzer = new AnalyzerAnonymousInnerClassHelper2();
+            var num = 1000 * RANDOM_MULTIPLIER;
+            for (var i = 0; i < num; i++)
+            {
+                var s = TestUtil.RandomUnicodeString(Random());
+                var ts = analyzer.TokenStream("foo", s);
+                try
+                {
+                    ts.Reset();
+                    var offsetAtt = ts.AddAttribute<IOffsetAttribute>();
+                    while (ts.IncrementToken())
+                    {
+                        string highlightedText = s.Substring(offsetAtt.StartOffset(), offsetAtt.EndOffset()
- offsetAtt.StartOffset());
+                        for (int j = 0, cp = 0; j < highlightedText.Length; j += Character.CharCount(cp))
+                        {
+                            cp = char.ConvertToUtf32(highlightedText, j);
+                            assertTrue("non-letter:" + cp.ToString("x"), Character.IsLetter(cp));
+                        }
+                    }
+                    ts.End();
+                }
+                finally
+                {
+                    IOUtils.CloseWhileHandlingException(ts);
+                }
+            }
+            // just for fun
+            CheckRandomData(Random(), analyzer, num);
+        }
+
+        private sealed class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+        {
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader
reader)
+            {
+                Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper2(TEST_VERSION_CURRENT,
reader);
+                return new TokenStreamComponents(tokenizer, tokenizer);
+            }
+
+            private sealed class LetterTokenizerAnonymousInnerClassHelper2 : LetterTokenizer
+            {
+                public LetterTokenizerAnonymousInnerClassHelper2(LuceneVersion TEST_VERSION_CURRENT,
TextReader reader)
+                    : base(TEST_VERSION_CURRENT, reader)
+                {
+                }
+
+                protected override int Normalize(int c)
+                {
+                    if (c <= 0xffff)
+                    {
+                        return 0x1043C;
+                    }
+                    else
+                    {
+                        return c;
+                    }
+                }
+            }
+        }
+    }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/4dcbcd1a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
index 2475ab5..722b345 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
+++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj
@@ -52,6 +52,7 @@
   <ItemGroup>
     <Compile Include="Analysis\Util\TestCharacterUtils.cs" />
     <Compile Include="Analysis\Util\TestCharArrayIterator.cs" />
+    <Compile Include="Analysis\Util\TestCharTokenizers.cs" />
     <Compile Include="Analysis\Util\TestRollingCharBuffer.cs" />
     <Compile Include="Analysis\Util\TestSegmentingTokenizerBase.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />


Mime
View raw message