lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [27/52] [abbrv] lucenenet git commit: Added test to demonstrate a problem with making the CharTokenizer.IsTokenChar() parameter a char rather than an int.
Date Thu, 01 Sep 2016 14:39:48 GMT
Added test to demonstrate a problem with making the CharTokenizer.IsTokenChar() parameter a
char rather than an int.


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c36a0bd1
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c36a0bd1
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c36a0bd1

Branch: refs/heads/master
Commit: c36a0bd1239061a07756b7735dcdd7f3dab016a8
Parents: 56cdc04
Author: Shad Storhaug <shad@shadstorhaug.com>
Authored: Tue Aug 23 15:39:52 2016 +0700
Committer: Shad Storhaug <shad@shadstorhaug.com>
Committed: Tue Aug 23 15:55:19 2016 +0700

----------------------------------------------------------------------
 .../Analysis/Util/TestCharTokenizers.cs         | 46 +++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c36a0bd1/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
index 0d28101..d452d83 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharTokenizers.cs
@@ -240,6 +240,50 @@ namespace Lucene.Net.Tests.Analysis.Common.Analysis.Util
                 }
             }
         }
-    }
 
+        /// <summary>
+        /// LUCENENET: Added this test as proof that making the IsTokenChar parameter a char
+        /// is not going to work 100% of the time because of surrogate pairs.
+        /// </summary>
+
+        [Test]
+        public virtual void TestSurrogates()
+        {
+            var analyzer = new AnalyzerAnonymousInnerClassHelper3();
+
+            AssertAnalyzesTo(analyzer, "bar 123" + (char)55404 + (char)56321 + "34 5te 987",
new string[] { "123𫀁34", "5", "987" });
+            AssertAnalyzesTo(analyzer, "787 " + (char)55297 + (char)56388 + "6" + (char)55404
+ (char)56321 + " art true 734", new string[] { "787", "𐑄6𫀁", "734" });
+        }
+
+        private sealed class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+        {
+            public AnalyzerAnonymousInnerClassHelper3()
+            { }
+
+            public override TokenStreamComponents CreateComponents(string fieldName, TextReader
reader)
+            {
+                Tokenizer tokenizer = new NumberAndSurrogatePairTokenizer(TEST_VERSION_CURRENT,
reader);
+                return new TokenStreamComponents(tokenizer, tokenizer);
+            }
+
+            private sealed class NumberAndSurrogatePairTokenizer : CharTokenizer
+            {
+                public NumberAndSurrogatePairTokenizer(LuceneVersion matchVersion, TextReader
reader)
+                    : base(matchVersion, reader)
+                {
+                }
+
+                protected override bool IsTokenChar(char c)
+                {
+                    if (char.IsNumber((char)c))
+                    {
+                        return true;
+                    }
+
+                    string character = char.ConvertFromUtf32(c);
+                    return char.IsSurrogatePair(character, 0);
+                }
+            }
+        }
+    }
 }
\ No newline at end of file


Mime
View raw message