lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r881850 - /incubator/lucene.net/trunk/C#/src/Test/QueryParser/TestQueryParser.cs
Date Wed, 18 Nov 2009 17:51:28 GMT
Author: digy
Date: Wed Nov 18 17:51:28 2009
New Revision: 881850

URL: http://svn.apache.org/viewvc?rev=881850&view=rev
Log:
LUCENENET-281 TestCJK on TestQueryParser fails

Modified:
    incubator/lucene.net/trunk/C#/src/Test/QueryParser/TestQueryParser.cs

Modified: incubator/lucene.net/trunk/C#/src/Test/QueryParser/TestQueryParser.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Test/QueryParser/TestQueryParser.cs?rev=881850&r1=881849&r2=881850&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Test/QueryParser/TestQueryParser.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Test/QueryParser/TestQueryParser.cs Wed Nov 18 17:51:28
2009
@@ -294,13 +294,29 @@
 		}
 		
 		[Test]
-		public virtual void  TestCJK()
-		{
-			// Test Ideographic Space - As wide as a CJK character cell (fullwidth)
-			// used google to translate the word "term" to japanese -> 用語
-			AssertQueryEquals("term\u3000term\u3000term", null, "term\u0020term\u0020term");
-			AssertQueryEquals("用語\u3000用語\u3000用語",
null, "用語\u0020用語\u0020用語");
-		}
+        public virtual void TestCJK()
+        {
+            // Test Ideographic Space - As wide as a CJK character cell (fullwidth)
+            // used google to translate the word "term" to japanese -> 用語
+            //
+            // NOTE: What is printed above is not the translation of "term" into
+            // Japanese.  Google translate currently gives:
+            //
+            // 期間
+            //
+            // Which translates to unicode characters 26399 and 38291, or
+            // the literals '\u671f' and '\u9593'.
+            //
+            // Unlike the second and third characters in the previous string ('\u201d' and
'\u00a8')
+            // which fail the test for IsCharacter when tokenized by LetterTokenizer (as
it should
+            // in Java), which causes the word to be split differently than if it actually
used
+            // letters as defined by Unicode.
+            //
+            // Using the string "\u671f\u9593\u3000\u671f\u9593\u3000\u671f\u9593" with just
the two
+            // characters is enough, as it uses two characters with the full width of a CJK
character cell.
+            AssertQueryEquals("term\u3000term\u3000term", null, "term\u0020term\u0020term");
+            AssertQueryEquals("\u671f\u9593\u3000\u671f\u9593\u3000\u671f\u9593", null, "\u671f\u9593\u0020\u671f\u9593\u0020\u671f\u9593");
+        }
 		
 		[Test]
 		public virtual void  TestSimple()



Mime
View raw message