lucene-pylucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From va...@apache.org
Subject svn commit: r1032428 - /lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
Date Mon, 08 Nov 2010 00:13:02 GMT
Author: vajda
Date: Mon Nov  8 00:13:02 2010
New Revision: 1032428

URL: http://svn.apache.org/viewvc?rev=1032428&view=rev
Log:
refreshed test_ThaiAnalyzer.py

Modified:
    lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py

Modified: lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py?rev=1032428&r1=1032427&r2=1032428&view=diff
==============================================================================
--- lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py (original)
+++ lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py Mon Nov  8 00:13:02 2010
@@ -14,15 +14,64 @@
 # ====================================================================
 
 from unittest import TestCase, main
-from lucene import ThaiAnalyzer, StringReader, Version
+from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
 
 
 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
 
-    def testAnalyzer(self):
+    def testOffsets(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
+
+        self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+                               u"การที่ได้ต้องแสดงว่างานดี",

+                               [ u"การ", u"ที่", u"ได้",
u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี"
],
+                               [ 0, 3, 6, 9, 13, 17, 20, 23 ],
+                               [ 3, 6, 9, 13, 17, 20, 23, 25 ])
+
+    def testTokenType(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
+
+        self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+                               u"การที่ได้ต้องแสดงว่างานดี
๑๒๓", 
+                               [ u"การ", u"ที่", u"ได้",
u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี",
u"๑๒๓" ],
+                               None, None,
+                               [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+                                 "<NUM>" ])
+
+    def testPositionIncrements(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
 
         analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
+
+        self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง
the แสดงว่างานดี", 
+                               [ u"การ", u"ที่", u"ได้",
u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี"
],
+                               [ 0, 3, 6, 9, 18, 22, 25, 28 ],
+                               [ 3, 6, 9, 13, 22, 25, 28, 30 ],
+                               None,
+                               [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+	 
+        # case that a stopword is adjacent to thai text, with no whitespace
+        self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe
แสดงว่างานดี", 
+                               [ u"การ", u"ที่", u"ได้",
u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี"
],
+                               [ 0, 3, 6, 9, 17, 21, 24, 27 ],
+                               [ 3, 6, 9, 13, 21, 24, 27, 29 ],
+                               None,
+                               [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+
+    def testAnalyzer30(self):
+
+        analyzer = ThaiAnalyzer(Version.LUCENE_30)
     
         self._assertAnalyzesTo(analyzer, u"", [])
 



Mime
View raw message