lucene-pylucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From va...@apache.org
Subject svn commit: r935265 - in /lucene/pylucene/trunk: CHANGES Makefile python/ICUNormalizer2Filter.py test/test_Analyzers.py test/test_ICUNormalizer2Filter.py
Date Sat, 17 Apr 2010 22:07:43 GMT
Author: vajda
Date: Sat Apr 17 22:07:43 2010
New Revision: 935265

URL: http://svn.apache.org/viewvc?rev=935265&view=rev
Log:
 - added port of ICUNormalizer2Filter that uses C++ ICU's Normalizer2 via PyICU

Added:
    lucene/pylucene/trunk/python/ICUNormalizer2Filter.py   (with props)
    lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py   (with props)
Modified:
    lucene/pylucene/trunk/CHANGES
    lucene/pylucene/trunk/Makefile
    lucene/pylucene/trunk/test/test_Analyzers.py

Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=935265&r1=935264&r2=935265&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Sat Apr 17 22:07:43 2010
@@ -3,6 +3,7 @@ Version 3.0.0 ->
 ----------------
  - improved support for building on Windows with mingw32
  - added wininst target to Makefile
+ - added port of ICUNormalizer2Filter that uses C++ ICU's Normalizer2 via PyICU
  - 
 
 Version 2.9.0 -> 3.0.0

Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=935265&r1=935264&r2=935265&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Sat Apr 17 22:07:43 2010
@@ -215,6 +215,7 @@ GENERATE=$(JCC) $(foreach jar,$(JARS),--
            --rename org.apache.lucene.search.highlight.SpanScorer=HighlighterSpanScorer \
            --version $(LUCENE_VER) \
            --module python/collections.py \
+           --module python/ICUNormalizer2Filter.py \
            --files $(NUM_FILES)
 
 generate: jars

Added: lucene/pylucene/trunk/python/ICUNormalizer2Filter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/python/ICUNormalizer2Filter.py?rev=935265&view=auto
==============================================================================
--- lucene/pylucene/trunk/python/ICUNormalizer2Filter.py (added)
+++ lucene/pylucene/trunk/python/ICUNormalizer2Filter.py Sat Apr 17 22:07:43 2010
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+#
+#  Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
+#
+#  With this filter, you can normalize text in the following ways:
+#   - NFKC Normalization, Case Folding, and removing Ignorables (the default)
+#   - Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
+#   - Based on rules from a custom normalization mapping.
+#
+#  If you use the defaults, this filter is a simple way to standardize
+#  Unicode text in a language-independent way for search:
+#   - The case folding that it does can be seen as a replacement for
+#     LowerCaseFilter: For example, it handles cases such as the Greek
+#     sigma, so that "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
+#   - The normalization will standardizes different forms of the same 
+#     character in Unicode. For example, CJK full-width numbers will be
+#     standardized to their ASCII forms.
+#   - Ignorables such as Zero-Width Joiner and Variation Selectors are
+#     removed. These are typically modifier characters that affect display.
+#
+# ====================================================================
+
+from lucene import PythonTokenFilter, CharTermAttribute
+from icu import Normalizer2, UNormalizationMode2, UNormalizationCheckResult
+
+
+class ICUNormalizer2Filter(PythonTokenFilter):
+
+    def __init__(self, input, normalizer=None):
+        super(ICUNormalizer2Filter, self).__init__(input)
+
+        self.input = input
+        self.termAtt = self.addAttribute(CharTermAttribute.class_);
+
+        if normalizer is None:
+            normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.UNORM2_COMPOSE)
+        self.normalizer = normalizer
+
+    def incrementToken(self):
+
+        if self.input.incrementToken():
+            text = self.termAtt.toString()
+
+            if self.normalizer.quickCheck(text) != UNormalizationCheckResult.UNORM_YES:
+                self.termAtt.setEmpty()
+                self.termAtt.append(self.normalizer.normalize(text))
+                
+            return True
+
+        return False

Propchange: lucene/pylucene/trunk/python/ICUNormalizer2Filter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/python/ICUNormalizer2Filter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: lucene/pylucene/trunk/test/test_Analyzers.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_Analyzers.py?rev=935265&r1=935264&r2=935265&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/test_Analyzers.py (original)
+++ lucene/pylucene/trunk/test/test_Analyzers.py Sat Apr 17 22:07:43 2010
@@ -13,7 +13,7 @@
 # ====================================================================
 
 from unittest import main
-from BaseTokenStreamTestCase import BaseTokenStreamTestCase;
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
 from lucene import *
 
 

Added: lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py?rev=935265&view=auto
==============================================================================
--- lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py (added)
+++ lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py Sat Apr 17 22:07:43 2010
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+
+try:
+    from icu import Normalizer2, UNormalizationMode2
+except ImportError, e:
+    pass
+
+from unittest import main
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
+
+from lucene import *
+from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
+
+
+class TestICUNormalizer2Filter(BaseTokenStreamTestCase):
+
+    def testDefaults(self):
+
+        class analyzer(PythonAnalyzer):
+            def tokenStream(_self, fieldName, reader):
+                return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
+
+        a = analyzer()
+
+        # case folding
+        self._assertAnalyzesTo(a, "This is a test",
+                               [ "this", "is", "a", "test" ])
+
+        # case folding
+        self._assertAnalyzesTo(a, "Ruß", [ "russ" ])
+    
+        # case folding
+        self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μάϊοσ" ])
+        self._assertAnalyzesTo(a, u"Μάϊος", [ u"μάϊοσ" ])
+
+        # supplementary case folding
+        self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
+    
+        # normalization
+        self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
+
+        # removal of default ignorables
+        self._assertAnalyzesTo(a, u"क्‍ष", [ u"क्ष" ])
+  
+    def testAlternate(self):
+
+        class analyzer(PythonAnalyzer):
+            # specify nfc with decompose to get nfd
+            def tokenStream(_self, fieldName, reader):
+                return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
+                                            Normalizer2.getInstance(None, "nfc", UNormalizationMode2.UNORM2_DECOMPOSE))
+
+        a = analyzer()
+        # decompose EAcute into E + combining Acute
+        self._assertAnalyzesTo(a, u"\u00E9", [ u"\u0065\u0301" ])
+
+
+if __name__ == "__main__":
+    import sys, lucene
+    try:
+        import icu
+    except ImportError:
+        pass
+    else:
+        lucene.initVM()
+        if '-loop' in sys.argv:
+            sys.argv.remove('-loop')
+            while True:
+                try:
+                    main()
+                except:
+                    pass
+        else:
+             main()

Propchange: lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/test/test_ICUNormalizer2Filter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message