lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bugzi...@apache.org
Subject DO NOT REPLY [Bug 23466] New: - StandardTokenzier with CJK support(sigram)
Date Sun, 28 Sep 2003 15:10:23 GMT
DO NOT REPLY TO THIS EMAIL, BUT PLEASE POST YOUR BUG 
RELATED COMMENTS THROUGH THE WEB INTERFACE AVAILABLE AT
<http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23466>.
ANY REPLY MADE TO THIS MESSAGE WILL NOT BE COLLECTED AND 
INSERTED IN THE BUG DATABASE.

http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23466

StandardTokenzier with CJK support(sigram)

           Summary: StandardTokenzier with CJK support(sigram)
           Product: Lucene
           Version: CVS Nightly - Specify date in submission
          Platform: All
               URL: http://www.chedong.com/
        OS/Version: All
            Status: NEW
          Severity: Enhancement
          Priority: Other
         Component: Analysis
        AssignedTo: lucene-dev@jakarta.apache.org
        ReportedBy: chedong@hotmail.com


diff -ub StandardTokenizer.jj StandardTokenizer.jj.orig 
--- StandardTokenizer.jj        Sun Sep 28 01:52:18 2003
+++ StandardTokenizer.jj.orig   Sun Sep 28 01:51:57 2003
@@ -54,12 +54,12 @@
 
 options {
     STATIC = false;
-    //IGNORE_CASE = true;
-    //BUILD_PARSER = false;
-    UNICODE_INPUT = true;
+//IGNORE_CASE = true;
+//BUILD_PARSER = false;
+//UNICODE_INPUT = true;
     USER_CHAR_STREAM = true;
     OPTIMIZE_TOKEN_MANAGER = true;
-    //DEBUG_TOKEN_MANAGER = true;
+//DEBUG_TOKEN_MANAGER = true;
 }
 PARSER_BEGIN(StandardTokenizer)
 
@@ -89,7 +89,7 @@
 TOKEN : {                                        // token patterns
 
     // basic word: a sequence of digits & letters
-<ALPHANUM: (<LETTER>|<DIGIT>)+ >
+  <ALPHANUM: (<LETTER>|<DIGIT>)+ >
 
     // internal apostrophes: O'Reilly, you're, O'Reilly's
     // use a post-filter to remove possesives
@@ -118,7 +118,6 @@
              | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT>
<P> <ALPHANUM>)+
             )
     >
-| <SIGRAM: (<CJK>) >
 | <#P: ("_"|"-"|"/"|"."|",") >
 | <#HAS_DIGIT:                                   // at least one digit
     (<LETTER>|<DIGIT>)*
@@ -127,18 +126,14 @@
     >
 
 | < #ALPHA: (<LETTER>)+>
-| < #LETTER:                                     // alphabets
+| < #LETTER:                                     // unicode letters
     [
         "\u0041"-"\u005a",
         "\u0061"-"\u007a",
         "\u00c0"-"\u00d6",
         "\u00d8"-"\u00f6",
         "\u00f8"-"\u00ff",
-        "\u0100"-"\u1fff"
-    ]
-    >
-|  < #CJK:       // non-alphabets
-      [
+       "\u0100"-"\u1fff",
        "\u3040"-"\u318f",
        "\u3300"-"\u337f",
        "\u3400"-"\u3d2d",
@@ -168,7 +163,7 @@
 }
 
 SKIP : {                                         // skip unrecognized chars
-<NOISE: ~[] >
+ <NOISE: ~[] >
 }
 
 /** Returns the next token in the stream, or null at EOS.
@@ -187,7 +182,6 @@
         token = <EMAIL> |
         token = <HOST> |
         token = <NUM> |
-        token = <SIGRAM> |
         token = <EOF>
     )
     {

Mime
View raw message