jakarta-oro-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Takashi Okamoto" <tokam...@rd.nttdata.co.jp>
Subject [PATCH] for unicode problem over 0xff characters
Date Fri, 10 Nov 2000 19:26:54 GMT
Hello ,jakarta-oro developers.

I made a patch for unicode problem at Perl5Compiler.java and
Perl5Matcher.java.
Now jakarta-oro has following problems,

[Problem 1]

 Perl5Util perl = new Perl5Util();
 boolean result = perl.match("m![a-z]+!", "abcdX");


 'X' is unicode character over 0xff.

 This matching throws fatal Exception!!

[Problem 2]

 Perl5Util perl = new Perl5Util();
 boolean result = perl.match("m![X-Y]+!", "ABCDEF");


 'X' and 'Y' are unicode characters over 0xff.
 'ABCDEF' is also unicode characters between 'X' and 'Y'.
 This matching result is false!! (true is right)


But these problems will not occur after attach my patch to
Perl5Compiler.java and Perl5Matcher.java.

Maybe this patch isn't so good idea ,because I don't know jakarta-oro
code so much.

But I hope next jakarta-oro release resolve these unicode problems.


Regards.


PS.
This patch  is for CVS 2000/11/10 version's source.

install memo

[1] download jakarta-oro from CVS
[2] cd jakarta-oro/src/java/org/apache/oro/text/regexp
[3] patch -p1 < [patch tail of this mail]
[4] build jakarta-oro
------------------------
Takashi Okamoto



------- patch for Perl5Compiler.java and Perl5Matcher.java -------

*** Perl5Compiler.java.org Fri Nov 10 09:55:15 2000
--- Perl5Compiler.java Fri Nov 10 09:09:34 2000
***************
*** 925,934 ****
    private void __setCharacterClassBits(char[] bits, int offset, char
deflt,
             char ch)
    {
!     if(__program== null || ch >= 256)
        return;
-     ch &= 0xffff;

      if(deflt == 0) {
        bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
      } else {
--- 925,935 ----
    private void __setCharacterClassBits(char[] bits, int offset, char
deflt,
             char ch)
    {
!     if(__program == null)
        return;

+     __extendProgramSize( offset + (ch >> 4) );
+     ch &= 0xffff;
      if(deflt == 0) {
        bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
      } else {
***************
*** 936,942 ****
      }
    }

!
    private int __parseCharacterClass() throws MalformedPatternException {
      boolean range = false, skipTest;
      char clss, deflt, lastclss = Character.MAX_VALUE;
--- 937,949 ----
      }
    }

!   private void __extendProgramSize (int  max)
!   {
!       if( max > __programSize ) {
!    __programSize = max + 1;
!       }
!   }
!
    private int __parseCharacterClass() throws MalformedPatternException {
      boolean range = false, skipTest;
      char clss, deflt, lastclss = Character.MAX_VALUE;
***************
*** 1468,1475 ****
      if(__programSize >= Character.MAX_VALUE - 1)
        throw new MalformedPatternException("Expression is too large.");


-     __program= new char[__programSize];
      regexp = new Perl5Pattern();

      regexp._program    = __program;
--- 1475,1486 ----
      if(__programSize >= Character.MAX_VALUE - 1)
        throw new MalformedPatternException("Expression is too large.");

+     __program = new char[Character.MAX_VALUE >> 4];
+
+     for(int i = 0 ;i < Character.MAX_VALUE >> 4 ;i++){
+        __program[i] = Character.MAX_VALUE;
+     }

      regexp = new Perl5Pattern();

      regexp._program    = __program;
*** Perl5Matcher.java.org Fri Nov 10 09:55:37 2000
--- Perl5Matcher.java Fri Nov 10 09:29:51 2000
***************
*** 412,418 ****
     while(__currentOffset < endOffset) {
       ch = __input[__currentOffset];

!      if(ch < 256 &&
          (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
         if(tmp && __tryExpression(expression, __currentOffset)) {
    success = true;
--- 412,418 ----
     while(__currentOffset < endOffset) {
       ch = __input[__currentOffset];

!      if(
          (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
         if(tmp && __tryExpression(expression, __currentOffset)) {
    success = true;
***************
*** 655,661 ****
        break;

      case OpCode._ANYOF:
!       if(scan < eol && (ch = __input[scan]) < 256) {
   while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
     if(++scan < eol)
       ch = __input[scan];
--- 655,662 ----
        break;

      case OpCode._ANYOF:
!       if(scan < eol ) {
!       ch = __input[scan];
   while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
     if(++scan < eol)
       ch = __input[scan];
***************
*** 805,811 ****
   if(nextChar == __EOS && inputRemains)
     nextChar = __input[input];

!  if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
       (1 << (nextChar & 0xf))) != 0)
     return false;

--- 806,812 ----
   if(nextChar == __EOS && inputRemains)
     nextChar = __input[input];

!  if((__program[current + (nextChar >> 4)] &
       (1 << (nextChar & 0xf))) != 0)
     return false;




Mime
View raw message