Return-Path: Mailing-List: contact oro-dev-help@jakarta.apache.org; run by ezmlm Delivered-To: mailing list oro-dev@jakarta.apache.org Received: (qmail 59809 invoked from network); 12 Nov 2000 09:01:38 -0000 Received: from ms0.nttdata.co.jp (HELO ms.nttdata.co.jp) (163.135.193.231) by locus.apache.org with SMTP; 12 Nov 2000 09:01:38 -0000 Received: from mail1.nttdata.co.jp ([163.135.10.21]) by ms.nttdata.co.jp (8.9.3/3.7W-NTTDATA-TOP-08/31/00) with ESMTP id SAA14519 for ; Sun, 12 Nov 2000 18:01:37 +0900 (JST) Received: from geb.rd.nttdata.co.jp (localhost [127.0.0.1]) by mail1.nttdata.co.jp (8.9.1/3.7W-NTTDATA-TOP-09/22/00) with ESMTP id SAA19994 for ; Sun, 12 Nov 2000 18:02:09 +0900 (JST) Received: from osiris.rd.nttdata.co.jp (osiris.rd.nttdata.co.jp [10.8.144.16]) by geb.rd.nttdata.co.jp (8.9.3/3.7W/R8V8) with ESMTP id SAA19245 for ; Sun, 12 Nov 2000 18:01:38 +0900 (JST) Received: from piccolo (RAS-10-8-157-15.rd.nttdata.co.jp [10.8.157.15]) by osiris.rd.nttdata.co.jp (8.10.1/3.7W/R8V8) with ESMTP id eAC91Vt00075 for ; Sun, 12 Nov 2000 18:01:32 +0900 (JST) Message-ID: <002c01c04b4c$8648f5a0$5919fea9@rd.nttdata.co.jp> From: "Takashi Okamoto" To: Subject: [PATCH] for unicode problem over 0xff characters Date: Sat, 11 Nov 2000 04:26:54 +0900 MIME-Version: 1.0 Content-Type: text/plain; charset="iso-2022-jp" Content-Transfer-Encoding: 7bit X-Priority: 3 X-MSMail-Priority: Normal X-Mailer: Microsoft Outlook Express 5.00.2919.6600 X-MimeOLE: Produced By Microsoft MimeOLE V5.00.2919.6600 X-Spam-Rating: locus.apache.org 1.6.2 0/1000/N Hello ,jakarta-oro developers. I made a patch for unicode problem at Perl5Compiler.java and Perl5Matcher.java. Now jakarta-oro has following problems, [Problem 1] Perl5Util perl = new Perl5Util(); boolean result = perl.match("m![a-z]+!", "abcdX"); 'X' is unicode character over 0xff. This matching throws fatal Exception!! [Problem 2] Perl5Util perl = new Perl5Util(); boolean result = perl.match("m![X-Y]+!", "ABCDEF"); 'X' and 'Y' are unicode characters over 0xff. 'ABCDEF' is also unicode characters between 'X' and 'Y'. This matching result is false!! (true is right) But these problems will not occur after attach my patch to Perl5Compiler.java and Perl5Matcher.java. Maybe this patch isn't so good idea ,because I don't know jakarta-oro code so much. But I hope next jakarta-oro release resolve these unicode problems. Regards. PS. This patch is for CVS 2000/11/10 version's source. install memo [1] download jakarta-oro from CVS [2] cd jakarta-oro/src/java/org/apache/oro/text/regexp [3] patch -p1 < [patch tail of this mail] [4] build jakarta-oro ------------------------ Takashi Okamoto ------- patch for Perl5Compiler.java and Perl5Matcher.java ------- *** Perl5Compiler.java.org Fri Nov 10 09:55:15 2000 --- Perl5Compiler.java Fri Nov 10 09:09:34 2000 *************** *** 925,934 **** private void __setCharacterClassBits(char[] bits, int offset, char deflt, char ch) { ! if(__program== null || ch >= 256) return; - ch &= 0xffff; if(deflt == 0) { bits[offset + (ch >> 4)] |= (1 << (ch & 0xf)); } else { --- 925,935 ---- private void __setCharacterClassBits(char[] bits, int offset, char deflt, char ch) { ! if(__program == null) return; + __extendProgramSize( offset + (ch >> 4) ); + ch &= 0xffff; if(deflt == 0) { bits[offset + (ch >> 4)] |= (1 << (ch & 0xf)); } else { *************** *** 936,942 **** } } ! private int __parseCharacterClass() throws MalformedPatternException { boolean range = false, skipTest; char clss, deflt, lastclss = Character.MAX_VALUE; --- 937,949 ---- } } ! private void __extendProgramSize (int max) ! { ! if( max > __programSize ) { ! __programSize = max + 1; ! } ! } ! private int __parseCharacterClass() throws MalformedPatternException { boolean range = false, skipTest; char clss, deflt, lastclss = Character.MAX_VALUE; *************** *** 1468,1475 **** if(__programSize >= Character.MAX_VALUE - 1) throw new MalformedPatternException("Expression is too large."); - __program= new char[__programSize]; regexp = new Perl5Pattern(); regexp._program = __program; --- 1475,1486 ---- if(__programSize >= Character.MAX_VALUE - 1) throw new MalformedPatternException("Expression is too large."); + __program = new char[Character.MAX_VALUE >> 4]; + + for(int i = 0 ;i < Character.MAX_VALUE >> 4 ;i++){ + __program[i] = Character.MAX_VALUE; + } regexp = new Perl5Pattern(); regexp._program = __program; *** Perl5Matcher.java.org Fri Nov 10 09:55:37 2000 --- Perl5Matcher.java Fri Nov 10 09:29:51 2000 *************** *** 412,418 **** while(__currentOffset < endOffset) { ch = __input[__currentOffset]; ! if(ch < 256 && (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; --- 412,418 ---- while(__currentOffset < endOffset) { ch = __input[__currentOffset]; ! if( (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; *************** *** 655,661 **** break; case OpCode._ANYOF: ! if(scan < eol && (ch = __input[scan]) < 256) { while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(++scan < eol) ch = __input[scan]; --- 655,662 ---- break; case OpCode._ANYOF: ! if(scan < eol ) { ! ch = __input[scan]; while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(++scan < eol) ch = __input[scan]; *************** *** 805,811 **** if(nextChar == __EOS && inputRemains) nextChar = __input[input]; ! if(nextChar >= 256 || (__program[current + (nextChar >> 4)] & (1 << (nextChar & 0xf))) != 0) return false; --- 806,812 ---- if(nextChar == __EOS && inputRemains) nextChar = __input[input]; ! if((__program[current + (nextChar >> 4)] & (1 << (nextChar & 0xf))) != 0) return false;