Return-Path: Delivered-To: apmail-xml-xerces-cvs-archive@xml.apache.org Received: (qmail 43245 invoked by uid 500); 3 Jul 2002 22:21:29 -0000 Mailing-List: contact xerces-cvs-help@xml.apache.org; run by ezmlm Precedence: bulk list-help: list-unsubscribe: list-post: Reply-To: Delivered-To: mailing list xerces-cvs@xml.apache.org Received: (qmail 43234 invoked from network); 3 Jul 2002 22:21:29 -0000 Date: 3 Jul 2002 22:21:32 -0000 Message-ID: <20020703222132.45570.qmail@icarus.apache.org> From: neilg@apache.org To: xml-xerces-cvs@apache.org Subject: cvs commit: xml-xerces/java/src/org/apache/xerces/impl/xpath/regex RangeToken.java REUtil.java RegexParser.java RegularExpression.java Token.java X-Spam-Rating: daedalus.apache.org 1.6.2 0/1000/N neilg 2002/07/03 15:21:31 Modified: java/src/org/apache/xerces/impl/xpath/regex RangeToken.java REUtil.java RegexParser.java RegularExpression.java Token.java Log: applying some fixes from Kent Tamura to the regular expression code used in Schema validation. This should fix bugzilla bugs 3560,, 7752, 7806. Revision Changes Path 1.3 +0 -2 xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RangeToken.java Index: RangeToken.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RangeToken.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- RangeToken.java 29 Jan 2002 01:15:14 -0000 1.2 +++ RangeToken.java 3 Jul 2002 22:21:31 -0000 1.3 @@ -228,8 +228,6 @@ } protected void mergeRanges(Token token) { - if (token.type != this.type) - throw new IllegalArgumentException("Token#mergeRanges(): Mismatched Type: "+token.type); RangeToken tok = (RangeToken)token; this.sortRanges(); tok.sortRanges(); 1.4 +6 -3 xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/REUtil.java Index: REUtil.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/REUtil.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- REUtil.java 29 Jan 2002 01:15:14 -0000 1.3 +++ REUtil.java 3 Jul 2002 22:21:31 -0000 1.4 @@ -293,13 +293,15 @@ synchronized (REUtil.regexCache) { int i; for (i = 0; i < REUtil.CACHESIZE; i ++) { - re = REUtil.regexCache[i]; - if (re == null) { + RegularExpression cached = REUtil.regexCache[i]; + if (cached == null) { i = -1; break; } - if (re.equals(pattern, intOptions)) + if (cached.equals(pattern, intOptions)) { + re = cached; break; + } } if (re != null) { if (i != 0) { @@ -347,6 +349,7 @@ if (i > 0) buffer.append(literal.substring(0, i)); } buffer.append((char)'\\'); + buffer.append((char)ch); } else if (buffer != null) buffer.append((char)ch); } 1.3 +14 -152 xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegexParser.java Index: RegexParser.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegexParser.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- RegexParser.java 29 Jan 2002 01:15:14 -0000 1.2 +++ RegexParser.java 3 Jul 2002 22:21:31 -0000 1.3 @@ -301,9 +301,6 @@ default: ret = T_CHAR; - if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen) - this.chardata = REUtil.composeFromSurrogates(this.chardata, - this.regex.charAt(this.offset++)); } this.nexttoken = ret; } @@ -775,7 +772,16 @@ case T_CHAR: tok = Token.createChar(this.chardata); + int high = this.chardata; this.next(); + if (REUtil.isHighSurrogate(high) + && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { + char[] sur = new char[2]; + sur[0] = (char)high; + sur[1] = (char)this.chardata; + tok = Token.createParen(Token.createString(new String(sur)), 0); + this.next(); + } break; default: @@ -810,8 +816,10 @@ int namestart = this.offset; int nameend = this.regex.indexOf('}', namestart); if (nameend < 0) throw this.ex("parser.atom.3", this.offset); + String pname = this.regex.substring(namestart, nameend); this.offset = nameend+1; - tok = Token.getRange(this.regex.substring(namestart, nameend), positive); + tok = Token.getRange(pname, positive, + this.isSet(RegularExpression.XMLSCHEMA_MODE)); /* if (this.isSet(RegularExpression.IGNORE_CASE)) tok = RangeToken.createCaseInsensitiveToken(tok); @@ -900,7 +908,8 @@ positive = false; } String name = this.regex.substring(this.offset, nameend); - RangeToken range = Token.getRange(name, positive); + RangeToken range = Token.getRange(name, positive, + this.isSet(RegularExpression.XMLSCHEMA_MODE)); if (range == null) throw this.ex("parser.cc.3", this.offset); tok.mergeRanges(range); end = true; @@ -933,153 +942,6 @@ } if (this.read() == T_EOF) throw this.ex("parser.cc.2", this.offset); - if (!useNrange && nrange) { - base.subtractRanges(tok); - tok = base; - } - tok.sortRanges(); - tok.compactRanges(); - //tok.dumpRanges(); - /* - if (this.isSet(RegularExpression.IGNORE_CASE)) - tok = RangeToken.createCaseInsensitiveToken(tok); - */ - this.setContext(S_NORMAL); - this.next(); // Skips ']' - - return tok; - } - private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException { - this.setContext(S_INBRACKETS); - this.next(); // '[' - boolean nrange = false; - RangeToken base = null; - RangeToken tok; - if (this.read() == T_CHAR && this.chardata == '^') { - nrange = true; - this.next(); // '^' - if (useNrange) { - tok = Token.createNRange(); - } else { - base = Token.createRange(); - base.addRange(0, Token.UTF16_MAX); - tok = Token.createRange(); - } - } else { - tok = Token.createRange(); - } - int type; - while ((type = this.read()) != T_EOF - && !(type == T_CHAR && this.chardata == ']')) { - int c = this.chardata; - /* - if (type == T_CHAR && c == '^') { - this.next(); - type = this.read(); - c = this.chardata; - if (type == T_EOF) break; - - nrange = !nrange; - if (nrange) - tok = Token.createRange(); - else { - base.subtractRanges(tok); - tok = base; - } - } - */ - boolean end = false; - if (type == T_BACKSOLIDUS) { - switch (c) { - case 'd': case 'D': - case 'w': case 'W': - case 's': case 'S': - tok.mergeRanges(this.getTokenForShorthand(c)); - end = true; - break; - - case 'i': case 'I': - case 'c': case 'C': - c = this.processCIinCharacterClass(tok, c); - if (c < 0) end = true; - break; - - case 'p': - case 'P': - boolean positive = c == 'p'; - int pstart = this.offset; - this.next(); - if (this.read() != T_CHAR) throw ex("parser.atom.2", this.offset-1); - RangeToken tok2 = null; - switch (this.chardata) { - case 'L': // Letter - tok2 = Token.getRange("L", positive); break; - case 'M': // Mark - tok2 = Token.getRange("M", positive); break; - case 'N': // Number - tok2 = Token.getRange("N", positive); break; - case 'Z': // Separator - tok2 = Token.getRange("Z", positive); break; - case 'C': // Other - tok2 = Token.getRange("C", positive); break; - case 'P': // Punctuation - tok2 = Token.getRange("P", positive); break; - case 'S': // Symbol - tok2 = Token.getRange("S", positive); break; - case '{': - // this.offset points the next of '{'. - pstart = this.offset; - int namestart = this.offset; - int nameend = this.regex.indexOf('}', namestart); - if (nameend < 0) throw ex("parser.atom.3", this.offset); - this.offset = nameend+1; - tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive); - break; - - default: - throw ex("parser.atom.2", this.offset-1); - } - if (tok2 == null) throw ex("parser.atom.5", pstart); - tok.mergeRanges(tok2); - end = true; - break; - - default: - c = this.decodeEscaped(); - } // \ + c - } // backsolidus - // POSIX Character class such as [:alnum:] - else if (type == T_POSIX_CHARCLASS_START) { - int nameend = this.regex.indexOf(':', this.offset); - if (nameend < 0) throw ex("parser.cc.1", this.offset); - String name = this.regex.substring(this.offset, nameend); - RangeToken range = Token.getRange(name, true); - if (range == null) throw ex("parser.cc.3", this.offset); - tok.mergeRanges(range); - end = true; - if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') - throw ex("parser.cc.1", nameend); - this.offset = nameend+2; - } - this.next(); - if (!end) { - if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. - tok.addRange(c, c); - } else { - this.next(); // Skips '-' - if ((type = this.read()) == T_EOF) throw ex("parser.cc.2", this.offset); - int rangeend = this.chardata; - if (type == T_BACKSOLIDUS) - rangeend = this.decodeEscaped(); - this.next(); - tok.addRange(c, rangeend); - } - } - if (this.read() == T_CHAR && this.chardata == ',') - this.next(); - } - if (this.read() == T_EOF) - throw ex("parser.cc.2", this.offset); if (!useNrange && nrange) { base.subtractRanges(tok); tok = base; 1.3 +6 -6 xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java Index: RegularExpression.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- RegularExpression.java 29 Jan 2002 01:15:14 -0000 1.2 +++ RegularExpression.java 3 Jul 2002 22:21:31 -0000 1.3 @@ -905,7 +905,7 @@ while (true) { if (op == null) - return offset; + return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; if (offset > con.limit || offset < con.start) return -1; switch (op.type) { @@ -1221,7 +1221,7 @@ if (DEBUG) { System.err.println("UNION: "+i+", ret="+ret); } - if (ret == con.length ) return ret; + if (ret >= 0) return ret; } return -1; @@ -1625,7 +1625,7 @@ while (true) { if (op == null) - return offset; + return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; if (offset > con.limit || offset < con.start) return -1; switch (op.type) { @@ -1940,7 +1940,7 @@ if (DEBUG) { System.err.println("UNION: "+i+", ret="+ret); } - if (ret == con.length ) return ret; + if (ret >= 0) return ret; } return -1; @@ -2272,7 +2272,7 @@ while (true) { if (op == null) - return offset; + return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; if (offset > con.limit || offset < con.start) return -1; switch (op.type) { @@ -2588,7 +2588,7 @@ if (DEBUG) { System.err.println("UNION: "+i+", ret="+ret); } - if (ret == con.length) return ret; + if (ret >= 0) return ret; } return -1; 1.3 +86 -25 xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/Token.java Index: Token.java =================================================================== RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/Token.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Token.java 29 Jan 2002 01:15:14 -0000 1.2 +++ Token.java 3 Jul 2002 22:21:31 -0000 1.3 @@ -744,7 +744,7 @@ /*FEFF..FEFF;*/ "Specials", /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", //missing Specials add manually - /*10300..1032F;*/ "Old Italic", + /*10300..1032F;*/ "Old Italic", // 87 /*10330..1034F;*/ "Gothic", /*10400..1044F;*/ "Deseret", /*1D000..1D0FF;*/ "Byzantine Musical Symbols", @@ -771,10 +771,21 @@ +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" - +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F" - +"\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F"; + +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; + static final int[] nonBMPBlockRanges = { + 0x10300, 0x1032F, // 87 + 0x10330, 0x1034F, + 0x10400, 0x1044F, + 0x1D000, 0x1D0FF, + 0x1D100, 0x1D1FF, + 0x1D400, 0x1D7FF, + 0x20000, 0x2A6D6, + 0x2F800, 0x2FA1F, + 0xE0000, 0xE007F + }; + private static final int NONBMP_BLOCK_START = 87; - static protected RangeToken getRange(String name, boolean positive) { + static protected RangeToken getRange(String name, boolean positive) { if (Token.categories.size() == 0) { synchronized (Token.categories) { Token[] ranges = new Token[Token.categoryNames.length]; @@ -864,17 +875,23 @@ // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? // StringBuffer buffer = new StringBuffer(50); - int location = 0; for (int i = 0; i < Token.blockNames.length; i ++) { Token r1 = Token.createRange(); - location = i*2; - int rstart = Token.blockRanges.charAt(location); - int rend = Token.blockRanges.charAt(location+1); + int location; + if (i < NONBMP_BLOCK_START) { + location = i*2; + int rstart = Token.blockRanges.charAt(location); + int rend = Token.blockRanges.charAt(location+1); + //DEBUGING + //System.out.println(n+" " +Integer.toHexString(rstart) + // +"-"+ Integer.toHexString(rend)); + r1.addRange(rstart, rend); + } else { + location = (i - NONBMP_BLOCK_START) * 2; + r1.addRange(Token.nonBMPBlockRanges[location], + Token.nonBMPBlockRanges[location + 1]); + } String n = Token.blockNames[i]; - //DEBUGING - //System.out.println(n+" " +Integer.toHexString(rstart) - // +"-"+ Integer.toHexString(rend)); - r1.addRange(rstart, rend); if (n.equals("Specials")) r1.addRange(0xfff0, 0xfffd); if (n.equals("Private Use")) { @@ -883,7 +900,7 @@ } Token.categories.put(n, r1); Token.categories2.put(n, Token.complementRanges(r1)); - buffer.setLength(0); + buffer.setLength(0); buffer.append("Is"); if (n.indexOf(' ') >= 0) { for (int ci = 0; ci < n.length(); ci ++) @@ -895,11 +912,6 @@ Token.setAlias(buffer.toString(), n, true); } - // REVISIT: remove this code later - // the following does not match the XML Schema definition - // for Regular Expressions - - /* // TR#18 1.2 Token.setAlias("ASSIGNED", "Cn", false); Token.setAlias("UNASSIGNED", "Cn", true); @@ -907,44 +919,51 @@ all.addRange(0, Token.UTF16_MAX); Token.categories.put("ALL", all); Token.categories2.put("ALL", Token.complementRanges(all)); - */ - - /* + Token.registerNonXS("ASSIGNED"); + Token.registerNonXS("UNASSIGNED"); + Token.registerNonXS("ALL"); + Token isalpha = Token.createRange(); isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo Token.categories.put("IsAlpha", isalpha); Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); - + Token.registerNonXS("IsAlpha"); + Token isalnum = Token.createRange(); isalnum.mergeRanges(isalpha); // Lu Ll Lo isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd Token.categories.put("IsAlnum", isalnum); Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); + Token.registerNonXS("IsAlnum"); Token isspace = Token.createRange(); isspace.mergeRanges(Token.token_spaces); isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z Token.categories.put("IsSpace", isspace); Token.categories2.put("IsSpace", Token.complementRanges(isspace)); + Token.registerNonXS("IsSpace"); Token isword = Token.createRange(); isword.mergeRanges(isalnum); // Lu Ll Lo Nd isword.addRange('_', '_'); Token.categories.put("IsWord", isword); Token.categories2.put("IsWord", Token.complementRanges(isword)); + Token.registerNonXS("IsWord"); Token isascii = Token.createRange(); isascii.addRange(0, 127); Token.categories.put("IsASCII", isascii); Token.categories2.put("IsASCII", Token.complementRanges(isascii)); + Token.registerNonXS("IsASCII"); Token isnotgraph = Token.createRange(); isnotgraph.mergeRanges(ranges[CHAR_OTHER]); isnotgraph.addRange(' ', ' '); Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); Token.categories2.put("IsGraph", isnotgraph); + Token.registerNonXS("IsGraph"); Token isxdigit = Token.createRange(); isxdigit.addRange('0', '9'); @@ -952,13 +971,20 @@ isxdigit.addRange('a', 'f'); Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); Token.categories2.put("IsXDigit", isxdigit); - + Token.registerNonXS("IsXDigit"); + Token.setAlias("IsDigit", "Nd", true); Token.setAlias("IsUpper", "Lu", true); Token.setAlias("IsLower", "Ll", true); Token.setAlias("IsCntrl", "C", true); Token.setAlias("IsPrint", "C", false); Token.setAlias("IsPunct", "P", true); + Token.registerNonXS("IsDigit"); + Token.registerNonXS("IsUpper"); + Token.registerNonXS("IsLower"); + Token.registerNonXS("IsCntrl"); + Token.registerNonXS("IsPrint"); + Token.registerNonXS("IsPunct"); Token.setAlias("alpha", "IsAlpha", true); Token.setAlias("alnum", "IsAlnum", true); @@ -973,13 +999,48 @@ Token.setAlias("upper", "IsUpper", true); Token.setAlias("word", "IsWord", true); // Perl extension Token.setAlias("xdigit", "IsXDigit", true); - */ + Token.registerNonXS("alpha"); + Token.registerNonXS("alnum"); + Token.registerNonXS("ascii"); + Token.registerNonXS("cntrl"); + Token.registerNonXS("digit"); + Token.registerNonXS("graph"); + Token.registerNonXS("lower"); + Token.registerNonXS("print"); + Token.registerNonXS("punct"); + Token.registerNonXS("space"); + Token.registerNonXS("upper"); + Token.registerNonXS("word"); + Token.registerNonXS("xdigit"); } // synchronized } // if null RangeToken tok = positive ? (RangeToken)Token.categories.get(name) : (RangeToken)Token.categories2.get(name); - if (tok == null) System.out.println(name); + //if (tok == null) System.out.println(name); return tok; + } + static protected RangeToken getRange(String name, boolean positive, boolean xs) { + RangeToken range = Token.getRange(name, positive); + if (xs && range != null && Token.isRegisterNonXS(name)) + range = null; + return range; + } + + static Hashtable nonxs = null; + /** + * This method is called by only getRange(). + * So this method need not MT-safe. + */ + static protected void registerNonXS(String name) { + if (Token.nonxs == null) + Token.nonxs = new Hashtable(); + Token.nonxs.put(name, name); + } + static protected boolean isRegisterNonXS(String name) { + if (Token.nonxs == null) + return false; + System.err.println("isRegisterNonXS: "+name); + return Token.nonxs.containsKey(name); } private static void setAlias(String newName, String name, boolean positive) { --------------------------------------------------------------------- To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org For additional commands, e-mail: xerces-cvs-help@xml.apache.org