jakarta-oro-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d..@apache.org
Subject cvs commit: jakarta-oro/src/java/org/apache/oro/text/regex OpCode.java Perl5Compiler.java Perl5Debug.java Perl5Matcher.java
Date Thu, 29 Mar 2001 16:33:19 GMT
dfs         01/03/29 08:33:19

  Modified:    src/java/org/apache/oro/text/regex OpCode.java
                        Perl5Compiler.java Perl5Debug.java
                        Perl5Matcher.java
  Log:
  Applied Takashi's fix for his posix character class patch.
  
  Revision  Changes    Path
  1.5       +22 -11    jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java
  
  Index: OpCode.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- OpCode.java	2001/01/29 00:19:00	1.4
  +++ OpCode.java	2001/03/29 16:33:17	1.5
  @@ -63,7 +63,7 @@
    * op-codes used in a compiled regular expression.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: OpCode.java,v 1.4 2001/01/29 00:19:00 dfs Exp $
  + @version $Id: OpCode.java,v 1.5 2001/03/29 16:33:17 dfs Exp $
    */
   final class OpCode {
   
  @@ -91,8 +91,8 @@
        _NOTHING = 15,  // no       Match empty string.
        _STAR    = 16,  // yes      Match this (simple) thing 0 or more times.
        _PLUS    = 17,  // yes      Match this (simple) thing 1 or more times.
  -     _ALNUM   = 18,  // no       Match any alphanumeric character
  -     _NALNUM  = 19,  // no       Match any non-alphanumeric character
  +     _WORD   = 18,   // no       Match any word character
  +     _NWORD  = 19,   // no       Match any non-word character
        _BOUND   = 20,  // no       Match "" at any word boundary
        _NBOUND  = 21,  // no       Match "" at any word non-boundary
        _SPACE   = 22,  // no       Match any whitespace character
  @@ -123,20 +123,29 @@
        _UPPER   = 45,
        _XDIGIT  = 46,
        _OPCODE  = 47,
  -     _ONECHAR = 48;
  +     _NOPCODE = 48,
  +     _ONECHAR = 49,
  +     _ALNUM   = 50,
  +     _ASCII   = 51;
   
     // Lengths of the various operands.
     static final int _operandLength[] = {
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 0-9
  +    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 10-19
  +    0, 0, 0, 0, 0, 0, 1, 1, 1, 0, // OpCode 20-29
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 30-39
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 40-49 
  +    0, 0                          // OpCode 50-51 
     };
   
     static final char _opType[] = {
   	_END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
  -	_CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
  -	_NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
  +	_CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _WORD,
  +	_NWORD, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
   	_OPEN, _CLOSE, _MINMOD,	_BOL, _BRANCH, _BRANCH, _END, _WHILEM,
  -	_ANYOFUN, _NANYOFUN
  +	_ANYOFUN, _NANYOFUN, _RANGE, _ALPHA, _BLANK, _CNTRL, _GRAPH,
  +	_LOWER, _PRINT, _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE,
  +	_ONECHAR, _ALNUM, _ASCII
     };
   
     static final char _opLengthVaries[] = {
  @@ -144,8 +153,10 @@
     };
   
     static final char _opLengthOne[] = {
  -    _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT, 
  -    _ANYOFUN, _NANYOFUN
  +    _ANY, _SANY, _ANYOF, _WORD, _NWORD, _SPACE, _NSPACE, _DIGIT, _NDIGIT, 
  +    _ANYOFUN, _NANYOFUN, _ALPHA, _BLANK, _CNTRL, _GRAPH, _LOWER, _PRINT,
  +    _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE, _ONECHAR, _ALNUM,
  +    _ASCII
     };
   
     static final int  _NULL_OFFSET  = -1;
  
  
  
  1.8       +99 -84    jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java
  
  Index: Perl5Compiler.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Perl5Compiler.java	2001/01/29 00:22:05	1.7
  +++ Perl5Compiler.java	2001/03/29 16:33:17	1.8
  @@ -67,7 +67,7 @@
    * information about Perl5 regular expressions.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Compiler.java,v 1.7 2001/01/29 00:22:05 dfs Exp $
  + @version $Id: Perl5Compiler.java,v 1.8 2001/03/29 16:33:17 dfs Exp $
   
    * @see PatternCompiler
    * @see MalformedPatternException
  @@ -110,18 +110,20 @@
     
     static {
       __hashPOSIX = new HashMap();
  -    __hashPOSIX.put("alnum",     new Character('w'));
  +    __hashPOSIX.put("alnum",     new Character(OpCode._ALNUM));
  +    __hashPOSIX.put("word",      new Character(OpCode._WORD));
       __hashPOSIX.put("alpha",     new Character(OpCode._ALPHA));
       __hashPOSIX.put("blank",     new Character(OpCode._BLANK));
       __hashPOSIX.put("cntrl",     new Character(OpCode._CNTRL));
  -    __hashPOSIX.put("digit",     new Character('d'));
  +    __hashPOSIX.put("digit",     new Character(OpCode._DIGIT));
       __hashPOSIX.put("graph",     new Character(OpCode._GRAPH));
       __hashPOSIX.put("lower",     new Character(OpCode._LOWER));
       __hashPOSIX.put("print",     new Character(OpCode._PRINT));
       __hashPOSIX.put("punct",     new Character(OpCode._PUNCT));
  -    __hashPOSIX.put("space",     new Character('s'));
  +    __hashPOSIX.put("space",     new Character(OpCode._SPACE));
       __hashPOSIX.put("upper",     new Character(OpCode._UPPER));
       __hashPOSIX.put("xdigit",    new Character(OpCode._XDIGIT));
  +    __hashPOSIX.put("ascii",     new Character(OpCode._ASCII));
     }
   
   
  @@ -642,12 +644,12 @@
   	  __getNextChar();
   	  break;
   	case 'w':
  -	  offset = __emitNode(OpCode._ALNUM);
  +	  offset = __emitNode(OpCode._WORD);
   	  retFlags[0] |= (__NONNULL | __SIMPLE);
   	  __getNextChar();
   	  break;
   	case 'W':
  -	  offset = __emitNode(OpCode._NALNUM);
  +	  offset = __emitNode(OpCode._NWORD);
   	  retFlags[0] |= (__NONNULL | __SIMPLE);
   	  __getNextChar();
   	  break;
  @@ -732,7 +734,8 @@
   	  if(__input._isAtEnd())
   	    throw new
   	      MalformedPatternException("Trailing \\ in expression.");
  -	  // fall through to default
  +
  +	// fall through to default
   	default:
   	  doDefault = true;
   	  break tryAgain;
  @@ -864,7 +867,6 @@
   	      break forLoop;
   	    }
   	    break;
  -
   	  case CharStringPointer._END_OF_STRING:
   	  case '\0':
   	    if(pOffset >= maxOffset)
  @@ -876,7 +878,6 @@
   	    break;
   	  } // end backslash switch
   	  break;
  -
   	case '#':
   	  if((__modifierFlags[0] & __EXTENDED) != 0) {
   	    while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
  @@ -1106,7 +1107,9 @@
     private int __parseUnicodeClass() throws MalformedPatternException {
       boolean range = false, skipTest;
       char clss, lastclss = Character.MAX_VALUE;
  +
       int offset, numLength[] = { 0 };
  +    boolean negFlag[] = new boolean[1];
       boolean opcodeFlag; /* clss isn't character when this flag true. */
   
       if(__input._getValue() == '^') {
  @@ -1136,83 +1139,85 @@
   	  clss = __input._postIncrement();
   	} else {
   	  /* try POSIX expression */
  -	  char posixOpCode = __parsePOSIX();
  +	  char posixOpCode = __parsePOSIX(negFlag);
   	  if(posixOpCode != 0){
   	    opcodeFlag = true;
   	    clss = posixOpCode;
   	  }
   	}
  -
  -	switch(clss){
  -	case 'w':
  -	  opcodeFlag = true;
  -	  clss = OpCode._ALNUM;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 'W':
  -	  opcodeFlag = true;
  -	  clss = OpCode._NALNUM;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 's':
  -	  opcodeFlag = true;
  -	  clss = OpCode._SPACE;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 'S':
  -	  opcodeFlag = true;
  -	  clss = OpCode._NSPACE;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 'd':
  -	  opcodeFlag = true;
  -	  clss = OpCode._DIGIT;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 'D':
  -	  opcodeFlag = true;
  -	  clss = OpCode._NDIGIT;
  -	  lastclss = Character.MAX_VALUE;
  -	  break;
  -	case 'n':
  -	  clss = '\n';
  -	  break;
  -	case 'r':
  -	  clss = '\r';
  -	  break;
  -	case 't':
  -	  clss = '\t';
  -	  break;
  -	case 'f':
  -	  clss = '\f';
  -	  break;
  -	case 'b':
  -	  clss = '\b';
  -	  break;
  -	case 'e':
  -	  clss = '\033';
  -	  break;
  -	case 'a':
  -	  clss = '\007';
  -	  break;
  -	case 'x':
  -	  clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
  -				  numLength);
  -	  __input._increment(numLength[0]);
  -	  break;
  -	case 'c':
  -	  clss = __input._postIncrement();
  -	  if(Character.isLowerCase(clss))
  -	    clss = Character.toUpperCase(clss);
  -	  clss ^= 64;
  -	  break;
  -	case '0': case '1': case '2': case '3': case '4':
  -	case '5': case '6': case '7': case '8': case '9':
  -	  clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
  -				    3, numLength);
  -	  __input._increment(numLength[0] - 1);
  -	  break;
  -	default:
  +        if (opcodeFlag != true) {
  +	  switch(clss){
  +	  case 'w':
  +	    opcodeFlag = true;
  +	    clss = OpCode._WORD;
  + 	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 'W':
  +	    opcodeFlag = true;
  +	    clss = OpCode._NWORD;
  +	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 's':
  +	    opcodeFlag = true;
  +	    clss = OpCode._SPACE;
  +	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 'S':
  +	    opcodeFlag = true;
  +	    clss = OpCode._NSPACE;
  +	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 'd':
  +	    opcodeFlag = true;
  +	    clss = OpCode._DIGIT;
  +	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 'D':
  +	    opcodeFlag = true;
  +	    clss = OpCode._NDIGIT;
  +	    lastclss = Character.MAX_VALUE;
  +	    break;
  +	  case 'n':
  +	    clss = '\n';
  +	    break;
  +	  case 'r':
  +	    clss = '\r';
  +	    break;
  +	  case 't':
  +	    clss = '\t';
  +	    break;
  +	  case 'f':
  +	    clss = '\f';
  +	    break;
  +	  case 'b':
  +	    clss = '\b';
  +	    break;
  +	  case 'e':
  +	    clss = '\033';
  +	    break;
  +	  case 'a':
  +	    clss = '\007';
  +	    break;
  +	  case 'x':
  +	    clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
  +				    numLength);
  +	    __input._increment(numLength[0]);
  +	    break;
  +	  case 'c':
  +	    clss = __input._postIncrement();
  +	    if(Character.isLowerCase(clss))
  +	        clss = Character.toUpperCase(clss);
  +	    clss ^= 64;
  +	    break;
  +	  case '0': case '1': case '2': case '3': case '4':
  +	  case '5': case '6': case '7': case '8': case '9':
  +	    clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
  +				      3, numLength);
  +  	    __input._increment(numLength[0] - 1);
  +	    break;
  +	  default:
  +            break;
  +          }
   	}
         }
   
  @@ -1235,7 +1240,10 @@
   
       if(lastclss == clss) {
         if(opcodeFlag == true) {
  -	__emitCode(OpCode._OPCODE);
  +        if(negFlag[0] == false)
  +	  __emitCode(OpCode._OPCODE);
  +        else 
  +	  __emitCode(OpCode._NOPCODE);
         } else {
   	__emitCode(OpCode._ONECHAR);
         }
  @@ -1281,7 +1289,7 @@
      * 
      * @return OpCode. return 0 when fail parsing POSIX expression.
      */
  -  private char __parsePOSIX() throws MalformedPatternException {
  +  private char __parsePOSIX(boolean negFlag[]) throws MalformedPatternException {
       int offset = __input._getOffset();
       int len = __input._getLength();
       int pos = offset;
  @@ -1290,6 +1298,12 @@
       Object opcode;
   
       if( value != ':' ) return 0;
  +    if( __input._getValue(pos) == '^' ) {
  +      negFlag[0] = true;
  +      pos++;
  +    } else {
  +      negFlag[0] = false;
  +    }
   
       buf = new StringBuffer();
       
  @@ -1311,7 +1325,8 @@
         return 0;
   
       __input._setOffset(pos);
  -
  +//    System.out.println("posix="+buf.toString()+":"+((Character)opcode).charValue());
  +    
       return ((Character)opcode).charValue();
     }
   
  
  
  
  1.4       +14 -3     jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java
  
  Index: Perl5Debug.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Perl5Debug.java	2001/01/29 00:19:01	1.3
  +++ Perl5Debug.java	2001/03/29 16:33:17	1.4
  @@ -68,7 +68,7 @@
    * comparison with the program generated by Perl5 with the -r option.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Debug.java,v 1.3 2001/01/29 00:19:01 dfs Exp $
  + @version $Id: Perl5Debug.java,v 1.4 2001/03/29 16:33:17 dfs Exp $
   
    * @see Perl5Pattern
    */
  @@ -199,14 +199,25 @@
       case OpCode._NOTHING: str = "NOTHING"; break;
       case OpCode._BACK  : str = "BACK"; break;
       case OpCode._END   : str = "END"; break;
  -    case OpCode._ALNUM : str = "ALNUM"; break;
  -    case OpCode._NALNUM: str = "NALNUM"; break;
  +    case OpCode._WORD : str = "WORD"; break;
  +    case OpCode._NWORD: str = "NWORD"; break;
       case OpCode._BOUND : str = "BOUND"; break;
       case OpCode._NBOUND: str = "NBOUND"; break;
       case OpCode._SPACE : str = "SPACE"; break;
       case OpCode._NSPACE: str = "NSPACE"; break;
       case OpCode._DIGIT : str = "DIGIT"; break;
       case OpCode._NDIGIT: str = "NDIGIT"; break;
  +    case OpCode._ALPHA : str = "ALPHA"; break;
  +    case OpCode._BLANK : str = "BLANK"; break;
  +    case OpCode._CNTRL : str = "CNTRL"; break;
  +    case OpCode._GRAPH : str = "GRAPH"; break;
  +    case OpCode._LOWER : str = "LOWER"; break;
  +    case OpCode._PRINT : str = "PRINT"; break;
  +    case OpCode._PUNCT : str = "PUNCT"; break;
  +    case OpCode._UPPER : str = "UPPER"; break;
  +    case OpCode._XDIGIT: str = "XDIGIT"; break;
  +    case OpCode._ALNUM : str = "ALNUM"; break;
  +    case OpCode._ASCII : str = "ASCII"; break;
       case OpCode._CURLY :
         buffer.append("CURLY {");
         buffer.append((int)OpCode._getArg1(program, offset));
  
  
  
  1.10      +22 -14    jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
  
  Index: Perl5Matcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- Perl5Matcher.java	2001/01/29 00:22:05	1.9
  +++ Perl5Matcher.java	2001/03/29 16:33:18	1.10
  @@ -66,7 +66,7 @@
    * Perl5Compiler.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Matcher.java,v 1.9 2001/01/29 00:22:05 dfs Exp $
  + @version $Id: Perl5Matcher.java,v 1.10 2001/03/29 16:33:18 dfs Exp $
   
    * @see PatternMatcher
    * @see Perl5Compiler
  @@ -512,7 +512,7 @@
   	  }
   	  break;
   
  -	case OpCode._ALNUM:
  +	case OpCode._WORD:
   	  while(__currentOffset < endOffset) {
   	    ch = __input[__currentOffset];
   	    if(OpCode._isWordCharacter(ch)) {
  @@ -527,7 +527,7 @@
   	  }
   	  break;
   
  -	case OpCode._NALNUM:
  +	case OpCode._NWORD:
   	  while(__currentOffset < endOffset) {
   	    ch = __input[__currentOffset];
   	    if(!OpCode._isWordCharacter(ch)) {
  @@ -637,14 +637,24 @@
   	} else {
   	  offset+=2;
   	}
  +
  +      } else if(__program[offset] == OpCode._ONECHAR) {
  +       	offset++;
  +	if(__program[offset++] == code) return isANYOF;
   
  -      } else if( __program[offset] == OpCode._OPCODE ){
  +      } else {
  +	isANYOF = (__program[offset] == OpCode._OPCODE) 
  +	  ? isANYOF : !isANYOF;
  +
   	offset++;
   	switch ( __program[offset++] ) {
   	case OpCode._ALNUM:
  +	  if(Character.isLetterOrDigit(code)) return isANYOF;
  +	  break;
  +	case OpCode._WORD:
   	  if(OpCode._isWordCharacter(code)) return isANYOF;
   	  break;
  -	case OpCode._NALNUM:
  +	case OpCode._NWORD:
   	  if(!OpCode._isWordCharacter(code)) return isANYOF;
   	  break;
   	case OpCode._SPACE:
  @@ -697,12 +707,10 @@
   	      (code >= 'a' && code <= 'f') ||
   	      (code >= 'A' && code <= 'F')) return isANYOF;
   	  break;
  -	}
  -      } else if((__program[offset++] == OpCode._ONECHAR) &&
  -		(__program[offset++] == code))
  -	{
  -	  return isANYOF;
  +	case OpCode._ASCII:
  +	  if(code < 0x80)return isANYOF;
   	}
  +      } 
       }
       return !isANYOF;
     }
  @@ -785,12 +793,12 @@
         }
         break;
   
  -    case OpCode._ALNUM:
  +    case OpCode._WORD:
         while(scan < eol && OpCode._isWordCharacter(__input[scan]))
   	++scan;
         break;
   
  -    case OpCode._NALNUM:
  +    case OpCode._NWORD:
         while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
   	++scan;
         break;
  @@ -953,7 +961,7 @@
   	nextChar = (inputRemains ? __input[input] : __EOS);
   	break;
   
  -      case OpCode._ALNUM:
  +      case OpCode._WORD:
   	if(!inputRemains)
   	  return false;
   	if(!OpCode._isWordCharacter(nextChar))
  @@ -962,7 +970,7 @@
   	nextChar = (inputRemains ? __input[input] : __EOS);
   	break;
   
  -      case OpCode._NALNUM:
  +      case OpCode._NWORD:
   	if(!inputRemains && input >= __eol)
   	  return false;
   	if(OpCode._isWordCharacter(nextChar))
  
  
  

Mime
View raw message