tomcat-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cos...@apache.org
Subject cvs commit: jakarta-tomcat/src/share/org/apache/tomcat/util/buf UDecoder.java UEncoder.java
Date Sat, 26 May 2001 17:07:31 GMT
costin      01/05/26 10:07:31

  Added:       src/share/org/apache/tomcat/util/buf UDecoder.java
                        UEncoder.java
  Log:
  Added ( refactored ) UTF encoder and decoder.
  
  The code used to be part of Byte/Char Chunk, but had many bugs and it was hard
  to optimize.
  
  Note that we don't implement M$ encoding scheme ( which is not standard and
  may cause many problems ), but it could be implemented.
  
  There is still work to be done for decoding char[] - the result of the
  conversion is byte, and it has to be converted ( somehow ) to char, but
  you can't do that without a b->c converter.
  
  ( this will happen for RequestDispatchers for example - a workaround is to
  not encode "extended" chars )
  
  Revision  Changes    Path
  1.1                  jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UDecoder.java
  
  Index: UDecoder.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  package org.apache.tomcat.util.buf;
  
  import org.apache.tomcat.util.buf.*;
  
  import java.util.BitSet;
  import java.io.*;
  
  /** 
   *  All URL decoding happens here. This way we can reuse, review, optimize
   *  without adding complexity to the buffers.
   *
   *  The conversion will modify the original buffer.
   * 
   *  @author Costin Manolache
   */
  public final class UDecoder {
      
      public UDecoder() 
      {
      }
  
      /** URLDecode, will modify the source.
       */
      public void convert(ByteChunk mb)
  	throws IOException
      {
  	int start=mb.getOffset();
  	byte buff[]=mb.getBytes();
  	int end=mb.getEnd();
  
  	int idx= mb.indexOf( buff, start, end, '%' );
  	int idx2= mb.indexOf( buff, start, end, '+' );
  	if( idx<0 && idx2<0 ) {
  	    return;
  	}
  	
  	if( idx2 >= 0 && idx2 < idx ) idx=idx2; 
  	
  	for( int j=idx; j<end; j++, idx++ ) {
  	    if( buff[ j ] == '+' ) {
  		buff[idx]= (byte)' ' ;
  	    } else if( buff[ j ] != '%' ) {
  		buff[idx]= buff[j];
  	    } else {
  		// read next 2 digits
  		if( j+2 >= end ) {
  		    throw new CharConversionException("EOF");
  		}
  		byte b1= buff[j+1];
  		byte b2=buff[j+2];
  		if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
  		    throw new CharConversionException( "isHexDigit");
  		
  		j+=2;
  		int res=x2c( b1, b2 );
  		buff[idx]=(byte)res;
  	    }
  	}
  
  	mb.setEnd( idx );
  
  	return;
      }
  
      // -------------------- Additional methods --------------------
      // XXX What do we do about charset ????
  
      /** In-buffer processing - the buffer will be modified
       */
      public void convert( CharChunk mb )
  	throws IOException
      {
  	log( "Converting a char chunk ");
  	int start=mb.getOffset();
  	char buff[]=mb.getBuffer();
  	int cend=mb.getEnd();
  
  	int idx= mb.indexOf( buff, start, cend, '%' );
  	int idx2= mb.indexOf( buff, start, cend, '+' );
  	if( idx<0 && idx2<0 ) {
  	    return;
  	}
  	
  	if( idx2 >= 0 && idx2 < idx ) idx=idx2; 
  
  	for( int j=idx; j<cend; j++, idx++ ) {
  	    if( buff[ j ] == '+' ) {
  		buff[idx]=( ' ' );
  	    } else if( buff[ j ] != '%' ) {
  		buff[idx]=buff[j];
  	    } else {
  		// read next 2 digits
  		if( j+2 >= cend ) {
  		    // invalid
  		    throw new CharConversionException("EOF");
  		}
  		char b1= buff[j+1];
  		char b2=buff[j+2];
  		if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
  		    throw new CharConversionException("isHexDigit");
  		
  		j+=2;
  		int res=x2c( b1, b2 );
  		buff[idx]=(char)res;
  	    }
  	}
  	mb.setEnd( idx );
      }
  
      /** URLDecode, will modify the source
       */
      public void convert(MessageBytes mb)
  	throws IOException
      {
  	
  	switch (mb.getType()) {
  	case MessageBytes.T_STR:
  	    String strValue=mb.toString();
  	    if( strValue==null ) return;
  	    mb.setString( convert( strValue ));
  	    break;
  	case MessageBytes.T_CHARS:
  	    CharChunk charC=mb.getCharChunk();
  	    convert( charC );
  	    break;
  	case MessageBytes.T_BYTES:
  	    ByteChunk bytesC=mb.getByteChunk();
  	    convert( bytesC );
  	    break;
  	}
      }
  
      // XXX Old code, needs to be replaced !!!!
      // 
      public final String convert(String str)
      {
          if (str == null)  return  null;
  	
  	if( str.indexOf( '+' ) <0 && str.indexOf( '%' ) < 0 )
  	    return str;
  	
          StringBuffer dec = new StringBuffer();    // decoded string output
          int strPos = 0;
          int strLen = str.length();
  
          dec.ensureCapacity(str.length());
          while (strPos < strLen) {
              int laPos;        // lookahead position
  
              // look ahead to next URLencoded metacharacter, if any
              for (laPos = strPos; laPos < strLen; laPos++) {
                  char laChar = str.charAt(laPos);
                  if ((laChar == '+') || (laChar == '%')) {
                      break;
                  }
              }
  
              // if there were non-metacharacters, copy them all as a block
              if (laPos > strPos) {
                  dec.append(str.substring(strPos,laPos));
                  strPos = laPos;
              }
  
              // shortcut out of here if we're at the end of the string
              if (strPos >= strLen) {
                  break;
              }
  
              // process next metacharacter
              char metaChar = str.charAt(strPos);
              if (metaChar == '+') {
                  dec.append(' ');
                  strPos++;
                  continue;
              } else if (metaChar == '%') {
  		// We throw the original exception - the super will deal with
  		// it
  		//                try {
  		dec.append((char)Integer.
  			   parseInt(str.substring(strPos + 1, strPos + 3),16));
                  strPos += 3;
              }
          }
  
          return dec.toString();
      }
  
  
  
      private static boolean isHexDigit( int c ) {
  	return ( ( c>='0' && c<='9' ) ||
  		 ( c>='a' && c<='f' ) ||
  		 ( c>='A' && c<='F' ));
      }
      
      private static int x2c( byte b1, byte b2 ) {
  	int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
  	    (b1 -'0');
  	digit*=16;
  	digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
  	    (b2 -'0');
  	return digit;
      }
  
      private static int x2c( char b1, char b2 ) {
  	int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
  	    (b1 -'0');
  	digit*=16;
  	digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
  	    (b2 -'0');
  	return digit;
      }
  
      private final static int debug=0;
      private static void log( String s ) {
  	System.out.println("URLDecoder: " + s );
      }
  
  }
  
  
  
  1.1                  jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UEncoder.java
  
  Index: UEncoder.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  
  package org.apache.tomcat.util.buf;
  
  import org.apache.tomcat.util.buf.*;
  import java.util.BitSet;
  import java.io.*;
  
  /** Efficient implementation for encoders.
   *  This class is not thread safe - you need one encoder per thread.
   *  The encoder will save and recycle the internal objects, avoiding
   *  garbage.
   * 
   *  You can add extra characters that you want preserved, for example
   *  while encoding a URL you can add "/".
   *
   *  @author Costin Manolache
   */
  public final class UEncoder {
  
      // Not static - the set may differ ( it's better than adding
      // an extra check for "/", "+", etc
      private BitSet safeChars=null;
      private C2BConverter c2b=null;
      private ByteChunk bb=null;
  
      private String encoding="UTF8";
      private static final int debug=0;
      
      public UEncoder() {
  	initSafeChars();
      }
  
      public void setEncoding( String s ) {
  	encoding=s;
      }
  
      public void addSafeCharacter( char c ) {
  	safeChars.set( c );
      }
  
  
      /** URL Encode string, using a specified encoding.
       *  @param s string to be encoded
       *  @param enc character encoding, for chars >%80 ( use UTF8 if not set,
       *         as recommended in RFCs)
       *  @param reserved extra characters to preserve ( "/" - if s is a URL )
       */
      public void urlEncode( Writer buf, String s )
  	throws IOException
      {
  	if( c2b==null ) {
  	    bb=new ByteChunk(16); // small enough.
  	    c2b=new C2BConverter( bb, encoding );
  	}
  
  	for (int i = 0; i < s.length(); i++) {
  	    int c = (int) s.charAt(i);
  	    if( safeChars.get( c ) ) {
  		if( debug > 0 ) log("Safe: " + (char)c);
  		buf.write((char)c);
  	    } else {
  		if( debug > 0 ) log("Unsafe:  " + (char)c);
  		c2b.convert( (char)c );
  		
  		// "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
  		// ( while UCS is 31 ). Amazing...
  		if (c >= 0xD800 && c <= 0xDBFF) {
  		    if ( (i+1) < s.length()) {
  			int d = (int) s.charAt(i+1);
  			if (d >= 0xDC00 && d <= 0xDFFF) {
  			    if( debug > 0 ) log("Unsafe:  " + c);
  			    c2b.convert( (char)d);
  			    i++;
  			}
  		    }
  		}
  
  		c2b.flushBuffer();
  		
  		urlEncode( buf, bb.getBuffer(), bb.getOffset(),
  			   bb.getLength() );
  		bb.recycle();
  	    }
  	}
      }
  
      /**
       */
      public void urlEncode( Writer buf, byte bytes[], int off, int len)
  	throws IOException
      {
  	for( int j=off; j< len; j++ ) {
  	    buf.write( '%' );
  	    char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
  	    if( debug > 0 ) log("Encode:  " + ch);
  	    buf.write(ch);
  	    ch = Character.forDigit(bytes[j] & 0xF, 16);
  	    if( debug > 0 ) log("Encode:  " + ch);
  	    buf.write(ch);
  	}
      }
  
  
  
      // -------------------- Internal implementation --------------------
      
      // 
      private void init() {
  	
      }
      
      private void initSafeChars() {
  	safeChars=new BitSet(128);
  	int i;
  	for (i = 'a'; i <= 'z'; i++) {
  	    safeChars.set(i);
  	}
  	for (i = 'A'; i <= 'Z'; i++) {
  	    safeChars.set(i);
  	}
  	for (i = '0'; i <= '9'; i++) {
  	    safeChars.set(i);
  	}
  	//safe
  	safeChars.set('$');
  	safeChars.set('-');
  	safeChars.set('_');
  	safeChars.set('.');
  
  	// Dangerous: someone may treat this as " "
  	// RFC1738 does allow it, it's not reserved
  	//    safeChars.set('+');
  	//extra
  	safeChars.set('!');
  	safeChars.set('*');
  	safeChars.set('\'');
  	safeChars.set('(');
  	safeChars.set(')');
  	safeChars.set(',');	
      }
  
      private static void log( String s ) {
  	System.out.println("Encoder: " + s );
      }
  }
  
  
  

Mime
View raw message