costin 01/05/26 10:07:31
Added: src/share/org/apache/tomcat/util/buf UDecoder.java
UEncoder.java
Log:
Added ( refactored ) UTF encoder and decoder.
The code used to be part of Byte/Char Chunk, but had many bugs and it was hard
to optimize.
Note that we don't implement M$ encoding scheme ( which is not standard and
may cause many problems ), but it could be implemented.
There is still work to be done for decoding char[] - the result of the
conversion is byte, and it has to be converted ( somehow ) to char, but
you can't do that without a b->c converter.
( this will happen for RequestDispatchers for example - a workaround is to
not encode "extended" chars )
Revision Changes Path
1.1 jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UDecoder.java
Index: UDecoder.java
===================================================================
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
* [Additional notices, if required by prior licensing conditions]
*
*/
package org.apache.tomcat.util.buf;
import org.apache.tomcat.util.buf.*;
import java.util.BitSet;
import java.io.*;
/**
* All URL decoding happens here. This way we can reuse, review, optimize
* without adding complexity to the buffers.
*
* The conversion will modify the original buffer.
*
* @author Costin Manolache
*/
public final class UDecoder {
public UDecoder()
{
}
/** URLDecode, will modify the source.
*/
public void convert(ByteChunk mb)
throws IOException
{
int start=mb.getOffset();
byte buff[]=mb.getBytes();
int end=mb.getEnd();
int idx= mb.indexOf( buff, start, end, '%' );
int idx2= mb.indexOf( buff, start, end, '+' );
if( idx<0 && idx2<0 ) {
return;
}
if( idx2 >= 0 && idx2 < idx ) idx=idx2;
for( int j=idx; j<end; j++, idx++ ) {
if( buff[ j ] == '+' ) {
buff[idx]= (byte)' ' ;
} else if( buff[ j ] != '%' ) {
buff[idx]= buff[j];
} else {
// read next 2 digits
if( j+2 >= end ) {
throw new CharConversionException("EOF");
}
byte b1= buff[j+1];
byte b2=buff[j+2];
if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
throw new CharConversionException( "isHexDigit");
j+=2;
int res=x2c( b1, b2 );
buff[idx]=(byte)res;
}
}
mb.setEnd( idx );
return;
}
// -------------------- Additional methods --------------------
// XXX What do we do about charset ????
/** In-buffer processing - the buffer will be modified
*/
public void convert( CharChunk mb )
throws IOException
{
log( "Converting a char chunk ");
int start=mb.getOffset();
char buff[]=mb.getBuffer();
int cend=mb.getEnd();
int idx= mb.indexOf( buff, start, cend, '%' );
int idx2= mb.indexOf( buff, start, cend, '+' );
if( idx<0 && idx2<0 ) {
return;
}
if( idx2 >= 0 && idx2 < idx ) idx=idx2;
for( int j=idx; j<cend; j++, idx++ ) {
if( buff[ j ] == '+' ) {
buff[idx]=( ' ' );
} else if( buff[ j ] != '%' ) {
buff[idx]=buff[j];
} else {
// read next 2 digits
if( j+2 >= cend ) {
// invalid
throw new CharConversionException("EOF");
}
char b1= buff[j+1];
char b2=buff[j+2];
if( !isHexDigit( b1 ) || ! isHexDigit(b2 ))
throw new CharConversionException("isHexDigit");
j+=2;
int res=x2c( b1, b2 );
buff[idx]=(char)res;
}
}
mb.setEnd( idx );
}
/** URLDecode, will modify the source
*/
public void convert(MessageBytes mb)
throws IOException
{
switch (mb.getType()) {
case MessageBytes.T_STR:
String strValue=mb.toString();
if( strValue==null ) return;
mb.setString( convert( strValue ));
break;
case MessageBytes.T_CHARS:
CharChunk charC=mb.getCharChunk();
convert( charC );
break;
case MessageBytes.T_BYTES:
ByteChunk bytesC=mb.getByteChunk();
convert( bytesC );
break;
}
}
// XXX Old code, needs to be replaced !!!!
//
public final String convert(String str)
{
if (str == null) return null;
if( str.indexOf( '+' ) <0 && str.indexOf( '%' ) < 0 )
return str;
StringBuffer dec = new StringBuffer(); // decoded string output
int strPos = 0;
int strLen = str.length();
dec.ensureCapacity(str.length());
while (strPos < strLen) {
int laPos; // lookahead position
// look ahead to next URLencoded metacharacter, if any
for (laPos = strPos; laPos < strLen; laPos++) {
char laChar = str.charAt(laPos);
if ((laChar == '+') || (laChar == '%')) {
break;
}
}
// if there were non-metacharacters, copy them all as a block
if (laPos > strPos) {
dec.append(str.substring(strPos,laPos));
strPos = laPos;
}
// shortcut out of here if we're at the end of the string
if (strPos >= strLen) {
break;
}
// process next metacharacter
char metaChar = str.charAt(strPos);
if (metaChar == '+') {
dec.append(' ');
strPos++;
continue;
} else if (metaChar == '%') {
// We throw the original exception - the super will deal with
// it
// try {
dec.append((char)Integer.
parseInt(str.substring(strPos + 1, strPos + 3),16));
strPos += 3;
}
}
return dec.toString();
}
private static boolean isHexDigit( int c ) {
return ( ( c>='0' && c<='9' ) ||
( c>='a' && c<='f' ) ||
( c>='A' && c<='F' ));
}
private static int x2c( byte b1, byte b2 ) {
int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
(b1 -'0');
digit*=16;
digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
(b2 -'0');
return digit;
}
private static int x2c( char b1, char b2 ) {
int digit= (b1>='A') ? ( (b1 & 0xDF)-'A') + 10 :
(b1 -'0');
digit*=16;
digit +=(b2>='A') ? ( (b2 & 0xDF)-'A') + 10 :
(b2 -'0');
return digit;
}
private final static int debug=0;
private static void log( String s ) {
System.out.println("URLDecoder: " + s );
}
}
1.1 jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UEncoder.java
Index: UEncoder.java
===================================================================
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
* [Additional notices, if required by prior licensing conditions]
*
*/
package org.apache.tomcat.util.buf;
import org.apache.tomcat.util.buf.*;
import java.util.BitSet;
import java.io.*;
/** Efficient implementation for encoders.
* This class is not thread safe - you need one encoder per thread.
* The encoder will save and recycle the internal objects, avoiding
* garbage.
*
* You can add extra characters that you want preserved, for example
* while encoding a URL you can add "/".
*
* @author Costin Manolache
*/
public final class UEncoder {
// Not static - the set may differ ( it's better than adding
// an extra check for "/", "+", etc
private BitSet safeChars=null;
private C2BConverter c2b=null;
private ByteChunk bb=null;
private String encoding="UTF8";
private static final int debug=0;
public UEncoder() {
initSafeChars();
}
public void setEncoding( String s ) {
encoding=s;
}
public void addSafeCharacter( char c ) {
safeChars.set( c );
}
/** URL Encode string, using a specified encoding.
* @param s string to be encoded
* @param enc character encoding, for chars >%80 ( use UTF8 if not set,
* as recommended in RFCs)
* @param reserved extra characters to preserve ( "/" - if s is a URL )
*/
public void urlEncode( Writer buf, String s )
throws IOException
{
if( c2b==null ) {
bb=new ByteChunk(16); // small enough.
c2b=new C2BConverter( bb, encoding );
}
for (int i = 0; i < s.length(); i++) {
int c = (int) s.charAt(i);
if( safeChars.get( c ) ) {
if( debug > 0 ) log("Safe: " + (char)c);
buf.write((char)c);
} else {
if( debug > 0 ) log("Unsafe: " + (char)c);
c2b.convert( (char)c );
// "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
// ( while UCS is 31 ). Amazing...
if (c >= 0xD800 && c <= 0xDBFF) {
if ( (i+1) < s.length()) {
int d = (int) s.charAt(i+1);
if (d >= 0xDC00 && d <= 0xDFFF) {
if( debug > 0 ) log("Unsafe: " + c);
c2b.convert( (char)d);
i++;
}
}
}
c2b.flushBuffer();
urlEncode( buf, bb.getBuffer(), bb.getOffset(),
bb.getLength() );
bb.recycle();
}
}
}
/**
*/
public void urlEncode( Writer buf, byte bytes[], int off, int len)
throws IOException
{
for( int j=off; j< len; j++ ) {
buf.write( '%' );
char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
if( debug > 0 ) log("Encode: " + ch);
buf.write(ch);
ch = Character.forDigit(bytes[j] & 0xF, 16);
if( debug > 0 ) log("Encode: " + ch);
buf.write(ch);
}
}
// -------------------- Internal implementation --------------------
//
private void init() {
}
private void initSafeChars() {
safeChars=new BitSet(128);
int i;
for (i = 'a'; i <= 'z'; i++) {
safeChars.set(i);
}
for (i = 'A'; i <= 'Z'; i++) {
safeChars.set(i);
}
for (i = '0'; i <= '9'; i++) {
safeChars.set(i);
}
//safe
safeChars.set('$');
safeChars.set('-');
safeChars.set('_');
safeChars.set('.');
// Dangerous: someone may treat this as " "
// RFC1738 does allow it, it's not reserved
// safeChars.set('+');
//extra
safeChars.set('!');
safeChars.set('*');
safeChars.set('\'');
safeChars.set('(');
safeChars.set(')');
safeChars.set(',');
}
private static void log( String s ) {
System.out.println("Encoder: " + s );
}
}
|