james-mime4j-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ol...@apache.org
Subject svn commit: r891854 - in /james/mime4j/trunk/core/src: main/java/org/apache/james/mime4j/codec/ test/java/org/apache/james/mime4j/codec/
Date Thu, 17 Dec 2009 19:24:00 GMT
Author: olegk
Date: Thu Dec 17 19:24:00 2009
New Revision: 891854

URL: http://svn.apache.org/viewvc?rev=891854&view=rev
Log:
* MIME4J-103: QuotedPrintableInputStream refactoring (first round); 2.5 times better performance
* Fix for MIME4J-143: QuotedPrintableInputStream can now deal with soft line breaks terminated
by a lone LF

Modified:
    james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/QuotedPrintableInputStream.java
    james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableInputStreamTest.java
    james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableTextEncodeTest.java

Modified: james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/QuotedPrintableInputStream.java
URL: http://svn.apache.org/viewvc/james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/QuotedPrintableInputStream.java?rev=891854&r1=891853&r2=891854&view=diff
==============================================================================
--- james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/QuotedPrintableInputStream.java
(original)
+++ james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/QuotedPrintableInputStream.java
Thu Dec 17 19:24:00 2009
@@ -29,16 +29,41 @@
  * Performs Quoted-Printable decoding on an underlying stream.
  */
 public class QuotedPrintableInputStream extends InputStream {
+    
+    private static final int ENCODED_BUFFER_SIZE = 1024 * 2;
+    private static char CR = '\r';
+    private static char LF = '\n';
+    private static char EQ = '=';
+    
     private static Log log = LogFactory.getLog(QuotedPrintableInputStream.class);
     
-    private InputStream stream;
-    ByteQueue byteq = new ByteQueue();
-    ByteQueue pushbackq = new ByteQueue();
-    private byte state = 0;
-    private boolean closed = false;
+    private final InputStream in;
+    private boolean strict;
+    private final ByteQueue data; 
+    private final ByteQueue blanks; 
+    
+    private final byte[] encoded;
+    private int pos = 0; // current index into encoded buffer
+    private int limit = 0; // current size of encoded buffer
+    
+    private boolean closed;
 
-    public QuotedPrintableInputStream(InputStream stream) {
-        this.stream = stream;
+    protected QuotedPrintableInputStream(final int bufsize, final InputStream in, boolean
strict) {
+        super();
+        this.in = in;
+        this.strict = strict;
+        this.encoded = new byte[bufsize];
+        this.data = new ByteQueue();
+        this.blanks = new ByteQueue();
+        this.closed = false;
+    }
+    
+    public QuotedPrintableInputStream(final InputStream in, boolean strict) {
+        this(ENCODED_BUFFER_SIZE, in, strict);
+    }
+    
+    public QuotedPrintableInputStream(final InputStream in) {
+        this(ENCODED_BUFFER_SIZE, in, false);
     }
     
     /**
@@ -49,181 +74,187 @@
      */
     @Override
     public void close() throws IOException {
-        this.closed = true;
+        closed = true;
     }
 
-    @Override
-    public int read() throws IOException {
-        if (closed) {
-            throw new IOException("QuotedPrintableInputStream has been closed");
+    private int bufferLength() {
+        return limit - pos;
+    }
+    
+    private int fillBuffer() throws IOException {
+        // Compact buffer if needed
+        if (pos < limit) {
+            System.arraycopy(encoded, pos, encoded, 0, limit - pos);
+            limit -= pos;
+            pos = 0;
+        } else {
+            limit = 0;
+            pos = 0;
+        }
+        
+        int capacity = encoded.length - limit;
+        if (capacity > 0) {
+            int bytesRead = in.read(encoded, limit, capacity);
+            if (bytesRead > 0) {
+                limit += bytesRead;
+            }
+            return bytesRead;
+        } else {
+            return 0;
+        }
+    }
+    
+    private byte advance() {
+        if (pos < limit) {
+            byte b =  encoded[pos];
+            pos++;
+            return b;
+        } else {
+            return -1;
         }
-        fillBuffer();
-        if (byteq.count() == 0)
+    }
+    
+    private byte peek(int i) {
+        if (pos + i < limit) {
+            return encoded[pos + i];
+        } else {
             return -1;
-        else {
-            byte val = byteq.dequeue();
-            if (val >= 0)
-                return val;
-            else
-                return val & 0xFF;
         }
     }
-
-    /**
-     * Pulls bytes out of the underlying stream and places them in the
-     * pushback queue.  This is necessary (vs. reading from the
-     * underlying stream directly) to detect and filter out "transport
-     * padding" whitespace, i.e., all whitespace that appears immediately
-     * before a CRLF.
-     *
-     * @throws IOException Underlying stream threw IOException.
-     */
-    private void populatePushbackQueue() throws IOException {
-        //Debug.verify(pushbackq.count() == 0, "PopulatePushbackQueue called when pushback
queue was not empty!");
-
-        if (pushbackq.count() != 0)
-            return;
-
-        while (true) {
-            int i = stream.read();
-            switch (i) {
-                case -1:
-                    // stream is done
-                    pushbackq.clear();  // discard any whitespace preceding EOF
-                    return;
-                case ' ':
-                case '\t':
-                    pushbackq.enqueue((byte)i);
-                    break;
-                case '\r':
-                case '\n':
-                    pushbackq.clear();  // discard any whitespace preceding EOL
-                    pushbackq.enqueue((byte)i);
-                    return;
-                default:
-                    pushbackq.enqueue((byte)i);
-                    return;
+    
+    private void enqueueData() {
+        for (int i = pos; i < limit; i++) {
+            byte b = encoded[i];
+            if (b == LF || b == EQ) {
+                break;
+            }
+            if (Character.isWhitespace(b)) {
+                blanks.enqueue(b);
+            } else {
+                enqueueBlanks();                
+                data.enqueue(b);
+            }
+            pos++;
+        }
+    }    
+    
+    private void enqueueBlanks() {
+        while (blanks.count() > 0) {
+            data.enqueue(blanks.dequeue());
+        }
+    }
+    
+    private void decode() throws IOException {
+        boolean endOfStream = false;
+        while (data.count() == 0) {
+
+            if (bufferLength() < 3) {
+                int bytesRead = fillBuffer();
+                endOfStream = bytesRead == -1;
+            }
+            // end of stream?
+            if (bufferLength() == 0 && endOfStream) {
+                break;
+            }
+            
+            // copy plain bytes until a delimiter is encountered
+            enqueueData();            
+            
+            int len = bufferLength();
+            if (len > 0) {
+                // found a delimiter of some kind
+                if (len >= 3 || endOfStream) {
+                    decodeSpecialSequence();
+                }
             }
         }
     }
 
-    /**
-     * Causes the pushback queue to get populated if it is empty, then
-     * consumes and decodes bytes out of it until one or more bytes are
-     * in the byte queue.  This decoding step performs the actual QP
-     * decoding.
-     *
-     * @throws IOException Underlying stream threw IOException.
-     */
-    private void fillBuffer() throws IOException {
-        byte msdChar = 0;  // first digit of escaped num
-        while (byteq.count() == 0) {
-            if (pushbackq.count() == 0) {
-                populatePushbackQueue();
-                if (pushbackq.count() == 0)
-                    return;
-            }
-
-            byte b = pushbackq.dequeue();
-
-            switch (state) {
-                case 0:  // start state, no bytes pending
-                    if (b != '=') {
-                        byteq.enqueue(b);
-                        break;  // state remains 0
-                    } else {
-                        state = 1;
-                        break;
-                    }
-                case 1:  // encountered "=" so far
-                    if (b == '\r') {
-                        state = 2;
-                        break;
-                    } else if ((b >= '0' && b <= '9') || (b >= 'A' &&
b <= 'F') || (b >= 'a' && b <= 'f')) {
-                        state = 3;
-                        msdChar = b;  // save until next digit encountered
-                        break;
-                    } else if (b == '=') {
-                        /*
-                         * Special case when == is encountered.
-                         * Emit one = and stay in this state.
-                         */
-                        if (log.isWarnEnabled()) {
-                            log.warn("Malformed MIME; got ==");
-                        }
-                        byteq.enqueue((byte)'=');
-                        break;
-                    } else {
-                        if (log.isWarnEnabled()) {
-                            log.warn("Malformed MIME; expected \\r or "
-                                    + "[0-9A-Z], got " + b);
-                        }
-                        state = 0;
-                        byteq.enqueue((byte)'=');
-                        byteq.enqueue(b);
-                        break;
-                    }
-                case 2:  // encountered "=\r" so far
-                    if (b == '\n') {
-                        state = 0;
-                        break;
-                    } else {
-                        if (log.isWarnEnabled()) {
-                            log.warn("Malformed MIME; expected " 
-                                    + (int)'\n' + ", got " + b);
-                        }
-                        state = 0;
-                        byteq.enqueue((byte)'=');
-                        byteq.enqueue((byte)'\r');
-                        byteq.enqueue(b);
-                        break;
-                    }
-                case 3:  // encountered =<digit> so far; expecting another <digit>
to complete the octet
-                    if ((b >= '0' && b <= '9') || (b >= 'A' && b
<= 'F') || (b >= 'a' && b <= 'f')) {
-                        byte msd = asciiCharToNumericValue(msdChar);
-                        byte low = asciiCharToNumericValue(b);
-                        state = 0;
-                        byteq.enqueue((byte)((msd << 4) | low));
-                        break;
+    private void decodeSpecialSequence() throws IOException {
+        byte b1 = advance();
+        if (b1 == LF) {
+            // at end of line
+            if (blanks.count() == 0) {
+                data.enqueue((byte) LF);
+            } else {
+                if (blanks.dequeue() != EQ) {
+                    // hard line break
+                    data.enqueue((byte) CR);
+                    data.enqueue((byte) LF);
+                }
+            }
+            blanks.clear();
+        } else if (b1 == EQ) {
+            // found special char '='
+            enqueueBlanks();
+            byte b2 = advance();
+            if (b2 == EQ) {
+                data.enqueue(b2);
+                // deal with '==\r\n' brokenness
+                byte bb1 = peek(0);
+                byte bb2 = peek(1);
+                if (bb1 == LF || (bb1 == CR && bb2 == LF)) {
+                    blanks.enqueue(b2);
+                }
+            } else if (Character.isWhitespace((char) b2)) {
+                // soft line break
+                if (b2 != LF) {
+                    blanks.enqueue(b1);
+                    blanks.enqueue(b2);
+                }
+            } else {
+                byte b3 = advance();
+                int upper = convert(b2);
+                int lower = convert(b3);
+                if (upper < 0 || lower < 0) {
+                    if (strict) {
+                        throw new IOException("Malformed encoded value encountered");
                     } else {
-                        if (log.isWarnEnabled()) {
-                            log.warn("Malformed MIME; expected "
-                                     + "[0-9A-Z], got " + b);
-                        }
-                        state = 0;
-                        byteq.enqueue((byte)'=');
-                        byteq.enqueue(msdChar);
-                        byteq.enqueue(b);
-                        break;
+                        log.warn("Malformed encoded value encountered");
+                        data.enqueue((byte) EQ);
+                        if (b2 != -1) data.enqueue((byte) b2);
+                        if (b3 != -1) data.enqueue((byte) b3);
                     }
-                default:  // should never happen
-                    log.error("Illegal state: " + state);
-                    state = 0;
-                    byteq.enqueue(b);
-                    break;
+                } else {
+                    data.enqueue((byte)((upper << 4) | lower));
+                }
             }
+        } else {
+            throw new IllegalStateException();
         }
     }
-
+    
     /**
      * Converts '0' => 0, 'A' => 10, etc.
      * @param c ASCII character value.
      * @return Numeric value of hexadecimal character.
      */
-    private byte asciiCharToNumericValue(byte c) {
+    private int convert(byte c) {
         if (c >= '0' && c <= '9') {
-            return (byte)(c - '0');
-        } else if (c >= 'A' && c <= 'Z') {
-            return (byte)(0xA + (c - 'A'));
-        } else if (c >= 'a' && c <= 'z') {
-            return (byte)(0xA + (c - 'a'));
-        } else {
-            /*
-             * This should never happen since all calls to this method
-             * are preceded by a check that c is in [0-9A-Za-z]
-             */
-            throw new IllegalArgumentException((char) c 
-                    + " is not a hexadecimal digit");
+            return (c - '0');
+        } else if (c >= 'A' && c <= 'F') {
+            return (0xA + (c - 'A'));
+        } else if (c >= 'a' && c <= 'f') {
+            return (0xA + (c - 'a'));
+        } else {
+            return -1;
+        }
+    }
+
+    @Override
+    public int read() throws IOException {
+        if (closed) {
+            throw new IOException("Stream has been closed");
+        }
+        decode();
+        if (data.count() == 0)
+            return -1;
+        else {
+            byte val = data.dequeue();
+            if (val >= 0)
+                return val;
+            else
+                return val & 0xFF;
         }
     }
 

Modified: james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableInputStreamTest.java
URL: http://svn.apache.org/viewvc/james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableInputStreamTest.java?rev=891854&r1=891853&r2=891854&view=diff
==============================================================================
--- james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableInputStreamTest.java
(original)
+++ james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableInputStreamTest.java
Thu Dec 17 19:24:00 2009
@@ -37,38 +37,102 @@
         BasicConfigurator.configure();
     }
     
-    public void testDecode() throws IOException, UnsupportedEncodingException {
-        ByteArrayInputStream bis = null;
-        QuotedPrintableInputStream decoder = null;
-
-        bis = new ByteArrayInputStream("=e1=e2=E3=E4\r\n".getBytes("US-ASCII"));
-        decoder = new QuotedPrintableInputStream(bis);
+    public void testBasicDecode() throws IOException, UnsupportedEncodingException {
+        ByteArrayInputStream bis = new ByteArrayInputStream("=e1=e2=E3=E4\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
         assertEquals("\u00e1\u00e2\u00e3\u00e4\r\n", new String(read(decoder), "ISO8859-1"));
-        
-        bis = new ByteArrayInputStream("=e1=g2=E3=E4\r\n".getBytes("US-ASCII"));
-        decoder = new QuotedPrintableInputStream(bis);
+    }
+
+    public void testDecodeBufferWrapping() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream(
+                "=e1=e2=E3=E4\r\n=e1=e2=E3=E4\r\n=e1=e2=E3=E4\r\n=e1=e2=E3=E4\r\n=e1=e2=E3=E4\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
+        assertEquals("\u00e1\u00e2\u00e3\u00e4\r\n\u00e1\u00e2\u00e3\u00e4\r\n\u00e1\u00e2\u00e3"
+
+        		"\u00e4\r\n\u00e1\u00e2\u00e3\u00e4\r\n\u00e1\u00e2\u00e3\u00e4\r\n", new String(read(decoder),
"ISO8859-1"));
+    }
+
+    public void testInvalidValueDecode() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("=e1=g2=E3=E4\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
         assertEquals("\u00e1=g2\u00e3\u00e4\r\n", new String(read(decoder), "ISO8859-1"));
-        
-        bis = new ByteArrayInputStream("   =e1 =e2  =E3\t=E4  \t \t    \r\n".getBytes("US-ASCII"));
-        decoder = new QuotedPrintableInputStream(bis);
+    }
+
+    public void testDecodeTrailingBlanks() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("   =e1 =e2  =E3\t=E4  \t \t
   \r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
         assertEquals("   \u00e1 \u00e2  \u00e3\t\u00e4\r\n", new String(read(decoder), "ISO8859-1"));
-        
-        /*
-         * Test soft line breaks.
-         */
-        bis = new ByteArrayInputStream("Soft line   = \t \r\nHard line   \r\n".getBytes("US-ASCII"));
-        decoder = new QuotedPrintableInputStream(bis);
+    }
+
+    public void testCanonicalSoftBreakDecode() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("Soft line   =\r\nHard line 
 \r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
         assertEquals("Soft line   Hard line\r\n", new String(read(decoder), "ISO8859-1"));
-        
+    }
+    
+    public void testSoftBreakLoneLFDecode() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("Soft line   =\nHard line   \r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
+        assertEquals("Soft line   Hard line\r\n", new String(read(decoder), "ISO8859-1"));
+    }
+    
+    public void testSoftBreakTrailingBalnksDecode() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("Soft line   = \t \r\nHard line
  \r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
+        assertEquals("Soft line   Hard line\r\n", new String(read(decoder), "ISO8859-1"));
+    }
+    
+    public void testBrokenSoftBreakDecode() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("Soft line   =\rHard line   \r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
+        assertEquals("Soft line   =\rHard line\r\n", new String(read(decoder), "ISO8859-1"));
+    }
+    
+    public void testEscapedEQDecode() throws IOException, UnsupportedEncodingException {
+        ByteArrayInputStream bis = new ByteArrayInputStream("width==340 height=3d200\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
+        assertEquals("width=340 height=200\r\n", new String(read(decoder), "ISO8859-1"));
+    }
+
+    public void testBrokenEscapedEQDecode() throws IOException, UnsupportedEncodingException
{
         /*
          * This isn't valid qp (==) but it is known to occur in certain
          * messages, especially spam.
          */
-        bis = new ByteArrayInputStream("width==\r\n340 height=3d200\r\n".getBytes("US-ASCII"));
-        decoder = new QuotedPrintableInputStream(bis);
+        ByteArrayInputStream bis = new ByteArrayInputStream("width==\r\n340 height=3d200\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(bis);
         assertEquals("width=340 height=200\r\n", new String(read(decoder), "ISO8859-1"));
     }
 
+    public void testDecodeEndOfStream1() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("01234567".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(6, bis, false);
+        assertEquals("01234567", new String(read(decoder), "ISO8859-1"));
+    }
+
+    public void testDecodeEndOfStream2() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("012345\r".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(6, bis, false);
+        assertEquals("012345", new String(read(decoder), "ISO8859-1"));
+    }
+
+    public void testDecodeEndOfStream3() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("012345\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(6, bis, false);
+        assertEquals("012345\n", new String(read(decoder), "ISO8859-1"));
+    }
+
+    public void testDecodeEndOfStream4() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("01234= ".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(6, bis, false);
+        assertEquals("01234", new String(read(decoder), "ISO8859-1"));
+    }
+
+    public void testDecodeEndOfStream5() throws IOException, UnsupportedEncodingException
{
+        ByteArrayInputStream bis = new ByteArrayInputStream("01234=\r\n".getBytes("US-ASCII"));
+        QuotedPrintableInputStream decoder = new QuotedPrintableInputStream(6, bis, false);
+        assertEquals("01234", new String(read(decoder), "ISO8859-1"));
+    }
+
     public void testDecodePrematureClose() throws IOException, UnsupportedEncodingException
{
         ByteArrayInputStream bis = null;
         QuotedPrintableInputStream decoder = null;

Modified: james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableTextEncodeTest.java
URL: http://svn.apache.org/viewvc/james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableTextEncodeTest.java?rev=891854&r1=891853&r2=891854&view=diff
==============================================================================
--- james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableTextEncodeTest.java
(original)
+++ james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/QuotedPrintableTextEncodeTest.java
Thu Dec 17 19:24:00 2009
@@ -103,7 +103,7 @@
         for (byte b=0;b<Byte.MAX_VALUE;b++) {
             byte[] content = {b};
             // White space is only escaped when followed by CRLF
-            if (b != 32 && b != 9) { 
+            if (b != 13 && b != 32 && b != 9) { 
                 checkRoundtrip(content);
             }
         }



Mime
View raw message