commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ggreg...@apache.org
Subject svn commit: r1637008 - in /commons/proper/codec/trunk/src: changes/changes.xml main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java
Date Wed, 05 Nov 2014 23:43:14 GMT
Author: ggregory
Date: Wed Nov  5 23:43:14 2014
New Revision: 1637008

URL: http://svn.apache.org/r1637008
Log:
[CODEC-121] QuotedPrintableCodec does not support soft line break per the 'quoted-printable'
example on Wikipedia. Apply patch from TN.

Modified:
    commons/proper/codec/trunk/src/changes/changes.xml
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1637008&r1=1637007&r2=1637008&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Wed Nov  5 23:43:14 2014
@@ -44,6 +44,7 @@ The <action> type attribute can be add,u
   <body>
     <release version="1.10" date="DD November 2014" description="Feature and fix release.">
       <action dev="ggregory" type="add" issue="CODEC-192" due-to="Thomas Neidhart">Add
Daitch-Mokotoff Soundex</action>   
+      <action dev="ggregory" type="add" issue="CODEC-121" due-to="Thomas Neidhart, Java
John">QuotedPrintableCodec does not support soft line break per the 'quoted-printable'
example on Wikipedia</action>   
       <action dev="tn" type="fix" issue="CODEC-185" due-to="Sean Busbey">Added clarification
to Javadoc of Base64 concerning the use of the urlSafe parameter</action>   
       <action dev="tn" type="fix" issue="CODEC-191" due-to="Igor Savin">Added clarification
to the Javadoc of Base[32|64]OutputStream that it is mandatory to call close()</action>
  
       <action dev="ggregory" type="fix" issue="CODEC-188" due-to="Hendrik Saly">Add
support for HMAC Message Authentication Code (MAC) digests</action>   

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java?rev=1637008&r1=1637007&r2=1637008&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java
(original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/net/QuotedPrintableCodec.java
Wed Nov  5 23:43:14 2014
@@ -45,14 +45,19 @@ import org.apache.commons.codec.binary.S
  * <p>
  * Note:
  * <p>
- * Rules #3, #4, and #5 of the quoted-printable spec are not implemented yet because the
complete quoted-printable spec
- * does not lend itself well into the byte[] oriented codec framework. Complete the codec
once the streamable codec
- * framework is ready. The motivation behind providing the codec in a partial form is that
it can already come in handy
- * for those applications that do not require quoted-printable line formatting (rules #3,
#4, #5), for instance Q codec.
+ * Depending on the selected {@code strict} parameter, this class will implement a different
set of rules of the
+ * quoted-printable spec:
+ * <ul>
+ *   <li>{@code strict=false}: only rules #1 and #2 are implemented 
+ *   <li>{@code strict=true}: all rules #1 through #5 are implemented
+ * </ul>
+ * Originally, this class only supported the non-strict mode, but the codec in this partial
form could already be used
+ * for certain applications that do not require quoted-printable line formatting (rules #3,
#4, #5), for instance Q codec.
+ * The strict mode has been added in 1.10.
  * <p>
  * This class is immutable and thread-safe.
  *
- * @see <a href="http://www.ietf.org/rfc/rfc1521.txt"> RFC 1521 MIME (Multipurpose
Internet Mail Extensions) Part One:
+ * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet
Mail Extensions) Part One:
  *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies
</a>
  *
  * @since 1.3
@@ -65,6 +70,11 @@ public class QuotedPrintableCodec implem
     private final Charset charset;
 
     /**
+     * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
+     */
+    private final boolean strict;
+
+    /**
      * BitSet of printable characters as defined in RFC 1521.
      */
     private static final BitSet PRINTABLE_CHARS = new BitSet(256);
@@ -74,6 +84,16 @@ public class QuotedPrintableCodec implem
     private static final byte TAB = 9;
 
     private static final byte SPACE = 32;
+
+    private static final byte CR = 13;
+
+    private static final byte LF = 10;
+
+    /**
+     * Safe line length for quoted printable encoded text.
+     */
+    private static final int SAFE_LENGTH = 73;
+
     // Static initializer for printable chars collection
     static {
         // alpha characters
@@ -91,7 +111,18 @@ public class QuotedPrintableCodec implem
      * Default constructor, assumes default charset of {@link Charsets#UTF_8}
      */
     public QuotedPrintableCodec() {
-        this(Charsets.UTF_8);
+        this(Charsets.UTF_8, false);
+    }
+
+    /**
+     * Constructor which allows for the selection of the strict mode.
+     *
+     * @param strict
+     *            if {@code true}, soft line breaks will be used
+     * @since 1.10
+     */
+    public QuotedPrintableCodec(final boolean strict) {
+        this(Charsets.UTF_8, strict);
     }
 
     /**
@@ -102,7 +133,21 @@ public class QuotedPrintableCodec implem
      * @since 1.7
      */
     public QuotedPrintableCodec(final Charset charset) {
+        this(charset, false);
+    }
+
+    /**
+     * Constructor which allows for the selection of a default charset and strict mode.
+     *
+     * @param charset
+     *            the default string charset to use.
+     * @param strict
+     *            if {@code true}, soft line breaks will be used
+     * @since 1.10
+     */
+    public QuotedPrintableCodec(final Charset charset, final boolean strict) {
         this.charset = charset;
+        this.strict = strict;
     }
 
     /**
@@ -122,7 +167,7 @@ public class QuotedPrintableCodec implem
      */
     public QuotedPrintableCodec(final String charsetName)
             throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException
{
-        this(Charset.forName(charsetName));
+        this(Charset.forName(charsetName), false);
     }
 
     /**
@@ -132,13 +177,65 @@ public class QuotedPrintableCodec implem
      *            byte to encode
      * @param buffer
      *            the buffer to write to
+     * @return The number of bytes written to the <code>buffer</code>
      */
-    private static final void encodeQuotedPrintable(final int b, final ByteArrayOutputStream
buffer) {
+    private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream
buffer) {
         buffer.write(ESCAPE_CHAR);
         final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF,
16));
         final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16));
         buffer.write(hex1);
         buffer.write(hex2);
+        return 3;
+    }
+
+    /**
+     * Return the byte at position <code>index</code> of the byte array and
+     * make sure it is unsigned.
+     *
+     * @param index
+     *            position in the array
+     * @param bytes
+     *            the byte array
+     * @return the unsigned octet at position <code>index</code> from the array
+     */
+    private static int getUnsignedOctet(final int index, final byte[] bytes) {
+        int b = bytes[index];
+        if (b < 0) {
+            b = 256 + b;
+        }
+        return b;
+    }
+
+    /**
+     * Write a byte to the buffer.
+     *
+     * @param b
+     *            byte to write
+     * @param encode
+     *            indicates whether the octet shall be encoded
+     * @param buffer
+     *            the buffer to write to
+     * @return the number of bytes that have been written to the buffer
+     */
+    private static int encodeByte(final int b, final boolean encode,
+                                  final ByteArrayOutputStream buffer) {
+        if (encode) {
+            return encodeQuotedPrintable(b, buffer);
+        } else {
+            buffer.write(b);
+            return 1;
+        }
+    }
+
+    /**
+     * Checks whether the given byte is whitespace.
+     *
+     * @param b
+     *            byte to be checked
+     * @return <code>true</code> if the byte is either a space or tab character
+     */
+    private static boolean isWhitespace(final int b) {
+        return b == SPACE || b == TAB;
     }
 
     /**
@@ -154,6 +251,26 @@ public class QuotedPrintableCodec implem
      * @return array of bytes containing quoted-printable data
      */
     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes)
{
+        return encodeQuotedPrintable(printable, bytes, false);
+    }
+
+    /**
+     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe
characters are escaped.
+     * <p>
+     * Depending on the selection of the {@code strict} parameter, this function either implements
the full ruleset
+     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2)
as defined in
+     * RFC 1521 and is suitable for encoding binary data and unformatted text.
+     *
+     * @param printable
+     *            bitset of characters deemed quoted-printable
+     * @param bytes
+     *            array of bytes to be encoded
+     * @param strict
+     *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule
#2
+     * @return array of bytes containing quoted-printable data
+     * @since 1.10
+     */
+    public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes,
boolean strict) {
         if (bytes == null) {
             return null;
         }
@@ -161,15 +278,59 @@ public class QuotedPrintableCodec implem
             printable = PRINTABLE_CHARS;
         }
         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-        for (final byte c : bytes) {
-            int b = c;
-            if (b < 0) {
-                b = 256 + b;
+        
+        if (strict) {
+            int pos = 1;
+            // encode up to buffer.length - 3, the last three octets will be treated
+            // separately for simplification of note #3
+            for (int i = 0; i < bytes.length - 3; i++) {
+                int b = getUnsignedOctet(i, bytes);
+                if (pos < SAFE_LENGTH) {
+                    // up to this length it is safe to add any byte, encoded or not
+                    pos += encodeByte(b, !printable.get(b), buffer);
+                } else {
+                    // rule #3: whitespace at the end of a line *must* be encoded
+                    encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
+
+                    // rule #5: soft line break
+                    buffer.write(ESCAPE_CHAR);
+                    buffer.write(CR);
+                    buffer.write(LF);
+                    pos = 1;
+                }
             }
-            if (printable.get(b)) {
-                buffer.write(b);
-            } else {
-                encodeQuotedPrintable(b, buffer);
+
+            // rule #3: whitespace at the end of a line *must* be encoded
+            // if we would do a soft break line after this octet, encode whitespace
+            int b = getUnsignedOctet(bytes.length - 3, bytes);
+            boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH
- 5);
+            pos += encodeByte(b, encode, buffer);
+
+            // note #3: '=' *must not* be the ultimate or penultimate character
+            // simplification: if < 6 bytes left, do a soft line break as we may need
+            //                 exactly 6 bytes space for the last 2 bytes
+            if (pos > SAFE_LENGTH - 2) {
+                buffer.write(ESCAPE_CHAR);
+                buffer.write(CR);
+                buffer.write(LF);
+            }
+            for (int i = bytes.length - 2; i < bytes.length; i++) {
+                b = getUnsignedOctet(i, bytes);
+                // rule #3: trailing whitespace shall be encoded
+                encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
+                encodeByte(b, encode, buffer);
+            }
+        } else {
+            for (final byte c : bytes) {
+                int b = c;
+                if (b < 0) {
+                    b = 256 + b;
+                }
+                if (printable.get(b)) {
+                    buffer.write(b);
+                } else {
+                    encodeQuotedPrintable(b, buffer);
+                }
             }
         }
         return buffer.toByteArray();
@@ -179,8 +340,8 @@ public class QuotedPrintableCodec implem
      * Decodes an array quoted-printable characters into an array of original bytes. Escaped
characters are converted
      * back to their original representation.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
-     * RFC 1521.
+     * This function fully implements the quoted-printable encoding specification (rule #1
through rule #5) as
+     * defined in RFC 1521.
      *
      * @param bytes
      *            array of quoted-printable characters
@@ -197,13 +358,18 @@ public class QuotedPrintableCodec implem
             final int b = bytes[i];
             if (b == ESCAPE_CHAR) {
                 try {
-                    final int u = Utils.digit16(bytes[++i]);
+                    // if the next octet is a CR we have found a soft line break
+                    if (bytes[++i] == CR) {
+                        continue;
+                    }
+                    final int u = Utils.digit16(bytes[i]);
                     final int l = Utils.digit16(bytes[++i]);
                     buffer.write((char) ((u << 4) + l));
                 } catch (final ArrayIndexOutOfBoundsException e) {
                     throw new DecoderException("Invalid quoted-printable encoding", e);
                 }
-            } else {
+            } else if (b != CR && b != LF) {
+                // every other octet is appended except for CR & LF
                 buffer.write(b);
             }
         }
@@ -213,7 +379,8 @@ public class QuotedPrintableCodec implem
     /**
      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe
characters are escaped.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
+     * Depending on the selection of the {@code strict} parameter, this function either implements
the full ruleset
+     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2)
as defined in
      * RFC 1521 and is suitable for encoding binary data and unformatted text.
      *
      * @param bytes
@@ -222,15 +389,15 @@ public class QuotedPrintableCodec implem
      */
     @Override
     public byte[] encode(final byte[] bytes) {
-        return encodeQuotedPrintable(PRINTABLE_CHARS, bytes);
+        return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
     }
 
     /**
      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped
characters are converted
      * back to their original representation.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
-     * RFC 1521.
+     * This function fully implements the quoted-printable encoding specification (rule #1
through rule #5) as
+     * defined in RFC 1521.
      *
      * @param bytes
      *            array of quoted-printable characters
@@ -246,8 +413,9 @@ public class QuotedPrintableCodec implem
     /**
      * Encodes a string into its quoted-printable form using the default string charset.
Unsafe characters are escaped.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
-     * RFC 1521 and is suitable for encoding binary data.
+     * Depending on the selection of the {@code strict} parameter, this function either implements
the full ruleset
+     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2)
as defined in
+     * RFC 1521 and is suitable for encoding binary data and unformatted text.
      *
      * @param str
      *            string to convert to quoted-printable form
@@ -392,7 +560,8 @@ public class QuotedPrintableCodec implem
     /**
      * Encodes a string into its quoted-printable form using the specified charset. Unsafe
characters are escaped.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
+     * Depending on the selection of the {@code strict} parameter, this function either implements
the full ruleset
+     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2)
as defined in
      * RFC 1521 and is suitable for encoding binary data and unformatted text.
      *
      * @param str
@@ -412,7 +581,8 @@ public class QuotedPrintableCodec implem
     /**
      * Encodes a string into its quoted-printable form using the specified charset. Unsafe
characters are escaped.
      * <p>
-     * This function implements a subset of quoted-printable encoding specification (rule
#1 and rule #2) as defined in
+     * Depending on the selection of the {@code strict} parameter, this function either implements
the full ruleset
+     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2)
as defined in
      * RFC 1521 and is suitable for encoding binary data and unformatted text.
      *
      * @param str

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java?rev=1637008&r1=1637007&r2=1637008&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java
(original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/net/QuotedPrintableCodecTest.java
Wed Nov  5 23:43:14 2014
@@ -17,9 +17,7 @@
 
 package org.apache.commons.codec.net;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.fail;
+import static org.junit.Assert.*;
 
 import java.nio.charset.UnsupportedCharsetException;
 
@@ -27,7 +25,6 @@ import org.apache.commons.codec.CharEnco
 import org.apache.commons.codec.Charsets;
 import org.apache.commons.codec.DecoderException;
 import org.apache.commons.codec.EncoderException;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -254,30 +251,87 @@ public class QuotedPrintableCodecTest {
     }
 
     @Test
-    @Ignore
-    /**
-     * The QuotedPrintableCodec documentation states that this is not supported.
-     *
-     * @throws Exception
-     * @see <a href="https://issues.apache.org/jira/browse/CODEC-121">CODEC-121</a>
-     */
     public void testSoftLineBreakDecode() throws Exception {
         final String qpdata = "If you believe that truth=3Dbeauty, then surely=20=\r\nmathematics
is the most beautiful branch of philosophy.";
         final String expected = "If you believe that truth=beauty, then surely mathematics
is the most beautiful branch of philosophy.";
-        assertEquals(expected, new QuotedPrintableCodec().decode(qpdata));
+
+        QuotedPrintableCodec qpcodec = new QuotedPrintableCodec();
+        assertEquals(expected, qpcodec.decode(qpdata));
+
+        String encoded = qpcodec.encode(expected);
+        assertEquals(expected, qpcodec.decode(encoded));
     }
 
     @Test
-    @Ignore
-    /**
-     * The QuotedPrintableCodec documentation states that this is not supported.
-     *
-     * @throws Exception
-     * @see <a href="https://issues.apache.org/jira/browse/CODEC-121">CODEC-121</a>
-     */
     public void testSoftLineBreakEncode() throws Exception {
-        final String qpdata = "If you believe that truth=3Dbeauty, then surely=20=\r\nmathematics
is the most beautiful branch of philosophy.";
+        final String qpdata = "If you believe that truth=3Dbeauty, then surely mathematics
is the most b=\r\neautiful branch of philosophy.";
         final String expected = "If you believe that truth=beauty, then surely mathematics
is the most beautiful branch of philosophy.";
-        assertEquals(qpdata, new QuotedPrintableCodec().encode(expected));
+
+        QuotedPrintableCodec qpcodec = new QuotedPrintableCodec(true);
+        assertEquals(qpdata, qpcodec.encode(expected));
+
+        String decoded = qpcodec.decode(qpdata);
+        assertEquals(qpdata, qpcodec.encode(decoded));
+    }
+
+    @Test
+    public void testSkipNotEncodedCRLF() throws Exception {
+        String qpdata = "CRLF in an\n encoded text should be=20=\r\n\rskipped in the\r decoding.";
+        String expected = "CRLF in an encoded text should be skipped in the decoding.";
+
+        QuotedPrintableCodec qpcodec = new QuotedPrintableCodec(true);
+        assertEquals(expected, qpcodec.decode(qpdata));
+
+        String encoded = qpcodec.encode(expected);
+        assertEquals(expected, qpcodec.decode(encoded));
+    }
+
+    @Test
+    public void testTrailingSpecial() throws Exception {
+        final QuotedPrintableCodec qpcodec = new QuotedPrintableCodec(true);
+
+        String plain ="This is a example of a quoted-printable text file. This might contain
sp=cial chars.";
+        String expected = "This is a example of a quoted-printable text file. This might
contain sp=3D=\r\ncial chars.";
+        assertEquals(expected, qpcodec.encode(plain));
+                
+        plain ="This is a example of a quoted-printable text file. This might contain ta\tbs
as well.";
+        expected = "This is a example of a quoted-printable text file. This might contain
ta=09=\r\nbs as well.";
+        assertEquals(expected, qpcodec.encode(plain));
+    }
+
+    @Test
+    public void testUltimateSoftBreak() throws Exception {
+        final QuotedPrintableCodec qpcodec = new QuotedPrintableCodec(true);
+
+        String plain ="This is a example of a quoted-printable text file. There is no end
to it\t";
+        String expected = "This is a example of a quoted-printable text file. There is no
end to i=\r\nt=09";
+
+        assertEquals(expected, qpcodec.encode(plain));
+
+        plain ="This is a example of a quoted-printable text file. There is no end to it
";
+        expected = "This is a example of a quoted-printable text file. There is no end to
i=\r\nt=20";
+
+        assertEquals(expected, qpcodec.encode(plain));
+
+        // whitespace before soft break
+        plain ="This is a example of a quoted-printable text file. There is no end to   ";
+        expected = "This is a example of a quoted-printable text file. There is no end to=20=\r\n
=20";
+
+        assertEquals(expected, qpcodec.encode(plain));
+
+        // non-printable character before soft break
+        plain ="This is a example of a quoted-printable text file. There is no end to=  ";
+        expected = "This is a example of a quoted-printable text file. There is no end to=3D=\r\n
=20";
+
+        assertEquals(expected, qpcodec.encode(plain));
+    }
+
+    @Test
+    public void testFinalBytes() throws Exception {
+        // whitespace, but does not need to be encoded
+        final String plain ="This is a example of a quoted=printable text file. There is
no tt";
+        final String expected = "This is a example of a quoted=3Dprintable text file. There
is no tt";
+
+        assertEquals(expected, new QuotedPrintableCodec(true).encode(plain));
     }
 }



Mime
View raw message