Repository: couchdb-mochiweb Updated Branches: refs/heads/upstream bd6ae7cbb -> 23dc11959 Support parsing UTF-16 surrogate pairs in mochiweb_html #164 Project: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/repo Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/commit/5a70cdab Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/tree/5a70cdab Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/diff/5a70cdab Branch: refs/heads/upstream Commit: 5a70cdabb5d8edd5da4e9532a59731390e31e622 Parents: bd6ae7c Author: Bob Ippolito Authored: Mon Feb 8 18:17:53 2016 -0800 Committer: Bob Ippolito Committed: Mon Feb 8 18:17:53 2016 -0800 ---------------------------------------------------------------------- CHANGES.md | 5 ++++- src/mochiweb_html.erl | 45 ++++++++++++++++++++++++++++----------- test/mochiweb_html_tests.erl | 6 ++++++ 3 files changed, 43 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/CHANGES.md ---------------------------------------------------------------------- diff --git a/CHANGES.md b/CHANGES.md index 05bf694..af80a19 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,8 @@ -Version 2.13.0 released XXXX-XX-XX +Version 2.13.0 released 2016-02-08 +* Support parsing of UTF-16 surrogate pairs encoded as character + references in mochiweb_html + https://github.com/mochi/mochiweb/issues/164 * Avoid swallowing messages that are not related to the socket during request parsing https://github.com/mochi/mochiweb/pull/161 http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/src/mochiweb_html.erl ---------------------------------------------------------------------- diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl index 3fd93d0..3c5c4f9 100644 --- a/src/mochiweb_html.erl +++ b/src/mochiweb_html.erl @@ -639,13 +639,42 @@ find_gt(Bin, S=#decoder{offset=O}, HasSlash) -> tokenize_charref(Bin, S=#decoder{offset=O}) -> try - tokenize_charref(Bin, S, O) + case tokenize_charref_raw(Bin, S, O) of + {C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF -> + %% Surrogate pair + tokeninize_charref_surrogate_pair(Bin, S1, C1); + {Unichar, S1} when is_integer(Unichar) -> + {{data, mochiutf8:codepoint_to_bytes(Unichar), false}, + S1}; + {Unichars, S1} when is_list(Unichars) -> + {{data, unicode:characters_to_binary(Unichars), false}, + S1} + end catch throw:invalid_charref -> {{data, <<"&">>, false}, S} end. -tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> +tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) -> + case Bin of + <<_:O/binary, $&, _/binary>> -> + case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of + {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF -> + {{data, + unicode:characters_to_binary( + <>, + utf16, + utf8), + false}, + S1}; + _ -> + throw(invalid_charref) + end; + _ -> + throw(invalid_charref) + end. + +tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) -> case Bin of <<_:O/binary>> -> throw(invalid_charref); @@ -658,17 +687,9 @@ tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> <<_:O/binary, $;, _/binary>> -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, - Data = case mochiweb_charref:charref(Raw) of - undefined -> - throw(invalid_charref); - Unichar when is_integer(Unichar) -> - mochiutf8:codepoint_to_bytes(Unichar); - Unichars when is_list(Unichars) -> - unicode:characters_to_binary(Unichars) - end, - {{data, Data, false}, ?INC_COL(S)}; + {mochiweb_charref:charref(Raw), ?INC_COL(S)}; _ -> - tokenize_charref(Bin, ?INC_COL(S), Start) + tokenize_charref_raw(Bin, ?INC_COL(S), Start) end. tokenize_doctype(Bin, S) -> http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/test/mochiweb_html_tests.erl ---------------------------------------------------------------------- diff --git a/test/mochiweb_html_tests.erl b/test/mochiweb_html_tests.erl index 3d35400..f67759a 100644 --- a/test/mochiweb_html_tests.erl +++ b/test/mochiweb_html_tests.erl @@ -126,6 +126,12 @@ tokens_test() -> mochiweb_html:tokens(<<"not html < at all">>)), ok. +surrogate_test() -> + %% https://github.com/mochi/mochiweb/issues/164 + ?assertEqual( + [{data,<<240,159,152,138>>,false}], + mochiweb_html:tokens(<<"��">>)). + parse_test() -> D0 = <<"