couchdb-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rnew...@apache.org
Subject [01/31] mochiweb commit: updated refs/heads/upstream to 23dc119
Date Mon, 14 Aug 2017 15:38:11 GMT
Repository: couchdb-mochiweb
Updated Branches:
  refs/heads/upstream bd6ae7cbb -> 23dc11959


Support parsing UTF-16 surrogate pairs in mochiweb_html #164


Project: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/commit/5a70cdab
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/tree/5a70cdab
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/diff/5a70cdab

Branch: refs/heads/upstream
Commit: 5a70cdabb5d8edd5da4e9532a59731390e31e622
Parents: bd6ae7c
Author: Bob Ippolito <bob@redivi.com>
Authored: Mon Feb 8 18:17:53 2016 -0800
Committer: Bob Ippolito <bob@redivi.com>
Committed: Mon Feb 8 18:17:53 2016 -0800

----------------------------------------------------------------------
 CHANGES.md                   |  5 ++++-
 src/mochiweb_html.erl        | 45 ++++++++++++++++++++++++++++-----------
 test/mochiweb_html_tests.erl |  6 ++++++
 3 files changed, 43 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/CHANGES.md
----------------------------------------------------------------------
diff --git a/CHANGES.md b/CHANGES.md
index 05bf694..af80a19 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,8 @@
-Version 2.13.0 released XXXX-XX-XX
+Version 2.13.0 released 2016-02-08
 
+* Support parsing of UTF-16 surrogate pairs encoded as character
+  references in mochiweb_html
+  https://github.com/mochi/mochiweb/issues/164
 * Avoid swallowing messages that are not related to the socket
   during request parsing
   https://github.com/mochi/mochiweb/pull/161

http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/src/mochiweb_html.erl
----------------------------------------------------------------------
diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl
index 3fd93d0..3c5c4f9 100644
--- a/src/mochiweb_html.erl
+++ b/src/mochiweb_html.erl
@@ -639,13 +639,42 @@ find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
 
 tokenize_charref(Bin, S=#decoder{offset=O}) ->
     try
-        tokenize_charref(Bin, S, O)
+        case tokenize_charref_raw(Bin, S, O) of
+            {C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
+                %% Surrogate pair
+                tokeninize_charref_surrogate_pair(Bin, S1, C1);
+            {Unichar, S1} when is_integer(Unichar) ->
+                {{data, mochiutf8:codepoint_to_bytes(Unichar), false},
+                 S1};
+            {Unichars, S1} when is_list(Unichars) ->
+                {{data, unicode:characters_to_binary(Unichars), false},
+                 S1}
+        end
     catch
         throw:invalid_charref ->
             {{data, <<"&">>, false}, S}
     end.
 
-tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
+tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
+    case Bin of
+        <<_:O/binary, $&, _/binary>> ->
+            case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
+                {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
+                    {{data,
+                      unicode:characters_to_binary(
+                        <<C1:16, C2:16>>,
+                        utf16,
+                        utf8),
+                      false},
+                     S1};
+                _ ->
+                    throw(invalid_charref)
+            end;
+        _ ->
+            throw(invalid_charref)
+    end.
+
+tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
     case Bin of
         <<_:O/binary>> ->
             throw(invalid_charref);
@@ -658,17 +687,9 @@ tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
         <<_:O/binary, $;, _/binary>> ->
             Len = O - Start,
             <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
-            Data = case mochiweb_charref:charref(Raw) of
-                       undefined ->
-                           throw(invalid_charref);
-                       Unichar when is_integer(Unichar) ->
-                           mochiutf8:codepoint_to_bytes(Unichar);
-                       Unichars when is_list(Unichars) ->
-                           unicode:characters_to_binary(Unichars)
-                   end,
-            {{data, Data, false}, ?INC_COL(S)};
+            {mochiweb_charref:charref(Raw), ?INC_COL(S)};
         _ ->
-            tokenize_charref(Bin, ?INC_COL(S), Start)
+            tokenize_charref_raw(Bin, ?INC_COL(S), Start)
     end.
 
 tokenize_doctype(Bin, S) ->

http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/5a70cdab/test/mochiweb_html_tests.erl
----------------------------------------------------------------------
diff --git a/test/mochiweb_html_tests.erl b/test/mochiweb_html_tests.erl
index 3d35400..f67759a 100644
--- a/test/mochiweb_html_tests.erl
+++ b/test/mochiweb_html_tests.erl
@@ -126,6 +126,12 @@ tokens_test() ->
        mochiweb_html:tokens(<<"not html < at all">>)),
     ok.
 
+surrogate_test() ->
+    %% https://github.com/mochi/mochiweb/issues/164
+    ?assertEqual(
+       [{data,<<240,159,152,138>>,false}],
+       mochiweb_html:tokens(<<"&#55357;&#56842;">>)).
+
 parse_test() ->
     D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
 <html>


Mime
View raw message