Skip to content

Commit 095e6f2

Browse files
mbrockJanWielemaker
authored andcommitted
ADDED: #158: Handle surrogate pairs in http/json
The JSON string "\ud83d\udc95" has one codepoint, not two. This is because the spec allows extended characters to be encoded as a pair of 16-bit values, called a "surrogate pair". From RFC 4627: > To escape an extended character that is not in the Basic Multilingual > Plane, the character is represented as a twelve-character sequence, > encoding the UTF-16 surrogate pair. So, for example, a string > containing only the G clef character (U+1D11E) may be represented as > "\uD834\uDD1E". This commit fixes the JSON parser to handle such surrogate pairs.
1 parent 4a2484e commit 095e6f2

File tree

2 files changed

+47
-10
lines changed

2 files changed

+47
-10
lines changed

json.pl

+43-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Author: Jan Wielemaker
44
55
WWW: http://www.swi-prolog.org
6-
Copyright (c) 2007-2021, University of Amsterdam
6+
Copyright (c) 2007-2023, University of Amsterdam
77
VU University Amsterdam
88
CWI, Amsterdam
99
SWI-Prolog Solutions b.v.
@@ -387,17 +387,47 @@
387387
escape(0'r, _, 0'\r) :- !.
388388
escape(0't, _, 0'\t) :- !.
389389
escape(0'u, Stream, C) :-
390-
!,
391-
get_code(Stream, C1),
392-
get_code(Stream, C2),
393-
get_code(Stream, C3),
394-
get_code(Stream, C4),
395-
code_type(C1, xdigit(D1)),
396-
code_type(C2, xdigit(D2)),
397-
code_type(C3, xdigit(D3)),
398-
code_type(C4, xdigit(D4)),
390+
get_XXXX(Stream, H),
391+
( hi_surrogate(H)
392+
-> get_surrogate_tail(Stream, H, C)
393+
; C = H
394+
).
395+
396+
get_XXXX(Stream, C) :-
397+
get_xdigit(Stream, D1),
398+
get_xdigit(Stream, D2),
399+
get_xdigit(Stream, D3),
400+
get_xdigit(Stream, D4),
399401
C is D1<<12+D2<<8+D3<<4+D4.
400402

403+
get_xdigit(Stream, D) :-
404+
get_code(Stream, C),
405+
code_type(C, xdigit(D)),
406+
!.
407+
get_xdigit(Stream, _) :-
408+
syntax_error(hexdigit_expected, Stream).
409+
410+
get_surrogate_tail(Stream, Hi, Codepoint) :-
411+
( get_code(Stream, 0'\\),
412+
get_code(Stream, 0'u),
413+
get_XXXX(Stream, Lo),
414+
surrogate([Hi, Lo], Codepoint)
415+
-> true
416+
; syntax_error(illegal_surrogate_pair, Stream)
417+
).
418+
419+
420+
hi_surrogate(C) :-
421+
C >= 0xD800, C < 0xDC00.
422+
423+
lo_surrogate(C) :-
424+
C >= 0xDC00, C < 0xE000.
425+
426+
surrogate([Hi, Lo], Codepoint) :-
427+
hi_surrogate(Hi),
428+
lo_surrogate(Lo),
429+
Codepoint is (Hi - 0xD800) * 0x400 + (Lo - 0xDC00) + 0x10000.
430+
401431
json_read_constant(0't, Stream, true) :-
402432
!,
403433
must_see(`rue`, Stream, true).
@@ -1091,3 +1121,6 @@
10911121
[ 'Illegal comment' ].
10921122
json_syntax_error(illegal_string_escape) -->
10931123
[ 'Illegal escape sequence in string' ].
1124+
json_syntax_error(illegal_surrogate_pair) -->
1125+
[ 'Illegal escaped surrogate pair in string' ].
1126+

test_json.pl

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@
6060
test(string, X == '\u1234') :-
6161
atom_json_term('"\\u1234"', X, []).
6262

63+
% surrogate pair (an emoji)
64+
test(string, X == '\U0001F495') :-
65+
atom_json_term('"\\ud83d\\udc95"', X, []).
66+
6367
test(int, X == 42) :-
6468
atom_json_term('42', X, []).
6569
test(int, X == -42) :-

0 commit comments

Comments
 (0)