diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index ed7454dd89d43..ca6d215154e2a 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path( size_t *tree_error_offset ) { - decoding_encoding_ctx->decode.status = LXB_STATUS_OK; - const lxb_char_t *buf_ref = *buf_ref_ref; + + /* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */ + if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) { + lxb_char_t buf[4]; + lxb_char_t *buf_ptr = buf; + lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); + if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) { + buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE); + } + decoding_encoding_ctx->decode.status = LXB_STATUS_OK; + + if (!dom_process_parse_chunk( + ctx, + document, + parser, + buf_ptr - buf, + buf, + buf_ptr - buf, + tokenizer_error_offset, + tree_error_offset + )) { + goto fail_oom; + } + } + const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ diff --git a/ext/dom/tests/modern/html/encoding/gh17481.phpt b/ext/dom/tests/modern/html/encoding/gh17481.phpt new file mode 100644 index 0000000000000..74e13e1300942 --- /dev/null +++ b/ext/dom/tests/modern/html/encoding/gh17481.phpt @@ -0,0 +1,33 @@ +--TEST-- +GH-17481 (UTF-8 corruption in \Dom\HTMLDocument) +--EXTENSIONS-- +dom +--FILE-- +$input"; + if ($endTag) { + $Data .= ''; + } + $Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8'); + var_dump($Document->body->textContent === $input); +} + +?> +--EXPECT-- +bool(true) +bool(true) +bool(true) +bool(true) +bool(true) +bool(true)