Skip to content

Commit 4c0528f

Browse files
committed
Fix phpGH-17481: UTF-8 corruption in \Dom\HTMLDocument
We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy.
1 parent 27fbdc1 commit 4c0528f

File tree

2 files changed

+58
-2
lines changed

2 files changed

+58
-2
lines changed

ext/dom/html_document.c

+25-2
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path(
528528
size_t *tree_error_offset
529529
)
530530
{
531-
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
532-
533531
const lxb_char_t *buf_ref = *buf_ref_ref;
532+
533+
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
534+
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
535+
lxb_char_t buf[4];
536+
lxb_char_t *buf_ptr = buf;
537+
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
538+
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
539+
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
540+
}
541+
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
542+
543+
if (!dom_process_parse_chunk(
544+
ctx,
545+
document,
546+
parser,
547+
buf_ptr - buf,
548+
buf,
549+
buf_ptr - buf,
550+
tokenizer_error_offset,
551+
tree_error_offset
552+
)) {
553+
goto fail_oom;
554+
}
555+
}
556+
534557
const lxb_char_t *last_output = buf_ref;
535558
while (buf_ref != buf_end) {
536559
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
--TEST--
2+
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
8+
$inputs = [
9+
[str_repeat('', 4096), false],
10+
[str_repeat('😏', 4096), false],
11+
[str_repeat('', 4096), true],
12+
[str_repeat('😏', 4096), true],
13+
[str_repeat('', 1358), false],
14+
[str_repeat('', 1359), false],
15+
];
16+
17+
foreach ($inputs as [$input, $endTag]) {
18+
$Data = "<!DOCTYPE HTML><html>$input";
19+
if ($endTag) {
20+
$Data .= '</html>';
21+
}
22+
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
23+
var_dump($Document->body->textContent === $input);
24+
}
25+
26+
?>
27+
--EXPECT--
28+
bool(true)
29+
bool(true)
30+
bool(true)
31+
bool(true)
32+
bool(true)
33+
bool(true)

0 commit comments

Comments
 (0)