Skip to content

Commit 72708f2

Browse files
committed
Merge branch 'PHP-8.4'
* PHP-8.4: Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument Fix GH-17486: Incorrect error line numbers reported in Dom\HTMLDocument::createFromString
2 parents 8a9095a + 2952e16 commit 72708f2

File tree

3 files changed

+104
-2
lines changed

3 files changed

+104
-2
lines changed

ext/dom/html_document.c

+32-2
Original file line numberDiff line numberDiff line change
@@ -553,9 +553,32 @@ static bool dom_decode_encode_fast_path(
553553
size_t *tree_error_offset
554554
)
555555
{
556-
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
557-
558556
const lxb_char_t *buf_ref = *buf_ref_ref;
557+
558+
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
559+
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
560+
lxb_char_t buf[4];
561+
lxb_char_t *buf_ptr = buf;
562+
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
563+
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
564+
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
565+
}
566+
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
567+
568+
if (!dom_process_parse_chunk(
569+
ctx,
570+
document,
571+
parser,
572+
buf_ptr - buf,
573+
buf,
574+
buf_ptr - buf,
575+
tokenizer_error_offset,
576+
tree_error_offset
577+
)) {
578+
goto fail_oom;
579+
}
580+
}
581+
559582
const lxb_char_t *last_output = buf_ref;
560583
while (buf_ref != buf_end) {
561584
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
@@ -904,6 +927,13 @@ PHP_METHOD(Dom_HTMLDocument, createFromString)
904927
if (!result) {
905928
goto fail_oom;
906929
}
930+
931+
/* In the string case we have a single buffer that acts as a sliding window.
932+
* The `current_input_characters` field starts pointing at the start of the buffer, but needs to slide along the
933+
* sliding window as well. */
934+
if (application_data.current_input_characters) {
935+
application_data.current_input_characters += chunk_size;
936+
}
907937
}
908938

909939
if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
--TEST--
2+
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
8+
$inputs = [
9+
[str_repeat('', 4096), false],
10+
[str_repeat('😏', 4096), false],
11+
[str_repeat('', 4096), true],
12+
[str_repeat('😏', 4096), true],
13+
[str_repeat('', 1358), false],
14+
[str_repeat('', 1359), false],
15+
];
16+
17+
foreach ($inputs as [$input, $endTag]) {
18+
$Data = "<!DOCTYPE HTML><html>$input";
19+
if ($endTag) {
20+
$Data .= '</html>';
21+
}
22+
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
23+
var_dump($Document->body->textContent === $input);
24+
}
25+
26+
?>
27+
--EXPECT--
28+
bool(true)
29+
bool(true)
30+
bool(true)
31+
bool(true)
32+
bool(true)
33+
bool(true)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
--TEST--
2+
GH-17486 (Incorrect error line numbers reported in Dom\HTMLDocument::createFromString)
3+
--EXTENSIONS--
4+
dom
5+
--INI--
6+
error_reporting=E_ALL
7+
--CREDITS--
8+
xPaw
9+
--FILE--
10+
<?php
11+
12+
$repeated = str_repeat('a', 50000);
13+
14+
$html = <<<HTML
15+
<!DOCTYPE html>
16+
<html lang="en">
17+
<body>
18+
<svg>
19+
<path d="{$repeated}" />
20+
</svg>
21+
<div>&#x0;</div>
22+
</body>
23+
</html>
24+
HTML;
25+
26+
\Dom\HTMLDocument::createFromString($html);
27+
28+
file_put_contents(__DIR__ . '/gh17486.tmp', $html);
29+
\Dom\HTMLDocument::createFromFile(__DIR__ . '/gh17486.tmp');
30+
31+
?>
32+
--CLEAN--
33+
<?php
34+
@unlink(__DIR__ . '/gh17486.tmp');
35+
?>
36+
--EXPECTF--
37+
Warning: Dom\HTMLDocument::createFromString(): tokenizer error null-character-reference in Entity, line: 7, column: 9 in %s on line %d
38+
39+
Warning: Dom\HTMLDocument::createFromFile(): tokenizer error null-character-reference in %s line: 7, column: 9 in %s on line %d

0 commit comments

Comments
 (0)