Fix phpGH-17481: UTF-8 corruption in \Dom\HTMLDocument

nielsdos · nielsdos · commit 4c0528f8cafb · 2025-01-16T20:12:18.000+01:00
We need to properly handle the case when we return from having too few
bytes, this needs to be handled separately because the while loop
otherwise just performs a partial byte copy.
diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c
@@ -528,9 +528,32 @@ static bool dom_decode_encode_fast_path(
 	size_t *tree_error_offset
 )
 {
-	decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
-
 	const lxb_char_t *buf_ref = *buf_ref_ref;
+
+	/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
+	if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
+		lxb_char_t buf[4];
+		lxb_char_t *buf_ptr = buf;
+		lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
+		if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
+			buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
+		}
+		decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
+
+		if (!dom_process_parse_chunk(
+			ctx,
+			document,
+			parser,
+			buf_ptr - buf,
+			buf,
+			buf_ptr - buf,
+			tokenizer_error_offset,
+			tree_error_offset
+		)) {
+			goto fail_oom;
+		}
+	}
+
 	const lxb_char_t *last_output = buf_ref;
 	while (buf_ref != buf_end) {
 		/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
diff --git a/ext/dom/tests/modern/html/parser/gh17481.phpt b/ext/dom/tests/modern/html/parser/gh17481.phpt
@@ -0,0 +1,33 @@
+--TEST--
+GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
+--EXTENSIONS--
+dom
+--FILE--
+<?php
+
+$inputs = [
+    [str_repeat('–', 4096), false],
+    [str_repeat('😏', 4096), false],
+    [str_repeat('–', 4096), true],
+    [str_repeat('😏', 4096), true],
+    [str_repeat('–', 1358), false],
+    [str_repeat('–', 1359), false],
+];
+
+foreach ($inputs as [$input, $endTag]) {
+    $Data = "<!DOCTYPE HTML><html>$input";
+    if ($endTag) {
+        $Data .= '</html>';
+    }
+    $Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
+    var_dump($Document->body->textContent === $input);
+}
+
+?>
+--EXPECT--
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+bool(true)