From a2a90538f792980e46cbbb579b3d67c040fc7976 Mon Sep 17 00:00:00 2001 From: Ilia Alshanetsky Date: Tue, 16 Jun 2026 15:33:43 -0400 Subject: [PATCH] Fix MIME charset sniffing advancing by name length not value length php_libxml_sniff_charset_from_string() advanced the parse cursor by the parameter name length after collecting an unquoted parameter value (WHATWG mime-sniff step 11.9.1), instead of the value length. When a Content-Type parameter before charset had a name and value of different lengths, the cursor misaligned and the charset parameter was missed, so document loading fell back to the wrong encoding. Closes GH-22343 --- .../html/encoding/HTMLDocument_createFromFile_http_header.phpt | 2 ++ ext/libxml/mime_sniff.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/dom/tests/modern/html/encoding/HTMLDocument_createFromFile_http_header.phpt b/ext/dom/tests/modern/html/encoding/HTMLDocument_createFromFile_http_header.phpt index 5c602b87f23e..5164ac68041d 100644 --- a/ext/dom/tests/modern/html/encoding/HTMLDocument_createFromFile_http_header.phpt +++ b/ext/dom/tests/modern/html/encoding/HTMLDocument_createFromFile_http_header.phpt @@ -46,6 +46,7 @@ $tests = [ "text/html; ;; ; ;; Charset=\"ISO-8859-1\"", "text/html;Charset=\"ISO-8859-1", "tex.t/h#\$%!&'*%2B-.^_`|~tml;Charset=\"ISO-8859-1\"", // Note: have to encode + as 2B because of implementation details of http_server() + "text/html; abcd=ef;charset=ISO-8859-1", ], "Valid input, but invalid encoding name" => [ "text/html;Charset=\"ISO-8859-1\\", @@ -100,6 +101,7 @@ foreach ($tests as $name => $headers) { äöü äöü äöü +äöü --- Valid input, but invalid encoding name --- ��� ��� diff --git a/ext/libxml/mime_sniff.c b/ext/libxml/mime_sniff.c index 0ca032f9b795..2840c69701fc 100644 --- a/ext/libxml/mime_sniff.c +++ b/ext/libxml/mime_sniff.c @@ -273,7 +273,7 @@ PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *sta /* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */ size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon); parameter_value = zend_string_init(start, parameter_value_length, false); - start += parameter_name_length; + start += parameter_value_length; /* 11.9.2. Remove trailing HTTP whitespace from parameterValue */ while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {