From 66f5709ba77a4a646d57d2099a8bde461a4affac Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Sun, 16 Mar 2025 19:34:08 +0300 Subject: [PATCH 1/4] Refactor traverse_for_entities (used in unescape_html_entities): Optimize scanning for '&' and ';' using memchr Use memcpy instead of character-by-character copying language --- ext/standard/html.c | 229 +++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 96 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 0c6231d590d88..3743a5fe925be 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -809,112 +809,149 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse /* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */ #define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2) static void traverse_for_entities( - const char *old, - size_t oldlen, - zend_string *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */ + const char *input, + size_t input_len, + zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */ int all, int flags, const entity_ht *inv_map, enum entity_charset charset) { - const char *p, - *lim; - char *q; - int doctype = flags & ENT_HTML_DOC_TYPE_MASK; - - lim = old + oldlen; /* terminator address */ - assert(*lim == '\0'); - - for (p = old, q = ZSTR_VAL(ret); p < lim;) { - unsigned code, code2 = 0; - const char *next = NULL; /* when set, next > p, otherwise possible inf loop */ - - /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an - * ASCII range byte can be part of a multi-byte sequence. - * However, they start at 0x40, therefore if we find a 0x26 byte, - * we're sure it represents the '&' character. */ - - /* assumes there are no single-char entities */ - if (p[0] != '&' || (p + 3 >= lim)) { - *(q++) = *(p++); - continue; - } - - /* now p[3] is surely valid and is no terminator */ - - /* numerical entity */ - if (p[1] == '#') { - next = &p[2]; - if (process_numeric_entity(&next, &code) == FAILURE) - goto invalid_code; - - /* If we're in htmlspecialchars_decode, we're only decoding entities + const char *current_ptr = input; + const char *input_end = input + input_len; /* terminator address */ + char *output_ptr = ZSTR_VAL(output); + int doctype = flags & ENT_HTML_DOC_TYPE_MASK; + + assert(*input_end == '\0'); + + while (current_ptr < input_end) { + const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr); + if (!ampersand_ptr) { + size_t tail_len = input_end - current_ptr; + if (tail_len > 0) { + memcpy(output_ptr, current_ptr, tail_len); + output_ptr += tail_len; + } + break; + } + + /* Copy everything up to the found '&' */ + size_t chunk_len = ampersand_ptr - current_ptr; + if (chunk_len > 0) { + memcpy(output_ptr, current_ptr, chunk_len); + output_ptr += chunk_len; + } + + /* Now current_ptr points to the '&' character. */ + current_ptr = ampersand_ptr; + + /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */ + if (current_ptr + 3 >= input_end) { + *output_ptr++ = *current_ptr++; + continue; + } + + unsigned code = 0, code2 = 0; + const char *entity_end_ptr = NULL; + int valid_entity = 1; + + if (current_ptr[1] == '#') { + /* Processing numeric entity */ + const char *num_start = current_ptr + 2; + entity_end_ptr = num_start; + if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) { + valid_entity = 0; + } + /* If we're in htmlspecialchars_decode, we're only decoding entities * that represent &, <, >, " and '. Is this one of them? */ - if (!all && (code > 63U || - stage3_table_be_apos_00000[code].data.ent.entity == NULL)) - goto invalid_code; - - /* are we allowed to decode this entity in this document type? + if (valid_entity && !all && + (code > 63U || + stage3_table_be_apos_00000[code].data.ent.entity == NULL)) + { + valid_entity = 0; + } + /* are we allowed to decode this entity in this document type? * HTML 5 is the only that has a character that cannot be used in * a numeric entity but is allowed literally (U+000D). The * unoptimized version would be ... || !numeric_entity_is_allowed(code) */ - if (!unicode_cp_is_allowed(code, doctype) || - (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)) - goto invalid_code; - } else { - const char *start; - size_t ent_len; - - next = &p[1]; - start = next; - - if (process_named_entity_html(&next, &start, &ent_len) == FAILURE) - goto invalid_code; - - if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) { - if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a' - && start[1] == 'p' && start[2] == 'o' && start[3] == 's') { - /* uses html4 inv_map, which doesn't include apos;. This is a - * hack to support it */ - code = (unsigned) '\''; - } else { - goto invalid_code; - } - } - } - - assert(*next == ';'); - - if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || - (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))) - /* && code2 == '\0' always true for current maps */) - goto invalid_code; - - /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but + if (valid_entity && (!unicode_cp_is_allowed(code, doctype) || + (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) + { + valid_entity = 0; + } + } else { + /* Processing named entity */ + const char *name_start = current_ptr + 1; + /* Search for ';' */ + const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1); + if (!semi_colon_ptr) { + valid_entity = 0; + } else { + size_t name_len = semi_colon_ptr - name_start; + if (name_len == 0) { + valid_entity = 0; + } else { + if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) { + if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 && + name_start[0] == 'a' && name_start[1] == 'p' && + name_start[2] == 'o' && name_start[3] == 's') + { + /* uses html4 inv_map, which doesn't include apos;. This is a + * hack to support it */ + code = (unsigned)'\''; + } else { + valid_entity = 0; + } + } + entity_end_ptr = semi_colon_ptr; + } + } + } + + /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */ + if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') { + *output_ptr++ = *current_ptr++; + continue; + } + + /* Check if quotes are allowed for entities representing ' or " */ + if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || + (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))) + { + valid_entity = 0; + } + + /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but * the call is needed to ensure the codepoint <= U+00FF) */ - if (charset != cs_utf_8) { - /* replace unicode code point */ - if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) - goto invalid_code; /* not representable in target charset */ - } - - q += write_octet_sequence((unsigned char*)q, charset, code); - if (code2) { - q += write_octet_sequence((unsigned char*)q, charset, code2); - } - - /* jump over the valid entity; may go beyond size of buffer; np */ - p = next + 1; - continue; - -invalid_code: - for (; p < next; p++) { - *(q++) = *p; - } - } - - *q = '\0'; - ZSTR_LEN(ret) = (size_t)(q - ZSTR_VAL(ret)); + if (valid_entity && charset != cs_utf_8) { + /* replace unicode code point */ + if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) + valid_entity = 0; + } + + if (valid_entity) { + /* Write the parsed entity into the output buffer */ + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code); + if (code2) { + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2); + } + /* Move current_ptr past the semicolon */ + current_ptr = entity_end_ptr + 1; + } else { + /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */ + if (entity_end_ptr) { + size_t len = entity_end_ptr - current_ptr; + memcpy(output_ptr, current_ptr, len); + output_ptr += len; + current_ptr = entity_end_ptr; + } else { + *output_ptr++ = *current_ptr++; + } + } + } + + *output_ptr = '\0'; + ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output)); } /* }}} */ From f093c30d5f0deb38be64650416dce2f7fd83b40b Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Mon, 17 Mar 2025 18:58:37 +0300 Subject: [PATCH 2/4] CR, refactoring, codestyle fix logic --- ext/standard/html.c | 273 ++++++++++++++++++++++---------------------- 1 file changed, 134 insertions(+), 139 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 3743a5fe925be..253d996e431e3 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -809,149 +809,144 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse /* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */ #define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2) static void traverse_for_entities( - const char *input, - size_t input_len, + const zend_string *input, zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */ - int all, - int flags, + const int all, + const int flags, const entity_ht *inv_map, - enum entity_charset charset) + const enum entity_charset charset) { - const char *current_ptr = input; - const char *input_end = input + input_len; /* terminator address */ - char *output_ptr = ZSTR_VAL(output); - int doctype = flags & ENT_HTML_DOC_TYPE_MASK; - - assert(*input_end == '\0'); - - while (current_ptr < input_end) { - const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr); - if (!ampersand_ptr) { - size_t tail_len = input_end - current_ptr; - if (tail_len > 0) { - memcpy(output_ptr, current_ptr, tail_len); - output_ptr += tail_len; - } - break; - } - - /* Copy everything up to the found '&' */ - size_t chunk_len = ampersand_ptr - current_ptr; - if (chunk_len > 0) { - memcpy(output_ptr, current_ptr, chunk_len); - output_ptr += chunk_len; - } - - /* Now current_ptr points to the '&' character. */ - current_ptr = ampersand_ptr; - - /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */ - if (current_ptr + 3 >= input_end) { - *output_ptr++ = *current_ptr++; - continue; - } - - unsigned code = 0, code2 = 0; - const char *entity_end_ptr = NULL; - int valid_entity = 1; - - if (current_ptr[1] == '#') { - /* Processing numeric entity */ - const char *num_start = current_ptr + 2; - entity_end_ptr = num_start; - if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) { - valid_entity = 0; - } - /* If we're in htmlspecialchars_decode, we're only decoding entities - * that represent &, <, >, " and '. Is this one of them? */ - if (valid_entity && !all && - (code > 63U || - stage3_table_be_apos_00000[code].data.ent.entity == NULL)) - { - valid_entity = 0; - } - /* are we allowed to decode this entity in this document type? - * HTML 5 is the only that has a character that cannot be used in - * a numeric entity but is allowed literally (U+000D). The - * unoptimized version would be ... || !numeric_entity_is_allowed(code) */ - if (valid_entity && (!unicode_cp_is_allowed(code, doctype) || - (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) - { - valid_entity = 0; - } - } else { - /* Processing named entity */ - const char *name_start = current_ptr + 1; - /* Search for ';' */ - const char *semi_colon_ptr = memchr(name_start, ';', LONGEST_ENTITY_LENGTH + 1); - if (!semi_colon_ptr) { - valid_entity = 0; - } else { - size_t name_len = semi_colon_ptr - name_start; - if (name_len == 0) { - valid_entity = 0; - } else { - if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) { - if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 && - name_start[0] == 'a' && name_start[1] == 'p' && - name_start[2] == 'o' && name_start[3] == 's') - { - /* uses html4 inv_map, which doesn't include apos;. This is a + const char *current_ptr = ZSTR_VAL(input); + const char *input_end = current_ptr + input->len; /* terminator address */ + char *output_ptr = ZSTR_VAL(output); + const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; + + while (current_ptr < input_end) { + const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr); + if (!ampersand_ptr) { + const size_t tail_len = input_end - current_ptr; + if (tail_len > 0) { + memcpy(output_ptr, current_ptr, tail_len); + output_ptr += tail_len; + } + break; + } + + /* Copy everything up to the found '&' */ + const size_t chunk_len = ampersand_ptr - current_ptr; + if (chunk_len > 0) { + memcpy(output_ptr, current_ptr, chunk_len); + output_ptr += chunk_len; + } + + /* Now current_ptr points to the '&' character. */ + current_ptr = ampersand_ptr; + + /* If there are less than 4 bytes remaining, there isn't enough for an entity – copy '&' as a normal character */ + if (input_end - current_ptr < 4){ + const size_t remaining = input_end - current_ptr; + memcpy(output_ptr, current_ptr, remaining); + output_ptr += remaining; + break; + } + + unsigned code = 0, code2 = 0; + const char *entity_end_ptr = NULL; + bool valid_entity = true; + + if (current_ptr[1] == '#') { + /* Processing numeric entity */ + const char *num_start = current_ptr + 2; + entity_end_ptr = num_start; + if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) { + valid_entity = false; + } + if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) { + /* If we're in htmlspecialchars_decode, we're only decoding entities + * that represent &, <, >, " and '. Is this one of them? */ + valid_entity = false; + } else if (valid_entity && (!unicode_cp_is_allowed(code, doctype) || + (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) { + /* are we allowed to decode this entity in this document type? + * HTML 5 is the only that has a character that cannot be used in + * a numeric entity but is allowed literally (U+000D). The + * unoptimized version would be ... || !numeric_entity_is_allowed(code) */ + valid_entity = false; + } + } else { + /* Processing named entity */ + const char *name_start = current_ptr + 1; + /* Search for ';' */ + const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start); + const char *semi_colon_ptr = memchr(name_start, ';', max_search_len); + if (!semi_colon_ptr) { + valid_entity = false; + } else { + const size_t name_len = semi_colon_ptr - name_start; + if (name_len == 0) { + valid_entity = false; + } else { + if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) { + if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 && + name_start[0] == 'a' && name_start[1] == 'p' && + name_start[2] == 'o' && name_start[3] == 's') + { + /* uses html4 inv_map, which doesn't include apos;. This is a * hack to support it */ - code = (unsigned)'\''; - } else { - valid_entity = 0; - } - } - entity_end_ptr = semi_colon_ptr; - } - } - } - - /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */ - if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') { - *output_ptr++ = *current_ptr++; - continue; - } - - /* Check if quotes are allowed for entities representing ' or " */ - if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || - (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))) - { - valid_entity = 0; - } - - /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but + code = (unsigned)'\''; + } else { + valid_entity = false; + } + } + entity_end_ptr = semi_colon_ptr; + } + } + } + + /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */ + if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') { + *output_ptr++ = *current_ptr++; + continue; + } + + /* Check if quotes are allowed for entities representing ' or " */ + if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || + (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))) + { + valid_entity = false; + } + + /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but * the call is needed to ensure the codepoint <= U+00FF) */ - if (valid_entity && charset != cs_utf_8) { - /* replace unicode code point */ - if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) - valid_entity = 0; - } - - if (valid_entity) { - /* Write the parsed entity into the output buffer */ - output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code); - if (code2) { - output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2); - } - /* Move current_ptr past the semicolon */ - current_ptr = entity_end_ptr + 1; - } else { - /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */ - if (entity_end_ptr) { - size_t len = entity_end_ptr - current_ptr; - memcpy(output_ptr, current_ptr, len); - output_ptr += len; - current_ptr = entity_end_ptr; - } else { - *output_ptr++ = *current_ptr++; - } - } - } - - *output_ptr = '\0'; - ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output)); + if (valid_entity && charset != cs_utf_8) { + /* replace unicode code point */ + if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) + valid_entity = false; + } + + if (valid_entity) { + /* Write the parsed entity into the output buffer */ + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code); + if (code2) { + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2); + } + /* Move current_ptr past the semicolon */ + current_ptr = entity_end_ptr + 1; + } else { + /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */ + if (entity_end_ptr) { + const size_t len = entity_end_ptr - current_ptr; + memcpy(output_ptr, current_ptr, len); + output_ptr += len; + current_ptr = entity_end_ptr; + } else { + *output_ptr++ = *current_ptr++; + } + } + } + + *output_ptr = '\0'; + ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output)); } /* }}} */ @@ -1036,7 +1031,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl inverse_map = unescape_inverse_map(all, flags); /* replace numeric entities */ - traverse_for_entities(ZSTR_VAL(str), ZSTR_LEN(str), ret, all, flags, inverse_map, charset); + traverse_for_entities(str, ret, all, flags, inverse_map, charset); return ret; } From 24ff7226b989af13160149fbc035c5c012596a14 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Fri, 28 Mar 2025 18:18:58 +0300 Subject: [PATCH 3/4] use macro ZSTR_LEN --- ext/standard/html.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 253d996e431e3..6e713256e8236 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -817,7 +817,7 @@ static void traverse_for_entities( const enum entity_charset charset) { const char *current_ptr = ZSTR_VAL(input); - const char *input_end = current_ptr + input->len; /* terminator address */ + const char *input_end = current_ptr + ZSTR_LEN(input); /* terminator address */ char *output_ptr = ZSTR_VAL(output); const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; From 5f8363bc8d70d1124d70b747386e926312e01335 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Sun, 30 Mar 2025 21:28:03 +0300 Subject: [PATCH 4/4] Back to `goto` instead boolean flag `invalid_entity`. Optimize condition. --- ext/standard/html.c | 75 ++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 6e713256e8236..fbded4160b55c 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -852,39 +852,38 @@ static void traverse_for_entities( unsigned code = 0, code2 = 0; const char *entity_end_ptr = NULL; - bool valid_entity = true; if (current_ptr[1] == '#') { /* Processing numeric entity */ const char *num_start = current_ptr + 2; entity_end_ptr = num_start; if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) { - valid_entity = false; + goto invalid_incomplete_entity; } - if (valid_entity && !all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) { + if (!all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) { /* If we're in htmlspecialchars_decode, we're only decoding entities * that represent &, <, >, " and '. Is this one of them? */ - valid_entity = false; - } else if (valid_entity && (!unicode_cp_is_allowed(code, doctype) || - (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))) { + goto invalid_incomplete_entity; + } else if (!unicode_cp_is_allowed(code, doctype) || + (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)) { /* are we allowed to decode this entity in this document type? * HTML 5 is the only that has a character that cannot be used in * a numeric entity but is allowed literally (U+000D). The * unoptimized version would be ... || !numeric_entity_is_allowed(code) */ - valid_entity = false; + goto invalid_incomplete_entity; } } else { - /* Processing named entity */ + /* Processing named entity */ const char *name_start = current_ptr + 1; /* Search for ';' */ const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start); const char *semi_colon_ptr = memchr(name_start, ';', max_search_len); if (!semi_colon_ptr) { - valid_entity = false; + goto invalid_incomplete_entity; } else { const size_t name_len = semi_colon_ptr - name_start; if (name_len == 0) { - valid_entity = false; + goto invalid_incomplete_entity; } else { if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) { if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 && @@ -895,7 +894,7 @@ static void traverse_for_entities( * hack to support it */ code = (unsigned)'\''; } else { - valid_entity = false; + goto invalid_incomplete_entity; } } entity_end_ptr = semi_colon_ptr; @@ -904,45 +903,51 @@ static void traverse_for_entities( } /* If entity_end_ptr is not found or does not point to ';', consider the entity invalid */ - if (!valid_entity || entity_end_ptr == NULL || *entity_end_ptr != ';') { - *output_ptr++ = *current_ptr++; - continue; + if (entity_end_ptr == NULL) { + goto invalid_incomplete_entity; } /* Check if quotes are allowed for entities representing ' or " */ if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) || (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))) { - valid_entity = false; + goto invalid_complete_entity; } /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but * the call is needed to ensure the codepoint <= U+00FF) */ - if (valid_entity && charset != cs_utf_8) { + if (charset != cs_utf_8) { /* replace unicode code point */ - if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) - valid_entity = false; + if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) { + goto invalid_complete_entity; + } } - if (valid_entity) { - /* Write the parsed entity into the output buffer */ - output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code); - if (code2) { - output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2); - } - /* Move current_ptr past the semicolon */ - current_ptr = entity_end_ptr + 1; + /* Write the parsed entity into the output buffer */ + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code); + if (code2) { + output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2); + } + /* Move current_ptr past the semicolon */ + current_ptr = entity_end_ptr + 1; + continue; + +invalid_incomplete_entity: + /* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */ + *output_ptr++ = *current_ptr++; + continue; + +invalid_complete_entity: + /* If the entity became invalid after we found entity_end_ptr */ + if (entity_end_ptr) { + const size_t len = entity_end_ptr - current_ptr; + memcpy(output_ptr, current_ptr, len); + output_ptr += len; + current_ptr = entity_end_ptr; } else { - /* If the entity is invalid, copy characters from current_ptr up to entity_end_ptr */ - if (entity_end_ptr) { - const size_t len = entity_end_ptr - current_ptr; - memcpy(output_ptr, current_ptr, len); - output_ptr += len; - current_ptr = entity_end_ptr; - } else { - *output_ptr++ = *current_ptr++; - } + *output_ptr++ = *current_ptr++; } + continue; } *output_ptr = '\0';