diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 77bf4319928a8..a383489f8c453 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split) ubrk_close(bi); } +PHP_FUNCTION(grapheme_levenshtein) +{ + zend_string *string1, *string2; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + + ZEND_PARSE_PARAMETERS_START(2, 5) + Z_PARAM_STR(string1) + Z_PARAM_STR(string2) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(cost_ins) + Z_PARAM_LONG(cost_rep) + Z_PARAM_LONG(cost_del) + ZEND_PARSE_PARAMETERS_END(); + + if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) { + zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) { + zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_del <= 0 || cost_del > UINT_MAX / 4) { + zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + zend_long *p1, *p2, *tmp; + zend_long c0, c1, c2; + zend_long retval; + size_t i2; + char *pstr1, *pstr2; + + UChar *ustring1 = NULL; + UChar *ustring2 = NULL; + + int32_t ustring1_len = 0; + int32_t ustring2_len = 0; + + UErrorCode ustatus1 = U_ZERO_ERROR; + UErrorCode ustatus2 = U_ZERO_ERROR; + + /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means + * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * by having shorter rows (p1 & p2). */ + if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { + zend_string *tmp = string1; + string1 = string2; + string2 = tmp; + } + + pstr1 = ZSTR_VAL(string1); + pstr2 = ZSTR_VAL(string2); + + intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); + + if ( U_FAILURE( ustatus1 ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus1 ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + if (ustring1) { + efree( ustring1 ); + } + RETURN_FALSE; + } + + intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); + + if ( U_FAILURE( ustatus2 ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus2 ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + if (ustring2) { + efree( ustring2 ); + } + if (ustring1) { + efree( ustring1 ); + } + RETURN_FALSE; + } + + UText *ut1 = NULL; + UText *ut2 = NULL; + UBreakIterator *bi1, *bi2; + + int32_t strlen_1, strlen_2; + strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0 ); + strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0 ); + + if (strlen_1 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_2 * cost_ins); + } + if (strlen_2 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_1 * cost_del); + } + + unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE]; + unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE]; + bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1 ); + bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2 ); + + ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1); + ubrk_setUText(bi1, ut1, &ustatus1); + ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2); + ubrk_setUText(bi2, ut2, &ustatus2); + + p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + + for (i2 = 0; i2 <= strlen_2; i2++) { + p1[i2] = i2 * cost_ins; + } + + int32_t current1 = 0; + int32_t current2 = 0; + int32_t pos1 = 0; + int32_t pos2 = 0; + int32_t usrch_pos = 0; + for ( ; pos1 != UBRK_DONE; ) { + current1 = ubrk_current(bi1); + pos1 = ubrk_next(bi1); + if (pos1 == UBRK_DONE) { + break; + } + p2[0] = p1[0] + cost_del; + for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) { + current2 = ubrk_current(bi2); + pos2 = ubrk_next(bi2); + if (pos2 == UBRK_DONE) { + break; + } + usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0); + if (usrch_pos == 0) { + c0 = p1[i2]; + } else { + c0 = p1[i2] + cost_rep; + } + c1 = p1[i2 + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + 1] = c0; + } + ubrk_first(bi2); + tmp = p1; + p1 = p2; + p2 = tmp; + } + + utext_close(ut1); + utext_close(ut2); + + ubrk_close(bi1); + ubrk_close(bi2); + + efree(ustring1); + efree(ustring2); + + retval = p1[strlen_2]; + + efree(p1); + efree(p2); + RETURN_LONG(retval); +} + /* }}} */ diff --git a/ext/intl/php_intl.stub.php b/ext/intl/php_intl.stub.php index f3a80dd511943..572c4a4b333ae 100644 --- a/ext/intl/php_intl.stub.php +++ b/ext/intl/php_intl.stub.php @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = function grapheme_str_split(string $string, int $length = 1): array|false {} +function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {} + /** @param int $next */ function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} diff --git a/ext/intl/php_intl_arginfo.h b/ext/intl/php_intl_arginfo.h index 11c585d8df63b..23a4a1d6fbfc6 100644 --- a/ext/intl/php_intl_arginfo.h +++ b/ext/intl/php_intl_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 168eabfdcbf29189f2327448f104ea98752d1c5a */ + * Stub hash: 5039dc739e445832b7f3e91afb6d62dc272d2fa3 */ ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") @@ -489,6 +489,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1") ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_BE_LONG|MAY_BE_FALSE) + ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) @@ -899,6 +907,7 @@ ZEND_FUNCTION(grapheme_substr); ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_str_split); +ZEND_FUNCTION(grapheme_levenshtein); ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_utf8); @@ -1086,6 +1095,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) + ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein) ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) diff --git a/ext/intl/tests/grapheme_levenshtein.phpt b/ext/intl/tests/grapheme_levenshtein.phpt new file mode 100644 index 0000000000000..eec36ea2f9801 --- /dev/null +++ b/ext/intl/tests/grapheme_levenshtein.phpt @@ -0,0 +1,104 @@ +--TEST-- +grapheme_levenshtein() function test +--EXTENSIONS-- +intl +--FILE-- + +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- +int(0) +int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) +--- 128 codepoints --- +int(2) +--- 128 codepoints over --- +int(2) +int(256) +--- 128 codepoints over only $string1 --- +int(128) +--- 128 codepoints over only $string2 --- +int(130) +--- 128 codepoints over Hiragana --- +int(2) +--- Variable selector --- +int(1) +int(0) +int(0)