Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add grapheme_levenshtein function. #18087

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions ext/intl/grapheme/grapheme_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split)
ubrk_close(bi);
}

PHP_FUNCTION(grapheme_levenshtein)
{
zend_string *string1, *string2;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;

ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
ZEND_PARSE_PARAMETERS_END();

if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

zend_long *p1, *p2, *tmp;
zend_long c0, c1, c2;
zend_long retval;
size_t i2;
char *pstr1, *pstr2;

UChar *ustring1 = NULL;
UChar *ustring2 = NULL;

int32_t ustring1_len = 0;
int32_t ustring2_len = 0;

UErrorCode ustatus1 = U_ZERO_ERROR;
UErrorCode ustatus2 = U_ZERO_ERROR;

/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}

pstr1 = ZSTR_VAL(string1);
pstr2 = ZSTR_VAL(string2);

intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1);

if ( U_FAILURE( ustatus1 ) ) {
/* Set global error code. */
intl_error_set_code( NULL, ustatus1 );

/* Set error messages. */
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
if (ustring1) {
efree( ustring1 );
}
RETURN_FALSE;
}

intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2);

if ( U_FAILURE( ustatus2 ) ) {
/* Set global error code. */
intl_error_set_code( NULL, ustatus2 );

/* Set error messages. */
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
if (ustring2) {
efree( ustring2 );
}
if (ustring1) {
efree( ustring1 );
}
RETURN_FALSE;
}

UText *ut1 = NULL;
UText *ut2 = NULL;
UBreakIterator *bi1, *bi2;

int32_t strlen_1, strlen_2;
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0 );
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0 );

if (strlen_1 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_2 * cost_ins);
}
if (strlen_2 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_1 * cost_del);
}

unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1 );
bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2 );

ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1);
ubrk_setUText(bi1, ut1, &ustatus1);
ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2);
ubrk_setUText(bi2, ut2, &ustatus2);

p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);

for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}

int32_t current1 = 0;
int32_t current2 = 0;
int32_t pos1 = 0;
int32_t pos2 = 0;
int32_t usrch_pos = 0;
for ( ; pos1 != UBRK_DONE; ) {
current1 = ubrk_current(bi1);
pos1 = ubrk_next(bi1);
if (pos1 == UBRK_DONE) {
break;
}
p2[0] = p1[0] + cost_del;
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
current2 = ubrk_current(bi2);
pos2 = ubrk_next(bi2);
if (pos2 == UBRK_DONE) {
break;
}
usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0);
if (usrch_pos == 0) {
c0 = p1[i2];
} else {
c0 = p1[i2] + cost_rep;
}
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + 1] = c0;
}
ubrk_first(bi2);
tmp = p1;
p1 = p2;
p2 = tmp;
}

utext_close(ut1);
utext_close(ut2);

ubrk_close(bi1);
ubrk_close(bi2);

efree(ustring1);
efree(ustring2);

retval = p1[strlen_2];

efree(p1);
efree(p2);
RETURN_LONG(retval);
}

/* }}} */
2 changes: 2 additions & 0 deletions ext/intl/php_intl.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =

function grapheme_str_split(string $string, int $length = 1): array|false {}

function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}

/** @param int $next */
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

Expand Down
12 changes: 11 additions & 1 deletion ext/intl/php_intl_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions ext/intl/tests/grapheme_levenshtein.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
--TEST--
grapheme_levenshtein() function test
--EXTENSIONS--
intl
--FILE--
<?php
echo '--- Equal ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('12345', '12345'));

echo '--- First string empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('', 'xyz'));
echo '--- Second string empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('xyz', ''));
echo '--- Both empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('', ''));
var_dump(grapheme_levenshtein('', '', 10, 10, 10));

echo '--- 1 character ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('1', '2'));
echo '--- 2 character swapped ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('12', '21'));

echo '--- Inexpensive deletion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('2121', '11', 2, 1, 5));

//
echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('11', '2121', 5));

echo '--- Expensive replacement ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('111', '121', 2, 3, 2));
echo '--- Very expensive replacement ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('111', '121', 2, 9, 2));

echo '--- 128 codepoints ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
echo '--- 128 codepoints over ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
var_dump(grapheme_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
var_dump(grapheme_levenshtein("abc", str_repeat("a", 128) . "aaa"));
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));

echo '--- Variable selector ---' . \PHP_EOL;
$ka = "カ́";
var_dump(grapheme_levenshtein("カ", $ka));
// variable $nabe and $nabe_E0100 is seems nothing different.
// However, $nabe_E0100 is variable selector in U+908A U+E0100.
// So grapheme_levenshtein result is maybe 0.
$nabe = '邊';
$nabe_E0100 = "邊󠄀";
var_dump(grapheme_levenshtein($nabe, $nabe_E0100));

// combining character
var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}"));
?>
--EXPECT--
--- Equal ---
int(0)
--- First string empty ---
int(3)
--- Second string empty ---
int(3)
--- Both empty ---
int(0)
int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)
--- 128 codepoints ---
int(2)
--- 128 codepoints over ---
int(2)
int(256)
--- 128 codepoints over only $string1 ---
int(128)
--- 128 codepoints over only $string2 ---
int(130)
--- 128 codepoints over Hiragana ---
int(2)
--- Variable selector ---
int(1)
int(0)
int(0)
Loading