Skip to content

Commit 853ec6b

Browse files
committed
fix: bad similarity index calculated by matchingIndex func (#7)
test: update tests to match with matchingIndex changes & add some test cases test: fix lcs distance test case which failed
1 parent 2edac5c commit 853ec6b

File tree

3 files changed

+26
-10
lines changed

3 files changed

+26
-10
lines changed

lcs_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ func TestLCSBacktrackAll(t *testing.T) {
8989
{"AZBYCWDX/ZAYBWCXD", args{"AZBYCWDX", "ZAYBWCXD"}, []string{"ABCD", "ABCX", "ABWD", "ABWX", "AYCD", "AYCX", "AYWD", "AYWX", "ZBCD", "ZBCX", "ZBWD", "ZBWX", "ZYCD", "ZYCX", "ZYWD", "ZYWX"}, false},
9090
{"AATCC/ACACG", args{"AATCC", "ACACG"}, []string{"AAC", "ACC"}, false},
9191
{"您好女士,你好吗?/先生,你好吗?", args{"您好女士 你好吗?", "先生 你好吗?"}, []string{" 你好吗?"}, false},
92+
{" 是ab是cde22f123g/222222是ab是cd123", args{" 是ab是cde22f123g", "222222是ab是cd123"}, []string{"是ab是cd123"}, false},
9293
}
9394
for _, tt := range tests {
9495
t.Run(tt.name, func(t *testing.T) {

string-analysis.go

+7-4
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,14 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
5252

5353
// Return matching index E [0..1] from two strings and an edit distance
5454
func matchingIndex(str1 string, str2 string, distance int) float32 {
55-
// Compare strings length and make a matching percentage between them
56-
if len(str1) >= len(str2) {
57-
return float32(len(str1)-distance) / float32(len(str1))
55+
// Convert strings to rune slices
56+
runeStr1 := []rune(str1)
57+
runeStr2 := []rune(str2)
58+
// Compare rune arrays length and make a matching percentage between them
59+
if len(runeStr1) >= len(runeStr2) {
60+
return float32(len(runeStr1)-distance) / float32(len(runeStr1))
5861
}
59-
return float32(len(str2)-distance) / float32(len(str2))
62+
return float32(len(runeStr2)-distance) / float32(len(runeStr2))
6063
}
6164

6265
// FuzzySearch realize an approximate search on a string list and return the closest one compared

string-analysis_test.go

+18-6
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ func TestStringsSimilarity(t *testing.T) {
4040
{"Levenshtein : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Levenshtein}, 0.6666667, false},
4141
{"Levenshtein : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Levenshtein}, 0.50, false},
4242
{"Levenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Levenshtein}, 0.80, false},
43+
{"Levenshtein : abcde/бвгдж", args{"abcde", "бвгдж", Levenshtein}, 0, false},
44+
{"Levenshtein : abcde/fghjk", args{"abcde", "fghjk", Levenshtein}, 0, false},
45+
{"Levenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", Levenshtein}, 0.4, false},
46+
{"Levenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Levenshtein}, 0.5, false},
4347

4448
// DamerauLevenshtein method
4549
{"DamerauLevenshtein : First arg empty", args{"", "abcde", DamerauLevenshtein}, 0.0, false},
@@ -52,8 +56,8 @@ func TestStringsSimilarity(t *testing.T) {
5256
{"DamerauLevenshtein : a cat/an abct", args{"a cat", "an abct", DamerauLevenshtein}, 0.5714286, false},
5357
{"DamerauLevenshtein : dixon/dicksonx", args{"dixon", "dicksonx", DamerauLevenshtein}, 0.5, false},
5458
{"DamerauLevenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", DamerauLevenshtein}, 0.8, false},
55-
{"DamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", DamerauLevenshtein}, 0.8666667, false}, // "Hello" in Japanese
56-
{"DamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", DamerauLevenshtein}, 0.875, false},
59+
{"DamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", DamerauLevenshtein}, 0.6, false},
60+
{"DamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", DamerauLevenshtein}, 0.5, false},
5761

5862
// OSADamerauLevenshtein method
5963
{"OSADamerauLevenshtein : First arg empty", args{"", "abcde", OSADamerauLevenshtein}, 0.0, false},
@@ -66,8 +70,8 @@ func TestStringsSimilarity(t *testing.T) {
6670
{"OSADamerauLevenshtein : a cat/an abct", args{"a cat", "an abct", OSADamerauLevenshtein}, 0.428571429, false},
6771
{"OSADamerauLevenshtein : dixon/dicksonx", args{"dixon", "dicksonx", OSADamerauLevenshtein}, 0.5, false},
6872
{"OSADamerauLevenshtein : jellyfish/smellyfish", args{"jellyfish", "smellyfish", OSADamerauLevenshtein}, 0.8, false},
69-
{"OSADamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", OSADamerauLevenshtein}, 0.8666667, false}, // "Hello" in Japanese
70-
{"OSADamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", OSADamerauLevenshtein}, 0.875, false},
73+
{"OSADamerauLevenshtein : こにんち/こんにちは", args{"こにんち", "こんにちは", OSADamerauLevenshtein}, 0.6, false},
74+
{"OSADamerauLevenshtein : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", OSADamerauLevenshtein}, 0.5, false},
7175

7276
// Lcs method
7377
{"LCS : First arg empty", args{"", "abcde", Lcs}, 0.0, false},
@@ -80,6 +84,8 @@ func TestStringsSimilarity(t *testing.T) {
8084
{"LCS : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Lcs}, 0.6666667, false},
8185
{"LCS : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Lcs}, 0.375, false},
8286
{"LCS : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Lcs}, 0.7, false},
87+
{"Lcs : こにんち/こんにちは", args{"こにんち", "こんにちは", Lcs}, 0.4, false}, // "Hello" in Japanese
88+
{"Lcs : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Lcs}, 0.5, false},
8389

8490
// Hamming method
8591
{"Hamming : First arg empty", args{"", "abcde", Hamming}, 0.0, true},
@@ -93,7 +99,7 @@ func TestStringsSimilarity(t *testing.T) {
9399
{"Hamming : dixon/dicksonx", args{"dixon", "dicksonx", Hamming}, 0.0, true},
94100
{"Hamming : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Hamming}, 0.0, true},
95101
{"Hamming : こにんち/こんにちは", args{"こにんち", "こんにちは", Hamming}, 0.0, true}, // "Hello" in Japanese
96-
{"Hamming : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Hamming}, 0.75, false},
102+
{"Hamming : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Hamming}, 0.0, false},
97103

98104
// Jaro method
99105
{"Jaro : First arg empty", args{"", "abcde", Jaro}, 0.0, false},
@@ -104,6 +110,9 @@ func TestStringsSimilarity(t *testing.T) {
104110
{"Jaro : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaro}, 0.9444444, false},
105111
{"Jaro : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaro}, 0.76666665, false},
106112
{"Jaro : jellyfish/smellyfish", args{"jellyfish", "smellyfish", Jaro}, 0.8962963, false},
113+
{"Jaro : こにんち/こんにちは", args{"こにんち", "こんにちは", Jaro}, 0.84999996, false},
114+
{"Jaro : こんににんち/こんにちは", args{"こんににんち", "こんにちは", Jaro}, 0.82222223, false},
115+
{"Jaro : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", Jaro}, 0.8333333, false},
107116

108117
// JaroWinkler method
109118
{"JaroWinkler : First arg empty", args{"", "abcde", JaroWinkler}, 0.0, false},
@@ -114,6 +123,9 @@ func TestStringsSimilarity(t *testing.T) {
114123
{"JaroWinkler : MARTHA/MARHTA", args{"MARTHA", "MARHTA", JaroWinkler}, 0.96111107, false},
115124
{"JaroWinkler : DIXON/DICKSONX", args{"DIXON", "DICKSONX", JaroWinkler}, 0.81333333, false},
116125
{"JaroWinkler : jellyfish/smellyfish", args{"jellyfish", "smellyfish", JaroWinkler}, 0.8962963, false},
126+
{"JaroWinkler : こにんち/こんにちは", args{"こにんち", "こんにちは", JaroWinkler}, 0.86499995, false},
127+
{"JaroWinkler : こんににんち/こんにちは", args{"こんににんち", "こんにちは", JaroWinkler}, 0.8755556, false},
128+
{"JaroWinkler : 🙂😄🙂😄/😄🙂😄🙂", args{"🙂😄🙂😄", "😄🙂😄🙂", JaroWinkler}, 0.8333333, false},
117129

118130
// Cosine method
119131
{"Cosine : First arg empty", args{"", "abcde", Cosine}, 0.0, false},
@@ -138,7 +150,7 @@ func TestStringsSimilarity(t *testing.T) {
138150
return
139151
}
140152
if got != tt.want {
141-
t.Errorf("StringsSimilarity() = %v, want %v", got, tt.want)
153+
t.Errorf("StringsSimilarity() = %v, want %v\nRune string 1: %v, len: %d\nRune string 2: %v, len: %d", got, tt.want, []rune(tt.args.str1), len([]rune(tt.args.str1)), []rune(tt.args.str2), len([]rune(tt.args.str2)))
142154
}
143155
})
144156
}

0 commit comments

Comments
 (0)