Skip to content

Commit a73073a

Browse files
committed
Apply dist_thresh to Genius and Google backends
This commit introduces a distance threshold mechanism for the Genius and Google backends. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791.
1 parent e914d59 commit a73073a

File tree

4 files changed

+118
-51
lines changed

4 files changed

+118
-51
lines changed

beetsplug/lyrics.py

+71-49
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
from __future__ import annotations
1818

19-
import difflib
2019
import errno
2120
import itertools
2221
import json
22+
import math
2323
import os.path
2424
import re
2525
import struct
@@ -30,14 +30,15 @@
3030
from functools import cached_property, partial, total_ordering
3131
from http import HTTPStatus
3232
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
33-
from urllib.parse import quote, urlencode
33+
from urllib.parse import quote, urlencode, urlparse
3434

3535
import requests
3636
from typing_extensions import TypedDict
3737
from unidecode import unidecode
3838

3939
import beets
4040
from beets import plugins, ui
41+
from beets.autotag.hooks import string_dist
4142

4243
if TYPE_CHECKING:
4344
from beets.importer import ImportTask
@@ -58,6 +59,7 @@
5859
except ImportError:
5960
HAS_LANGDETECT = False
6061

62+
6163
DIV_RE = re.compile(r"<(/?)div>?", re.I)
6264
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
6365
TAG_RE = re.compile(r"<[^>]*>")
@@ -488,15 +490,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
488490
return lyrics
489491

490492

491-
class Genius(Backend):
493+
class SearchBackend(Backend):
494+
REQUIRES_BS = True
495+
496+
@cached_property
497+
def dist_thresh(self) -> float:
498+
return self.config["dist_thresh"].get(float)
499+
500+
def check_match(
501+
self, target_artist: str, target_title: str, artist: str, title: str
502+
) -> bool:
503+
"""Check if the given artist and title are 'good enough' match."""
504+
max_dist = max(
505+
string_dist(target_artist, artist),
506+
string_dist(target_title, title),
507+
)
508+
509+
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
510+
return True
511+
512+
if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
513+
# log out the candidate that did not make it but was close.
514+
# This may show a matching candidate with some noise in the name
515+
self._log.debug(
516+
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
517+
artist,
518+
title,
519+
target_artist,
520+
target_title,
521+
max_dist,
522+
)
523+
524+
return False
525+
526+
527+
class Genius(SearchBackend):
492528
"""Fetch lyrics from Genius via genius-api.
493529
494530
Simply adapted from
495531
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
496532
"""
497533

498-
REQUIRES_BS = True
499-
500534
base_url = "https://api.genius.com"
501535

502536
def __init__(self, config, log):
@@ -519,19 +553,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
519553
self._log.debug("Genius API request returned invalid JSON")
520554
return None
521555

522-
# find a matching artist in the json
556+
check = partial(self.check_match, artist, title)
523557
for hit in json["response"]["hits"]:
524-
hit_artist = hit["result"]["primary_artist"]["name"]
525-
526-
if slug(hit_artist) == slug(artist):
527-
html = self.fetch_url(hit["result"]["url"])
558+
result = hit["result"]
559+
if check(result["primary_artist"]["name"], result["title"]):
560+
html = self.fetch_url(result["url"])
528561
if not html:
529562
return None
530563
return self._scrape_lyrics_from_html(html)
531564

532-
self._log.debug(
533-
"Genius failed to find a matching artist for '{0}'", artist
534-
)
535565
return None
536566

537567
def _search(self, artist, title):
@@ -727,10 +757,9 @@ def is_text_notcode(text):
727757
return None
728758

729759

730-
class Google(Backend):
760+
class Google(SearchBackend):
731761
"""Fetch lyrics from Google search results."""
732762

733-
REQUIRES_BS = True
734763
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
735764

736765
def is_lyrics(self, text, artist=None):
@@ -778,21 +807,20 @@ def slugify(self, text):
778807
BY_TRANS = ["by", "par", "de", "von"]
779808
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
780809

781-
def is_page_candidate(self, url_link, url_title, title, artist):
810+
def is_page_candidate(
811+
self, artist: str, title: str, url_link: str, url_title: str
812+
) -> bool:
782813
"""Return True if the URL title makes it a good candidate to be a
783814
page that contains lyrics of title by artist.
784815
"""
785-
title = self.slugify(title.lower())
786-
artist = self.slugify(artist.lower())
787-
sitename = re.search(
788-
"//([^/]+)/.*", self.slugify(url_link.lower())
789-
).group(1)
790-
url_title = self.slugify(url_title.lower())
791-
792-
# Check if URL title contains song title (exact match)
793-
if url_title.find(title) != -1:
816+
title_slug = self.slugify(title.lower())
817+
url_title_slug = self.slugify(url_title.lower())
818+
if title_slug in url_title_slug:
794819
return True
795820

821+
artist = self.slugify(artist.lower())
822+
sitename = urlparse(url_link).netloc
823+
796824
# or try extracting song title from URL title and check if
797825
# they are close enough
798826
tokens = (
@@ -801,12 +829,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
801829
+ self.LYRICS_TRANS
802830
)
803831
tokens = [re.escape(t) for t in tokens]
804-
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
832+
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
805833

806-
song_title = song_title.strip("_|")
807-
typo_ratio = 0.9
808-
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
809-
return ratio >= typo_ratio
834+
return self.check_match(artist, title_slug, artist, song_title)
810835

811836
def fetch(self, artist: str, title: str, *_) -> str | None:
812837
params = {
@@ -828,24 +853,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
828853
self._log.debug("google backend error: {0}", reason)
829854
return None
830855

831-
if "items" in data.keys():
832-
for item in data["items"]:
833-
url_link = item["link"]
834-
url_title = item.get("title", "")
835-
if not self.is_page_candidate(
836-
url_link, url_title, title, artist
837-
):
838-
continue
839-
html = self.fetch_url(url_link)
840-
if not html:
841-
continue
842-
lyrics = scrape_lyrics_from_html(html)
843-
if not lyrics:
844-
continue
845-
846-
if self.is_lyrics(lyrics, artist):
847-
self._log.debug("got lyrics from {0}", item["displayLink"])
848-
return lyrics
856+
check_candidate = partial(self.is_page_candidate, artist, title)
857+
for item in data.get("items", []):
858+
url_link = item["link"]
859+
if not check_candidate(url_link, item.get("title", "")):
860+
continue
861+
html = self.fetch_url(url_link)
862+
if not html:
863+
continue
864+
lyrics = scrape_lyrics_from_html(html)
865+
if not lyrics:
866+
continue
867+
868+
if self.is_lyrics(lyrics, artist):
869+
self._log.debug("got lyrics from {0}", item["displayLink"])
870+
return lyrics
849871

850872
return None
851873

@@ -869,6 +891,7 @@ def __init__(self):
869891
"bing_client_secret": None,
870892
"bing_lang_from": [],
871893
"bing_lang_to": None,
894+
"dist_thresh": 0.11,
872895
"google_API_key": None,
873896
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
874897
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -880,7 +903,6 @@ def __init__(self):
880903
# Musixmatch is disabled by default as they are currently blocking
881904
# requests with the beets user agent.
882905
"sources": [s for s in self.SOURCES if s != "musixmatch"],
883-
"dist_thresh": 0.1,
884906
}
885907
)
886908
self.config["bing_client_secret"].redact = True

docs/changelog.rst

+3
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ Bug fixes:
6161
``lrclib`` over other sources since it returns reliable results quicker than
6262
others.
6363
:bug:`5102`
64+
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
65+
to match lyrics when there was a slight variation in the artist name.
66+
:bug:`4791`
6467

6568
For packagers:
6669

docs/plugins/lyrics.rst

+6
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ configuration file. The available options are:
4242
Default: ``[]``
4343
- **bing_lang_to**: Language to translate lyrics into.
4444
Default: None.
45+
- **dist_thresh**: The maximum distance between the artist and title
46+
combination of the music file and lyrics candidate to consider them a match.
47+
Lower values will make the plugin more strict, higher values will make it
48+
more lenient. This does not apply to the ``lrclib`` backend as it matches
49+
durations.
50+
Default: ``0.11``.
4551
- **fallback**: By default, the file will be left unchanged when no lyrics are
4652
found. Use the empty string ``''`` to reset the lyrics in such a case.
4753
Default: None.

test/plugins/test_lyrics.py

+38-2
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,42 @@ def test_slug(self, text, expected):
161161
assert lyrics.slug(text) == expected
162162

163163

164+
class TestSearchBackend:
165+
@pytest.fixture
166+
def backend(self, dist_thresh):
167+
plugin = lyrics.LyricsPlugin()
168+
plugin.config.set({"dist_thresh": dist_thresh})
169+
return lyrics.SearchBackend(plugin.config, plugin._log)
170+
171+
@pytest.mark.parametrize(
172+
"dist_thresh, target_artist, artist, should_match",
173+
[
174+
(0.11, "Target Artist", "Target Artist", True),
175+
(0.11, "Target Artist", "Target Artis", True),
176+
(0.11, "Target Artist", "Target Arti", False),
177+
(0.11, "Psychonaut", "Psychonaut (BEL)", True),
178+
(0.11, "beets song", "beats song", True),
179+
(0.10, "beets song", "beats song", False),
180+
(
181+
0.11,
182+
"Lucid Dreams (Forget Me)",
183+
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
184+
False,
185+
),
186+
(
187+
0.12,
188+
"Lucid Dreams (Forget Me)",
189+
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
190+
True,
191+
),
192+
],
193+
)
194+
def test_check_match(self, backend, target_artist, artist, should_match):
195+
assert (
196+
backend.check_match(target_artist, "", artist, "") == should_match
197+
)
198+
199+
164200
@pytest.fixture(scope="module")
165201
def lyrics_root_dir(pytestconfig: pytest.Config):
166202
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ def test_is_page_candidate(
275311
self, backend, lyrics_html, url_title, artist, should_be_candidate
276312
):
277313
result = backend.is_page_candidate(
314+
artist,
315+
self.TITLE,
278316
"http://www.example.com/lyrics/beetssong",
279317
url_title,
280-
self.TITLE,
281-
artist,
282318
)
283319
assert bool(result) == should_be_candidate
284320

0 commit comments

Comments
 (0)