16
16
17
17
from __future__ import annotations
18
18
19
- import difflib
20
19
import errno
21
20
import itertools
22
21
import json
22
+ import math
23
23
import os .path
24
24
import re
25
25
import struct
30
30
from functools import cached_property , partial , total_ordering
31
31
from http import HTTPStatus
32
32
from typing import TYPE_CHECKING , ClassVar , Iterable , Iterator
33
- from urllib .parse import quote , urlencode
33
+ from urllib .parse import quote , urlencode , urlparse
34
34
35
35
import requests
36
36
from typing_extensions import TypedDict
37
37
from unidecode import unidecode
38
38
39
39
import beets
40
40
from beets import plugins , ui
41
+ from beets .autotag .hooks import string_dist
41
42
42
43
if TYPE_CHECKING :
43
44
from beets .importer import ImportTask
58
59
except ImportError :
59
60
HAS_LANGDETECT = False
60
61
62
+
61
63
DIV_RE = re .compile (r"<(/?)div>?" , re .I )
62
64
COMMENT_RE = re .compile (r"<!--.*-->" , re .S )
63
65
TAG_RE = re .compile (r"<[^>]*>" )
@@ -488,15 +490,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
488
490
return lyrics
489
491
490
492
491
- class Genius (Backend ):
493
+ class SearchBackend (Backend ):
494
+ REQUIRES_BS = True
495
+
496
+ @cached_property
497
+ def dist_thresh (self ) -> float :
498
+ return self .config ["dist_thresh" ].get (float )
499
+
500
+ def check_match (
501
+ self , target_artist : str , target_title : str , artist : str , title : str
502
+ ) -> bool :
503
+ """Check if the given artist and title are 'good enough' match."""
504
+ max_dist = max (
505
+ string_dist (target_artist , artist ),
506
+ string_dist (target_title , title ),
507
+ )
508
+
509
+ if (max_dist := round (max_dist , 2 )) <= self .dist_thresh :
510
+ return True
511
+
512
+ if math .isclose (max_dist , self .dist_thresh , abs_tol = 0.4 ):
513
+ # log out the candidate that did not make it but was close.
514
+ # This may show a matching candidate with some noise in the name
515
+ self ._log .debug (
516
+ "({}, {}) does not match ({}, {}) but dist was close: {:.2f}" ,
517
+ artist ,
518
+ title ,
519
+ target_artist ,
520
+ target_title ,
521
+ max_dist ,
522
+ )
523
+
524
+ return False
525
+
526
+
527
+ class Genius (SearchBackend ):
492
528
"""Fetch lyrics from Genius via genius-api.
493
529
494
530
Simply adapted from
495
531
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
496
532
"""
497
533
498
- REQUIRES_BS = True
499
-
500
534
base_url = "https://api.genius.com"
501
535
502
536
def __init__ (self , config , log ):
@@ -519,19 +553,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
519
553
self ._log .debug ("Genius API request returned invalid JSON" )
520
554
return None
521
555
522
- # find a matching artist in the json
556
+ check = partial ( self . check_match , artist , title )
523
557
for hit in json ["response" ]["hits" ]:
524
- hit_artist = hit ["result" ]["primary_artist" ]["name" ]
525
-
526
- if slug (hit_artist ) == slug (artist ):
527
- html = self .fetch_url (hit ["result" ]["url" ])
558
+ result = hit ["result" ]
559
+ if check (result ["primary_artist" ]["name" ], result ["title" ]):
560
+ html = self .fetch_url (result ["url" ])
528
561
if not html :
529
562
return None
530
563
return self ._scrape_lyrics_from_html (html )
531
564
532
- self ._log .debug (
533
- "Genius failed to find a matching artist for '{0}'" , artist
534
- )
535
565
return None
536
566
537
567
def _search (self , artist , title ):
@@ -727,10 +757,9 @@ def is_text_notcode(text):
727
757
return None
728
758
729
759
730
- class Google (Backend ):
760
+ class Google (SearchBackend ):
731
761
"""Fetch lyrics from Google search results."""
732
762
733
- REQUIRES_BS = True
734
763
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
735
764
736
765
def is_lyrics (self , text , artist = None ):
@@ -778,21 +807,20 @@ def slugify(self, text):
778
807
BY_TRANS = ["by" , "par" , "de" , "von" ]
779
808
LYRICS_TRANS = ["lyrics" , "paroles" , "letras" , "liedtexte" ]
780
809
781
- def is_page_candidate (self , url_link , url_title , title , artist ):
810
+ def is_page_candidate (
811
+ self , artist : str , title : str , url_link : str , url_title : str
812
+ ) -> bool :
782
813
"""Return True if the URL title makes it a good candidate to be a
783
814
page that contains lyrics of title by artist.
784
815
"""
785
- title = self .slugify (title .lower ())
786
- artist = self .slugify (artist .lower ())
787
- sitename = re .search (
788
- "//([^/]+)/.*" , self .slugify (url_link .lower ())
789
- ).group (1 )
790
- url_title = self .slugify (url_title .lower ())
791
-
792
- # Check if URL title contains song title (exact match)
793
- if url_title .find (title ) != - 1 :
816
+ title_slug = self .slugify (title .lower ())
817
+ url_title_slug = self .slugify (url_title .lower ())
818
+ if title_slug in url_title_slug :
794
819
return True
795
820
821
+ artist = self .slugify (artist .lower ())
822
+ sitename = urlparse (url_link ).netloc
823
+
796
824
# or try extracting song title from URL title and check if
797
825
# they are close enough
798
826
tokens = (
@@ -801,12 +829,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
801
829
+ self .LYRICS_TRANS
802
830
)
803
831
tokens = [re .escape (t ) for t in tokens ]
804
- song_title = re .sub ("(%s)" % "|" .join (tokens ), "" , url_title )
832
+ song_title = re .sub ("(%s)" % "|" .join (tokens ), "" , url_title_slug )
805
833
806
- song_title = song_title .strip ("_|" )
807
- typo_ratio = 0.9
808
- ratio = difflib .SequenceMatcher (None , song_title , title ).ratio ()
809
- return ratio >= typo_ratio
834
+ return self .check_match (artist , title_slug , artist , song_title )
810
835
811
836
def fetch (self , artist : str , title : str , * _ ) -> str | None :
812
837
params = {
@@ -828,24 +853,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
828
853
self ._log .debug ("google backend error: {0}" , reason )
829
854
return None
830
855
831
- if "items" in data .keys ():
832
- for item in data ["items" ]:
833
- url_link = item ["link" ]
834
- url_title = item .get ("title" , "" )
835
- if not self .is_page_candidate (
836
- url_link , url_title , title , artist
837
- ):
838
- continue
839
- html = self .fetch_url (url_link )
840
- if not html :
841
- continue
842
- lyrics = scrape_lyrics_from_html (html )
843
- if not lyrics :
844
- continue
845
-
846
- if self .is_lyrics (lyrics , artist ):
847
- self ._log .debug ("got lyrics from {0}" , item ["displayLink" ])
848
- return lyrics
856
+ check_candidate = partial (self .is_page_candidate , artist , title )
857
+ for item in data .get ("items" , []):
858
+ url_link = item ["link" ]
859
+ if not check_candidate (url_link , item .get ("title" , "" )):
860
+ continue
861
+ html = self .fetch_url (url_link )
862
+ if not html :
863
+ continue
864
+ lyrics = scrape_lyrics_from_html (html )
865
+ if not lyrics :
866
+ continue
867
+
868
+ if self .is_lyrics (lyrics , artist ):
869
+ self ._log .debug ("got lyrics from {0}" , item ["displayLink" ])
870
+ return lyrics
849
871
850
872
return None
851
873
@@ -869,6 +891,7 @@ def __init__(self):
869
891
"bing_client_secret" : None ,
870
892
"bing_lang_from" : [],
871
893
"bing_lang_to" : None ,
894
+ "dist_thresh" : 0.11 ,
872
895
"google_API_key" : None ,
873
896
"google_engine_ID" : "009217259823014548361:lndtuqkycfu" ,
874
897
"genius_api_key" : "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -880,7 +903,6 @@ def __init__(self):
880
903
# Musixmatch is disabled by default as they are currently blocking
881
904
# requests with the beets user agent.
882
905
"sources" : [s for s in self .SOURCES if s != "musixmatch" ],
883
- "dist_thresh" : 0.1 ,
884
906
}
885
907
)
886
908
self .config ["bing_client_secret" ].redact = True
0 commit comments