Skip to content

Commit d3955ba

Browse files
committed
Update Tekstowo backend to fetch lyrics directly
- Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks.
1 parent 9d2b34d commit d3955ba

File tree

5 files changed

+23
-1252
lines changed

5 files changed

+23
-1252
lines changed

beetsplug/lyrics.py

+18-72
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
"""Fetches, embeds, and displays lyrics."""
1616

17+
from __future__ import annotations
18+
1719
import difflib
1820
import errno
1921
import itertools
@@ -23,6 +25,7 @@
2325
import struct
2426
import unicodedata
2527
import warnings
28+
from functools import partial
2629
from typing import ClassVar
2730
from urllib.parse import quote, urlencode
2831

@@ -47,7 +50,6 @@
4750

4851
import beets
4952
from beets import plugins, ui
50-
from beets.autotag.hooks import string_dist
5153

5254
DIV_RE = re.compile(r"<(/?)div>?", re.I)
5355
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
@@ -288,7 +290,7 @@ class DirectBackend(Backend):
288290
@classmethod
289291
def encode(cls, text: str) -> str:
290292
"""Encode the string for inclusion in a URL."""
291-
return quote(unidecode(text))
293+
raise NotImplementedError
292294

293295
@classmethod
294296
def build_url(cls, *args: str) -> str:
@@ -312,7 +314,7 @@ def encode(cls, text: str) -> str:
312314
for old, new in cls.REPLACEMENTS.items():
313315
text = re.sub(old, new, text)
314316

315-
return super().encode(text)
317+
return quote(unidecode(text))
316318

317319
def fetch(self, artist, title, album=None, length=None):
318320
url = self.build_url(artist, title)
@@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
485487
"""Fetch lyrics from Tekstowo.pl."""
486488

487489
REQUIRES_BS = True
488-
BASE_URL = "http://www.tekstowo.pl"
489-
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
490-
491-
def fetch(self, artist, title, album=None, length=None):
492-
url = self.build_url(title, artist)
493-
search_results = self.fetch_url(url)
494-
if not search_results:
495-
return None
496-
497-
song_page_url = self.parse_search_results(search_results)
498-
if not song_page_url:
499-
return None
500-
501-
song_page_html = self.fetch_url(song_page_url)
502-
if not song_page_html:
503-
return None
504-
505-
return self.extract_lyrics(song_page_html, artist, title)
490+
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
506491

507-
def parse_search_results(self, html):
508-
html = _scrape_strip_cruft(html)
509-
html = _scrape_merge_paragraphs(html)
510-
511-
soup = try_parse_html(html)
512-
if not soup:
513-
return None
514-
515-
content_div = soup.find("div", class_="content")
516-
if not content_div:
517-
return None
518-
519-
card_div = content_div.find("div", class_="card")
520-
if not card_div:
521-
return None
492+
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
522493

523-
song_rows = card_div.find_all("div", class_="box-przeboje")
524-
if not song_rows:
525-
return None
526-
527-
song_row = song_rows[0]
528-
if not song_row:
529-
return None
494+
@classmethod
495+
def encode(cls, text: str) -> str:
496+
return cls.non_alpha_to_underscore(unidecode(text.lower()))
530497

531-
link = song_row.find("a")
532-
if not link:
533-
return None
498+
def fetch(self, artist, title, album=None, length=None):
499+
if html := self.fetch_url(self.build_url(artist, title)):
500+
return self.extract_lyrics(html)
534501

535-
return self.BASE_URL + link.get("href")
502+
return None
536503

537-
def extract_lyrics(self, html, artist, title):
504+
def extract_lyrics(self, html: str) -> str | None:
538505
html = _scrape_strip_cruft(html)
539506
html = _scrape_merge_paragraphs(html)
540507

541508
soup = try_parse_html(html)
542-
if not soup:
543-
return None
544509

545-
info_div = soup.find("div", class_="col-auto")
546-
if not info_div:
547-
return None
548-
549-
info_elements = info_div.find_all("a")
550-
if not info_elements:
551-
return None
552-
553-
html_title = info_elements[-1].get_text()
554-
html_artist = info_elements[-2].get_text()
555-
556-
title_dist = string_dist(html_title, title)
557-
artist_dist = string_dist(html_artist, artist)
510+
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
511+
return lyrics_div.get_text()
558512

559-
thresh = self.config["dist_thresh"].get(float)
560-
if title_dist > thresh or artist_dist > thresh:
561-
return None
562-
563-
lyrics_div = soup.select("div.song-text > div.inner-text")
564-
if not lyrics_div:
565-
return None
566-
567-
return lyrics_div[0].get_text()
513+
return None
568514

569515

570516
def remove_credits(text):

docs/changelog.rst

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ Bug fixes:
4444
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
4545
* Remove single quotes from all SQL queries
4646
:bug:`4709`
47+
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
48+
since recent updates to their website made it unsearchable.
49+
:bug:`5456`
4750

4851
For packagers:
4952

test/plugins/test_lyrics.py

+2-59
Original file line numberDiff line numberDiff line change
@@ -564,10 +564,7 @@ def test_good_lyrics(self):
564564
"""Ensure we are able to scrape a page with lyrics"""
565565
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
566566
mock = MockFetchUrl()
567-
assert (
568-
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
569-
is not None
570-
)
567+
assert tekstowo.extract_lyrics(mock(url))
571568

572569
def test_no_lyrics(self):
573570
"""Ensure we don't crash when the scraping the html for a Tekstowo page
@@ -578,61 +575,7 @@ def test_no_lyrics(self):
578575
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
579576
)
580577
mock = MockFetchUrl()
581-
assert (
582-
tekstowo.extract_lyrics(
583-
mock(url),
584-
"Beethoven",
585-
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
586-
)
587-
is None
588-
)
589-
590-
def test_song_no_match(self):
591-
"""Ensure we return None when a song does not match the search query"""
592-
# https://github.com/beetbox/beets/issues/4406
593-
# expected return value None
594-
url = (
595-
"https://www.tekstowo.pl/piosenka,bailey_bigger"
596-
",black_eyed_susan.html"
597-
)
598-
mock = MockFetchUrl()
599-
assert (
600-
tekstowo.extract_lyrics(
601-
mock(url), "Kelly Bailey", "Black Mesa Inbound"
602-
)
603-
is None
604-
)
605-
606-
607-
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
608-
"""tests Tekstowo.parse_search_results()"""
609-
610-
def setUp(self):
611-
"""Set up configuration"""
612-
TekstowoBaseTest.setUp(self)
613-
self.plugin = lyrics.LyricsPlugin()
614-
615-
def test_multiple_results(self):
616-
"""Ensure we are able to scrape a page with multiple search results"""
617-
url = (
618-
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
619-
",tytul,lucid+dreams.html"
620-
)
621-
mock = MockFetchUrl()
622-
assert (
623-
tekstowo.parse_search_results(mock(url))
624-
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
625-
"lucid_dreams__remix__ft__lil_uzi_vert.html"
626-
)
627-
628-
def test_no_results(self):
629-
"""Ensure we are able to scrape a page with no search results"""
630-
url = (
631-
"https://www.tekstowo.pl/szukaj,wykonawca,"
632-
"agfdgja,tytul,agfdgafg.html"
633-
)
634-
mock = MockFetchUrl()
635-
assert tekstowo.parse_search_results(mock(url)) is None
578+
assert not tekstowo.extract_lyrics(mock(url))
636579

637580

638581
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):

0 commit comments

Comments
 (0)