Spaces:
Runtime error
Runtime error
| import asyncio | |
| import json | |
| from collections import defaultdict | |
| from itertools import chain | |
| from typing import List, Optional, Tuple, TypedDict | |
| import aiohttp | |
| from bs4 import BeautifulSoup | |
| """ | |
| This file scrapes disney songs + lyrics from "https://www.disneyclips.com/lyrics/" | |
| """ | |
| URL = "https://www.disneyclips.com/lyrics/" | |
| async def get_lyrics_names_and_urls_from_movie_url( | |
| movie_name: str, url: str, session: aiohttp.ClientSession | |
| ) -> List[Tuple[str, str]]: | |
| async with session.get(url) as response: | |
| html = await response.text() | |
| soup = BeautifulSoup(html, "html.parser") | |
| table = soup.find("table", {"class": "songs"}) | |
| names_and_urls = [] | |
| if table: | |
| links = table.find_all("a") | |
| names_and_urls = [] | |
| for link in links: | |
| names_and_urls.append( | |
| (movie_name, link.text, f"{URL}/{link.get('href')}") | |
| ) | |
| return names_and_urls | |
| async def get_lyric_from_lyric_url( | |
| movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession | |
| ) -> str: | |
| async with session.get(url) as response: | |
| html = await response.text() | |
| soup = BeautifulSoup(html, "html.parser") | |
| div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}) | |
| paragraphs = div.find_all("p") | |
| text = "" | |
| # first <p> has the lyric | |
| p = paragraphs[0] | |
| for br in p.find_all("br"): | |
| br.replace_with(". ") | |
| for span in p.find_all("span"): | |
| span.decompose() | |
| text += p.text | |
| return (movie_name, lyric_name, text) | |
| async def get_movie_names_and_urls( | |
| session: aiohttp.ClientSession, | |
| ) -> List[Tuple[str, str]]: | |
| async with session.get(URL) as response: | |
| html = await response.text() | |
| soup = BeautifulSoup(html, "html.parser") | |
| links = ( | |
| soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a") | |
| ) | |
| movie_names_and_urls = [ | |
| (link.text, f"{URL}/{link.get('href')}") for link in links | |
| ] | |
| return movie_names_and_urls | |
| async def scrape_disney_lyrics(): | |
| async with aiohttp.ClientSession() as session: | |
| data = await get_movie_names_and_urls(session) | |
| data = await asyncio.gather( | |
| *[ | |
| asyncio.create_task( | |
| get_lyrics_names_and_urls_from_movie_url(*el, session) | |
| ) | |
| for el in data | |
| ] | |
| ) | |
| data = await asyncio.gather( | |
| *[ | |
| asyncio.create_task(get_lyric_from_lyric_url(*data, session)) | |
| for data in chain(*data) | |
| ] | |
| ) | |
| result = defaultdict(list) | |
| for movie_name, lyric_name, lyric_text in data: | |
| result[movie_name].append({"name": lyric_name, "text": lyric_text}) | |
| with open("data/lyrics.json", "w") as f: | |
| json.dump(result, f) | |
| loop = asyncio.get_event_loop() | |
| loop.run_until_complete(scrape_disney_lyrics()) | |