File size: 3,041 Bytes
14a3421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import asyncio
import json
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Tuple, TypedDict

import aiohttp
from bs4 import BeautifulSoup

"""
This file scrapes disney songs + lyrics from "https://www.disneyclips.com/lyrics/"
"""

URL = "https://www.disneyclips.com/lyrics/"


async def get_lyrics_names_and_urls_from_movie_url(
    movie_name: str, url: str, session: aiohttp.ClientSession
) -> List[Tuple[str, str]]:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table", {"class": "songs"})
        names_and_urls = []
        if table:
            links = table.find_all("a")
            names_and_urls = []
            for link in links:
                names_and_urls.append(
                    (movie_name, link.text, f"{URL}/{link.get('href')}")
                )
        return names_and_urls


async def get_lyric_from_lyric_url(
    movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
) -> str:
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
        paragraphs = div.find_all("p")
        text = ""
        # first <p> has the lyric
        p = paragraphs[0]
        for br in p.find_all("br"):
            br.replace_with(". ")
        for span in p.find_all("span"):
            span.decompose()
        text += p.text

        return (movie_name, lyric_name, text)


async def get_movie_names_and_urls(
    session: aiohttp.ClientSession,
) -> List[Tuple[str, str]]:
    async with session.get(URL) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        links = (
            soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
        )
        movie_names_and_urls = [
            (link.text, f"{URL}/{link.get('href')}") for link in links
        ]
        return movie_names_and_urls


async def scrape_disney_lyrics():
    async with aiohttp.ClientSession() as session:
        data = await get_movie_names_and_urls(session)
        data = await asyncio.gather(
            *[
                asyncio.create_task(
                    get_lyrics_names_and_urls_from_movie_url(*el, session)
                )
                for el in data
            ]
        )
        data = await asyncio.gather(
            *[
                asyncio.create_task(get_lyric_from_lyric_url(*data, session))
                for data in chain(*data)
            ]
        )

        result = defaultdict(list)

        for movie_name, lyric_name, lyric_text in data:
            result[movie_name].append({"name": lyric_name, "text": lyric_text})

        with open("data/lyrics.json", "w") as f:
            json.dump(result, f)


loop = asyncio.get_event_loop()
loop.run_until_complete(scrape_disney_lyrics())