Francesco commited on
Commit
04242a9
0 Parent(s):

first commit

Browse files
Files changed (1) hide show
  1. data.py +66 -0
data.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # def get_lyrics_url_from_website():
3
+ # # https://www.disneyclips.com/lyrics/
4
+
5
+ import aiohttp
6
+ import asyncio
7
+ from bs4 import BeautifulSoup
8
+
9
+ from typing import List, TypedDict, Tuple, Optional
10
+
11
+ class Lyric(TypedDict):
12
+ name: str
13
+ text: str
14
+
15
+ class Movie(TypedDict):
16
+ title: str
17
+ lyrics: List[Lyric]
18
+
19
+
20
+ URL = "https://www.disneyclips.com/lyrics/"
21
+
22
+
23
+ async def get_lyrics_urls_from_movie_url(url: str, session: aiohttp.ClientSession) -> Optional[Tuple[str, str]]:
24
+ async with session.get(url) as response:
25
+ html = await response.text()
26
+ soup = BeautifulSoup(html, 'html.parser')
27
+ table = soup.find('table', {'class': 'songs'})
28
+ names_and_urls = None
29
+ if table:
30
+ links = table.find_all('a')
31
+ names_and_urls = []
32
+ for link in links:
33
+ names_and_urls.append((link.text, f"{URL}/{link.get('href')}"))
34
+ return names_and_urls
35
+
36
+ async def get_lyric_from_lyric_url(url: str, name: str, session: aiohttp.ClientSession) -> Lyric:
37
+ async with session.get(url) as response:
38
+ html = await response.text()
39
+ soup = BeautifulSoup(html, 'html.parser')
40
+ div = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'})
41
+ paragraphs = div.find_all('p')
42
+ text = ""
43
+ for p in paragraphs:
44
+ text += p.text
45
+ return text
46
+
47
+
48
+
49
+ async def get_movie_names_and_urls(session: aiohttp.ClientSession) -> List[Tuple[str, str]]:
50
+ async with session.get(URL) as response:
51
+ html = await response.text()
52
+ soup = BeautifulSoup(html, 'html.parser')
53
+ links = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'}).find_all('a')
54
+ movie_names_and_urls = [(link.text, f"{URL}/{link.get('href')}") for link in links]
55
+ return movie_names_and_urls
56
+
57
+
58
+
59
+
60
+ async def main():
61
+ async with aiohttp.ClientSession() as session:
62
+ names_and_urls = await get_movie_names_and_urls(session)
63
+ data = await asyncio.gather(*[asyncio.create_task(get_lyrics_urls_from_movie_url(names, url, session)) for (names, url) in names_and_urls])
64
+
65
+ loop = asyncio.get_event_loop()
66
+ loop.run_until_complete(main())