Spaces:
Running
Running
Commit
•
a4106b6
1
Parent(s):
fd633e6
Update songscope.py
Browse files- songscope.py +527 -3
songscope.py
CHANGED
@@ -1,4 +1,528 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import time
|
5 |
+
import random
|
6 |
+
import Levenshtein
|
7 |
+
import nltk
|
8 |
+
nltk.download("vader_lexicon")
|
9 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
10 |
+
import azapi
|
11 |
|
12 |
+
def random_delay(min_val: float, max_val: float, print_delay: bool = False):
|
13 |
+
"""
|
14 |
+
Inserts a random delay between website pings to simulate human-like behavior.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
----------
|
18 |
+
min_val: float
|
19 |
+
The minimum amount of time to delay (in seconds).
|
20 |
+
max_val: float
|
21 |
+
The maximum amount of time to delay (in seconds).
|
22 |
+
print_delay: bool
|
23 |
+
Whether or not to print the delay time.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
-------
|
27 |
+
val: float
|
28 |
+
The random delay time (in seconds).
|
29 |
+
"""
|
30 |
+
val = random.uniform(min_val, max_val)
|
31 |
+
time.sleep(val)
|
32 |
+
if print_delay == True:
|
33 |
+
print(f"Delayed {val} seconds")
|
34 |
+
return val
|
35 |
+
|
36 |
+
def find_artist(artist_name: str):
|
37 |
+
"""
|
38 |
+
Finds the link to an artist's page on azlyrics.com.
|
39 |
+
|
40 |
+
This function sends an HTTP request to azlyrics.com, scrapes the HTML content
|
41 |
+
to find the artist's page, and returns the URL to that page.
|
42 |
+
|
43 |
+
Parameters:
|
44 |
+
----------
|
45 |
+
artist_name: str
|
46 |
+
The name of the artist.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
-------
|
50 |
+
url: str
|
51 |
+
The URL to the artist's page on azlyrics.com.
|
52 |
+
"""
|
53 |
+
for char in artist_name:
|
54 |
+
if char != " ":
|
55 |
+
first_letter = char
|
56 |
+
break
|
57 |
+
|
58 |
+
url = f"https://www.azlyrics.com/{first_letter}.html"
|
59 |
+
|
60 |
+
response = requests.get(url)
|
61 |
+
|
62 |
+
if response.status_code == 200:
|
63 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
64 |
+
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
|
65 |
+
artist_links = []
|
66 |
+
|
67 |
+
for artist_div in artist_divs:
|
68 |
+
for anchor in artist_div.find_all('a'):
|
69 |
+
href = anchor.get('href')
|
70 |
+
artist_links.append(href)
|
71 |
+
|
72 |
+
artist_urls = []
|
73 |
+
for url in artist_links:
|
74 |
+
artist_urls.append(str(url).split("/")[-1][:-5])
|
75 |
+
|
76 |
+
if artist_name in artist_urls:
|
77 |
+
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
|
78 |
+
else:
|
79 |
+
min_id = None
|
80 |
+
max_sim = -100
|
81 |
+
for id, name in enumerate(artist_urls):
|
82 |
+
dist = Levenshtein.jaro(artist_name, name)
|
83 |
+
if max_sim < dist:
|
84 |
+
max_sim = dist
|
85 |
+
min_id = id
|
86 |
+
|
87 |
+
return f"https://www.azlyrics.com/{artist_links[min_id]}"
|
88 |
+
|
89 |
+
def follow_lyrics(lyric_url: str):
|
90 |
+
"""
|
91 |
+
Retrieves the lyrics of a song from the specified URL on azlyrics.com.
|
92 |
+
|
93 |
+
This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
|
94 |
+
and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
|
95 |
+
and whitespace. The function returns the lyrics as a string.
|
96 |
+
|
97 |
+
Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
|
98 |
+
specifically prohibits the usage of their content by third-party lyrics providers.
|
99 |
+
Always review the website's policies and ensure you are compliant before scraping data.
|
100 |
+
|
101 |
+
Parameters:
|
102 |
+
------
|
103 |
+
`lyric_url`: str
|
104 |
+
The URL of the song lyrics on azlyrics.com.
|
105 |
+
`song_title`: str
|
106 |
+
Title of the song
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
------
|
110 |
+
`lyrics_str`:
|
111 |
+
The lyrics of the song as a single string.
|
112 |
+
|
113 |
+
Raises:
|
114 |
+
------
|
115 |
+
`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
|
116 |
+
"""
|
117 |
+
|
118 |
+
# # delay website call by a random amount as to not get banned
|
119 |
+
# random_delay(min_val = 1, max_val = 3, print_delay = False)
|
120 |
+
|
121 |
+
# Send an HTTP request to the lyric_url
|
122 |
+
response = requests.get(lyric_url)
|
123 |
+
|
124 |
+
# Check if the request was successful
|
125 |
+
if response.status_code == 200:
|
126 |
+
# Parse the HTML content
|
127 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
128 |
+
|
129 |
+
# Find the main div element containing the lyrics
|
130 |
+
main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')
|
131 |
+
|
132 |
+
# Find the div element containing the lyrics within the main div
|
133 |
+
lyrics_div = None
|
134 |
+
for div in main_div.find_all('div'):
|
135 |
+
if not div.has_attr('class') and not div.has_attr('id'):
|
136 |
+
lyrics_div = div
|
137 |
+
break
|
138 |
+
|
139 |
+
if lyrics_div:
|
140 |
+
# Clean up the lyrics by removing unnecessary HTML tags and whitespace
|
141 |
+
lyrics_str = lyrics_div.get_text(strip = False)
|
142 |
+
else:
|
143 |
+
print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
|
144 |
+
else:
|
145 |
+
print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")
|
146 |
+
|
147 |
+
return lyrics_str
|
148 |
+
|
149 |
+
# get artist link on azlyrics
|
150 |
+
def find_artist(artist_name: str) -> str:
|
151 |
+
"""
|
152 |
+
Finds the link for the artist page on azlyrics.com.
|
153 |
+
|
154 |
+
Parameters:
|
155 |
+
------
|
156 |
+
`artist_name`: str
|
157 |
+
The name of the artist.
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
------
|
161 |
+
`url`: str
|
162 |
+
The URL of the artist page on azlyrics.com.
|
163 |
+
|
164 |
+
Raises:
|
165 |
+
------
|
166 |
+
`ValueError`: If the artist page cannot be found.
|
167 |
+
"""
|
168 |
+
for char in artist_name:
|
169 |
+
if char != " ":
|
170 |
+
first_letter = char
|
171 |
+
break
|
172 |
+
|
173 |
+
# The target URL
|
174 |
+
url = f"https://www.azlyrics.com/{first_letter}.html"
|
175 |
+
|
176 |
+
# Send an HTTP request to the URL
|
177 |
+
response = requests.get(url)
|
178 |
+
|
179 |
+
# Check if the request was successful
|
180 |
+
if response.status_code == 200:
|
181 |
+
# Parse the HTML content
|
182 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
183 |
+
|
184 |
+
# Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
|
185 |
+
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
|
186 |
+
|
187 |
+
# Initialize an empty list to store the artist links
|
188 |
+
artist_links = []
|
189 |
+
|
190 |
+
# Extract the 'href' attribute from each 'a' tag within the artist divs
|
191 |
+
for artist_div in artist_divs:
|
192 |
+
for anchor in artist_div.find_all('a'):
|
193 |
+
href = anchor.get('href')
|
194 |
+
artist_links.append(href)
|
195 |
+
|
196 |
+
# choose most similar artist link from all artist links
|
197 |
+
artist_urls = []
|
198 |
+
for url in artist_links:
|
199 |
+
artist_urls.append(str(url).split("/")[-1][:-5])
|
200 |
+
|
201 |
+
if artist_name in artist_urls:
|
202 |
+
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
|
203 |
+
else:
|
204 |
+
min_id = None
|
205 |
+
max_sim = -100
|
206 |
+
for id, name in enumerate(artist_urls):
|
207 |
+
dist = Levenshtein.jaro(artist_name, name)
|
208 |
+
if max_sim < dist:
|
209 |
+
max_sim = dist
|
210 |
+
min_id = id
|
211 |
+
|
212 |
+
return f"https://www.azlyrics.com/{artist_links[min_id]}"
|
213 |
+
|
214 |
+
# this will flatten all inner lists (all depths) of a list into a list of depth == 1
|
215 |
+
def flatten_list(lst: list):
|
216 |
+
"""
|
217 |
+
Flattens all inner lists (all depths) of a list into a list of depth == 1.
|
218 |
+
|
219 |
+
Parameters:
|
220 |
+
------
|
221 |
+
`lst`: List
|
222 |
+
The list to be flattened.
|
223 |
+
|
224 |
+
Returns:
|
225 |
+
------
|
226 |
+
`result`: List
|
227 |
+
The flattened list.
|
228 |
+
"""
|
229 |
+
result = []
|
230 |
+
for element in lst:
|
231 |
+
if isinstance(element, list):
|
232 |
+
result.extend(flatten_list(element))
|
233 |
+
else:
|
234 |
+
result.append(element)
|
235 |
+
return result
|
236 |
+
|
237 |
+
# lyric pre-processing
|
238 |
+
def process_lyrics(lyrics: str):
|
239 |
+
"""
|
240 |
+
Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
|
241 |
+
by `\n`, and removing consecutive whitespace list items.
|
242 |
+
|
243 |
+
Parameters:
|
244 |
+
------
|
245 |
+
`lyrics`: str
|
246 |
+
The lyrics to be pre-processed.
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
------
|
250 |
+
`cleaned_lines`: List
|
251 |
+
The pre-processed lyrics.
|
252 |
+
"""
|
253 |
+
# Replace "\r" with an empty string
|
254 |
+
lyrics = lyrics.replace('\r', '')
|
255 |
+
|
256 |
+
# Split the lyrics by "\n"
|
257 |
+
lines = lyrics.split('\n')
|
258 |
+
|
259 |
+
# Remove consecutive whitespace list items
|
260 |
+
cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']
|
261 |
+
|
262 |
+
return cleaned_lines
|
263 |
+
|
264 |
+
# splitting pre-processed lyrics into sections (this typically loosely matches a song form)
|
265 |
+
def sectionize(lyrics: str):
|
266 |
+
"""
|
267 |
+
Splits the pre-processed lyrics into sections.
|
268 |
+
|
269 |
+
Parameters:
|
270 |
+
------
|
271 |
+
`lyrics`: str
|
272 |
+
The pre-processed lyrics.
|
273 |
+
|
274 |
+
Returns:
|
275 |
+
------
|
276 |
+
`all_sections`: List
|
277 |
+
The lyrics split into sections.
|
278 |
+
"""
|
279 |
+
lyrs_list = process_lyrics(lyrics)
|
280 |
+
|
281 |
+
sectd = []
|
282 |
+
for line in lyrs_list:
|
283 |
+
if line == "":
|
284 |
+
sectd.append("#SEC")
|
285 |
+
else:
|
286 |
+
sectd.append(line)
|
287 |
+
|
288 |
+
del sectd[-1]
|
289 |
+
|
290 |
+
all_sections = []
|
291 |
+
for id, line in enumerate(sectd):
|
292 |
+
if id == 0:
|
293 |
+
sec_list = []
|
294 |
+
if line == "#SEC":
|
295 |
+
all_sections.append(sec_list)
|
296 |
+
sec_list = []
|
297 |
+
else:
|
298 |
+
sec_list.append(line)
|
299 |
+
|
300 |
+
del all_sections[0]
|
301 |
+
|
302 |
+
return all_sections
|
303 |
+
|
304 |
+
# sentiment analysis model
|
305 |
+
def analyze_sentiment_vader(text: str):
|
306 |
+
"""
|
307 |
+
Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
|
308 |
+
sentiment analysis model.
|
309 |
+
|
310 |
+
Parameters:
|
311 |
+
------
|
312 |
+
`text`: str
|
313 |
+
The text to be analyzed.
|
314 |
+
|
315 |
+
Returns:
|
316 |
+
------
|
317 |
+
`label`: str
|
318 |
+
The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
|
319 |
+
`compound_score`: float
|
320 |
+
The compound score of the text.
|
321 |
+
"""
|
322 |
+
sia = SentimentIntensityAnalyzer()
|
323 |
+
sentiment_scores = sia.polarity_scores(text)
|
324 |
+
|
325 |
+
# Determine the sentiment label based on the compound score
|
326 |
+
compound_score = sentiment_scores["compound"]
|
327 |
+
if compound_score >= 0.05:
|
328 |
+
label = "POSITIVE"
|
329 |
+
elif compound_score <= -0.05:
|
330 |
+
label = "NEGATIVE"
|
331 |
+
else:
|
332 |
+
label = "NEUTRAL"
|
333 |
+
|
334 |
+
return label, compound_score
|
335 |
+
|
336 |
+
# get sentiment of all text items in 'lyrics' column
|
337 |
+
def get_sentiments(df):
|
338 |
+
"""
|
339 |
+
Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
|
340 |
+
|
341 |
+
This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
|
342 |
+
to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
|
343 |
+
('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
|
344 |
+
each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
|
345 |
+
DataFrame is returned.
|
346 |
+
|
347 |
+
Parameters:
|
348 |
+
-----------
|
349 |
+
df : pandas DataFrame
|
350 |
+
The DataFrame containing the 'lyrics' column to be analyzed.
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
--------
|
354 |
+
df : pandas DataFrame
|
355 |
+
The modified DataFrame with sentiment analysis added as new columns.
|
356 |
+
|
357 |
+
Raises:
|
358 |
+
-------
|
359 |
+
None.
|
360 |
+
"""
|
361 |
+
|
362 |
+
for row in df.index:
|
363 |
+
section_lyrics = df.loc[row, 'lyrics']
|
364 |
+
sec_lyrs_str = ""
|
365 |
+
for line in section_lyrics:
|
366 |
+
sec_lyrs_str += line + " "
|
367 |
+
label, valence = analyze_sentiment_vader(sec_lyrs_str)
|
368 |
+
df.loc[row, 'sentiment_label'] = label
|
369 |
+
df.loc[row, 'sentiment_valence'] = valence
|
370 |
+
|
371 |
+
return df
|
372 |
+
|
373 |
+
# get just metadata for songs (not lyrics)
|
374 |
+
def get_metadata(artist_name: str, song_titles: list = None) -> dict:
|
375 |
+
"""
|
376 |
+
Get all metadata for the passed artist and songs.
|
377 |
+
|
378 |
+
Parameters:
|
379 |
+
-----------
|
380 |
+
artist_name: str
|
381 |
+
The name of the artist to search for.
|
382 |
+
song_titles: list
|
383 |
+
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
|
384 |
+
|
385 |
+
Returns:
|
386 |
+
--------
|
387 |
+
dict
|
388 |
+
A dictionary containing metadata for each song found. The keys are the song titles and the values are
|
389 |
+
dictionaries containing various metadata for each song.
|
390 |
+
"""
|
391 |
+
|
392 |
+
urls = find_artist(artist_name)
|
393 |
+
azlyrics_artist_name = urls.split("/")[-1][:-5]
|
394 |
+
|
395 |
+
API = azapi.AZlyrics('google', accuracy = 0.6)
|
396 |
+
API.artist = azlyrics_artist_name
|
397 |
+
all_songs_info = API.getSongs() # dictionary
|
398 |
+
|
399 |
+
az_titles = [title for title in all_songs_info]
|
400 |
+
|
401 |
+
if song_titles == None:
|
402 |
+
return all_songs_info
|
403 |
+
else:
|
404 |
+
found_data = {}
|
405 |
+
for title in song_titles:
|
406 |
+
if title in az_titles:
|
407 |
+
found_data[title] = all_songs_info[title]
|
408 |
+
else:
|
409 |
+
min_id = None
|
410 |
+
max_sim = -100
|
411 |
+
for id, az_name in enumerate(az_titles):
|
412 |
+
dist = Levenshtein.jaro(title, az_name)
|
413 |
+
if max_sim < dist:
|
414 |
+
max_sim = dist
|
415 |
+
min_id = id
|
416 |
+
|
417 |
+
found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]
|
418 |
+
|
419 |
+
return found_data
|
420 |
+
|
421 |
+
# combine metadata with found lyrics
|
422 |
+
def get_all_data(artist_name: str, song_titles: list = None,
|
423 |
+
delay: tuple = (0.5, 2), print_progress: bool = False):
|
424 |
+
"""
|
425 |
+
Get all metadata and sentiment analysis for the passed artist and songs.
|
426 |
+
|
427 |
+
Parameters:
|
428 |
+
-----------
|
429 |
+
artist_name: str
|
430 |
+
The name of the artist to search for.
|
431 |
+
song_titles: list
|
432 |
+
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
|
433 |
+
delay: tuple
|
434 |
+
A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
|
435 |
+
being banned by the server.
|
436 |
+
print_progress: bool
|
437 |
+
Whether to print progress messages or not.
|
438 |
+
|
439 |
+
Returns:
|
440 |
+
--------
|
441 |
+
pd.DataFrame
|
442 |
+
A pandas DataFrame containing metadata and sentiment analysis for each song found.
|
443 |
+
"""
|
444 |
+
if print_progress == True:
|
445 |
+
print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")
|
446 |
+
|
447 |
+
artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)
|
448 |
+
|
449 |
+
if print_progress == True:
|
450 |
+
print(f"\n\t- All metadata found")
|
451 |
+
|
452 |
+
times = []
|
453 |
+
|
454 |
+
for title, mdata in artist_data.items():
|
455 |
+
start = time.time()
|
456 |
+
try:
|
457 |
+
lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
|
458 |
+
artist_data[title]['lyrics'] = sectionize(lyrics)
|
459 |
+
except: (UnboundLocalError, TypeError, AttributeError)
|
460 |
+
print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
|
461 |
+
continue
|
462 |
+
|
463 |
+
# as to not get banned
|
464 |
+
random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)
|
465 |
+
|
466 |
+
# time stuff
|
467 |
+
times.append(start - time.time())
|
468 |
+
avg_time = sum(times) / len(times)
|
469 |
+
remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed
|
470 |
+
|
471 |
+
# printing stuff
|
472 |
+
if print_progress == True:
|
473 |
+
if remaining >= 60: # more than one minute remaining
|
474 |
+
remaining = round(remaining / 60, 2)
|
475 |
+
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
|
476 |
+
else: # less than one minute remaining
|
477 |
+
remaining = round(remaining, 2)
|
478 |
+
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")
|
479 |
+
|
480 |
+
if print_progress == True:
|
481 |
+
print(f"\nAll lyrics and metadata found. Returning structured data.")
|
482 |
+
|
483 |
+
df_dict = {}
|
484 |
+
df_dict['artist_name'] = []
|
485 |
+
df_dict['song_title'] = []
|
486 |
+
df_dict['release_year'] = []
|
487 |
+
df_dict['lyrics'] = []
|
488 |
+
df_dict['lyrics_section_number'] = []
|
489 |
+
df_dict['album_name'] = []
|
490 |
+
df_dict['release_type'] = []
|
491 |
+
df_dict['lyrics_url'] = []
|
492 |
+
|
493 |
+
for title, info in artist_data.items():
|
494 |
+
df_dict['artist_name'].append("John Mayer")
|
495 |
+
df_dict['song_title'].append(title)
|
496 |
+
df_dict['album_name'].append(info['album'])
|
497 |
+
df_dict['release_year'].append(info['year'])
|
498 |
+
df_dict['lyrics'].append(info['lyrics'])
|
499 |
+
df_dict['lyrics_section_number'].append(len(info['album']))
|
500 |
+
df_dict['release_type'].append(info['type'])
|
501 |
+
df_dict['lyrics_url'].append(info['url'])
|
502 |
+
|
503 |
+
new_dict = {}
|
504 |
+
for key in df_dict:
|
505 |
+
new_dict[key] = []
|
506 |
+
|
507 |
+
for i in range(len(df_dict['lyrics'])):
|
508 |
+
for id, inner in enumerate(df_dict['lyrics'][i]):
|
509 |
+
new_dict['song_title'].append(df_dict['song_title'][i])
|
510 |
+
new_dict['release_year'].append(df_dict['release_year'][i])
|
511 |
+
new_dict['album_name'].append(df_dict['album_name'][i])
|
512 |
+
new_dict['artist_name'].append(df_dict['artist_name'][i])
|
513 |
+
new_dict['lyrics'].append(inner)
|
514 |
+
new_dict['lyrics_section_number'].append(id)
|
515 |
+
new_dict['release_type'].append(df_dict['release_type'][i])
|
516 |
+
new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])
|
517 |
+
|
518 |
+
sents_df = get_sentiments(pd.DataFrame(new_dict))
|
519 |
+
|
520 |
+
# reordering columns to better suit the task
|
521 |
+
sents_df = sents_df[["artist_name", "song_title", "release_year",
|
522 |
+
"lyrics", "lyrics_section_number", 'sentiment_label',
|
523 |
+
'sentiment_valence', "album_name", "release_type", "lyrics_url"]]
|
524 |
+
|
525 |
+
if print_progress == True:
|
526 |
+
print(f"Data retrieval complete!\n\n------------------------")
|
527 |
+
|
528 |
+
return sents_df
|