kmaurinjones commited on
Commit
a4106b6
1 Parent(s): fd633e6

Update songscope.py

Browse files
Files changed (1) hide show
  1. songscope.py +527 -3
songscope.py CHANGED
@@ -1,4 +1,528 @@
1
- import streamlit as st
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import time
5
+ import random
6
+ import Levenshtein
7
+ import nltk
8
+ nltk.download("vader_lexicon")
9
+ from nltk.sentiment import SentimentIntensityAnalyzer
10
+ import azapi
11
 
12
+ def random_delay(min_val: float, max_val: float, print_delay: bool = False):
13
+ """
14
+ Inserts a random delay between website pings to simulate human-like behavior.
15
+
16
+ Parameters:
17
+ ----------
18
+ min_val: float
19
+ The minimum amount of time to delay (in seconds).
20
+ max_val: float
21
+ The maximum amount of time to delay (in seconds).
22
+ print_delay: bool
23
+ Whether or not to print the delay time.
24
+
25
+ Returns:
26
+ -------
27
+ val: float
28
+ The random delay time (in seconds).
29
+ """
30
+ val = random.uniform(min_val, max_val)
31
+ time.sleep(val)
32
+ if print_delay == True:
33
+ print(f"Delayed {val} seconds")
34
+ return val
35
+
36
+ def find_artist(artist_name: str):
37
+ """
38
+ Finds the link to an artist's page on azlyrics.com.
39
+
40
+ This function sends an HTTP request to azlyrics.com, scrapes the HTML content
41
+ to find the artist's page, and returns the URL to that page.
42
+
43
+ Parameters:
44
+ ----------
45
+ artist_name: str
46
+ The name of the artist.
47
+
48
+ Returns:
49
+ -------
50
+ url: str
51
+ The URL to the artist's page on azlyrics.com.
52
+ """
53
+ for char in artist_name:
54
+ if char != " ":
55
+ first_letter = char
56
+ break
57
+
58
+ url = f"https://www.azlyrics.com/{first_letter}.html"
59
+
60
+ response = requests.get(url)
61
+
62
+ if response.status_code == 200:
63
+ soup = BeautifulSoup(response.text, 'html.parser')
64
+ artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
65
+ artist_links = []
66
+
67
+ for artist_div in artist_divs:
68
+ for anchor in artist_div.find_all('a'):
69
+ href = anchor.get('href')
70
+ artist_links.append(href)
71
+
72
+ artist_urls = []
73
+ for url in artist_links:
74
+ artist_urls.append(str(url).split("/")[-1][:-5])
75
+
76
+ if artist_name in artist_urls:
77
+ return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
78
+ else:
79
+ min_id = None
80
+ max_sim = -100
81
+ for id, name in enumerate(artist_urls):
82
+ dist = Levenshtein.jaro(artist_name, name)
83
+ if max_sim < dist:
84
+ max_sim = dist
85
+ min_id = id
86
+
87
+ return f"https://www.azlyrics.com/{artist_links[min_id]}"
88
+
89
+ def follow_lyrics(lyric_url: str):
90
+ """
91
+ Retrieves the lyrics of a song from the specified URL on azlyrics.com.
92
+
93
+ This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
94
+ and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
95
+ and whitespace. The function returns the lyrics as a string.
96
+
97
+ Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
98
+ specifically prohibits the usage of their content by third-party lyrics providers.
99
+ Always review the website's policies and ensure you are compliant before scraping data.
100
+
101
+ Parameters:
102
+ ------
103
+ `lyric_url`: str
104
+ The URL of the song lyrics on azlyrics.com.
105
+ `song_title`: str
106
+ Title of the song
107
+
108
+ Returns:
109
+ ------
110
+ `lyrics_str`:
111
+ The lyrics of the song as a single string.
112
+
113
+ Raises:
114
+ ------
115
+ `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
116
+ """
117
+
118
+ # # delay website call by a random amount as to not get banned
119
+ # random_delay(min_val = 1, max_val = 3, print_delay = False)
120
+
121
+ # Send an HTTP request to the lyric_url
122
+ response = requests.get(lyric_url)
123
+
124
+ # Check if the request was successful
125
+ if response.status_code == 200:
126
+ # Parse the HTML content
127
+ soup = BeautifulSoup(response.text, 'html.parser')
128
+
129
+ # Find the main div element containing the lyrics
130
+ main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')
131
+
132
+ # Find the div element containing the lyrics within the main div
133
+ lyrics_div = None
134
+ for div in main_div.find_all('div'):
135
+ if not div.has_attr('class') and not div.has_attr('id'):
136
+ lyrics_div = div
137
+ break
138
+
139
+ if lyrics_div:
140
+ # Clean up the lyrics by removing unnecessary HTML tags and whitespace
141
+ lyrics_str = lyrics_div.get_text(strip = False)
142
+ else:
143
+ print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
144
+ else:
145
+ print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")
146
+
147
+ return lyrics_str
148
+
149
+ # get artist link on azlyrics
150
+ def find_artist(artist_name: str) -> str:
151
+ """
152
+ Finds the link for the artist page on azlyrics.com.
153
+
154
+ Parameters:
155
+ ------
156
+ `artist_name`: str
157
+ The name of the artist.
158
+
159
+ Returns:
160
+ ------
161
+ `url`: str
162
+ The URL of the artist page on azlyrics.com.
163
+
164
+ Raises:
165
+ ------
166
+ `ValueError`: If the artist page cannot be found.
167
+ """
168
+ for char in artist_name:
169
+ if char != " ":
170
+ first_letter = char
171
+ break
172
+
173
+ # The target URL
174
+ url = f"https://www.azlyrics.com/{first_letter}.html"
175
+
176
+ # Send an HTTP request to the URL
177
+ response = requests.get(url)
178
+
179
+ # Check if the request was successful
180
+ if response.status_code == 200:
181
+ # Parse the HTML content
182
+ soup = BeautifulSoup(response.text, 'html.parser')
183
+
184
+ # Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
185
+ artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
186
+
187
+ # Initialize an empty list to store the artist links
188
+ artist_links = []
189
+
190
+ # Extract the 'href' attribute from each 'a' tag within the artist divs
191
+ for artist_div in artist_divs:
192
+ for anchor in artist_div.find_all('a'):
193
+ href = anchor.get('href')
194
+ artist_links.append(href)
195
+
196
+ # choose most similar artist link from all artist links
197
+ artist_urls = []
198
+ for url in artist_links:
199
+ artist_urls.append(str(url).split("/")[-1][:-5])
200
+
201
+ if artist_name in artist_urls:
202
+ return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
203
+ else:
204
+ min_id = None
205
+ max_sim = -100
206
+ for id, name in enumerate(artist_urls):
207
+ dist = Levenshtein.jaro(artist_name, name)
208
+ if max_sim < dist:
209
+ max_sim = dist
210
+ min_id = id
211
+
212
+ return f"https://www.azlyrics.com/{artist_links[min_id]}"
213
+
214
+ # this will flatten all inner lists (all depths) of a list into a list of depth == 1
215
+ def flatten_list(lst: list):
216
+ """
217
+ Flattens all inner lists (all depths) of a list into a list of depth == 1.
218
+
219
+ Parameters:
220
+ ------
221
+ `lst`: List
222
+ The list to be flattened.
223
+
224
+ Returns:
225
+ ------
226
+ `result`: List
227
+ The flattened list.
228
+ """
229
+ result = []
230
+ for element in lst:
231
+ if isinstance(element, list):
232
+ result.extend(flatten_list(element))
233
+ else:
234
+ result.append(element)
235
+ return result
236
+
237
+ # lyric pre-processing
238
+ def process_lyrics(lyrics: str):
239
+ """
240
+ Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
241
+ by `\n`, and removing consecutive whitespace list items.
242
+
243
+ Parameters:
244
+ ------
245
+ `lyrics`: str
246
+ The lyrics to be pre-processed.
247
+
248
+ Returns:
249
+ ------
250
+ `cleaned_lines`: List
251
+ The pre-processed lyrics.
252
+ """
253
+ # Replace "\r" with an empty string
254
+ lyrics = lyrics.replace('\r', '')
255
+
256
+ # Split the lyrics by "\n"
257
+ lines = lyrics.split('\n')
258
+
259
+ # Remove consecutive whitespace list items
260
+ cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']
261
+
262
+ return cleaned_lines
263
+
264
+ # splitting pre-processed lyrics into sections (this typically loosely matches a song form)
265
+ def sectionize(lyrics: str):
266
+ """
267
+ Splits the pre-processed lyrics into sections.
268
+
269
+ Parameters:
270
+ ------
271
+ `lyrics`: str
272
+ The pre-processed lyrics.
273
+
274
+ Returns:
275
+ ------
276
+ `all_sections`: List
277
+ The lyrics split into sections.
278
+ """
279
+ lyrs_list = process_lyrics(lyrics)
280
+
281
+ sectd = []
282
+ for line in lyrs_list:
283
+ if line == "":
284
+ sectd.append("#SEC")
285
+ else:
286
+ sectd.append(line)
287
+
288
+ del sectd[-1]
289
+
290
+ all_sections = []
291
+ for id, line in enumerate(sectd):
292
+ if id == 0:
293
+ sec_list = []
294
+ if line == "#SEC":
295
+ all_sections.append(sec_list)
296
+ sec_list = []
297
+ else:
298
+ sec_list.append(line)
299
+
300
+ del all_sections[0]
301
+
302
+ return all_sections
303
+
304
+ # sentiment analysis model
305
+ def analyze_sentiment_vader(text: str):
306
+ """
307
+ Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
308
+ sentiment analysis model.
309
+
310
+ Parameters:
311
+ ------
312
+ `text`: str
313
+ The text to be analyzed.
314
+
315
+ Returns:
316
+ ------
317
+ `label`: str
318
+ The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
319
+ `compound_score`: float
320
+ The compound score of the text.
321
+ """
322
+ sia = SentimentIntensityAnalyzer()
323
+ sentiment_scores = sia.polarity_scores(text)
324
+
325
+ # Determine the sentiment label based on the compound score
326
+ compound_score = sentiment_scores["compound"]
327
+ if compound_score >= 0.05:
328
+ label = "POSITIVE"
329
+ elif compound_score <= -0.05:
330
+ label = "NEGATIVE"
331
+ else:
332
+ label = "NEUTRAL"
333
+
334
+ return label, compound_score
335
+
336
+ # get sentiment of all text items in 'lyrics' column
337
+ def get_sentiments(df):
338
+ """
339
+ Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
340
+
341
+ This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
342
+ to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
343
+ ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
344
+ each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
345
+ DataFrame is returned.
346
+
347
+ Parameters:
348
+ -----------
349
+ df : pandas DataFrame
350
+ The DataFrame containing the 'lyrics' column to be analyzed.
351
+
352
+ Returns:
353
+ --------
354
+ df : pandas DataFrame
355
+ The modified DataFrame with sentiment analysis added as new columns.
356
+
357
+ Raises:
358
+ -------
359
+ None.
360
+ """
361
+
362
+ for row in df.index:
363
+ section_lyrics = df.loc[row, 'lyrics']
364
+ sec_lyrs_str = ""
365
+ for line in section_lyrics:
366
+ sec_lyrs_str += line + " "
367
+ label, valence = analyze_sentiment_vader(sec_lyrs_str)
368
+ df.loc[row, 'sentiment_label'] = label
369
+ df.loc[row, 'sentiment_valence'] = valence
370
+
371
+ return df
372
+
373
+ # get just metadata for songs (not lyrics)
374
+ def get_metadata(artist_name: str, song_titles: list = None) -> dict:
375
+ """
376
+ Get all metadata for the passed artist and songs.
377
+
378
+ Parameters:
379
+ -----------
380
+ artist_name: str
381
+ The name of the artist to search for.
382
+ song_titles: list
383
+ A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
384
+
385
+ Returns:
386
+ --------
387
+ dict
388
+ A dictionary containing metadata for each song found. The keys are the song titles and the values are
389
+ dictionaries containing various metadata for each song.
390
+ """
391
+
392
+ urls = find_artist(artist_name)
393
+ azlyrics_artist_name = urls.split("/")[-1][:-5]
394
+
395
+ API = azapi.AZlyrics('google', accuracy = 0.6)
396
+ API.artist = azlyrics_artist_name
397
+ all_songs_info = API.getSongs() # dictionary
398
+
399
+ az_titles = [title for title in all_songs_info]
400
+
401
+ if song_titles == None:
402
+ return all_songs_info
403
+ else:
404
+ found_data = {}
405
+ for title in song_titles:
406
+ if title in az_titles:
407
+ found_data[title] = all_songs_info[title]
408
+ else:
409
+ min_id = None
410
+ max_sim = -100
411
+ for id, az_name in enumerate(az_titles):
412
+ dist = Levenshtein.jaro(title, az_name)
413
+ if max_sim < dist:
414
+ max_sim = dist
415
+ min_id = id
416
+
417
+ found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]
418
+
419
+ return found_data
420
+
421
+ # combine metadata with found lyrics
422
+ def get_all_data(artist_name: str, song_titles: list = None,
423
+ delay: tuple = (0.5, 2), print_progress: bool = False):
424
+ """
425
+ Get all metadata and sentiment analysis for the passed artist and songs.
426
+
427
+ Parameters:
428
+ -----------
429
+ artist_name: str
430
+ The name of the artist to search for.
431
+ song_titles: list
432
+ A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
433
+ delay: tuple
434
+ A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
435
+ being banned by the server.
436
+ print_progress: bool
437
+ Whether to print progress messages or not.
438
+
439
+ Returns:
440
+ --------
441
+ pd.DataFrame
442
+ A pandas DataFrame containing metadata and sentiment analysis for each song found.
443
+ """
444
+ if print_progress == True:
445
+ print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")
446
+
447
+ artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)
448
+
449
+ if print_progress == True:
450
+ print(f"\n\t- All metadata found")
451
+
452
+ times = []
453
+
454
+ for title, mdata in artist_data.items():
455
+ start = time.time()
456
+ try:
457
+ lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
458
+ artist_data[title]['lyrics'] = sectionize(lyrics)
459
+ except: (UnboundLocalError, TypeError, AttributeError)
460
+ print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
461
+ continue
462
+
463
+ # as to not get banned
464
+ random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)
465
+
466
+ # time stuff
467
+ times.append(start - time.time())
468
+ avg_time = sum(times) / len(times)
469
+ remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed
470
+
471
+ # printing stuff
472
+ if print_progress == True:
473
+ if remaining >= 60: # more than one minute remaining
474
+ remaining = round(remaining / 60, 2)
475
+ print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
476
+ else: # less than one minute remaining
477
+ remaining = round(remaining, 2)
478
+ print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")
479
+
480
+ if print_progress == True:
481
+ print(f"\nAll lyrics and metadata found. Returning structured data.")
482
+
483
+ df_dict = {}
484
+ df_dict['artist_name'] = []
485
+ df_dict['song_title'] = []
486
+ df_dict['release_year'] = []
487
+ df_dict['lyrics'] = []
488
+ df_dict['lyrics_section_number'] = []
489
+ df_dict['album_name'] = []
490
+ df_dict['release_type'] = []
491
+ df_dict['lyrics_url'] = []
492
+
493
+ for title, info in artist_data.items():
494
+ df_dict['artist_name'].append("John Mayer")
495
+ df_dict['song_title'].append(title)
496
+ df_dict['album_name'].append(info['album'])
497
+ df_dict['release_year'].append(info['year'])
498
+ df_dict['lyrics'].append(info['lyrics'])
499
+ df_dict['lyrics_section_number'].append(len(info['album']))
500
+ df_dict['release_type'].append(info['type'])
501
+ df_dict['lyrics_url'].append(info['url'])
502
+
503
+ new_dict = {}
504
+ for key in df_dict:
505
+ new_dict[key] = []
506
+
507
+ for i in range(len(df_dict['lyrics'])):
508
+ for id, inner in enumerate(df_dict['lyrics'][i]):
509
+ new_dict['song_title'].append(df_dict['song_title'][i])
510
+ new_dict['release_year'].append(df_dict['release_year'][i])
511
+ new_dict['album_name'].append(df_dict['album_name'][i])
512
+ new_dict['artist_name'].append(df_dict['artist_name'][i])
513
+ new_dict['lyrics'].append(inner)
514
+ new_dict['lyrics_section_number'].append(id)
515
+ new_dict['release_type'].append(df_dict['release_type'][i])
516
+ new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])
517
+
518
+ sents_df = get_sentiments(pd.DataFrame(new_dict))
519
+
520
+ # reordering columns to better suit the task
521
+ sents_df = sents_df[["artist_name", "song_title", "release_year",
522
+ "lyrics", "lyrics_section_number", 'sentiment_label',
523
+ 'sentiment_valence', "album_name", "release_type", "lyrics_url"]]
524
+
525
+ if print_progress == True:
526
+ print(f"Data retrieval complete!\n\n------------------------")
527
+
528
+ return sents_df