Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import random | |
import Levenshtein | |
import nltk | |
import azapi | |
from nltk.sentiment import SentimentIntensityAnalyzer | |
nltk.download("vader_lexicon") | |
def random_delay(min_val: float, max_val: float, print_delay: bool = False): | |
""" | |
Inserts a random delay between website pings to simulate human-like behavior. | |
Parameters: | |
---------- | |
min_val: float | |
The minimum amount of time to delay (in seconds). | |
max_val: float | |
The maximum amount of time to delay (in seconds). | |
print_delay: bool | |
Whether or not to print the delay time. | |
Returns: | |
------- | |
val: float | |
The random delay time (in seconds). | |
""" | |
val = random.uniform(min_val, max_val) | |
time.sleep(val) | |
if print_delay == True: | |
print(f"Delayed {val} seconds") | |
return val | |
def find_artist(artist_name: str): | |
""" | |
Finds the link to an artist's page on azlyrics.com. | |
This function sends an HTTP request to azlyrics.com, scrapes the HTML content | |
to find the artist's page, and returns the URL to that page. | |
Parameters: | |
---------- | |
artist_name: str | |
The name of the artist. | |
Returns: | |
------- | |
url: str | |
The URL to the artist's page on azlyrics.com. | |
""" | |
for char in artist_name: | |
if char != " ": | |
first_letter = char | |
break | |
url = f"https://www.azlyrics.com/{first_letter}.html" | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col') | |
artist_links = [] | |
for artist_div in artist_divs: | |
for anchor in artist_div.find_all('a'): | |
href = anchor.get('href') | |
artist_links.append(href) | |
artist_urls = [] | |
for url in artist_links: | |
artist_urls.append(str(url).split("/")[-1][:-5]) | |
if artist_name in artist_urls: | |
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}" | |
else: | |
min_id = None | |
max_sim = -100 | |
for id, name in enumerate(artist_urls): | |
dist = Levenshtein.jaro(artist_name, name) | |
if max_sim < dist: | |
max_sim = dist | |
min_id = id | |
return f"https://www.azlyrics.com/{artist_links[min_id]}" | |
def follow_lyrics(lyric_url: str): | |
""" | |
Retrieves the lyrics of a song from the specified URL on azlyrics.com. | |
This function sends an HTTP request to the given `lyric_url`, parses the HTML content, | |
and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags | |
and whitespace. The function returns the lyrics as a string. | |
Note: Web scraping may be against the terms of service of some websites. Azlyrics.com | |
specifically prohibits the usage of their content by third-party lyrics providers. | |
Always review the website's policies and ensure you are compliant before scraping data. | |
Parameters: | |
------ | |
`lyric_url`: str | |
The URL of the song lyrics on azlyrics.com. | |
`song_title`: str | |
Title of the song | |
Returns: | |
------ | |
`lyrics_str`: | |
The lyrics of the song as a single string. | |
Raises: | |
------ | |
`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage. | |
""" | |
# # delay website call by a random amount as to not get banned | |
# random_delay(min_val = 1, max_val = 3, print_delay = False) | |
# Send an HTTP request to the lyric_url | |
response = requests.get(lyric_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find the main div element containing the lyrics | |
main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center') | |
# Find the div element containing the lyrics within the main div | |
lyrics_div = None | |
for div in main_div.find_all('div'): | |
if not div.has_attr('class') and not div.has_attr('id'): | |
lyrics_div = div | |
break | |
if lyrics_div: | |
# Clean up the lyrics by removing unnecessary HTML tags and whitespace | |
lyrics_str = lyrics_div.get_text(strip = False) | |
else: | |
print(f"Error: Unable to find the lyrics for '{lyric_url}'.") | |
else: | |
print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}") | |
return lyrics_str | |
# get artist link on azlyrics | |
def find_artist(artist_name: str) -> str: | |
""" | |
Finds the link for the artist page on azlyrics.com. | |
Parameters: | |
------ | |
`artist_name`: str | |
The name of the artist. | |
Returns: | |
------ | |
`url`: str | |
The URL of the artist page on azlyrics.com. | |
Raises: | |
------ | |
`ValueError`: If the artist page cannot be found. | |
""" | |
for char in artist_name: | |
if char != " ": | |
first_letter = char | |
break | |
# The target URL | |
url = f"https://www.azlyrics.com/{first_letter}.html" | |
# Send an HTTP request to the URL | |
response = requests.get(url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find all the 'div' elements with the class "col-sm-6 text-center artist-col" | |
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col') | |
# Initialize an empty list to store the artist links | |
artist_links = [] | |
# Extract the 'href' attribute from each 'a' tag within the artist divs | |
for artist_div in artist_divs: | |
for anchor in artist_div.find_all('a'): | |
href = anchor.get('href') | |
artist_links.append(href) | |
# choose most similar artist link from all artist links | |
artist_urls = [] | |
for url in artist_links: | |
artist_urls.append(str(url).split("/")[-1][:-5]) | |
if artist_name in artist_urls: | |
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}" | |
else: | |
min_id = None | |
max_sim = -100 | |
for id, name in enumerate(artist_urls): | |
dist = Levenshtein.jaro(artist_name, name) | |
if max_sim < dist: | |
max_sim = dist | |
min_id = id | |
return f"https://www.azlyrics.com/{artist_links[min_id]}" | |
# this will flatten all inner lists (all depths) of a list into a list of depth == 1 | |
def flatten_list(lst: list): | |
""" | |
Flattens all inner lists (all depths) of a list into a list of depth == 1. | |
Parameters: | |
------ | |
`lst`: List | |
The list to be flattened. | |
Returns: | |
------ | |
`result`: List | |
The flattened list. | |
""" | |
result = [] | |
for element in lst: | |
if isinstance(element, list): | |
result.extend(flatten_list(element)) | |
else: | |
result.append(element) | |
return result | |
# lyric pre-processing | |
def process_lyrics(lyrics: str): | |
""" | |
Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics | |
by `\n`, and removing consecutive whitespace list items. | |
Parameters: | |
------ | |
`lyrics`: str | |
The lyrics to be pre-processed. | |
Returns: | |
------ | |
`cleaned_lines`: List | |
The pre-processed lyrics. | |
""" | |
# Replace "\r" with an empty string | |
lyrics = lyrics.replace('\r', '') | |
# Split the lyrics by "\n" | |
lines = lyrics.split('\n') | |
# Remove consecutive whitespace list items | |
cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != ''] | |
return cleaned_lines | |
# splitting pre-processed lyrics into sections (this typically loosely matches a song form) | |
def sectionize(lyrics: str): | |
""" | |
Splits the pre-processed lyrics into sections. | |
Parameters: | |
------ | |
`lyrics`: str | |
The pre-processed lyrics. | |
Returns: | |
------ | |
`all_sections`: List | |
The lyrics split into sections. | |
""" | |
lyrs_list = process_lyrics(lyrics) | |
sectd = [] | |
for line in lyrs_list: | |
if line == "": | |
sectd.append("#SEC") | |
else: | |
sectd.append(line) | |
del sectd[-1] | |
all_sections = [] | |
for id, line in enumerate(sectd): | |
if id == 0: | |
sec_list = [] | |
if line == "#SEC": | |
all_sections.append(sec_list) | |
sec_list = [] | |
else: | |
sec_list.append(line) | |
del all_sections[0] | |
return all_sections | |
# sentiment analysis model | |
def analyze_sentiment_vader(text: str): | |
""" | |
Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner) | |
sentiment analysis model. | |
Parameters: | |
------ | |
`text`: str | |
The text to be analyzed. | |
Returns: | |
------ | |
`label`: str | |
The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL". | |
`compound_score`: float | |
The compound score of the text. | |
""" | |
sia = SentimentIntensityAnalyzer() | |
sentiment_scores = sia.polarity_scores(text) | |
# Determine the sentiment label based on the compound score | |
compound_score = sentiment_scores["compound"] | |
if compound_score >= 0.05: | |
label = "POSITIVE" | |
elif compound_score <= -0.05: | |
label = "NEGATIVE" | |
else: | |
label = "NEUTRAL" | |
return label, compound_score | |
# get sentiment of all text items in 'lyrics' column | |
def get_sentiments(df): | |
""" | |
Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame. | |
This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk) | |
to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label | |
('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to | |
each text item. The sentiment analysis is added as new columns to the DataFrame and the modified | |
DataFrame is returned. | |
Parameters: | |
----------- | |
df : pandas DataFrame | |
The DataFrame containing the 'lyrics' column to be analyzed. | |
Returns: | |
-------- | |
df : pandas DataFrame | |
The modified DataFrame with sentiment analysis added as new columns. | |
Raises: | |
------- | |
None. | |
""" | |
for row in df.index: | |
section_lyrics = df.loc[row, 'lyrics'] | |
sec_lyrs_str = "" | |
for line in section_lyrics: | |
sec_lyrs_str += line + " " | |
label, valence = analyze_sentiment_vader(sec_lyrs_str) | |
df.loc[row, 'sentiment_label'] = label | |
df.loc[row, 'sentiment_valence'] = valence | |
return df | |
# get just metadata for songs (not lyrics) | |
def get_metadata(artist_name: str, song_titles: list = None) -> dict: | |
""" | |
Get all metadata for the passed artist and songs. | |
Parameters: | |
----------- | |
artist_name: str | |
The name of the artist to search for. | |
song_titles: list | |
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved. | |
Returns: | |
-------- | |
dict | |
A dictionary containing metadata for each song found. The keys are the song titles and the values are | |
dictionaries containing various metadata for each song. | |
""" | |
urls = find_artist(artist_name) | |
azlyrics_artist_name = urls.split("/")[-1][:-5] | |
API = azapi.AZlyrics('google', accuracy = 0.6) | |
API.artist = azlyrics_artist_name | |
all_songs_info = API.getSongs() # dictionary | |
az_titles = [title for title in all_songs_info] | |
if song_titles == None: | |
return all_songs_info | |
else: | |
found_data = {} | |
for title in song_titles: | |
if title in az_titles: | |
found_data[title] = all_songs_info[title] | |
else: | |
min_id = None | |
max_sim = -100 | |
for id, az_name in enumerate(az_titles): | |
dist = Levenshtein.jaro(title, az_name) | |
if max_sim < dist: | |
max_sim = dist | |
min_id = id | |
found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]] | |
return found_data | |
# combine metadata with found lyrics | |
def get_all_data(artist_name: str, song_titles: list = None, | |
delay: tuple = (0.5, 2), print_progress: bool = False): | |
""" | |
Get all metadata and sentiment analysis for the passed artist and songs. | |
Parameters: | |
----------- | |
artist_name: str | |
The name of the artist to search for. | |
song_titles: list | |
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved. | |
delay: tuple | |
A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid | |
being banned by the server. | |
print_progress: bool | |
Whether to print progress messages or not. | |
Returns: | |
-------- | |
pd.DataFrame | |
A pandas DataFrame containing metadata and sentiment analysis for each song found. | |
""" | |
if print_progress == True: | |
print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...") | |
artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles) | |
if print_progress == True: | |
print(f"\n\t- All metadata found") | |
times = [] | |
for title, mdata in artist_data.items(): | |
start = time.time() | |
# try: | |
# lyrics = follow_lyrics(lyric_url = artist_data[title]['url']) | |
# artist_data[title]['lyrics'] = sectionize(lyrics) | |
# except: (UnboundLocalError, TypeError, AttributeError) | |
# print(f"\tCouldn't find lyrics to {title}. Moving to next song.") | |
# pass | |
lyrics = follow_lyrics(lyric_url = artist_data[title]['url']) | |
artist_data[title]['lyrics'] = sectionize(lyrics) | |
# as to not get banned | |
random_delay(min_val = delay[0], max_val = delay[1], print_delay = False) | |
# time stuff | |
times.append(start - time.time()) | |
avg_time = sum(times) / len(times) | |
remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed | |
# printing stuff | |
if print_progress == True: | |
if remaining >= 60: # more than one minute remaining | |
remaining = round(remaining / 60, 2) | |
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes") | |
else: # less than one minute remaining | |
remaining = round(remaining, 2) | |
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds") | |
if print_progress == True: | |
print(f"\nAll lyrics and metadata found. Returning structured data.") | |
df_dict = {} | |
df_dict['artist_name'] = [] | |
df_dict['song_title'] = [] | |
df_dict['release_year'] = [] | |
df_dict['lyrics'] = [] | |
df_dict['lyrics_section_number'] = [] | |
df_dict['album_name'] = [] | |
df_dict['release_type'] = [] | |
df_dict['lyrics_url'] = [] | |
for title, info in artist_data.items(): | |
df_dict['artist_name'].append("John Mayer") | |
df_dict['song_title'].append(title) | |
df_dict['album_name'].append(info['album']) | |
df_dict['release_year'].append(info['year']) | |
df_dict['lyrics'].append(info['lyrics']) | |
df_dict['lyrics_section_number'].append(len(info['album'])) | |
df_dict['release_type'].append(info['type']) | |
df_dict['lyrics_url'].append(info['url']) | |
new_dict = {} | |
for key in df_dict: | |
new_dict[key] = [] | |
for i in range(len(df_dict['lyrics'])): | |
for id, inner in enumerate(df_dict['lyrics'][i]): | |
new_dict['song_title'].append(df_dict['song_title'][i]) | |
new_dict['release_year'].append(df_dict['release_year'][i]) | |
new_dict['album_name'].append(df_dict['album_name'][i]) | |
new_dict['artist_name'].append(df_dict['artist_name'][i]) | |
new_dict['lyrics'].append(inner) | |
new_dict['lyrics_section_number'].append(id) | |
new_dict['release_type'].append(df_dict['release_type'][i]) | |
new_dict['lyrics_url'].append(df_dict['lyrics_url'][i]) | |
sents_df = get_sentiments(pd.DataFrame(new_dict)) | |
# reordering columns to better suit the task | |
sents_df = sents_df[["artist_name", "song_title", "release_year", | |
"lyrics", "lyrics_section_number", 'sentiment_label', | |
'sentiment_valence', "album_name", "release_type", "lyrics_url"]] | |
if print_progress == True: | |
print(f"Data retrieval complete!\n\n------------------------") | |
return sents_df |