import pandas as pd import requests from bs4 import BeautifulSoup import time import random import Levenshtein import nltk import azapi from nltk.sentiment import SentimentIntensityAnalyzer nltk.download("vader_lexicon") def random_delay(min_val: float, max_val: float, print_delay: bool = False): """ Inserts a random delay between website pings to simulate human-like behavior. Parameters: ---------- min_val: float The minimum amount of time to delay (in seconds). max_val: float The maximum amount of time to delay (in seconds). print_delay: bool Whether or not to print the delay time. Returns: ------- val: float The random delay time (in seconds). """ val = random.uniform(min_val, max_val) time.sleep(val) if print_delay == True: print(f"Delayed {val} seconds") return val def find_artist(artist_name: str): """ Finds the link to an artist's page on azlyrics.com. This function sends an HTTP request to azlyrics.com, scrapes the HTML content to find the artist's page, and returns the URL to that page. Parameters: ---------- artist_name: str The name of the artist. Returns: ------- url: str The URL to the artist's page on azlyrics.com. """ for char in artist_name: if char != " ": first_letter = char break url = f"https://www.azlyrics.com/{first_letter}.html" response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col') artist_links = [] for artist_div in artist_divs: for anchor in artist_div.find_all('a'): href = anchor.get('href') artist_links.append(href) artist_urls = [] for url in artist_links: artist_urls.append(str(url).split("/")[-1][:-5]) if artist_name in artist_urls: return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}" else: min_id = None max_sim = -100 for id, name in enumerate(artist_urls): dist = Levenshtein.jaro(artist_name, name) if max_sim < dist: max_sim = dist min_id = id return f"https://www.azlyrics.com/{artist_links[min_id]}" def follow_lyrics(lyric_url: str): """ Retrieves the lyrics of a song from the specified URL on azlyrics.com. This function sends an HTTP request to the given `lyric_url`, parses the HTML content, and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags and whitespace. The function returns the lyrics as a string. Note: Web scraping may be against the terms of service of some websites. Azlyrics.com specifically prohibits the usage of their content by third-party lyrics providers. Always review the website's policies and ensure you are compliant before scraping data. Parameters: ------ `lyric_url`: str The URL of the song lyrics on azlyrics.com. `song_title`: str Title of the song Returns: ------ `lyrics_str`: The lyrics of the song as a single string. Raises: ------ `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage. """ # # delay website call by a random amount as to not get banned # random_delay(min_val = 1, max_val = 3, print_delay = False) # Send an HTTP request to the lyric_url response = requests.get(lyric_url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Find the main div element containing the lyrics main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center') # Find the div element containing the lyrics within the main div lyrics_div = None for div in main_div.find_all('div'): if not div.has_attr('class') and not div.has_attr('id'): lyrics_div = div break if lyrics_div: # Clean up the lyrics by removing unnecessary HTML tags and whitespace lyrics_str = lyrics_div.get_text(strip = False) else: print(f"Error: Unable to find the lyrics for '{lyric_url}'.") else: print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}") return lyrics_str # get artist link on azlyrics def find_artist(artist_name: str) -> str: """ Finds the link for the artist page on azlyrics.com. Parameters: ------ `artist_name`: str The name of the artist. Returns: ------ `url`: str The URL of the artist page on azlyrics.com. Raises: ------ `ValueError`: If the artist page cannot be found. """ for char in artist_name: if char != " ": first_letter = char break # The target URL url = f"https://www.azlyrics.com/{first_letter}.html" # Send an HTTP request to the URL response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content soup = BeautifulSoup(response.text, 'html.parser') # Find all the 'div' elements with the class "col-sm-6 text-center artist-col" artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col') # Initialize an empty list to store the artist links artist_links = [] # Extract the 'href' attribute from each 'a' tag within the artist divs for artist_div in artist_divs: for anchor in artist_div.find_all('a'): href = anchor.get('href') artist_links.append(href) # choose most similar artist link from all artist links artist_urls = [] for url in artist_links: artist_urls.append(str(url).split("/")[-1][:-5]) if artist_name in artist_urls: return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}" else: min_id = None max_sim = -100 for id, name in enumerate(artist_urls): dist = Levenshtein.jaro(artist_name, name) if max_sim < dist: max_sim = dist min_id = id return f"https://www.azlyrics.com/{artist_links[min_id]}" # this will flatten all inner lists (all depths) of a list into a list of depth == 1 def flatten_list(lst: list): """ Flattens all inner lists (all depths) of a list into a list of depth == 1. Parameters: ------ `lst`: List The list to be flattened. Returns: ------ `result`: List The flattened list. """ result = [] for element in lst: if isinstance(element, list): result.extend(flatten_list(element)) else: result.append(element) return result # lyric pre-processing def process_lyrics(lyrics: str): """ Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics by `\n`, and removing consecutive whitespace list items. Parameters: ------ `lyrics`: str The lyrics to be pre-processed. Returns: ------ `cleaned_lines`: List The pre-processed lyrics. """ # Replace "\r" with an empty string lyrics = lyrics.replace('\r', '') # Split the lyrics by "\n" lines = lyrics.split('\n') # Remove consecutive whitespace list items cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != ''] return cleaned_lines # splitting pre-processed lyrics into sections (this typically loosely matches a song form) def sectionize(lyrics: str): """ Splits the pre-processed lyrics into sections. Parameters: ------ `lyrics`: str The pre-processed lyrics. Returns: ------ `all_sections`: List The lyrics split into sections. """ lyrs_list = process_lyrics(lyrics) sectd = [] for line in lyrs_list: if line == "": sectd.append("#SEC") else: sectd.append(line) del sectd[-1] all_sections = [] for id, line in enumerate(sectd): if id == 0: sec_list = [] if line == "#SEC": all_sections.append(sec_list) sec_list = [] else: sec_list.append(line) del all_sections[0] return all_sections # sentiment analysis model def analyze_sentiment_vader(text: str): """ Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analysis model. Parameters: ------ `text`: str The text to be analyzed. Returns: ------ `label`: str The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL". `compound_score`: float The compound score of the text. """ sia = SentimentIntensityAnalyzer() sentiment_scores = sia.polarity_scores(text) # Determine the sentiment label based on the compound score compound_score = sentiment_scores["compound"] if compound_score >= 0.05: label = "POSITIVE" elif compound_score <= -0.05: label = "NEGATIVE" else: label = "NEUTRAL" return label, compound_score # get sentiment of all text items in 'lyrics' column def get_sentiments(df): """ Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame. This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk) to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to each text item. The sentiment analysis is added as new columns to the DataFrame and the modified DataFrame is returned. Parameters: ----------- df : pandas DataFrame The DataFrame containing the 'lyrics' column to be analyzed. Returns: -------- df : pandas DataFrame The modified DataFrame with sentiment analysis added as new columns. Raises: ------- None. """ for row in df.index: section_lyrics = df.loc[row, 'lyrics'] sec_lyrs_str = "" for line in section_lyrics: sec_lyrs_str += line + " " label, valence = analyze_sentiment_vader(sec_lyrs_str) df.loc[row, 'sentiment_label'] = label df.loc[row, 'sentiment_valence'] = valence return df # get just metadata for songs (not lyrics) def get_metadata(artist_name: str, song_titles: list = None) -> dict: """ Get all metadata for the passed artist and songs. Parameters: ----------- artist_name: str The name of the artist to search for. song_titles: list A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved. Returns: -------- dict A dictionary containing metadata for each song found. The keys are the song titles and the values are dictionaries containing various metadata for each song. """ urls = find_artist(artist_name) azlyrics_artist_name = urls.split("/")[-1][:-5] API = azapi.AZlyrics('google', accuracy = 0.6) API.artist = azlyrics_artist_name all_songs_info = API.getSongs() # dictionary az_titles = [title for title in all_songs_info] if song_titles == None: return all_songs_info else: found_data = {} for title in song_titles: if title in az_titles: found_data[title] = all_songs_info[title] else: min_id = None max_sim = -100 for id, az_name in enumerate(az_titles): dist = Levenshtein.jaro(title, az_name) if max_sim < dist: max_sim = dist min_id = id found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]] return found_data # combine metadata with found lyrics def get_all_data(artist_name: str, song_titles: list = None, delay: tuple = (0.5, 2), print_progress: bool = False): """ Get all metadata and sentiment analysis for the passed artist and songs. Parameters: ----------- artist_name: str The name of the artist to search for. song_titles: list A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved. delay: tuple A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid being banned by the server. print_progress: bool Whether to print progress messages or not. Returns: -------- pd.DataFrame A pandas DataFrame containing metadata and sentiment analysis for each song found. """ if print_progress == True: print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...") artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles) if print_progress == True: print(f"\n\t- All metadata found") times = [] for title, mdata in artist_data.items(): start = time.time() # try: # lyrics = follow_lyrics(lyric_url = artist_data[title]['url']) # artist_data[title]['lyrics'] = sectionize(lyrics) # except: (UnboundLocalError, TypeError, AttributeError) # print(f"\tCouldn't find lyrics to {title}. Moving to next song.") # pass lyrics = follow_lyrics(lyric_url = artist_data[title]['url']) artist_data[title]['lyrics'] = sectionize(lyrics) # as to not get banned random_delay(min_val = delay[0], max_val = delay[1], print_delay = False) # time stuff times.append(start - time.time()) avg_time = sum(times) / len(times) remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed # printing stuff if print_progress == True: if remaining >= 60: # more than one minute remaining remaining = round(remaining / 60, 2) print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes") else: # less than one minute remaining remaining = round(remaining, 2) print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds") if print_progress == True: print(f"\nAll lyrics and metadata found. Returning structured data.") df_dict = {} df_dict['artist_name'] = [] df_dict['song_title'] = [] df_dict['release_year'] = [] df_dict['lyrics'] = [] df_dict['lyrics_section_number'] = [] df_dict['album_name'] = [] df_dict['release_type'] = [] df_dict['lyrics_url'] = [] for title, info in artist_data.items(): df_dict['artist_name'].append("John Mayer") df_dict['song_title'].append(title) df_dict['album_name'].append(info['album']) df_dict['release_year'].append(info['year']) df_dict['lyrics'].append(info['lyrics']) df_dict['lyrics_section_number'].append(len(info['album'])) df_dict['release_type'].append(info['type']) df_dict['lyrics_url'].append(info['url']) new_dict = {} for key in df_dict: new_dict[key] = [] for i in range(len(df_dict['lyrics'])): for id, inner in enumerate(df_dict['lyrics'][i]): new_dict['song_title'].append(df_dict['song_title'][i]) new_dict['release_year'].append(df_dict['release_year'][i]) new_dict['album_name'].append(df_dict['album_name'][i]) new_dict['artist_name'].append(df_dict['artist_name'][i]) new_dict['lyrics'].append(inner) new_dict['lyrics_section_number'].append(id) new_dict['release_type'].append(df_dict['release_type'][i]) new_dict['lyrics_url'].append(df_dict['lyrics_url'][i]) sents_df = get_sentiments(pd.DataFrame(new_dict)) # reordering columns to better suit the task sents_df = sents_df[["artist_name", "song_title", "release_year", "lyrics", "lyrics_section_number", 'sentiment_label', 'sentiment_valence', "album_name", "release_type", "lyrics_url"]] if print_progress == True: print(f"Data retrieval complete!\n\n------------------------") return sents_df