Spaces:

kmaurinjones
/

SongScope

Running

App Files Files Community

SongScope / songscope.py

kmaurinjones

Update songscope.py

042f3b8 about 1 year ago

raw

history blame

No virus

17.1 kB

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import time
	import random
	import Levenshtein
	import nltk
	import azapi
	from nltk.sentiment import SentimentIntensityAnalyzer
	nltk.download("vader_lexicon")

	def random_delay(min_val: float, max_val: float, print_delay: bool = False):
	"""
	Inserts a random delay between website pings to simulate human-like behavior.

	Parameters:
	----------
	min_val: float
	The minimum amount of time to delay (in seconds).
	max_val: float
	The maximum amount of time to delay (in seconds).
	print_delay: bool
	Whether or not to print the delay time.

	Returns:
	-------
	val: float
	The random delay time (in seconds).
	"""
	val = random.uniform(min_val, max_val)
	time.sleep(val)
	if print_delay == True:
	print(f"Delayed {val} seconds")
	return val

	def find_artist(artist_name: str):
	"""
	Finds the link to an artist's page on azlyrics.com.

	This function sends an HTTP request to azlyrics.com, scrapes the HTML content
	to find the artist's page, and returns the URL to that page.

	Parameters:
	----------
	artist_name: str
	The name of the artist.

	Returns:
	-------
	url: str
	The URL to the artist's page on azlyrics.com.
	"""
	for char in artist_name:
	if char != " ":
	first_letter = char
	break

	url = f"https://www.azlyrics.com/{first_letter}.html"

	response = requests.get(url)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
	artist_links = []

	for artist_div in artist_divs:
	for anchor in artist_div.find_all('a'):
	href = anchor.get('href')
	artist_links.append(href)

	artist_urls = []
	for url in artist_links:
	artist_urls.append(str(url).split("/")[-1][:-5])

	if artist_name in artist_urls:
	return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
	else:
	min_id = None
	max_sim = -100
	for id, name in enumerate(artist_urls):
	dist = Levenshtein.jaro(artist_name, name)
	if max_sim < dist:
	max_sim = dist
	min_id = id

	return f"https://www.azlyrics.com/{artist_links[min_id]}"

	def follow_lyrics(lyric_url: str):
	"""
	Retrieves the lyrics of a song from the specified URL on azlyrics.com.

	This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
	and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
	and whitespace. The function returns the lyrics as a string.

	Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
	specifically prohibits the usage of their content by third-party lyrics providers.
	Always review the website's policies and ensure you are compliant before scraping data.

	Parameters:
	------
	`lyric_url`: str
	The URL of the song lyrics on azlyrics.com.
	`song_title`: str
	Title of the song

	Returns:
	------
	`lyrics_str`:
	The lyrics of the song as a single string.

	Raises:
	------
	`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
	"""

	# # delay website call by a random amount as to not get banned
	# random_delay(min_val = 1, max_val = 3, print_delay = False)

	# Send an HTTP request to the lyric_url
	response = requests.get(lyric_url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find the main div element containing the lyrics
	main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')

	# Find the div element containing the lyrics within the main div
	lyrics_div = None
	for div in main_div.find_all('div'):
	if not div.has_attr('class') and not div.has_attr('id'):
	lyrics_div = div
	break

	if lyrics_div:
	# Clean up the lyrics by removing unnecessary HTML tags and whitespace
	lyrics_str = lyrics_div.get_text(strip = False)
	else:
	print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
	else:
	print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")

	return lyrics_str

	# get artist link on azlyrics
	def find_artist(artist_name: str) -> str:
	"""
	Finds the link for the artist page on azlyrics.com.

	Parameters:
	------
	`artist_name`: str
	The name of the artist.

	Returns:
	------
	`url`: str
	The URL of the artist page on azlyrics.com.

	Raises:
	------
	`ValueError`: If the artist page cannot be found.
	"""
	for char in artist_name:
	if char != " ":
	first_letter = char
	break

	# The target URL
	url = f"https://www.azlyrics.com/{first_letter}.html"

	# Send an HTTP request to the URL
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
	artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')

	# Initialize an empty list to store the artist links
	artist_links = []

	# Extract the 'href' attribute from each 'a' tag within the artist divs
	for artist_div in artist_divs:
	for anchor in artist_div.find_all('a'):
	href = anchor.get('href')
	artist_links.append(href)

	# choose most similar artist link from all artist links
	artist_urls = []
	for url in artist_links:
	artist_urls.append(str(url).split("/")[-1][:-5])

	if artist_name in artist_urls:
	return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
	else:
	min_id = None
	max_sim = -100
	for id, name in enumerate(artist_urls):
	dist = Levenshtein.jaro(artist_name, name)
	if max_sim < dist:
	max_sim = dist
	min_id = id

	return f"https://www.azlyrics.com/{artist_links[min_id]}"

	# this will flatten all inner lists (all depths) of a list into a list of depth == 1
	def flatten_list(lst: list):
	"""
	Flattens all inner lists (all depths) of a list into a list of depth == 1.

	Parameters:
	------
	`lst`: List
	The list to be flattened.

	Returns:
	------
	`result`: List
	The flattened list.
	"""
	result = []
	for element in lst:
	if isinstance(element, list):
	result.extend(flatten_list(element))
	else:
	result.append(element)
	return result

	# lyric pre-processing
	def process_lyrics(lyrics: str):
	"""
	Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
	by `\n`, and removing consecutive whitespace list items.

	Parameters:
	------
	`lyrics`: str
	The lyrics to be pre-processed.

	Returns:
	------
	`cleaned_lines`: List
	The pre-processed lyrics.
	"""
	# Replace "\r" with an empty string
	lyrics = lyrics.replace('\r', '')

	# Split the lyrics by "\n"
	lines = lyrics.split('\n')

	# Remove consecutive whitespace list items
	cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']

	return cleaned_lines

	# splitting pre-processed lyrics into sections (this typically loosely matches a song form)
	def sectionize(lyrics: str):
	"""
	Splits the pre-processed lyrics into sections.

	Parameters:
	------
	`lyrics`: str
	The pre-processed lyrics.

	Returns:
	------
	`all_sections`: List
	The lyrics split into sections.
	"""
	lyrs_list = process_lyrics(lyrics)

	sectd = []
	for line in lyrs_list:
	if line == "":
	sectd.append("#SEC")
	else:
	sectd.append(line)

	del sectd[-1]

	all_sections = []
	for id, line in enumerate(sectd):
	if id == 0:
	sec_list = []
	if line == "#SEC":
	all_sections.append(sec_list)
	sec_list = []
	else:
	sec_list.append(line)

	del all_sections[0]

	return all_sections

	# sentiment analysis model
	def analyze_sentiment_vader(text: str):
	"""
	Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
	sentiment analysis model.

	Parameters:
	------
	`text`: str
	The text to be analyzed.

	Returns:
	------
	`label`: str
	The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
	`compound_score`: float
	The compound score of the text.
	"""
	sia = SentimentIntensityAnalyzer()
	sentiment_scores = sia.polarity_scores(text)

	# Determine the sentiment label based on the compound score
	compound_score = sentiment_scores["compound"]
	if compound_score >= 0.05:
	label = "POSITIVE"
	elif compound_score <= -0.05:
	label = "NEGATIVE"
	else:
	label = "NEUTRAL"

	return label, compound_score

	# get sentiment of all text items in 'lyrics' column
	def get_sentiments(df):
	"""
	Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.

	This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
	to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
	('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
	each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
	DataFrame is returned.

	Parameters:
	-----------
	df : pandas DataFrame
	The DataFrame containing the 'lyrics' column to be analyzed.

	Returns:
	--------
	df : pandas DataFrame
	The modified DataFrame with sentiment analysis added as new columns.

	Raises:
	-------
	None.
	"""

	for row in df.index:
	section_lyrics = df.loc[row, 'lyrics']
	sec_lyrs_str = ""
	for line in section_lyrics:
	sec_lyrs_str += line + " "
	label, valence = analyze_sentiment_vader(sec_lyrs_str)
	df.loc[row, 'sentiment_label'] = label
	df.loc[row, 'sentiment_valence'] = valence

	return df

	# get just metadata for songs (not lyrics)
	def get_metadata(artist_name: str, song_titles: list = None) -> dict:
	"""
	Get all metadata for the passed artist and songs.

	Parameters:
	-----------
	artist_name: str
	The name of the artist to search for.
	song_titles: list
	A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.

	Returns:
	--------
	dict
	A dictionary containing metadata for each song found. The keys are the song titles and the values are
	dictionaries containing various metadata for each song.
	"""

	urls = find_artist(artist_name)
	azlyrics_artist_name = urls.split("/")[-1][:-5]

	API = azapi.AZlyrics('google', accuracy = 0.6)
	API.artist = azlyrics_artist_name
	all_songs_info = API.getSongs() # dictionary

	az_titles = [title for title in all_songs_info]

	if song_titles == None:
	return all_songs_info
	else:
	found_data = {}
	for title in song_titles:
	if title in az_titles:
	found_data[title] = all_songs_info[title]
	else:
	min_id = None
	max_sim = -100
	for id, az_name in enumerate(az_titles):
	dist = Levenshtein.jaro(title, az_name)
	if max_sim < dist:
	max_sim = dist
	min_id = id

	found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]

	return found_data

	# combine metadata with found lyrics
	def get_all_data(artist_name: str, song_titles: list = None,
	delay: tuple = (0.5, 2), print_progress: bool = False):
	"""
	Get all metadata and sentiment analysis for the passed artist and songs.

	Parameters:
	-----------
	artist_name: str
	The name of the artist to search for.
	song_titles: list
	A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
	delay: tuple
	A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
	being banned by the server.
	print_progress: bool
	Whether to print progress messages or not.

	Returns:
	--------
	pd.DataFrame
	A pandas DataFrame containing metadata and sentiment analysis for each song found.
	"""
	if print_progress == True:
	print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")

	artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)

	if print_progress == True:
	print(f"\n\t- All metadata found")

	times = []

	for title, mdata in artist_data.items():
	start = time.time()
	# try:
	# lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
	# artist_data[title]['lyrics'] = sectionize(lyrics)
	# except: (UnboundLocalError, TypeError, AttributeError)
	# print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
	# pass

	lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
	artist_data[title]['lyrics'] = sectionize(lyrics)

	# as to not get banned
	random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)

	# time stuff
	times.append(start - time.time())
	avg_time = sum(times) / len(times)
	remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed

	# printing stuff
	if print_progress == True:
	if remaining >= 60: # more than one minute remaining
	remaining = round(remaining / 60, 2)
	print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
	else: # less than one minute remaining
	remaining = round(remaining, 2)
	print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")

	if print_progress == True:
	print(f"\nAll lyrics and metadata found. Returning structured data.")

	df_dict = {}
	df_dict['artist_name'] = []
	df_dict['song_title'] = []
	df_dict['release_year'] = []
	df_dict['lyrics'] = []
	df_dict['lyrics_section_number'] = []
	df_dict['album_name'] = []
	df_dict['release_type'] = []
	df_dict['lyrics_url'] = []

	for title, info in artist_data.items():
	df_dict['artist_name'].append("John Mayer")
	df_dict['song_title'].append(title)
	df_dict['album_name'].append(info['album'])
	df_dict['release_year'].append(info['year'])
	df_dict['lyrics'].append(info['lyrics'])
	df_dict['lyrics_section_number'].append(len(info['album']))
	df_dict['release_type'].append(info['type'])
	df_dict['lyrics_url'].append(info['url'])

	new_dict = {}
	for key in df_dict:
	new_dict[key] = []

	for i in range(len(df_dict['lyrics'])):
	for id, inner in enumerate(df_dict['lyrics'][i]):
	new_dict['song_title'].append(df_dict['song_title'][i])
	new_dict['release_year'].append(df_dict['release_year'][i])
	new_dict['album_name'].append(df_dict['album_name'][i])
	new_dict['artist_name'].append(df_dict['artist_name'][i])
	new_dict['lyrics'].append(inner)
	new_dict['lyrics_section_number'].append(id)
	new_dict['release_type'].append(df_dict['release_type'][i])
	new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])

	sents_df = get_sentiments(pd.DataFrame(new_dict))

	# reordering columns to better suit the task
	sents_df = sents_df[["artist_name", "song_title", "release_year",
	"lyrics", "lyrics_section_number", 'sentiment_label',
	'sentiment_valence', "album_name", "release_type", "lyrics_url"]]

	if print_progress == True:
	print(f"Data retrieval complete!\n\n------------------------")

	return sents_df