SongScope / songscope.py
kmaurinjones's picture
Update songscope.py
042f3b8
raw
history blame
No virus
17.1 kB
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import Levenshtein
import nltk
import azapi
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
def random_delay(min_val: float, max_val: float, print_delay: bool = False):
"""
Inserts a random delay between website pings to simulate human-like behavior.
Parameters:
----------
min_val: float
The minimum amount of time to delay (in seconds).
max_val: float
The maximum amount of time to delay (in seconds).
print_delay: bool
Whether or not to print the delay time.
Returns:
-------
val: float
The random delay time (in seconds).
"""
val = random.uniform(min_val, max_val)
time.sleep(val)
if print_delay == True:
print(f"Delayed {val} seconds")
return val
def find_artist(artist_name: str):
"""
Finds the link to an artist's page on azlyrics.com.
This function sends an HTTP request to azlyrics.com, scrapes the HTML content
to find the artist's page, and returns the URL to that page.
Parameters:
----------
artist_name: str
The name of the artist.
Returns:
-------
url: str
The URL to the artist's page on azlyrics.com.
"""
for char in artist_name:
if char != " ":
first_letter = char
break
url = f"https://www.azlyrics.com/{first_letter}.html"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
artist_links = []
for artist_div in artist_divs:
for anchor in artist_div.find_all('a'):
href = anchor.get('href')
artist_links.append(href)
artist_urls = []
for url in artist_links:
artist_urls.append(str(url).split("/")[-1][:-5])
if artist_name in artist_urls:
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
else:
min_id = None
max_sim = -100
for id, name in enumerate(artist_urls):
dist = Levenshtein.jaro(artist_name, name)
if max_sim < dist:
max_sim = dist
min_id = id
return f"https://www.azlyrics.com/{artist_links[min_id]}"
def follow_lyrics(lyric_url: str):
"""
Retrieves the lyrics of a song from the specified URL on azlyrics.com.
This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
and whitespace. The function returns the lyrics as a string.
Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
specifically prohibits the usage of their content by third-party lyrics providers.
Always review the website's policies and ensure you are compliant before scraping data.
Parameters:
------
`lyric_url`: str
The URL of the song lyrics on azlyrics.com.
`song_title`: str
Title of the song
Returns:
------
`lyrics_str`:
The lyrics of the song as a single string.
Raises:
------
`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
"""
# # delay website call by a random amount as to not get banned
# random_delay(min_val = 1, max_val = 3, print_delay = False)
# Send an HTTP request to the lyric_url
response = requests.get(lyric_url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find the main div element containing the lyrics
main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')
# Find the div element containing the lyrics within the main div
lyrics_div = None
for div in main_div.find_all('div'):
if not div.has_attr('class') and not div.has_attr('id'):
lyrics_div = div
break
if lyrics_div:
# Clean up the lyrics by removing unnecessary HTML tags and whitespace
lyrics_str = lyrics_div.get_text(strip = False)
else:
print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
else:
print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")
return lyrics_str
# get artist link on azlyrics
def find_artist(artist_name: str) -> str:
"""
Finds the link for the artist page on azlyrics.com.
Parameters:
------
`artist_name`: str
The name of the artist.
Returns:
------
`url`: str
The URL of the artist page on azlyrics.com.
Raises:
------
`ValueError`: If the artist page cannot be found.
"""
for char in artist_name:
if char != " ":
first_letter = char
break
# The target URL
url = f"https://www.azlyrics.com/{first_letter}.html"
# Send an HTTP request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
# Initialize an empty list to store the artist links
artist_links = []
# Extract the 'href' attribute from each 'a' tag within the artist divs
for artist_div in artist_divs:
for anchor in artist_div.find_all('a'):
href = anchor.get('href')
artist_links.append(href)
# choose most similar artist link from all artist links
artist_urls = []
for url in artist_links:
artist_urls.append(str(url).split("/")[-1][:-5])
if artist_name in artist_urls:
return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
else:
min_id = None
max_sim = -100
for id, name in enumerate(artist_urls):
dist = Levenshtein.jaro(artist_name, name)
if max_sim < dist:
max_sim = dist
min_id = id
return f"https://www.azlyrics.com/{artist_links[min_id]}"
# this will flatten all inner lists (all depths) of a list into a list of depth == 1
def flatten_list(lst: list):
"""
Flattens all inner lists (all depths) of a list into a list of depth == 1.
Parameters:
------
`lst`: List
The list to be flattened.
Returns:
------
`result`: List
The flattened list.
"""
result = []
for element in lst:
if isinstance(element, list):
result.extend(flatten_list(element))
else:
result.append(element)
return result
# lyric pre-processing
def process_lyrics(lyrics: str):
"""
Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
by `\n`, and removing consecutive whitespace list items.
Parameters:
------
`lyrics`: str
The lyrics to be pre-processed.
Returns:
------
`cleaned_lines`: List
The pre-processed lyrics.
"""
# Replace "\r" with an empty string
lyrics = lyrics.replace('\r', '')
# Split the lyrics by "\n"
lines = lyrics.split('\n')
# Remove consecutive whitespace list items
cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']
return cleaned_lines
# splitting pre-processed lyrics into sections (this typically loosely matches a song form)
def sectionize(lyrics: str):
"""
Splits the pre-processed lyrics into sections.
Parameters:
------
`lyrics`: str
The pre-processed lyrics.
Returns:
------
`all_sections`: List
The lyrics split into sections.
"""
lyrs_list = process_lyrics(lyrics)
sectd = []
for line in lyrs_list:
if line == "":
sectd.append("#SEC")
else:
sectd.append(line)
del sectd[-1]
all_sections = []
for id, line in enumerate(sectd):
if id == 0:
sec_list = []
if line == "#SEC":
all_sections.append(sec_list)
sec_list = []
else:
sec_list.append(line)
del all_sections[0]
return all_sections
# sentiment analysis model
def analyze_sentiment_vader(text: str):
"""
Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
sentiment analysis model.
Parameters:
------
`text`: str
The text to be analyzed.
Returns:
------
`label`: str
The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
`compound_score`: float
The compound score of the text.
"""
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
# Determine the sentiment label based on the compound score
compound_score = sentiment_scores["compound"]
if compound_score >= 0.05:
label = "POSITIVE"
elif compound_score <= -0.05:
label = "NEGATIVE"
else:
label = "NEUTRAL"
return label, compound_score
# get sentiment of all text items in 'lyrics' column
def get_sentiments(df):
"""
Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
DataFrame is returned.
Parameters:
-----------
df : pandas DataFrame
The DataFrame containing the 'lyrics' column to be analyzed.
Returns:
--------
df : pandas DataFrame
The modified DataFrame with sentiment analysis added as new columns.
Raises:
-------
None.
"""
for row in df.index:
section_lyrics = df.loc[row, 'lyrics']
sec_lyrs_str = ""
for line in section_lyrics:
sec_lyrs_str += line + " "
label, valence = analyze_sentiment_vader(sec_lyrs_str)
df.loc[row, 'sentiment_label'] = label
df.loc[row, 'sentiment_valence'] = valence
return df
# get just metadata for songs (not lyrics)
def get_metadata(artist_name: str, song_titles: list = None) -> dict:
"""
Get all metadata for the passed artist and songs.
Parameters:
-----------
artist_name: str
The name of the artist to search for.
song_titles: list
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
Returns:
--------
dict
A dictionary containing metadata for each song found. The keys are the song titles and the values are
dictionaries containing various metadata for each song.
"""
urls = find_artist(artist_name)
azlyrics_artist_name = urls.split("/")[-1][:-5]
API = azapi.AZlyrics('google', accuracy = 0.6)
API.artist = azlyrics_artist_name
all_songs_info = API.getSongs() # dictionary
az_titles = [title for title in all_songs_info]
if song_titles == None:
return all_songs_info
else:
found_data = {}
for title in song_titles:
if title in az_titles:
found_data[title] = all_songs_info[title]
else:
min_id = None
max_sim = -100
for id, az_name in enumerate(az_titles):
dist = Levenshtein.jaro(title, az_name)
if max_sim < dist:
max_sim = dist
min_id = id
found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]
return found_data
# combine metadata with found lyrics
def get_all_data(artist_name: str, song_titles: list = None,
delay: tuple = (0.5, 2), print_progress: bool = False):
"""
Get all metadata and sentiment analysis for the passed artist and songs.
Parameters:
-----------
artist_name: str
The name of the artist to search for.
song_titles: list
A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
delay: tuple
A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
being banned by the server.
print_progress: bool
Whether to print progress messages or not.
Returns:
--------
pd.DataFrame
A pandas DataFrame containing metadata and sentiment analysis for each song found.
"""
if print_progress == True:
print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")
artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)
if print_progress == True:
print(f"\n\t- All metadata found")
times = []
for title, mdata in artist_data.items():
start = time.time()
# try:
# lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
# artist_data[title]['lyrics'] = sectionize(lyrics)
# except: (UnboundLocalError, TypeError, AttributeError)
# print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
# pass
lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
artist_data[title]['lyrics'] = sectionize(lyrics)
# as to not get banned
random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)
# time stuff
times.append(start - time.time())
avg_time = sum(times) / len(times)
remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed
# printing stuff
if print_progress == True:
if remaining >= 60: # more than one minute remaining
remaining = round(remaining / 60, 2)
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
else: # less than one minute remaining
remaining = round(remaining, 2)
print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")
if print_progress == True:
print(f"\nAll lyrics and metadata found. Returning structured data.")
df_dict = {}
df_dict['artist_name'] = []
df_dict['song_title'] = []
df_dict['release_year'] = []
df_dict['lyrics'] = []
df_dict['lyrics_section_number'] = []
df_dict['album_name'] = []
df_dict['release_type'] = []
df_dict['lyrics_url'] = []
for title, info in artist_data.items():
df_dict['artist_name'].append("John Mayer")
df_dict['song_title'].append(title)
df_dict['album_name'].append(info['album'])
df_dict['release_year'].append(info['year'])
df_dict['lyrics'].append(info['lyrics'])
df_dict['lyrics_section_number'].append(len(info['album']))
df_dict['release_type'].append(info['type'])
df_dict['lyrics_url'].append(info['url'])
new_dict = {}
for key in df_dict:
new_dict[key] = []
for i in range(len(df_dict['lyrics'])):
for id, inner in enumerate(df_dict['lyrics'][i]):
new_dict['song_title'].append(df_dict['song_title'][i])
new_dict['release_year'].append(df_dict['release_year'][i])
new_dict['album_name'].append(df_dict['album_name'][i])
new_dict['artist_name'].append(df_dict['artist_name'][i])
new_dict['lyrics'].append(inner)
new_dict['lyrics_section_number'].append(id)
new_dict['release_type'].append(df_dict['release_type'][i])
new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])
sents_df = get_sentiments(pd.DataFrame(new_dict))
# reordering columns to better suit the task
sents_df = sents_df[["artist_name", "song_title", "release_year",
"lyrics", "lyrics_section_number", 'sentiment_label',
'sentiment_valence', "album_name", "release_type", "lyrics_url"]]
if print_progress == True:
print(f"Data retrieval complete!\n\n------------------------")
return sents_df