Spaces:

kmaurinjones
/

SongScope

Running

App Files Files Community

kmaurinjones commited on Apr 22, 2023

Commit

a4106b6

•

1 Parent(s): fd633e6

Update songscope.py

Browse files

Files changed (1) hide show

songscope.py +527 -3

songscope.py CHANGED Viewed

@@ -1,4 +1,528 @@
-import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+import Levenshtein
+import nltk
+nltk.download("vader_lexicon")
+from nltk.sentiment import SentimentIntensityAnalyzer
+import azapi
+def random_delay(min_val: float, max_val: float, print_delay: bool = False):
+    """
+    Inserts a random delay between website pings to simulate human-like behavior.
+    Parameters:
+    ----------
+    min_val: float
+        The minimum amount of time to delay (in seconds).
+    max_val: float
+        The maximum amount of time to delay (in seconds).
+    print_delay: bool
+        Whether or not to print the delay time.
+    Returns:
+    -------
+    val: float
+        The random delay time (in seconds).
+    """
+    val = random.uniform(min_val, max_val)
+    time.sleep(val)
+    if print_delay == True:
+        print(f"Delayed {val} seconds")
+    return val
+def find_artist(artist_name: str):
+    """
+    Finds the link to an artist's page on azlyrics.com.
+    This function sends an HTTP request to azlyrics.com, scrapes the HTML content
+    to find the artist's page, and returns the URL to that page.
+    Parameters:
+    ----------
+    artist_name: str
+        The name of the artist.
+    Returns:
+    -------
+    url: str
+        The URL to the artist's page on azlyrics.com.
+    """
+    for char in artist_name:
+        if char != " ":
+            first_letter = char
+            break
+    url = f"https://www.azlyrics.com/{first_letter}.html"
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
+        artist_links = []
+        for artist_div in artist_divs:
+            for anchor in artist_div.find_all('a'):
+                href = anchor.get('href')
+                artist_links.append(href)
+    artist_urls = []
+    for url in artist_links:
+        artist_urls.append(str(url).split("/")[-1][:-5])
+    if artist_name in artist_urls:
+        return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
+    else:
+        min_id = None
+        max_sim = -100
+        for id, name in enumerate(artist_urls):
+            dist = Levenshtein.jaro(artist_name, name)
+            if max_sim < dist:
+                max_sim = dist
+                min_id = id
+        return f"https://www.azlyrics.com/{artist_links[min_id]}"
+def follow_lyrics(lyric_url: str):
+    """
+    Retrieves the lyrics of a song from the specified URL on azlyrics.com.
+    This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
+    and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
+    and whitespace. The function returns the lyrics as a string.
+    Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
+    specifically prohibits the usage of their content by third-party lyrics providers.
+    Always review the website's policies and ensure you are compliant before scraping data.
+    Parameters:
+    ------
+    `lyric_url`: str
+        The URL of the song lyrics on azlyrics.com.
+    `song_title`: str
+        Title of the song
+    Returns:
+    ------
+    `lyrics_str`:
+        The lyrics of the song as a single string.
+    Raises:
+    ------
+        `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
+    """
+    # # delay website call by a random amount as to not get banned
+    # random_delay(min_val = 1, max_val = 3, print_delay = False)
+    # Send an HTTP request to the lyric_url
+    response = requests.get(lyric_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find the main div element containing the lyrics
+        main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')
+        # Find the div element containing the lyrics within the main div
+        lyrics_div = None
+        for div in main_div.find_all('div'):
+            if not div.has_attr('class') and not div.has_attr('id'):
+                lyrics_div = div
+                break
+        if lyrics_div:
+            # Clean up the lyrics by removing unnecessary HTML tags and whitespace
+            lyrics_str = lyrics_div.get_text(strip = False)
+        else:
+            print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
+    else:
+        print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")
+    return lyrics_str
+# get artist link on azlyrics
+def find_artist(artist_name: str) -> str:
+    """
+    Finds the link for the artist page on azlyrics.com.
+    Parameters:
+    ------
+    `artist_name`: str
+        The name of the artist.
+    Returns:
+    ------
+    `url`: str
+        The URL of the artist page on azlyrics.com.
+    Raises:
+    ------
+    `ValueError`: If the artist page cannot be found.
+    """
+    for char in artist_name:
+        if char != " ":
+            first_letter = char
+            break
+    # The target URL
+    url = f"https://www.azlyrics.com/{first_letter}.html"
+    # Send an HTTP request to the URL
+    response = requests.get(url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
+        artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
+        # Initialize an empty list to store the artist links
+        artist_links = []
+        # Extract the 'href' attribute from each 'a' tag within the artist divs
+        for artist_div in artist_divs:
+            for anchor in artist_div.find_all('a'):
+                href = anchor.get('href')
+                artist_links.append(href)
+    # choose most similar artist link from all artist links
+    artist_urls = []
+    for url in artist_links:
+        artist_urls.append(str(url).split("/")[-1][:-5])
+    if artist_name in artist_urls:
+        return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
+    else:
+        min_id = None
+        max_sim = -100
+        for id, name in enumerate(artist_urls):
+            dist = Levenshtein.jaro(artist_name, name)
+            if max_sim < dist:
+                max_sim = dist
+                min_id = id
+        return f"https://www.azlyrics.com/{artist_links[min_id]}"
+# this will flatten all inner lists (all depths) of a list into a list of depth == 1
+def flatten_list(lst: list):
+    """
+    Flattens all inner lists (all depths) of a list into a list of depth == 1.
+    Parameters:
+    ------
+    `lst`: List
+        The list to be flattened.
+    Returns:
+    ------
+    `result`: List
+        The flattened list.
+    """
+    result = []
+    for element in lst:
+        if isinstance(element, list):
+            result.extend(flatten_list(element))
+        else:
+            result.append(element)
+    return result
+# lyric pre-processing
+def process_lyrics(lyrics: str):
+    """
+    Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
+    by `\n`, and removing consecutive whitespace list items.
+    Parameters:
+    ------
+    `lyrics`: str
+        The lyrics to be pre-processed.
+    Returns:
+    ------
+    `cleaned_lines`: List
+        The pre-processed lyrics.
+    """
+    # Replace "\r" with an empty string
+    lyrics = lyrics.replace('\r', '')
+    # Split the lyrics by "\n"
+    lines = lyrics.split('\n')
+    # Remove consecutive whitespace list items
+    cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']
+    return cleaned_lines
+# splitting pre-processed lyrics into sections (this typically loosely matches a song form)
+def sectionize(lyrics: str):
+    """
+    Splits the pre-processed lyrics into sections.
+    Parameters:
+    ------
+    `lyrics`: str
+        The pre-processed lyrics.
+    Returns:
+    ------
+    `all_sections`: List
+        The lyrics split into sections.
+    """
+    lyrs_list = process_lyrics(lyrics)
+    sectd = []
+    for line in lyrs_list:
+        if line == "":
+            sectd.append("#SEC")
+        else:
+            sectd.append(line)
+    del sectd[-1]
+    all_sections = []
+    for id, line in enumerate(sectd):
+        if id == 0:
+            sec_list = []
+        if line == "#SEC":
+            all_sections.append(sec_list)
+            sec_list = []
+        else:
+            sec_list.append(line)
+    del all_sections[0]
+    return all_sections
+# sentiment analysis model
+def analyze_sentiment_vader(text: str):
+    """
+    Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
+    sentiment analysis model.
+    Parameters:
+    ------
+    `text`: str
+        The text to be analyzed.
+    Returns:
+    ------
+    `label`: str
+        The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
+    `compound_score`: float
+        The compound score of the text.
+    """
+    sia = SentimentIntensityAnalyzer()
+    sentiment_scores = sia.polarity_scores(text)
+    # Determine the sentiment label based on the compound score
+    compound_score = sentiment_scores["compound"]
+    if compound_score >= 0.05:
+        label = "POSITIVE"
+    elif compound_score <= -0.05:
+        label = "NEGATIVE"
+    else:
+        label = "NEUTRAL"
+    return label, compound_score
+# get sentiment of all text items in 'lyrics' column
+def get_sentiments(df):
+    """
+    Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
+    This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
+    to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
+    ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
+    each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
+    DataFrame is returned.
+    Parameters:
+    -----------
+    df : pandas DataFrame
+        The DataFrame containing the 'lyrics' column to be analyzed.
+    Returns:
+    --------
+    df : pandas DataFrame
+        The modified DataFrame with sentiment analysis added as new columns.
+    Raises:
+    -------
+    None.
+    """
+    for row in df.index:
+        section_lyrics = df.loc[row, 'lyrics']
+        sec_lyrs_str = ""
+        for line in section_lyrics:
+            sec_lyrs_str += line + " "
+        label, valence = analyze_sentiment_vader(sec_lyrs_str)
+        df.loc[row, 'sentiment_label'] = label
+        df.loc[row, 'sentiment_valence'] = valence
+    return df
+# get just metadata for songs (not lyrics)
+def get_metadata(artist_name: str, song_titles: list = None) -> dict:
+    """
+    Get all metadata for the passed artist and songs.
+    Parameters:
+    -----------
+    artist_name: str
+        The name of the artist to search for.
+    song_titles: list
+        A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
+    Returns:
+    --------
+    dict
+        A dictionary containing metadata for each song found. The keys are the song titles and the values are
+        dictionaries containing various metadata for each song.
+    """
+    urls = find_artist(artist_name)
+    azlyrics_artist_name = urls.split("/")[-1][:-5]
+    API = azapi.AZlyrics('google',  accuracy = 0.6)
+    API.artist = azlyrics_artist_name
+    all_songs_info = API.getSongs() # dictionary
+    az_titles = [title for title in all_songs_info]
+    if song_titles == None:
+        return all_songs_info
+    else:
+        found_data = {}
+        for title in song_titles:
+            if title in az_titles:
+                found_data[title] = all_songs_info[title]
+            else:
+                min_id = None
+                max_sim = -100
+                for id, az_name in enumerate(az_titles):
+                    dist = Levenshtein.jaro(title, az_name)
+                    if max_sim < dist:
+                        max_sim = dist
+                        min_id = id
+                found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]
+        return found_data
+# combine metadata with found lyrics
+def get_all_data(artist_name: str, song_titles: list = None,
+                 delay: tuple = (0.5, 2), print_progress: bool = False):
+    """
+    Get all metadata and sentiment analysis for the passed artist and songs.
+    Parameters:
+    -----------
+    artist_name: str
+        The name of the artist to search for.
+    song_titles: list
+        A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
+    delay: tuple
+        A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
+        being banned by the server.
+    print_progress: bool
+        Whether to print progress messages or not.
+    Returns:
+    --------
+    pd.DataFrame
+        A pandas DataFrame containing metadata and sentiment analysis for each song found.
+    """
+    if print_progress == True:
+        print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")
+    artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)
+    if print_progress == True:
+        print(f"\n\t- All metadata found")
+    times = []
+    for title, mdata in artist_data.items():
+        start = time.time()
+        try:
+            lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
+            artist_data[title]['lyrics'] = sectionize(lyrics)
+        except: (UnboundLocalError, TypeError, AttributeError)
+            print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
+            continue
+        # as to not get banned
+        random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)
+        # time stuff
+        times.append(start - time.time())
+        avg_time = sum(times) / len(times)
+        remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed
+        # printing stuff
+        if print_progress == True:
+            if remaining >= 60: # more than one minute remaining
+                remaining = round(remaining / 60, 2)
+                print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
+            else: # less than one minute remaining
+                remaining = round(remaining, 2)
+                print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")
+    if print_progress == True:
+        print(f"\nAll lyrics and metadata found. Returning structured data.")
+    df_dict = {}
+    df_dict['artist_name'] = []
+    df_dict['song_title'] = []
+    df_dict['release_year'] = []
+    df_dict['lyrics'] = []
+    df_dict['lyrics_section_number'] = []
+    df_dict['album_name'] = []
+    df_dict['release_type'] = []
+    df_dict['lyrics_url'] = []
+    for title, info in artist_data.items():
+        df_dict['artist_name'].append("John Mayer")
+        df_dict['song_title'].append(title)
+        df_dict['album_name'].append(info['album'])
+        df_dict['release_year'].append(info['year'])
+        df_dict['lyrics'].append(info['lyrics'])
+        df_dict['lyrics_section_number'].append(len(info['album']))
+        df_dict['release_type'].append(info['type'])
+        df_dict['lyrics_url'].append(info['url'])
+    new_dict = {}
+    for key in df_dict:
+        new_dict[key] = []
+    for i in range(len(df_dict['lyrics'])):
+        for id, inner in enumerate(df_dict['lyrics'][i]):
+            new_dict['song_title'].append(df_dict['song_title'][i])
+            new_dict['release_year'].append(df_dict['release_year'][i])
+            new_dict['album_name'].append(df_dict['album_name'][i])
+            new_dict['artist_name'].append(df_dict['artist_name'][i])
+            new_dict['lyrics'].append(inner)
+            new_dict['lyrics_section_number'].append(id)
+            new_dict['release_type'].append(df_dict['release_type'][i])
+            new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])
+    sents_df = get_sentiments(pd.DataFrame(new_dict))
+    # reordering columns to better suit the task
+    sents_df = sents_df[["artist_name", "song_title", "release_year",
+                         "lyrics", "lyrics_section_number", 'sentiment_label',
+                         'sentiment_valence', "album_name", "release_type", "lyrics_url"]]
+    if print_progress == True:
+        print(f"Data retrieval complete!\n\n------------------------")
+    return sents_df