import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import Levenshtein
import nltk
import azapi
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

def random_delay(min_val: float, max_val: float, print_delay: bool = False):
    """
    Inserts a random delay between website pings to simulate human-like behavior.
    
    Parameters:
    ----------
    min_val: float
        The minimum amount of time to delay (in seconds).
    max_val: float
        The maximum amount of time to delay (in seconds).
    print_delay: bool
        Whether or not to print the delay time.

    Returns:
    -------
    val: float
        The random delay time (in seconds).
    """
    val = random.uniform(min_val, max_val)
    time.sleep(val)
    if print_delay == True:
        print(f"Delayed {val} seconds")
    return val

def find_artist(artist_name: str):
    """
    Finds the link to an artist's page on azlyrics.com.
    
    This function sends an HTTP request to azlyrics.com, scrapes the HTML content
    to find the artist's page, and returns the URL to that page.

    Parameters:
    ----------
    artist_name: str
        The name of the artist.

    Returns:
    -------
    url: str
        The URL to the artist's page on azlyrics.com.
    """
    for char in artist_name:
        if char != " ":
            first_letter = char
            break
    
    url = f"https://www.azlyrics.com/{first_letter}.html"
    
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')
        artist_links = []

        for artist_div in artist_divs:
            for anchor in artist_div.find_all('a'):
                href = anchor.get('href')
                artist_links.append(href)

    artist_urls = []
    for url in artist_links:
        artist_urls.append(str(url).split("/")[-1][:-5])

    if artist_name in artist_urls:
        return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
    else:
        min_id = None
        max_sim = -100
        for id, name in enumerate(artist_urls):
            dist = Levenshtein.jaro(artist_name, name)
            if max_sim < dist:
                max_sim = dist
                min_id = id

        return f"https://www.azlyrics.com/{artist_links[min_id]}"
    
def follow_lyrics(lyric_url: str):
    """
    Retrieves the lyrics of a song from the specified URL on azlyrics.com.
    
    This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
    and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
    and whitespace. The function returns the lyrics as a string.

    Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
    specifically prohibits the usage of their content by third-party lyrics providers.
    Always review the website's policies and ensure you are compliant before scraping data.

    Parameters:
    ------
    `lyric_url`: str
        The URL of the song lyrics on azlyrics.com.
    `song_title`: str
        Title of the song

    Returns:
    ------
    `lyrics_str`: 
        The lyrics of the song as a single string.

    Raises:
    ------
        `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
    """

    # # delay website call by a random amount as to not get banned
    # random_delay(min_val = 1, max_val = 3, print_delay = False)

    # Send an HTTP request to the lyric_url
    response = requests.get(lyric_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main div element containing the lyrics
        main_div = soup.find('div', class_='col-xs-12 col-lg-8 text-center')

        # Find the div element containing the lyrics within the main div
        lyrics_div = None
        for div in main_div.find_all('div'):
            if not div.has_attr('class') and not div.has_attr('id'):
                lyrics_div = div
                break

        if lyrics_div:
            # Clean up the lyrics by removing unnecessary HTML tags and whitespace
            lyrics_str = lyrics_div.get_text(strip = False)
        else:
            print(f"Error: Unable to find the lyrics for '{lyric_url}'.")
    else:
        print(f"Error: Unable to fetch the webpage. Status code: {response.status_code}")

    return lyrics_str

# get artist link on azlyrics
def find_artist(artist_name: str) -> str:
    """
    Finds the link for the artist page on azlyrics.com.

    Parameters:
    ------
    `artist_name`: str
        The name of the artist.

    Returns:
    ------
    `url`: str
        The URL of the artist page on azlyrics.com.

    Raises:
    ------
    `ValueError`: If the artist page cannot be found.
    """
    for char in artist_name:
        if char != " ":
            first_letter = char
            break
    
    # The target URL
    url = f"https://www.azlyrics.com/{first_letter}.html"
    
    # Send an HTTP request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the 'div' elements with the class "col-sm-6 text-center artist-col"
        artist_divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')

        # Initialize an empty list to store the artist links
        artist_links = []

        # Extract the 'href' attribute from each 'a' tag within the artist divs
        for artist_div in artist_divs:
            for anchor in artist_div.find_all('a'):
                href = anchor.get('href')
                artist_links.append(href)

    # choose most similar artist link from all artist links
    artist_urls = []
    for url in artist_links:
        artist_urls.append(str(url).split("/")[-1][:-5])

    if artist_name in artist_urls:
        return f"https://www.azlyrics.com/{artist_links[artist_urls.index(artist_name)]}"
    else:
        min_id = None
        max_sim = -100
        for id, name in enumerate(artist_urls):
            dist = Levenshtein.jaro(artist_name, name)
            if max_sim < dist:
                max_sim = dist
                min_id = id

        return f"https://www.azlyrics.com/{artist_links[min_id]}"

# this will flatten all inner lists (all depths) of a list into a list of depth == 1
def flatten_list(lst: list):
    """
    Flattens all inner lists (all depths) of a list into a list of depth == 1.

    Parameters:
    ------
    `lst`: List
        The list to be flattened.

    Returns:
    ------
    `result`: List
        The flattened list.
    """
    result = []
    for element in lst:
        if isinstance(element, list):
            result.extend(flatten_list(element))
        else:
            result.append(element)
    return result

# lyric pre-processing
def process_lyrics(lyrics: str):
    """
    Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
    by `\n`, and removing consecutive whitespace list items.

    Parameters:
    ------
    `lyrics`: str
        The lyrics to be pre-processed.

    Returns:
    ------
    `cleaned_lines`: List
        The pre-processed lyrics.
    """
    # Replace "\r" with an empty string
    lyrics = lyrics.replace('\r', '')

    # Split the lyrics by "\n"
    lines = lyrics.split('\n')

    # Remove consecutive whitespace list items
    cleaned_lines = [line for i, line in enumerate(lines) if i == 0 or lines[i - 1].strip() != '' or line.strip() != '']

    return cleaned_lines

# splitting pre-processed lyrics into sections (this typically loosely matches a song form)
def sectionize(lyrics: str):
    """
    Splits the pre-processed lyrics into sections.

    Parameters:
    ------
    `lyrics`: str
        The pre-processed lyrics.

    Returns:
    ------
    `all_sections`: List
        The lyrics split into sections.
    """
    lyrs_list = process_lyrics(lyrics)
    
    sectd = []
    for line in lyrs_list:
        if line == "":
            sectd.append("#SEC")
        else:
            sectd.append(line)

    del sectd[-1]
    
    all_sections = []
    for id, line in enumerate(sectd):
        if id == 0:
            sec_list = []
        if line == "#SEC":
            all_sections.append(sec_list)
            sec_list = []
        else:
            sec_list.append(line)

    del all_sections[0]

    return all_sections

# sentiment analysis model
def analyze_sentiment_vader(text: str):
    """
    Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
    sentiment analysis model.

    Parameters:
    ------
    `text`: str
        The text to be analyzed.

    Returns:
    ------
    `label`: str
        The sentiment label of the text. Can be "POSITIVE", "NEGATIVE", or "NEUTRAL".
    `compound_score`: float
        The compound score of the text.
    """
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)

    # Determine the sentiment label based on the compound score
    compound_score = sentiment_scores["compound"]
    if compound_score >= 0.05:
        label = "POSITIVE"
    elif compound_score <= -0.05:
        label = "NEGATIVE"
    else:
        label = "NEUTRAL"

    return label, compound_score

# get sentiment of all text items in 'lyrics' column
def get_sentiments(df):
    """
    Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.

    This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
    to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
    ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
    each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
    DataFrame is returned.

    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame containing the 'lyrics' column to be analyzed.

    Returns:
    --------
    df : pandas DataFrame
        The modified DataFrame with sentiment analysis added as new columns.

    Raises:
    -------
    None.
    """
        
    for row in df.index:
        section_lyrics = df.loc[row, 'lyrics']
        sec_lyrs_str = ""
        for line in section_lyrics:
            sec_lyrs_str += line + " "
        label, valence = analyze_sentiment_vader(sec_lyrs_str)
        df.loc[row, 'sentiment_label'] = label
        df.loc[row, 'sentiment_valence'] = valence

    return df

# get just metadata for songs (not lyrics)
def get_metadata(artist_name: str, song_titles: list = None) -> dict:
    """
    Get all metadata for the passed artist and songs.
    
    Parameters:
    -----------
    artist_name: str
        The name of the artist to search for.
    song_titles: list
        A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
    
    Returns:
    --------
    dict
        A dictionary containing metadata for each song found. The keys are the song titles and the values are 
        dictionaries containing various metadata for each song.
    """

    urls = find_artist(artist_name)
    azlyrics_artist_name = urls.split("/")[-1][:-5]
       
    API = azapi.AZlyrics('google',  accuracy = 0.6)
    API.artist = azlyrics_artist_name
    all_songs_info = API.getSongs() # dictionary
    
    az_titles = [title for title in all_songs_info]

    if song_titles == None:
        return all_songs_info
    else:
        found_data = {}
        for title in song_titles:
            if title in az_titles:
                found_data[title] = all_songs_info[title]
            else:
                min_id = None
                max_sim = -100
                for id, az_name in enumerate(az_titles):
                    dist = Levenshtein.jaro(title, az_name)
                    if max_sim < dist:
                        max_sim = dist
                        min_id = id

                found_data[az_titles[min_id]] = all_songs_info[az_titles[min_id]]

        return found_data

# combine metadata with found lyrics
def get_all_data(artist_name: str, song_titles: list = None, 
                 delay: tuple = (0.5, 2), print_progress: bool = False):
    """
    Get all metadata and sentiment analysis for the passed artist and songs.
    
    Parameters:
    -----------
    artist_name: str
        The name of the artist to search for.
    song_titles: list
        A list of song titles to get metadata for. If not specified, all songs by the artist will be retrieved.
    delay: tuple
        A tuple containing the minimum and maximum amount of time (in seconds) to wait between requests to avoid
        being banned by the server.
    print_progress: bool
        Whether to print progress messages or not.
    
    Returns:
    --------
    pd.DataFrame
        A pandas DataFrame containing metadata and sentiment analysis for each song found.
    """
    if print_progress == True:
        print(f"------------------------\n\nFinding song data for '{artist_name}'. This may take a few moments...")

    artist_data = get_metadata(artist_name = artist_name, song_titles = song_titles)

    if print_progress == True:
        print(f"\n\t- All metadata found")

    times = []

    for title, mdata in artist_data.items():
        start = time.time()
        # try:
        #     lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
        #     artist_data[title]['lyrics'] = sectionize(lyrics)
        # except: (UnboundLocalError, TypeError, AttributeError)
        #     print(f"\tCouldn't find lyrics to {title}. Moving to next song.")
        #     pass
        
        lyrics = follow_lyrics(lyric_url = artist_data[title]['url'])
        artist_data[title]['lyrics'] = sectionize(lyrics)
        
        # as to not get banned
        random_delay(min_val = delay[0], max_val = delay[1], print_delay = False)

        # time stuff
        times.append(start - time.time())
        avg_time = sum(times) / len(times)
        remaining = abs((len(artist_data) - len(times)) * avg_time) # len(times) also gives the number of iterations completed

        # printing stuff
        if print_progress == True:
            if remaining >= 60: # more than one minute remaining
                remaining = round(remaining / 60, 2)
                print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} minutes")
            else: # less than one minute remaining
                remaining = round(remaining, 2)
                print(f"\t- Lyrics to '{title}' found. Estimated time remaining: {remaining} seconds")

    if print_progress == True:
        print(f"\nAll lyrics and metadata found. Returning structured data.")

    df_dict = {}
    df_dict['artist_name'] = []
    df_dict['song_title'] = []
    df_dict['release_year'] = []
    df_dict['lyrics'] = []
    df_dict['lyrics_section_number'] = []
    df_dict['album_name'] = []
    df_dict['release_type'] = []
    df_dict['lyrics_url'] = []

    for title, info in artist_data.items():
        df_dict['artist_name'].append("John Mayer")
        df_dict['song_title'].append(title)
        df_dict['album_name'].append(info['album'])
        df_dict['release_year'].append(info['year'])
        df_dict['lyrics'].append(info['lyrics'])
        df_dict['lyrics_section_number'].append(len(info['album']))
        df_dict['release_type'].append(info['type'])
        df_dict['lyrics_url'].append(info['url'])

    new_dict = {}
    for key in df_dict:
        new_dict[key] = []

    for i in range(len(df_dict['lyrics'])):
        for id, inner in enumerate(df_dict['lyrics'][i]):
            new_dict['song_title'].append(df_dict['song_title'][i])
            new_dict['release_year'].append(df_dict['release_year'][i])
            new_dict['album_name'].append(df_dict['album_name'][i])
            new_dict['artist_name'].append(df_dict['artist_name'][i])
            new_dict['lyrics'].append(inner)
            new_dict['lyrics_section_number'].append(id)
            new_dict['release_type'].append(df_dict['release_type'][i])
            new_dict['lyrics_url'].append(df_dict['lyrics_url'][i])

    sents_df = get_sentiments(pd.DataFrame(new_dict))

    # reordering columns to better suit the task
    sents_df = sents_df[["artist_name", "song_title", "release_year",
                         "lyrics", "lyrics_section_number", 'sentiment_label',
                         'sentiment_valence', "album_name", "release_type", "lyrics_url"]]

    if print_progress == True:
        print(f"Data retrieval complete!\n\n------------------------")

    return sents_df